From dcb60f9c403e03133363563ac8ea5d8bba6c2be1 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 12 Jul 2023 20:08:32 -0700
Subject: [PATCH 01/10] cpumask: eliminate kernel-doc warnings

Update lib/cpumask.c and <linux/cpumask.h> to fix all kernel-doc
warnings:

include/linux/cpumask.h:185: warning: Function parameter or member 'srcp1' not described in 'cpumask_first_and'
include/linux/cpumask.h:185: warning: Function parameter or member 'srcp2' not described in 'cpumask_first_and'
include/linux/cpumask.h:185: warning: Excess function parameter 'src1p' description in 'cpumask_first_and'
include/linux/cpumask.h:185: warning: Excess function parameter 'src2p' description in 'cpumask_first_and'

lib/cpumask.c:59: warning: Function parameter or member 'node' not described in 'alloc_cpumask_var_node'
lib/cpumask.c:169: warning: Function parameter or member 'src1p' not described in 'cpumask_any_and_distribute'
lib/cpumask.c:169: warning: Function parameter or member 'src2p' not described in 'cpumask_any_and_distribute'

Fixes: 7b4967c53204 ("cpumask: Add alloc_cpumask_var_node()")
Fixes: 839cad5fa54b ("cpumask: fix function description kernel-doc notation")
Fixes: 93ba139ba819 ("cpumask: use find_first_and_bit()")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 8 ++++++--
 lib/cpumask.c           | 5 ++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 0d2e2a38b92d0..f10fb87d49dbe 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -175,8 +175,8 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
 
 /**
  * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
- * @src1p: the first input
- * @src2p: the second input
+ * @srcp1: the first input
+ * @srcp2: the second input
  *
  * Returns >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
  */
@@ -1197,6 +1197,10 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
 /**
  * cpumap_print_list_to_buf  - copies the cpumask into the buffer as
  *	comma-separated list of cpus
+ * @buf: the buffer to copy into
+ * @mask: the cpumask to copy
+ * @off: in the string from which we are copying, we copy to @buf
+ * @count: the maximum number of bytes to print
  *
  * Everything is same with the above cpumap_print_bitmask_to_buf()
  * except the print format.
diff --git a/lib/cpumask.c b/lib/cpumask.c
index de356f16773a0..a7fd02b5ae264 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -45,6 +45,7 @@ EXPORT_SYMBOL(cpumask_next_wrap);
  * alloc_cpumask_var_node - allocate a struct cpumask on a given node
  * @mask: pointer to cpumask_var_t where the cpumask is returned
  * @flags: GFP_ flags
+ * @node: memory node from which to allocate or %NUMA_NO_NODE
  *
  * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
  * a nop returning a constant 1 (in <linux/cpumask.h>)
@@ -157,7 +158,9 @@ EXPORT_SYMBOL(cpumask_local_spread);
 static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);
 
 /**
- * cpumask_any_and_distribute - Return an arbitrary cpu within srcp1 & srcp2.
+ * cpumask_any_and_distribute - Return an arbitrary cpu within src1p & src2p.
+ * @src1p: first &cpumask for intersection
+ * @src2p: second &cpumask for intersection
  *
  * Iterated calls using the same srcp1 and srcp2 will be distributed within
  * their intersection.

From 2356d198d2b4ddec24efea98271cb3be230bc787 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 17 Jul 2023 12:17:03 -0700
Subject: [PATCH 02/10] lib/bitmap: workaround const_eval test build failure

When building with Clang, and when KASAN and GCOV_PROFILE_ALL are both
enabled, the test fails to build [1]:

>> lib/test_bitmap.c:920:2: error: call to '__compiletime_assert_239' declared with 'error' attribute: BUILD_BUG_ON failed: !__builtin_constant_p(res)
           BUILD_BUG_ON(!__builtin_constant_p(res));
           ^
   include/linux/build_bug.h:50:2: note: expanded from macro 'BUILD_BUG_ON'
           BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
           ^
   include/linux/build_bug.h:39:37: note: expanded from macro 'BUILD_BUG_ON_MSG'
   #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
                                       ^
   include/linux/compiler_types.h:352:2: note: expanded from macro 'compiletime_assert'
           _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
           ^
   include/linux/compiler_types.h:340:2: note: expanded from macro '_compiletime_assert'
           __compiletime_assert(condition, msg, prefix, suffix)
           ^
   include/linux/compiler_types.h:333:4: note: expanded from macro '__compiletime_assert'
                           prefix ## suffix();                             \
                           ^
   <scratch space>:185:1: note: expanded from here
   __compiletime_assert_239

Originally it was attributed to s390, which now looks seemingly wrong. The
issue is not related to bitmap code itself, but it breaks build for a given
configuration.

Disabling the const_eval test under that config may potentially hide other
bugs. Instead, workaround it by disabling GCOV for the test_bitmap unless
the compiler will get fixed.

[1] https://github.com/ClangBuiltLinux/linux/issues/1874

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202307171254.yFcH97ej-lkp@intel.com/
Fixes: dc34d5036692 ("lib: test_bitmap: add compile-time optimization/evaluations assertions")
Co-developed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 lib/Makefile      | 6 ++++++
 lib/test_bitmap.c | 8 ++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/lib/Makefile b/lib/Makefile
index 42d307ade225e..1ffae65bb7eed 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -82,7 +82,13 @@ obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
 obj-$(CONFIG_TEST_DYNAMIC_DEBUG) += test_dynamic_debug.o
 obj-$(CONFIG_TEST_PRINTF) += test_printf.o
 obj-$(CONFIG_TEST_SCANF) += test_scanf.o
+
 obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
+ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_KASAN),yy)
+# FIXME: Clang breaks test_bitmap_const_eval when KASAN and GCOV are enabled
+GCOV_PROFILE_test_bitmap.o := n
+endif
+
 obj-$(CONFIG_TEST_UUID) += test_uuid.o
 obj-$(CONFIG_TEST_XARRAY) += test_xarray.o
 obj-$(CONFIG_TEST_MAPLE_TREE) += test_maple_tree.o
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 187f5b2db4cf1..f2ea9f30c7c5d 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -1161,6 +1161,10 @@ static void __init test_bitmap_print_buf(void)
 	}
 }
 
+/*
+ * FIXME: Clang breaks compile-time evaluations when KASAN and GCOV are enabled.
+ * To workaround it, GCOV is force-disabled in Makefile for this configuration.
+ */
 static void __init test_bitmap_const_eval(void)
 {
 	DECLARE_BITMAP(bitmap, BITS_PER_LONG);
@@ -1186,11 +1190,7 @@ static void __init test_bitmap_const_eval(void)
 	 * the compiler is fixed.
 	 */
 	bitmap_clear(bitmap, 0, BITS_PER_LONG);
-#if defined(__s390__) && defined(__clang__)
-	if (!const_test_bit(7, bitmap))
-#else
 	if (!test_bit(7, bitmap))
-#endif
 		bitmap_set(bitmap, 5, 2);
 
 	/* Equals to `unsigned long bitopvar = BIT(20)` */

From 59a4b8fcf31a100514903706e6ffeef2bbe41cbb Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 30 Apr 2023 10:18:02 -0700
Subject: [PATCH 03/10] sched: fix sched_numa_find_nth_cpu() in non-NUMA case

When CONFIG_NUMA is enabled, sched_numa_find_nth_cpu() searches for a
CPU in sched_domains_numa_masks. The masks includes only online CPUs,
so effectively offline CPUs are skipped.

When CONFIG_NUMA is disabled, the fallback function should be consistent.

Fixes: cd7f55359c90 ("sched: add sched_numa_find_nth_cpu()")
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index fea32377f7c77..52f5850730b3e 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -251,7 +251,7 @@ extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int
 #else
 static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 {
-	return cpumask_nth(cpu, cpus);
+	return cpumask_nth_and(cpu, cpus, cpu_online_mask);
 }
 
 static inline const struct cpumask *

From fad6da0fbb3f8ac5c7210d35fbf9f1a1ace7cc07 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 30 Apr 2023 10:18:03 -0700
Subject: [PATCH 04/10] lib/find: add find_next_and_andnot_bit()

Similarly to find_nth_and_andnot_bit(), find_next_and_andnot_bit() is
a convenient helper that allows traversing bitmaps without storing
intermediate results in a temporary bitmap.

In the following patches the function is used to implement NUMA-aware
CPUs enumeration.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/find.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 lib/find_bit.c       | 12 ++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/include/linux/find.h b/include/linux/find.h
index 5e4f39ef2e72c..90b68d76c0737 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -16,6 +16,9 @@ unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned l
 					unsigned long nbits, unsigned long start);
 unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
 					unsigned long nbits, unsigned long start);
+unsigned long _find_next_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
+					const unsigned long *addr3, unsigned long nbits,
+					unsigned long start);
 unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
 					 unsigned long start);
 extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
@@ -159,6 +162,40 @@ unsigned long find_next_or_bit(const unsigned long *addr1,
 }
 #endif
 
+#ifndef find_next_and_andnot_bit
+/**
+ * find_next_and_andnot_bit - find the next bit set in *addr1 and *addr2,
+ *			      excluding all the bits in *addr3
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @addr3: The third address to base the search on
+ * @size: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Return: the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+static __always_inline
+unsigned long find_next_and_andnot_bit(const unsigned long *addr1,
+				   const unsigned long *addr2,
+				   const unsigned long *addr3,
+				   unsigned long size,
+				   unsigned long offset)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val;
+
+		if (unlikely(offset >= size))
+			return size;
+
+		val = *addr1 & *addr2 & ~*addr3 & GENMASK(size - 1, offset);
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_next_and_andnot_bit(addr1, addr2, addr3, size, offset);
+}
+#endif
+
 #ifndef find_next_zero_bit
 /**
  * find_next_zero_bit - find the next cleared bit in a memory region
@@ -568,6 +605,12 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 	     (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
 	     (bit)++)
 
+#define for_each_and_andnot_bit(bit, addr1, addr2, addr3, size) \
+	for ((bit) = 0;									\
+	     (bit) = find_next_and_andnot_bit((addr1), (addr2), (addr3), (size), (bit)),\
+	     (bit) < (size);								\
+	     (bit)++)
+
 #define for_each_or_bit(bit, addr1, addr2, size) \
 	for ((bit) = 0;									\
 	     (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 32f99e9a670e6..4403e00890b13 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -182,6 +182,18 @@ unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned l
 EXPORT_SYMBOL(_find_next_andnot_bit);
 #endif
 
+#ifndef find_next_and_andnot_bit
+unsigned long _find_next_and_andnot_bit(const unsigned long *addr1,
+					const unsigned long *addr2,
+					const unsigned long *addr3,
+					unsigned long nbits,
+					unsigned long start)
+{
+	return FIND_NEXT_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], /* nop */, nbits, start);
+}
+EXPORT_SYMBOL(_find_next_and_andnot_bit);
+#endif
+
 #ifndef find_next_or_bit
 unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
 					unsigned long nbits, unsigned long start)

From 310ae5d9d46b65fdbd18ac1e5bd03681fbc19ae8 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 30 Apr 2023 10:18:04 -0700
Subject: [PATCH 05/10] sched/topology: introduce sched_numa_find_next_cpu()

The function searches for a next CPU in a given cpumask according to
NUMA topology, so that it traverses CPUs per-hop.

If the CPU is the last CPU in a given hop, sched_numa_find_next_cpu()
switches to the next hop, and picks the first CPU from there, excluding
those already traversed.

Because only online CPUs are presented in the NUMA topology masks, offline
CPUs will be skipped even if presented in the 'cpus' mask provided in the
arguments.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/topology.h | 12 ++++++++++++
 kernel/sched/topology.c  | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 52f5850730b3e..da92fea385858 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -245,8 +245,13 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
 	return cpumask_of_node(cpu_to_node(cpu));
 }
 
+/*
+ * sched_numa_find_*_cpu() functions family traverses only accessible CPUs,
+ * i.e. those listed in cpu_online_mask.
+ */
 #ifdef CONFIG_NUMA
 int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
+int sched_numa_find_next_cpu(const struct cpumask *cpus, int cpu, int node, unsigned int *hop);
 extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
 #else
 static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
@@ -254,6 +259,13 @@ static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, i
 	return cpumask_nth_and(cpu, cpus, cpu_online_mask);
 }
 
+static __always_inline
+int sched_numa_find_next_cpu(const struct cpumask *cpus, int cpu, int node, unsigned int *hop)
+{
+	return find_next_and_bit(cpumask_bits(cpus), cpumask_bits(cpu_online_mask),
+						small_cpumask_bits, cpu);
+}
+
 static inline const struct cpumask *
 sched_numa_hop_mask(unsigned int node, unsigned int hops)
 {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d3a3b2646ec4f..3b5751a3445de 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2135,6 +2135,45 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 }
 EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
 
+/*
+ * sched_numa_find_next_cpu() - given the NUMA topology, find the next cpu
+ * cpumask: cpumask to find a CPU from
+ * cpu: current CPU
+ * node: local node
+ * hop: (in/out) indicates distance order of current CPU to a local node
+ *
+ * The function searches for a next CPU at a given NUMA distance, indicated
+ * by hop, and if nothing found, tries to find CPUs at a greater distance,
+ * starting from the beginning.
+ *
+ * Return: cpu, or >= nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_next_cpu(const struct cpumask *cpus, int cpu, int node, unsigned int *hop)
+{
+	unsigned long *cur, *prev;
+	struct cpumask ***masks;
+	unsigned int ret;
+
+	if (*hop >= sched_domains_numa_levels)
+		return nr_cpu_ids;
+
+	masks = rcu_dereference(sched_domains_numa_masks);
+	cur = cpumask_bits(masks[*hop][node]);
+	if (*hop == 0)
+		ret = find_next_and_bit(cpumask_bits(cpus), cur, nr_cpu_ids, cpu);
+	else {
+		prev = cpumask_bits(masks[*hop - 1][node]);
+		ret = find_next_and_andnot_bit(cpumask_bits(cpus), cur, prev, nr_cpu_ids, cpu);
+	}
+
+	if (ret < nr_cpu_ids)
+		return ret;
+
+	*hop += 1;
+	return sched_numa_find_next_cpu(cpus, 0, node, hop);
+}
+EXPORT_SYMBOL_GPL(sched_numa_find_next_cpu);
+
 /**
  * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
  *                         @node

From 6cc0359cb665b07f6b89ca258a5942e0806e5e2f Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 30 Apr 2023 10:18:05 -0700
Subject: [PATCH 06/10] sched/topology: add for_each_numa_{,online}_cpu() macro

for_each_cpu() is widely used in the kernel, and it's beneficial to
create a NUMA-aware version of the macro to improve on node locality..

Recently added for_each_numa_hop_mask() works, but switching existing
codebase to using it is not an easy process.

New for_each_numa_cpu() is designed to be similar to the for_each_cpu().
It allows to convert existing code to NUMA-aware as simple as adding a
hop iterator variable and passing it inside new macro. for_each_numa_cpu()
takes care of the rest.

At the moment, we have 2 users of NUMA-aware enumerators. One is
Melanox's in-tree driver, and another is Intel's in-review driver:

https://lore.kernel.org/lkml/20230216145455.661709-1-pawel.chmielewski@intel.com/

Both real-life examples follow the same pattern:

	for_each_numa_hop_mask(cpus, prev, node) {
 		for_each_cpu_andnot(cpu, cpus, prev) {
 			if (cnt++ == max_num)
 				goto out;
 			do_something(cpu);
 		}
		prev = cpus;
 	}

With the new macro, it would look like this:

	for_each_numa_online_cpu(cpu, hop, node) {
		if (cnt++ == max_num)
			break;
		do_something(cpu);
 	}

Straight conversion of existing for_each_cpu() codebase to NUMA-aware
version with for_each_numa_hop_mask() is difficult because it doesn't
take a user-provided cpu mask, and eventually ends up with open-coded
double loop. With for_each_numa_cpu() it shouldn't be a brainteaser.
Consider the NUMA-ignorant example:

	cpumask_t cpus = get_mask();
	int cnt = 0, cpu;

	for_each_cpu(cpu, cpus) {
		if (cnt++ == max_num)
			break;
		do_something(cpu);
 	}

Converting it to NUMA-aware version would be as simple as:

	cpumask_t cpus = get_mask();
	int node = get_node();
	int cnt = 0, hop, cpu;

	rcu_read_lock();
	for_each_numa_cpu(cpu, hop, node, cpus) {
		if (cnt++ == max_num)
			break;
		do_something(cpu);
 	}
	rcu_read_unlock();

The latter looks more verbose and avoids from open-coding that annoying
double loop. Another advantage is that it works with a 'hop' parameter with
the clear meaning of NUMA distance, and doesn't make people not familiar
to enumerator internals bothering with current and previous masks machinery.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/topology.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index da92fea385858..7d878f5f35cfb 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -291,4 +291,28 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops)
 	     !IS_ERR_OR_NULL(mask);					       \
 	     __hops++)
 
+/**
+ * for_each_numa_cpu - iterate over cpus in increasing order taking into account
+ *		       NUMA distances from a given node.
+ * @cpu: the (optionally unsigned) integer iterator
+ * @hop: the iterator variable for nodes, i.e. proximity order to the @node
+ * @node: the NUMA node to start the search from.
+ * @mask: the cpumask pointer
+ *
+ * Where considered as a replacement to for_each_cpu(), the following should be
+ * taken into consideration:
+ *  - Only accessible (i.e. online) CPUs are enumerated.
+ *  - CPUs enumeration may not be a monotonic increasing sequence;
+ *
+ * rcu_lock must be held;
+ */
+#define for_each_numa_cpu(cpu, hop, node, mask)					\
+	for ((cpu) = 0, (hop) = 0;						\
+		(cpu) = sched_numa_find_next_cpu((mask), (cpu), (node), &(hop)),\
+		(cpu) < nr_cpu_ids;						\
+		(cpu)++)
+
+#define for_each_numa_online_cpu(cpu, hop, node)				\
+	for_each_numa_cpu(cpu, hop, node, cpu_online_mask)
+
 #endif /* _LINUX_TOPOLOGY_H */

From a4be5fa84bb269886310f563e9095e8164f82c8c Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 30 Apr 2023 10:18:06 -0700
Subject: [PATCH 07/10] net: mlx5: switch comp_irqs_request() to using
 for_each_numa_cpu

for_each_numa_online_cpu() is a more straightforward alternative to
for_each_numa_hop_mask() + for_each_cpu_andnot().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 3db4866d7880f..7c8dc0443d6a4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -813,12 +813,10 @@ static void comp_irqs_release_pci(struct mlx5_core_dev *dev)
 static int comp_irqs_request_pci(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
-	const struct cpumask *prev = cpu_none_mask;
-	const struct cpumask *mask;
 	int ncomp_eqs;
 	u16 *cpus;
 	int ret;
-	int cpu;
+	int cpu, hop;
 	int i;
 
 	ncomp_eqs = table->num_comp_eqs;
@@ -828,15 +826,11 @@ static int comp_irqs_request_pci(struct mlx5_core_dev *dev)
 
 	i = 0;
 	rcu_read_lock();
-	for_each_numa_hop_mask(mask, dev->priv.numa_node) {
-		for_each_cpu_andnot(cpu, mask, prev) {
-			cpus[i] = cpu;
-			if (++i == ncomp_eqs)
-				goto spread_done;
-		}
-		prev = mask;
+	for_each_numa_online_cpu(cpu, hop, dev->priv.numa_node) {
+		cpus[i] = cpu;
+		if (++i == ncomp_eqs)
+			break;
 	}
-spread_done:
 	rcu_read_unlock();
 	ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs, &table->rmap);
 	kfree(cpus);

From 57ddcc014bbfb9084f068e67dde84cf73c383068 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 30 Apr 2023 10:18:07 -0700
Subject: [PATCH 08/10] lib/cpumask: update comment to cpumask_local_spread()

Now that we have a for_each_numa_online_cpu(), which is a more
straightforward replacement to the cpumask_local_spread() when it comes to
enumeration of CPUs with respect to NUMA topology, it's worth to update the
comment.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 lib/cpumask.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/lib/cpumask.c b/lib/cpumask.c
index a7fd02b5ae264..357b68be526b1 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -128,11 +128,8 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
  *
  * There's a better alternative based on for_each()-like iterators:
  *
- *	for_each_numa_hop_mask(mask, node) {
- *		for_each_cpu_andnot(cpu, mask, prev)
- *			do_something(cpu);
- *		prev = mask;
- *	}
+ *	for_each_numa_online_cpu(cpu, hop, node)
+ *		do_something(cpu);
  *
  * It's simpler and more verbose than above. Complexity of iterator-based
  * enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while

From b9833b80d87030b0def7aeda88471ac7f6acd3cb Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 30 Apr 2023 10:18:08 -0700
Subject: [PATCH 09/10] sched: drop for_each_numa_hop_mask()

Now that we have for_each_numa_cpu(), for_each_numa_hop_mask()
and all related code is a dead code. Drop it.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/topology.h | 25 -------------------------
 kernel/sched/topology.c  | 32 --------------------------------
 2 files changed, 57 deletions(-)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 7d878f5f35cfb..c5a9626521663 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -252,7 +252,6 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
 #ifdef CONFIG_NUMA
 int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
 int sched_numa_find_next_cpu(const struct cpumask *cpus, int cpu, int node, unsigned int *hop);
-extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
 #else
 static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 {
@@ -265,32 +264,8 @@ int sched_numa_find_next_cpu(const struct cpumask *cpus, int cpu, int node, unsi
 	return find_next_and_bit(cpumask_bits(cpus), cpumask_bits(cpu_online_mask),
 						small_cpumask_bits, cpu);
 }
-
-static inline const struct cpumask *
-sched_numa_hop_mask(unsigned int node, unsigned int hops)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
 #endif	/* CONFIG_NUMA */
 
-/**
- * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
- *                          from a given node.
- * @mask: the iteration variable.
- * @node: the NUMA node to start the search from.
- *
- * Requires rcu_lock to be held.
- *
- * Yields cpu_online_mask for @node == NUMA_NO_NODE.
- */
-#define for_each_numa_hop_mask(mask, node)				       \
-	for (unsigned int __hops = 0;					       \
-	     mask = (node != NUMA_NO_NODE || __hops) ?			       \
-		     sched_numa_hop_mask(node, __hops) :		       \
-		     cpu_online_mask,					       \
-	     !IS_ERR_OR_NULL(mask);					       \
-	     __hops++)
-
 /**
  * for_each_numa_cpu - iterate over cpus in increasing order taking into account
  *		       NUMA distances from a given node.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3b5751a3445de..e1b8d0d4f4990 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2174,38 +2174,6 @@ int sched_numa_find_next_cpu(const struct cpumask *cpus, int cpu, int node, unsi
 }
 EXPORT_SYMBOL_GPL(sched_numa_find_next_cpu);
 
-/**
- * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
- *                         @node
- * @node: The node to count hops from.
- * @hops: Include CPUs up to that many hops away. 0 means local node.
- *
- * Return: On success, a pointer to a cpumask of CPUs at most @hops away from
- * @node, an error value otherwise.
- *
- * Requires rcu_lock to be held. Returned cpumask is only valid within that
- * read-side section, copy it if required beyond that.
- *
- * Note that not all hops are equal in distance; see sched_init_numa() for how
- * distances and masks are handled.
- * Also note that this is a reflection of sched_domains_numa_masks, which may change
- * during the lifetime of the system (offline nodes are taken out of the masks).
- */
-const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
-{
-	struct cpumask ***masks;
-
-	if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
-		return ERR_PTR(-EINVAL);
-
-	masks = rcu_dereference(sched_domains_numa_masks);
-	if (!masks)
-		return ERR_PTR(-EBUSY);
-
-	return masks[hops][node];
-}
-EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
-
 #endif /* CONFIG_NUMA */
 
 static int __sdt_alloc(const struct cpumask *cpu_map)

From 94b1547668965e1fde8bde3638845ab582b40034 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 30 Apr 2023 10:18:09 -0700
Subject: [PATCH 10/10] lib: test for_each_numa_cpus()

Test for_each_numa_cpus() output to ensure that:
 - all CPUs are picked from NUMA nodes with non-decreasing distances to the
   original node;
 - only online CPUs are enumerated;
 - the macro enumerates each online CPUs only once;
 - enumeration order is consistent with cpumask_local_spread().

The latter is an implementation-defined behavior. If cpumask_local_spread()
or for_each_numa_cpu() will get changed in future, the subtest may need
to be adjusted or even removed, as appropriate.

It's useful now because some architectures don't implement numa_distance(),
and generic implementation only distinguishes local and remote nodes, which
doesn't allow to test the for_each_numa_cpu() properly.

Suggested-by: Valentin Schneider <vschneid@redhat.com> (for node_distance() test)
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 lib/test_bitmap.c | 70 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 68 insertions(+), 2 deletions(-)

diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index f2ea9f30c7c5d..08aed2e3bb238 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -12,6 +12,7 @@
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/topology.h>
 #include <linux/uaccess.h>
 
 #include "../tools/testing/selftests/kselftest_module.h"
@@ -71,6 +72,16 @@ __check_eq_uint(const char *srcfile, unsigned int line,
 	return true;
 }
 
+static bool __init
+__check_ge_uint(const char *srcfile, unsigned int line,
+		const unsigned int exp_uint, unsigned int x)
+{
+	if (exp_uint >=  x)
+		return true;
+
+	pr_err("[%s:%u] expected >= %u, got %u\n", srcfile, line, exp_uint, x);
+	return false;
+}
 
 static bool __init
 __check_eq_bitmap(const char *srcfile, unsigned int line,
@@ -86,6 +97,18 @@ __check_eq_bitmap(const char *srcfile, unsigned int line,
 	return true;
 }
 
+static bool __init
+__check_eq_cpumask(const char *srcfile, unsigned int line,
+		  const struct cpumask *exp_cpumask, const struct cpumask *cpumask)
+{
+	if (cpumask_equal(exp_cpumask, cpumask))
+		return true;
+
+	pr_warn("[%s:%u] cpumasks contents differ: expected \"%*pbl\", got \"%*pbl\"\n",
+		srcfile, line, cpumask_pr_args(exp_cpumask), cpumask_pr_args(cpumask));
+	return false;
+}
+
 static bool __init
 __check_eq_pbl(const char *srcfile, unsigned int line,
 	       const char *expected_pbl,
@@ -173,11 +196,11 @@ __check_eq_str(const char *srcfile, unsigned int line,
 	return eq;
 }
 
-#define __expect_eq(suffix, ...)					\
+#define __expect(suffix, ...)						\
 	({								\
 		int result = 0;						\
 		total_tests++;						\
-		if (!__check_eq_ ## suffix(__FILE__, __LINE__,		\
+		if (!__check_ ## suffix(__FILE__, __LINE__,		\
 					   ##__VA_ARGS__)) {		\
 			failed_tests++;					\
 			result = 1;					\
@@ -185,13 +208,19 @@ __check_eq_str(const char *srcfile, unsigned int line,
 		result;							\
 	})
 
+#define __expect_eq(suffix, ...)	__expect(eq_ ## suffix, ##__VA_ARGS__)
+#define __expect_ge(suffix, ...)	__expect(ge_ ## suffix, ##__VA_ARGS__)
+
 #define expect_eq_uint(...)		__expect_eq(uint, ##__VA_ARGS__)
 #define expect_eq_bitmap(...)		__expect_eq(bitmap, ##__VA_ARGS__)
+#define expect_eq_cpumask(...)		__expect_eq(cpumask, ##__VA_ARGS__)
 #define expect_eq_pbl(...)		__expect_eq(pbl, ##__VA_ARGS__)
 #define expect_eq_u32_array(...)	__expect_eq(u32_array, ##__VA_ARGS__)
 #define expect_eq_clump8(...)		__expect_eq(clump8, ##__VA_ARGS__)
 #define expect_eq_str(...)		__expect_eq(str, ##__VA_ARGS__)
 
+#define expect_ge_uint(...)		__expect_ge(uint, ##__VA_ARGS__)
+
 static void __init test_zero_clear(void)
 {
 	DECLARE_BITMAP(bmap, 1024);
@@ -763,6 +792,42 @@ static void __init test_for_each_set_bit_wrap(void)
 	}
 }
 
+static void __init test_for_each_numa_cpu(void)
+{
+	unsigned int node, cpu, hop;
+	cpumask_var_t mask;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+		pr_err("Can't allocate cpumask. Skipping for_each_numa_cpu() test");
+		return;
+	}
+
+	for_each_node(node) {
+		unsigned int c = 0, dist, old_dist = node_distance(node, node);
+
+		cpumask_clear(mask);
+
+		rcu_read_lock();
+		for_each_numa_cpu(cpu, hop, node, cpu_possible_mask) {
+			dist = node_distance(cpu_to_node(cpu), node);
+
+			/* Distance between nodes must never decrease */
+			expect_ge_uint(dist, old_dist);
+
+			/* Test for coherence with cpumask_local_spread() */
+			expect_eq_uint(cpumask_local_spread(c++, node), cpu);
+
+			cpumask_set_cpu(cpu, mask);
+			old_dist = dist;
+		}
+		rcu_read_unlock();
+
+		/* Each online CPU must be visited exactly once */
+		expect_eq_uint(c, num_online_cpus());
+		expect_eq_cpumask(mask, cpu_online_mask);
+	}
+}
+
 static void __init test_for_each_set_bit(void)
 {
 	DECLARE_BITMAP(orig, 500);
@@ -1249,6 +1314,7 @@ static void __init selftest(void)
 	test_for_each_clear_bitrange_from();
 	test_for_each_set_clump8();
 	test_for_each_set_bit_wrap();
+	test_for_each_numa_cpu();
 }
 
 KSTM_MODULE_LOADERS(test_bitmap);