From 135746c61fa6d7f66dc079027304eaa4d35fe942 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 19:44:55 +0100 Subject: [PATCH 1/4] net-sysctl: factor out cpumask parsing helper Will be used by the following patch to avoid code duplication. No functional changes intended. The only difference is that now flow_limit_cpu_sysctl() will always compute the flow limit mask on each read operation, even when read() will not return any byte to user-space. Note that the new helper is placed under a new #ifdef at the file start to better fit the usage in the later patch Signed-off-by: Paolo Abeni Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/core/sysctl_net_core.c | 46 +++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index e7b98162c6321..6935ecdc84b0d 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -45,6 +45,33 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); +#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) +static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, + struct cpumask *mask) +{ + char kbuf[128]; + int len; + + if (*ppos || !*lenp) { + *lenp = 0; + return; + } + + len = min(sizeof(kbuf) - 1, *lenp); + len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); + if (!len) { + *lenp = 0; + return; + } + + if (len < *lenp) + kbuf[len++] = '\n'; + memcpy(buffer, kbuf, len); + *lenp = len; + *ppos += len; +} +#endif + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -155,13 +182,6 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { - char kbuf[128]; - - if (*ppos || !*lenp) { - *lenp = 0; - goto done; - } - cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { @@ -171,17 +191,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, } rcu_read_unlock(); - len = min(sizeof(kbuf) - 1, *lenp); - len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); - if (!len) { - *lenp = 0; - goto done; - } - if (len < *lenp) - kbuf[len++] = '\n'; - memcpy(buffer, kbuf, len); - *lenp = len; - *ppos += len; + dump_cpumask(buffer, lenp, ppos, mask); } done: From 370ca718fd5e1fd45ccfdf7a9d76d010f561e607 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 19:44:56 +0100 Subject: [PATCH 2/4] net-sysctl: factor-out rpm mask manipulation helpers Will simplify the following patch. No functional change intended. Signed-off-by: Paolo Abeni Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/core/dev.h | 2 ++ net/core/net-sysfs.c | 72 ++++++++++++++++++++++++++------------------ 2 files changed, 44 insertions(+), 30 deletions(-) diff --git a/net/core/dev.h b/net/core/dev.h index a065b7571441d..e075e198092cc 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -9,6 +9,7 @@ struct net_device; struct netdev_bpf; struct netdev_phys_item_id; struct netlink_ext_ack; +struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ @@ -134,4 +135,5 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_ipv4_max_size, size); } +int rps_cpumask_housekeeping(struct cpumask *mask); #endif diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ca55dd747d6cc..2126970a4bfde 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -831,42 +831,18 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t store_rps_map(struct netdev_rx_queue *queue, - const char *buf, size_t len) +static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue, + cpumask_var_t mask) { - struct rps_map *old_map, *map; - cpumask_var_t mask; - int err, cpu, i; static DEFINE_MUTEX(rps_map_mutex); - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); - if (err) { - free_cpumask_var(mask); - return err; - } - - if (!cpumask_empty(mask)) { - cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); - cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); - if (cpumask_empty(mask)) { - free_cpumask_var(mask); - return -EINVAL; - } - } + struct rps_map *old_map, *map; + int cpu, i; map = kzalloc(max_t(unsigned int, RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), GFP_KERNEL); - if (!map) { - free_cpumask_var(mask); + if (!map) return -ENOMEM; - } i = 0; for_each_cpu_and(cpu, mask, cpu_online_mask) @@ -893,9 +869,45 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, if (old_map) kfree_rcu(old_map, rcu); + return 0; +} +int rps_cpumask_housekeeping(struct cpumask *mask) +{ + if (!cpumask_empty(mask)) { + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); + if (cpumask_empty(mask)) + return -EINVAL; + } + return 0; +} + +static ssize_t store_rps_map(struct netdev_rx_queue *queue, + const char *buf, size_t len) +{ + cpumask_var_t mask; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) + goto out; + + err = rps_cpumask_housekeeping(mask); + if (err) + goto out; + + err = netdev_rx_queue_set_rps_mask(queue, mask); + +out: free_cpumask_var(mask); - return len; + return err ? : len; } static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, From 605cfa1b1090b5d9e227d8a8f7d08fdd04f07724 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 19:44:57 +0100 Subject: [PATCH 3/4] net: introduce default_rps_mask netns attribute If RPS is enabled, this allows configuring a default rps mask, which is effective since receive queue creation time. A default RPS mask allows the system admin to ensure proper isolation, avoiding races at network namespace or device creation time. The default RPS mask is initially empty, and can be modified via a newly added sysctl entry. Signed-off-by: Paolo Abeni Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- Documentation/admin-guide/sysctl/net.rst | 6 ++++ include/linux/netdevice.h | 1 + net/core/net-sysfs.c | 7 +++++ net/core/sysctl_net_core.c | 37 +++++++++++++++++++++++- 4 files changed, 50 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 6394f5dc2303d..466c560b0c304 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -215,6 +215,12 @@ rmem_max The maximum receive socket buffer size in bytes. +rps_default_mask +---------------- + +The default RPS CPU mask used on newly created network devices. An empty +mask means RPS disabled by default. + tstamp_allow_data ----------------- Allow processes to receive tx timestamps looped together with the original diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d5ef4c1fedd24..38ab96ae0d68b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -223,6 +223,7 @@ struct net_device_core_stats { #include extern struct static_key_false rps_needed; extern struct static_key_false rfs_needed; +extern struct cpumask rps_default_mask; #endif struct neighbour; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 2126970a4bfde..4b361ac6a252a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1083,6 +1083,13 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) goto err; } +#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL) + if (!cpumask_empty(&rps_default_mask)) { + error = netdev_rx_queue_set_rps_mask(queue, &rps_default_mask); + if (error) + goto err; + } +#endif kobject_uevent(kobj, KOBJ_ADD); return error; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6935ecdc84b0d..7130e6d9e263a 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -45,7 +46,7 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); -#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) +#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, struct cpumask *mask) { @@ -73,6 +74,31 @@ static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, #endif #ifdef CONFIG_RPS +struct cpumask rps_default_mask; + +static int rps_default_mask_sysctl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int err = 0; + + rtnl_lock(); + if (write) { + err = cpumask_parse(buffer, &rps_default_mask); + if (err) + goto done; + + err = rps_cpumask_housekeeping(&rps_default_mask); + if (err) + goto done; + } else { + dump_cpumask(buffer, lenp, ppos, &rps_default_mask); + } + +done: + rtnl_unlock(); + return err; +} + static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -482,6 +508,11 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = rps_sock_flow_sysctl }, + { + .procname = "rps_default_mask", + .mode = 0644, + .proc_handler = rps_default_mask_sysctl + }, #endif #ifdef CONFIG_NET_FLOW_LIMIT { @@ -685,6 +716,10 @@ static __net_initdata struct pernet_operations sysctl_core_ops = { static __init int sysctl_core_init(void) { +#if IS_ENABLED(CONFIG_RPS) + cpumask_copy(&rps_default_mask, cpu_none_mask); +#endif + register_net_sysctl(&init_net, "net/core", net_core_table); return register_pernet_subsys(&sysctl_core_ops); } From c12e0d5f267d7eb45a2f8eaa9fd44eaa2871a95e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 19:44:58 +0100 Subject: [PATCH 4/4] self-tests: introduce self-tests for RPS default mask Ensure that RPS default mask changes take place on all newly created netns/devices and don't affect existing ones. Signed-off-by: Paolo Abeni Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config | 3 + .../testing/selftests/net/rps_default_mask.sh | 57 +++++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100755 tools/testing/selftests/net/rps_default_mask.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 951bd5342bc64..3364c548a23bf 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -46,6 +46,7 @@ TEST_PROGS += stress_reuseport_listen.sh TEST_PROGS += l2_tos_ttl_inherit.sh TEST_PROGS += bind_bhash.sh TEST_PROGS += ip_local_port_range.sh +TEST_PROGS += rps_default_mask.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh TEST_GEN_FILES = socket nettest diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index bd89198cd8176..cc9fd55ab8699 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -3,6 +3,9 @@ CONFIG_NET_NS=y CONFIG_BPF_SYSCALL=y CONFIG_TEST_BPF=m CONFIG_NUMA=y +CONFIG_RPS=y +CONFIG_SYSFS=y +CONFIG_PROC_SYSCTL=y CONFIG_NET_VRF=y CONFIG_NET_L3_MASTER_DEV=y CONFIG_IPV6=y diff --git a/tools/testing/selftests/net/rps_default_mask.sh b/tools/testing/selftests/net/rps_default_mask.sh new file mode 100755 index 0000000000000..c81c0ac7ddfe9 --- /dev/null +++ b/tools/testing/selftests/net/rps_default_mask.sh @@ -0,0 +1,57 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +readonly ksft_skip=4 +readonly cpus=$(nproc) +ret=0 + +[ $cpus -gt 2 ] || exit $ksft_skip + +readonly INITIAL_RPS_DEFAULT_MASK=$(cat /proc/sys/net/core/rps_default_mask) +readonly NETNS="ns-$(mktemp -u XXXXXX)" + +setup() { + ip netns add "${NETNS}" + ip -netns "${NETNS}" link set lo up +} + +cleanup() { + echo $INITIAL_RPS_DEFAULT_MASK > /proc/sys/net/core/rps_default_mask + ip netns del $NETNS +} + +chk_rps() { + local rps_mask expected_rps_mask=$3 + local dev_name=$2 + local msg=$1 + + rps_mask=$(ip netns exec $NETNS cat /sys/class/net/$dev_name/queues/rx-0/rps_cpus) + printf "%-60s" "$msg" + if [ $rps_mask -eq $expected_rps_mask ]; then + echo "[ ok ]" + else + echo "[fail] expected $expected_rps_mask found $rps_mask" + ret=1 + fi +} + +trap cleanup EXIT + +echo 0 > /proc/sys/net/core/rps_default_mask +setup +chk_rps "empty rps_default_mask" lo 0 +cleanup + +echo 1 > /proc/sys/net/core/rps_default_mask +setup +chk_rps "non zero rps_default_mask" lo 1 + +echo 3 > /proc/sys/net/core/rps_default_mask +chk_rps "changing rps_default_mask dont affect existing netns" lo 1 + +ip -n $NETNS link add type veth +ip -n $NETNS link set dev veth0 up +ip -n $NETNS link set dev veth1 up +chk_rps "changing rps_default_mask affect newly created devices" veth0 3 +chk_rps "changing rps_default_mask affect newly created devices[II]" veth1 3 +exit $ret