diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 6394f5dc2303d..466c560b0c304 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -215,6 +215,12 @@ rmem_max The maximum receive socket buffer size in bytes. +rps_default_mask +---------------- + +The default RPS CPU mask used on newly created network devices. An empty +mask means RPS disabled by default. + tstamp_allow_data ----------------- Allow processes to receive tx timestamps looped together with the original diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d5ef4c1fedd24..38ab96ae0d68b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -223,6 +223,7 @@ struct net_device_core_stats { #include extern struct static_key_false rps_needed; extern struct static_key_false rfs_needed; +extern struct cpumask rps_default_mask; #endif struct neighbour; diff --git a/net/core/dev.h b/net/core/dev.h index a065b7571441d..e075e198092cc 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -9,6 +9,7 @@ struct net_device; struct netdev_bpf; struct netdev_phys_item_id; struct netlink_ext_ack; +struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ @@ -134,4 +135,5 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_ipv4_max_size, size); } +int rps_cpumask_housekeeping(struct cpumask *mask); #endif diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ca55dd747d6cc..4b361ac6a252a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -831,42 +831,18 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t store_rps_map(struct netdev_rx_queue *queue, - const char *buf, size_t len) +static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue, + cpumask_var_t mask) { - struct rps_map *old_map, *map; - cpumask_var_t mask; - int err, cpu, i; static DEFINE_MUTEX(rps_map_mutex); - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); - if (err) { - free_cpumask_var(mask); - return err; - } - - if (!cpumask_empty(mask)) { - cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); - cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); - if (cpumask_empty(mask)) { - free_cpumask_var(mask); - return -EINVAL; - } - } + struct rps_map *old_map, *map; + int cpu, i; map = kzalloc(max_t(unsigned int, RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), GFP_KERNEL); - if (!map) { - free_cpumask_var(mask); + if (!map) return -ENOMEM; - } i = 0; for_each_cpu_and(cpu, mask, cpu_online_mask) @@ -893,9 +869,45 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, if (old_map) kfree_rcu(old_map, rcu); + return 0; +} +int rps_cpumask_housekeeping(struct cpumask *mask) +{ + if (!cpumask_empty(mask)) { + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); + if (cpumask_empty(mask)) + return -EINVAL; + } + return 0; +} + +static ssize_t store_rps_map(struct netdev_rx_queue *queue, + const char *buf, size_t len) +{ + cpumask_var_t mask; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) + goto out; + + err = rps_cpumask_housekeeping(mask); + if (err) + goto out; + + err = netdev_rx_queue_set_rps_mask(queue, mask); + +out: free_cpumask_var(mask); - return len; + return err ? : len; } static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, @@ -1071,6 +1083,13 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) goto err; } +#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL) + if (!cpumask_empty(&rps_default_mask)) { + error = netdev_rx_queue_set_rps_mask(queue, &rps_default_mask); + if (error) + goto err; + } +#endif kobject_uevent(kobj, KOBJ_ADD); return error; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index e7b98162c6321..7130e6d9e263a 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -45,7 +46,59 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); +#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) +static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, + struct cpumask *mask) +{ + char kbuf[128]; + int len; + + if (*ppos || !*lenp) { + *lenp = 0; + return; + } + + len = min(sizeof(kbuf) - 1, *lenp); + len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); + if (!len) { + *lenp = 0; + return; + } + + if (len < *lenp) + kbuf[len++] = '\n'; + memcpy(buffer, kbuf, len); + *lenp = len; + *ppos += len; +} +#endif + #ifdef CONFIG_RPS +struct cpumask rps_default_mask; + +static int rps_default_mask_sysctl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int err = 0; + + rtnl_lock(); + if (write) { + err = cpumask_parse(buffer, &rps_default_mask); + if (err) + goto done; + + err = rps_cpumask_housekeeping(&rps_default_mask); + if (err) + goto done; + } else { + dump_cpumask(buffer, lenp, ppos, &rps_default_mask); + } + +done: + rtnl_unlock(); + return err; +} + static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -155,13 +208,6 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { - char kbuf[128]; - - if (*ppos || !*lenp) { - *lenp = 0; - goto done; - } - cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { @@ -171,17 +217,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, } rcu_read_unlock(); - len = min(sizeof(kbuf) - 1, *lenp); - len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); - if (!len) { - *lenp = 0; - goto done; - } - if (len < *lenp) - kbuf[len++] = '\n'; - memcpy(buffer, kbuf, len); - *lenp = len; - *ppos += len; + dump_cpumask(buffer, lenp, ppos, mask); } done: @@ -472,6 +508,11 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = rps_sock_flow_sysctl }, + { + .procname = "rps_default_mask", + .mode = 0644, + .proc_handler = rps_default_mask_sysctl + }, #endif #ifdef CONFIG_NET_FLOW_LIMIT { @@ -675,6 +716,10 @@ static __net_initdata struct pernet_operations sysctl_core_ops = { static __init int sysctl_core_init(void) { +#if IS_ENABLED(CONFIG_RPS) + cpumask_copy(&rps_default_mask, cpu_none_mask); +#endif + register_net_sysctl(&init_net, "net/core", net_core_table); return register_pernet_subsys(&sysctl_core_ops); } diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 951bd5342bc64..3364c548a23bf 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -46,6 +46,7 @@ TEST_PROGS += stress_reuseport_listen.sh TEST_PROGS += l2_tos_ttl_inherit.sh TEST_PROGS += bind_bhash.sh TEST_PROGS += ip_local_port_range.sh +TEST_PROGS += rps_default_mask.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh TEST_GEN_FILES = socket nettest diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index bd89198cd8176..cc9fd55ab8699 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -3,6 +3,9 @@ CONFIG_NET_NS=y CONFIG_BPF_SYSCALL=y CONFIG_TEST_BPF=m CONFIG_NUMA=y +CONFIG_RPS=y +CONFIG_SYSFS=y +CONFIG_PROC_SYSCTL=y CONFIG_NET_VRF=y CONFIG_NET_L3_MASTER_DEV=y CONFIG_IPV6=y diff --git a/tools/testing/selftests/net/rps_default_mask.sh b/tools/testing/selftests/net/rps_default_mask.sh new file mode 100755 index 0000000000000..c81c0ac7ddfe9 --- /dev/null +++ b/tools/testing/selftests/net/rps_default_mask.sh @@ -0,0 +1,57 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +readonly ksft_skip=4 +readonly cpus=$(nproc) +ret=0 + +[ $cpus -gt 2 ] || exit $ksft_skip + +readonly INITIAL_RPS_DEFAULT_MASK=$(cat /proc/sys/net/core/rps_default_mask) +readonly NETNS="ns-$(mktemp -u XXXXXX)" + +setup() { + ip netns add "${NETNS}" + ip -netns "${NETNS}" link set lo up +} + +cleanup() { + echo $INITIAL_RPS_DEFAULT_MASK > /proc/sys/net/core/rps_default_mask + ip netns del $NETNS +} + +chk_rps() { + local rps_mask expected_rps_mask=$3 + local dev_name=$2 + local msg=$1 + + rps_mask=$(ip netns exec $NETNS cat /sys/class/net/$dev_name/queues/rx-0/rps_cpus) + printf "%-60s" "$msg" + if [ $rps_mask -eq $expected_rps_mask ]; then + echo "[ ok ]" + else + echo "[fail] expected $expected_rps_mask found $rps_mask" + ret=1 + fi +} + +trap cleanup EXIT + +echo 0 > /proc/sys/net/core/rps_default_mask +setup +chk_rps "empty rps_default_mask" lo 0 +cleanup + +echo 1 > /proc/sys/net/core/rps_default_mask +setup +chk_rps "non zero rps_default_mask" lo 1 + +echo 3 > /proc/sys/net/core/rps_default_mask +chk_rps "changing rps_default_mask dont affect existing netns" lo 1 + +ip -n $NETNS link add type veth +ip -n $NETNS link set dev veth0 up +ip -n $NETNS link set dev veth1 up +chk_rps "changing rps_default_mask affect newly created devices" veth0 3 +chk_rps "changing rps_default_mask affect newly created devices[II]" veth1 3 +exit $ret