diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index c7525942f12ce..3960916519557 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -345,7 +345,10 @@ optmem_max ---------- Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence -of struct cmsghdr structures with appended data. +of struct cmsghdr structures with appended data. TCP tx zerocopy also uses +optmem_max as a limit for its internal structures. + +Default : 128 KB fb_tunnels_only_for_init_net ---------------------------- diff --git a/include/net/netns/core.h b/include/net/netns/core.h index a91ef9f8de60b..78214f1b43a20 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -13,6 +13,7 @@ struct netns_core { struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + int sysctl_optmem_max; u8 sysctl_txrehash; #ifdef CONFIG_PROC_FS diff --git a/include/net/sock.h b/include/net/sock.h index 1d6931caf0c3c..8b6fe164b218d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2920,7 +2920,6 @@ extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern int sysctl_tstamp_allow_data; -extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index cca7594be92ec..6c4d90b24d467 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -275,9 +275,10 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { - int optmem_max = READ_ONCE(sysctl_optmem_max); struct sock *sk = (struct sock *)owner; + int optmem_max; + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { diff --git a/net/core/filter.c b/net/core/filter.c index eedb33f3e9982..6d89a9cf33c9f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1219,8 +1219,8 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) */ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); u32 filter_size = bpf_prog_size(fp->prog->len); - int optmem_max = READ_ONCE(sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (filter_size <= optmem_max && @@ -1550,12 +1550,13 @@ EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); - int err; + int err, optmem_max; if (IS_ERR(prog)) return PTR_ERR(prog); - if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); @@ -1594,7 +1595,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog; - int err; + int err, optmem_max; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; @@ -1622,7 +1623,8 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ - if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) { + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) { err = -ENOMEM; goto err_prog_put; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb8bcbff9e83a..72799533426b6 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -372,6 +372,10 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) static int __net_init net_defaults_init_net(struct net *net) { net->core.sysctl_somaxconn = SOMAXCONN; + /* Limits per socket sk_omem_alloc usage. + * TCP zerocopy regular usage needs 128 KB. + */ + net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; return 0; diff --git a/net/core/sock.c b/net/core/sock.c index fef349dd72fa7..446e945f736b3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -283,10 +283,6 @@ EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -/* Maximal space eaten by iovec or ancillary data plus some space */ -int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); -EXPORT_SYMBOL(sysctl_optmem_max); - int sysctl_tstamp_allow_data __read_mostly = 1; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); @@ -2651,7 +2647,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > - READ_ONCE(sysctl_optmem_max)) + READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); @@ -2669,7 +2665,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - int optmem_max = READ_ONCE(sysctl_optmem_max); + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if ((unsigned int)size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 03f1edb948d7d..0f0cb1465e089 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -508,13 +508,6 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "optmem_max", - .data = &sysctl_optmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tstamp_allow_data", .data = &sysctl_tstamp_allow_data, @@ -673,6 +666,14 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, + { + .procname = "optmem_max", + .data = &init_net.core.sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .proc_handler = proc_dointvec_minmax + }, { .procname = "txrehash", .data = &init_net.core.sysctl_txrehash, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d7d13940774e8..66247e8b429e4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -775,7 +775,7 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max)) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); @@ -811,7 +811,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < size0) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max) - 4) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); @@ -1254,7 +1254,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, if (optlen < IP_MSFILTER_SIZE(0)) goto e_inval; - if (optlen > READ_ONCE(sysctl_optmem_max)) { + if (optlen > READ_ONCE(net->core.sysctl_optmem_max)) { err = -ENOBUFS; break; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 9e8ebda170f14..56c3c467f9deb 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -210,7 +210,7 @@ static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max)) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); @@ -244,7 +244,7 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < size0) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max) - 4) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); diff --git a/tools/testing/selftests/net/io_uring_zerocopy_tx.sh b/tools/testing/selftests/net/io_uring_zerocopy_tx.sh index 9ac4456d48fcc..123439545013d 100755 --- a/tools/testing/selftests/net/io_uring_zerocopy_tx.sh +++ b/tools/testing/selftests/net/io_uring_zerocopy_tx.sh @@ -76,23 +76,22 @@ case "${TXMODE}" in esac # Start of state changes: install cleanup handler -save_sysctl_mem="$(sysctl -n ${path_sysctl_mem})" cleanup() { ip netns del "${NS2}" ip netns del "${NS1}" - sysctl -w -q "${path_sysctl_mem}=${save_sysctl_mem}" } trap cleanup EXIT -# Configure system settings -sysctl -w -q "${path_sysctl_mem}=1000000" - # Create virtual ethernet pair between network namespaces ip netns add "${NS1}" ip netns add "${NS2}" +# Configure system settings +ip netns exec "${NS1}" sysctl -w -q "${path_sysctl_mem}=1000000" +ip netns exec "${NS2}" sysctl -w -q "${path_sysctl_mem}=1000000" + ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \ peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}" diff --git a/tools/testing/selftests/net/msg_zerocopy.sh b/tools/testing/selftests/net/msg_zerocopy.sh index 825ffec85cea3..89c22f5320e0d 100755 --- a/tools/testing/selftests/net/msg_zerocopy.sh +++ b/tools/testing/selftests/net/msg_zerocopy.sh @@ -70,23 +70,22 @@ case "${TXMODE}" in esac # Start of state changes: install cleanup handler -save_sysctl_mem="$(sysctl -n ${path_sysctl_mem})" cleanup() { ip netns del "${NS2}" ip netns del "${NS1}" - sysctl -w -q "${path_sysctl_mem}=${save_sysctl_mem}" } trap cleanup EXIT -# Configure system settings -sysctl -w -q "${path_sysctl_mem}=1000000" - # Create virtual ethernet pair between network namespaces ip netns add "${NS1}" ip netns add "${NS2}" +# Configure system settings +ip netns exec "${NS1}" sysctl -w -q "${path_sysctl_mem}=1000000" +ip netns exec "${NS2}" sysctl -w -q "${path_sysctl_mem}=1000000" + ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \ peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}"