Skip to content

Commit

Permalink
Merge branch 'net-optmem_max-changes'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
net: optmem_max changes

optmem_max default value is too small for tx zerocopy workloads.

First patch increases default from 20KB to 128 KB,
which is the value we have used for seven years.

Second patch makes optmem_max sysctl per netns.

Last patch tweaks two tests accordingly.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Dec 15, 2023
2 parents e16064c + 18872ba commit 9ed816b
Show file tree
Hide file tree
Showing 12 changed files with 41 additions and 36 deletions.
5 changes: 4 additions & 1 deletion Documentation/admin-guide/sysctl/net.rst
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,10 @@ optmem_max
----------

Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
of struct cmsghdr structures with appended data.
of struct cmsghdr structures with appended data. TCP tx zerocopy also uses
optmem_max as a limit for its internal structures.

Default : 128 KB

fb_tunnels_only_for_init_net
----------------------------
Expand Down
1 change: 1 addition & 0 deletions include/net/netns/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ struct netns_core {
struct ctl_table_header *sysctl_hdr;

int sysctl_somaxconn;
int sysctl_optmem_max;
u8 sysctl_txrehash;

#ifdef CONFIG_PROC_FS
Expand Down
1 change: 0 additions & 1 deletion include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -2920,7 +2920,6 @@ extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;

extern int sysctl_tstamp_allow_data;
extern int sysctl_optmem_max;

extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;
Expand Down
3 changes: 2 additions & 1 deletion net/core/bpf_sk_storage.c
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,10 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap,
void *owner, u32 size)
{
int optmem_max = READ_ONCE(sysctl_optmem_max);
struct sock *sk = (struct sock *)owner;
int optmem_max;

optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
/* same check as in sock_kmalloc() */
if (size <= optmem_max &&
atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
Expand Down
12 changes: 7 additions & 5 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -1219,8 +1219,8 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
*/
static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
u32 filter_size = bpf_prog_size(fp->prog->len);
int optmem_max = READ_ONCE(sysctl_optmem_max);

/* same check as in sock_kmalloc() */
if (filter_size <= optmem_max &&
Expand Down Expand Up @@ -1550,12 +1550,13 @@ EXPORT_SYMBOL_GPL(sk_attach_filter);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
struct bpf_prog *prog = __get_filter(fprog, sk);
int err;
int err, optmem_max;

if (IS_ERR(prog))
return PTR_ERR(prog);

if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max))
optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
if (bpf_prog_size(prog->len) > optmem_max)
err = -ENOMEM;
else
err = reuseport_attach_prog(sk, prog);
Expand Down Expand Up @@ -1594,7 +1595,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
struct bpf_prog *prog;
int err;
int err, optmem_max;

if (sock_flag(sk, SOCK_FILTER_LOCKED))
return -EPERM;
Expand Down Expand Up @@ -1622,7 +1623,8 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
}
} else {
/* BPF_PROG_TYPE_SOCKET_FILTER */
if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) {
optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
if (bpf_prog_size(prog->len) > optmem_max) {
err = -ENOMEM;
goto err_prog_put;
}
Expand Down
4 changes: 4 additions & 0 deletions net/core/net_namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,10 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
static int __net_init net_defaults_init_net(struct net *net)
{
net->core.sysctl_somaxconn = SOMAXCONN;
/* Limits per socket sk_omem_alloc usage.
* TCP zerocopy regular usage needs 128 KB.
*/
net->core.sysctl_optmem_max = 128 * 1024;
net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;

return 0;
Expand Down
8 changes: 2 additions & 6 deletions net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -283,10 +283,6 @@ EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;

/* Maximal space eaten by iovec or ancillary data plus some space */
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
EXPORT_SYMBOL(sysctl_optmem_max);

int sysctl_tstamp_allow_data __read_mostly = 1;

DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
Expand Down Expand Up @@ -2651,7 +2647,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,

/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
READ_ONCE(sysctl_optmem_max))
READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
return NULL;

skb = alloc_skb(size, priority);
Expand All @@ -2669,7 +2665,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
*/
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
int optmem_max = READ_ONCE(sysctl_optmem_max);
int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);

if ((unsigned int)size <= optmem_max &&
atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
Expand Down
15 changes: 8 additions & 7 deletions net/core/sysctl_net_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -508,13 +508,6 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "optmem_max",
.data = &sysctl_optmem_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tstamp_allow_data",
.data = &sysctl_tstamp_allow_data,
Expand Down Expand Up @@ -673,6 +666,14 @@ static struct ctl_table netns_core_table[] = {
.extra1 = SYSCTL_ZERO,
.proc_handler = proc_dointvec_minmax
},
{
.procname = "optmem_max",
.data = &init_net.core.sysctl_optmem_max,
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = SYSCTL_ZERO,
.proc_handler = proc_dointvec_minmax
},
{
.procname = "txrehash",
.data = &init_net.core.sysctl_txrehash,
Expand Down
6 changes: 3 additions & 3 deletions net/ipv4/ip_sockglue.c
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen)

if (optlen < GROUP_FILTER_SIZE(0))
return -EINVAL;
if (optlen > READ_ONCE(sysctl_optmem_max))
if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
return -ENOBUFS;

gsf = memdup_sockptr(optval, optlen);
Expand Down Expand Up @@ -811,7 +811,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,

if (optlen < size0)
return -EINVAL;
if (optlen > READ_ONCE(sysctl_optmem_max) - 4)
if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4)
return -ENOBUFS;

p = kmalloc(optlen + 4, GFP_KERNEL);
Expand Down Expand Up @@ -1254,7 +1254,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,

if (optlen < IP_MSFILTER_SIZE(0))
goto e_inval;
if (optlen > READ_ONCE(sysctl_optmem_max)) {
if (optlen > READ_ONCE(net->core.sysctl_optmem_max)) {
err = -ENOBUFS;
break;
}
Expand Down
4 changes: 2 additions & 2 deletions net/ipv6/ipv6_sockglue.c
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,

if (optlen < GROUP_FILTER_SIZE(0))
return -EINVAL;
if (optlen > READ_ONCE(sysctl_optmem_max))
if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
return -ENOBUFS;

gsf = memdup_sockptr(optval, optlen);
Expand Down Expand Up @@ -244,7 +244,7 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,

if (optlen < size0)
return -EINVAL;
if (optlen > READ_ONCE(sysctl_optmem_max) - 4)
if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4)
return -ENOBUFS;

p = kmalloc(optlen + 4, GFP_KERNEL);
Expand Down
9 changes: 4 additions & 5 deletions tools/testing/selftests/net/io_uring_zerocopy_tx.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,23 +76,22 @@ case "${TXMODE}" in
esac

# Start of state changes: install cleanup handler
save_sysctl_mem="$(sysctl -n ${path_sysctl_mem})"

cleanup() {
ip netns del "${NS2}"
ip netns del "${NS1}"
sysctl -w -q "${path_sysctl_mem}=${save_sysctl_mem}"
}

trap cleanup EXIT

# Configure system settings
sysctl -w -q "${path_sysctl_mem}=1000000"

# Create virtual ethernet pair between network namespaces
ip netns add "${NS1}"
ip netns add "${NS2}"

# Configure system settings
ip netns exec "${NS1}" sysctl -w -q "${path_sysctl_mem}=1000000"
ip netns exec "${NS2}" sysctl -w -q "${path_sysctl_mem}=1000000"

ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \
peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}"

Expand Down
9 changes: 4 additions & 5 deletions tools/testing/selftests/net/msg_zerocopy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,23 +70,22 @@ case "${TXMODE}" in
esac

# Start of state changes: install cleanup handler
save_sysctl_mem="$(sysctl -n ${path_sysctl_mem})"

cleanup() {
ip netns del "${NS2}"
ip netns del "${NS1}"
sysctl -w -q "${path_sysctl_mem}=${save_sysctl_mem}"
}

trap cleanup EXIT

# Configure system settings
sysctl -w -q "${path_sysctl_mem}=1000000"

# Create virtual ethernet pair between network namespaces
ip netns add "${NS1}"
ip netns add "${NS2}"

# Configure system settings
ip netns exec "${NS1}" sysctl -w -q "${path_sysctl_mem}=1000000"
ip netns exec "${NS2}" sysctl -w -q "${path_sysctl_mem}=1000000"

ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \
peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}"

Expand Down

0 comments on commit 9ed816b

Please sign in to comment.