From 8919a9b31eb4fb4c0a93e5fb350a626924302aa6 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:32 -0400 Subject: [PATCH 1/5] tcp: Only init congestion control if not initialized already Change tcp_init_transfer() to only initialize congestion control if it has not been initialized already. With this new approach, we can arrange things so that if the EBPF code sets the congestion control by calling setsockopt(TCP_CONGESTION) then tcp_init_transfer() will not re-initialize the CC module. This is an approach that has the following beneficial properties: (1) This allows CC module customizations made by the EBPF called in tcp_init_transfer() to persist, and not be wiped out by a later call to tcp_init_congestion_control() in tcp_init_transfer(). (2) Does not flip the order of EBPF and CC init, to avoid causing bugs for existing code upstream that depends on the current order. (3) Does not cause 2 initializations for for CC in the case where the EBPF called in tcp_init_transfer() wants to set the CC to a new CC algorithm. (4) Allows follow-on simplifications to the code in net/core/filter.c and net/ipv4/tcp_cong.c, which currently both have some complexity to special-case CC initialization to avoid double CC initialization if EBPF sets the CC. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- include/net/inet_connection_sock.h | 3 ++- net/ipv4/tcp.c | 1 + net/ipv4/tcp_cong.c | 3 ++- net/ipv4/tcp_input.c | 4 +++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c738abeb3265c..dc763ca9413cc 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -96,7 +96,8 @@ struct inet_connection_sock { void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); - __u8 icsk_ca_state:6, + __u8 icsk_ca_state:5, + icsk_ca_initialized:1, icsk_ca_setsockopt:1, icsk_ca_dst_locked:1; __u8 icsk_retransmits; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 57a5688755391..7360d3db2b616 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2698,6 +2698,7 @@ int tcp_disconnect(struct sock *sk, int flags) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 62878cf26d9cc..d18d7a1ce4ce7 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -176,7 +176,7 @@ void tcp_assign_congestion_control(struct sock *sk) void tcp_init_congestion_control(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) @@ -185,6 +185,7 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); + icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4337841faeff9..0e5ac0d33fd3f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5894,8 +5894,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); tp->snd_cwnd_stamp = tcp_jiffies32; + icsk->icsk_ca_initialized = 0; bpf_skops_established(sk, bpf_op, skb); - tcp_init_congestion_control(sk); + if (!icsk->icsk_ca_initialized) + tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } From e7b10a4dd1b13c8fb8ee1686da7d01437c5da1d2 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:33 -0400 Subject: [PATCH 2/5] tcp: Simplify EBPF TCP_CONGESTION to always init CC Now that the previous patch ensures we don't initialize the congestion control twice, when EBPF sets the congestion control algorithm at connection establishment we can simplify the code by simply initializing the congestion control module at that time. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- net/core/filter.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 47eef9a0be6a7..067f6759a68f7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4313,8 +4313,6 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#define SOCKOPT_CC_REINIT (1 << 0) - static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen, u32 flags) { @@ -4449,13 +4447,12 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; ret = tcp_set_congestion_control(sk, name, false, - reinit, true); + true, true); } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -4652,8 +4649,6 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { u32 flags = 0; - if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) - flags |= SOCKOPT_CC_REINIT; return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, flags); } From 29a949325c6c90f1431db9af64592275c83d9b2a Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:34 -0400 Subject: [PATCH 3/5] tcp: simplify tcp_set_congestion_control(): Always reinitialize Now that the previous patches ensure that all call sites for tcp_set_congestion_control() want to initialize congestion control, we can simplify tcp_set_congestion_control() by removing the reinit argument and the code to support it. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- include/net/tcp.h | 2 +- net/core/filter.c | 3 +-- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_cong.c | 11 ++--------- 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index e85d564446c65..f857146c17a52 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1104,7 +1104,7 @@ void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin); + bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); diff --git a/net/core/filter.c b/net/core/filter.c index 067f6759a68f7..e89d6d7da03c4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4451,8 +4451,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, - true, true); + ret = tcp_set_congestion_control(sk, name, false, true); } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7360d3db2b616..e58ab9db73ff1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3050,7 +3050,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true, + err = tcp_set_congestion_control(sk, name, true, ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); release_sock(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index d18d7a1ce4ce7..a9b0fb52a1ec4 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -341,7 +341,7 @@ int tcp_set_allowed_congestion_control(char *val) * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin) + bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -365,15 +365,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, if (!ca) { err = -ENOENT; } else if (!load) { - const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - if (bpf_try_module_get(ca, ca->owner)) { - if (reinit) { - tcp_reinit_congestion_control(sk, ca); - } else { - icsk->icsk_ca_ops = ca; - bpf_module_put(old_ca, old_ca->owner); - } + tcp_reinit_congestion_control(sk, ca); } else { err = -EBUSY; } From 5cdc744caab7cb0d0a3ed479246cbab3dbc59c0e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:35 -0400 Subject: [PATCH 4/5] tcp: simplify _bpf_setsockopt(): Remove flags argument Now that the previous patches have removed the code that uses the flags argument to _bpf_setsockopt(), we can remove that argument. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- net/core/filter.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index e89d6d7da03c4..d266c69419670 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4314,7 +4314,7 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { }; static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen, u32 flags) + char *optval, int optlen) { char devname[IFNAMSIZ]; int val, valbool; @@ -4611,9 +4611,7 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { @@ -4647,9 +4645,7 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { From 5050bef8736facac341fb7e2c0eda5002619acf8 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:36 -0400 Subject: [PATCH 5/5] tcp: Simplify tcp_set_congestion_control() load=false case Simplify tcp_set_congestion_control() by removing the initialization code path for the !load case. There are only two call sites for tcp_set_congestion_control(). The EBPF call site is the only one that passes load=false; it also passes cap_net_admin=true. Because of that, the exact same behavior can be achieved by removing the special if (!load) branch of the logic. Both before and after this commit, the EBPF case will call bpf_try_module_get(), and if that succeeds then call tcp_reinit_congestion_control() or if that fails then return EBUSY. Note that this returns the logic to a structure very similar to the structure before: commit 91b5b21c7c16 ("bpf: Add support for changing congestion control") except that the CAP_NET_ADMIN status is passed in as a function argument. This clean-up was suggested by Martin KaFai Lau. Suggested-by: Martin KaFai Lau Signed-off-by: Neal Cardwell Signed-off-by: Alexei Starovoitov Cc: Lawrence Brakmo Cc: Eric Dumazet Cc: Yuchung Cheng Cc: Kevin Yang --- net/ipv4/tcp_cong.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index a9b0fb52a1ec4..db47ac24d057e 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -362,21 +362,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, goto out; } - if (!ca) { + if (!ca) err = -ENOENT; - } else if (!load) { - if (bpf_try_module_get(ca, ca->owner)) { - tcp_reinit_congestion_control(sk, ca); - } else { - err = -EBUSY; - } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; - } else if (!bpf_try_module_get(ca, ca->owner)) { + else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; - } else { + else tcp_reinit_congestion_control(sk, ca); - } out: rcu_read_unlock(); return err;