From 9ee79a65d176e5815877aa148acc956e4cc90c53 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:29 -0800 Subject: [PATCH 1/8] bpf: sk_msg, fix sk_msg_md access past end test Currently, the test to ensure reads past the end of the sk_msg_md data structure fail is incorrectly expecting success. Fix this typo and use correct expected error. Fixes: 945a47d87cee ("bpf: sk_msg, add tests for size field") Reported-by: Alexei Starovoitov Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index b246931c46ef7..dbd31750b2141 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1879,7 +1879,7 @@ static struct bpf_test tests[] = { offsetof(struct sk_msg_md, size) + 4), BPF_EXIT_INSN(), }, - .errstr = "R0 !read_ok", + .errstr = "invalid bpf_context access", .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_MSG, }, From bc1b4f013b5029b4c8b63fb9ba8d084119486d7b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:30 -0800 Subject: [PATCH 2/8] bpf: sk_msg, improve offset chk in _is_valid_access The check for max offset in sk_msg_is_valid_access uses sizeof() which is incorrect because it would allow accessing possibly past the end of the struct in the padded case. Further, it doesn't preclude accessing any padding that may be added in the middle of a struct. All told this makes it fragile to rely on. To fix this explicitly check offsets with fields using the bpf_ctx_range() and bpf_ctx_range_till() macros. For reference the current structure layout looks as follows (reported by pahole) struct sk_msg_md { union { void * data; /* 8 */ }; /* 0 8 */ union { void * data_end; /* 8 */ }; /* 8 8 */ __u32 family; /* 16 4 */ __u32 remote_ip4; /* 20 4 */ __u32 local_ip4; /* 24 4 */ __u32 remote_ip6[4]; /* 28 16 */ __u32 local_ip6[4]; /* 44 16 */ __u32 remote_port; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u32 local_port; /* 64 4 */ __u32 size; /* 68 4 */ /* size: 72, cachelines: 2, members: 10 */ /* last cacheline: 8 bytes */ }; So there should be no padding at the moment but fixing this now prevents future errors. Reported-by: Alexei Starovoitov Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/filter.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 3a3b21726fb5f..6bd9f08f6162f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6313,6 +6313,9 @@ static bool sk_msg_is_valid_access(int off, int size, if (type == BPF_WRITE) return false; + if (off % size != 0) + return false; + switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; @@ -6324,16 +6327,20 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; - default: + case bpf_ctx_range(struct sk_msg_md, family): + case bpf_ctx_range(struct sk_msg_md, remote_ip4): + case bpf_ctx_range(struct sk_msg_md, local_ip4): + case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct sk_msg_md, remote_port): + case bpf_ctx_range(struct sk_msg_md, local_port): + case bpf_ctx_range(struct sk_msg_md, size): if (size != sizeof(__u32)) return false; - } - - if (off < 0 || off >= sizeof(struct sk_msg_md)) - return false; - if (off % size != 0) + break; + default: return false; - + } return true; } From 7a69c0f250568e6ab72f401b2c69aa0e666c94f2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:31 -0800 Subject: [PATCH 3/8] bpf: skmsg, replace comments with BUILD bug Enforce comment on structure layout dependency with a BUILD_BUG_ON to ensure the condition is maintained. Suggested-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 4 +--- net/core/filter.c | 3 +++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index eb8f6cb84c10b..dd57e6f408b18 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -36,9 +36,7 @@ struct sk_msg_sg { struct scatterlist data[MAX_MSG_FRAGS + 1]; }; -/* UAPI in filter.c depends on struct sk_msg_sg being first element. If - * this is moved filter.c also must be updated. - */ +/* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; diff --git a/net/core/filter.c b/net/core/filter.c index 6bd9f08f6162f..447dd1bad31fa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7425,6 +7425,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, int off; #endif + /* convert ctx uses the fact sg element is first in struct */ + BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); + switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), From 51199405f967207de372d9b60989eb87d7ae8809 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:32 -0800 Subject: [PATCH 4/8] bpf: skb_verdict, support SK_PASS on RX BPF path Add SK_PASS verdict support to SK_SKB_VERDICT programs. Now that support for redirects exists we can implement SK_PASS as a redirect to the same socket. This simplifies the BPF programs and avoids an extra map lookup on RX path for simple visibility cases. Further, reduces user (BPF programmer in this context) confusion when their program drops skb due to lack of support. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/skmsg.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 56a99d0c9aa08..8a91a460de8f6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -669,6 +669,22 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, bool ingress; switch (verdict) { + case __SK_PASS: + sk_other = psock->sk; + if (sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + goto out_free; + } + if (atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf) { + struct tcp_skb_cb *tcp = TCP_SKB_CB(skb); + + tcp->bpf.flags |= BPF_F_INGRESS; + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + break; + } + goto out_free; case __SK_REDIRECT: sk_other = tcp_skb_bpf_redirect_fetch(skb); if (unlikely(!sk_other)) From 552de91068828daef50a227a665068cf8dde835e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:33 -0800 Subject: [PATCH 5/8] bpf: sk_msg, fix socket data_ready events When a skb verdict program is in-use and either another BPF program redirects to that socket or the new SK_PASS support is used the data_ready callback does not wake up application. Instead because the stream parser/verdict is using the sk data_ready callback we wake up the stream parser/verdict block. Fix this by adding a helper to check if the stream parser block is enabled on the sk and if so call the saved pointer which is the upper layers wake up function. This fixes application stalls observed when an application is waiting for data in a blocking read(). Fixes: d829e9c4112b ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 8 ++++++++ net/core/skmsg.c | 6 +++--- net/ipv4/tcp_bpf.c | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index dd57e6f408b18..178a3933a71b8 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -417,6 +417,14 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) sk_psock_drop(sk, psock); } +static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) +{ + if (psock->parser.enabled) + psock->parser.saved_data_ready(sk); + else + sk->sk_data_ready(sk); +} + static inline void psock_set_prog(struct bpf_prog **pprog, struct bpf_prog *prog) { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8a91a460de8f6..3df7627db4bb5 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -403,7 +403,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) msg->skb = skb; sk_psock_queue_msg(psock, msg); - sk->sk_data_ready(sk); + sk_psock_data_ready(sk, psock); return copied; } @@ -751,7 +751,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) } /* Called with socket lock held. */ -static void sk_psock_data_ready(struct sock *sk) +static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; @@ -799,7 +799,7 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) return; parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; parser->enabled = true; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index a47c1cdf90fc2..87503343743d3 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -198,7 +198,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, msg->sg.start = i; msg->sg.size -= apply_bytes; sk_psock_queue_msg(psock, tmp); - sk->sk_data_ready(sk); + sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); kfree(tmp); From a136678c0bdbb650daff5df5eec1dab960e074a7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:34 -0800 Subject: [PATCH 6/8] bpf: sk_msg, zap ingress queue on psock down In addition to releasing any cork'ed data on a psock when the psock is removed we should also release any skb's in the ingress work queue. Otherwise the skb's eventually get free'd but late in the tear down process so we see the WARNING due to non-zero sk_forward_alloc. void sk_stream_kill_queues(struct sock *sk) { ... WARN_ON(sk->sk_forward_alloc); ... } Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/skmsg.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3df7627db4bb5..86c9726fced80 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -572,6 +572,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { rcu_assign_sk_user_data(sk, NULL); sk_psock_cork_free(psock); + sk_psock_zap_ingress(psock); sk_psock_restore_proto(sk, psock); write_lock_bh(&sk->sk_callback_lock); From 0608c69c9a805c6264689d7eab4203eab88cf1da Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:35 -0800 Subject: [PATCH 7/8] bpf: sk_msg, sock{map|hash} redirect through ULP A sockmap program that redirects through a kTLS ULP enabled socket will not work correctly because the ULP layer is skipped. This fixes the behavior to call through the ULP layer on redirect to ensure any operations required on the data stream at the ULP layer continue to be applied. To do this we add an internal flag MSG_SENDPAGE_NOPOLICY to avoid calling the BPF layer on a redirected message. This is required to avoid calling the BPF layer multiple times (possibly recursively) which is not the current/expected behavior without ULPs. In the future we may add a redirect flag if users _do_ want the policy applied again but this would need to work for both ULP and non-ULP sockets and be opt-in to avoid breaking existing programs. Also to avoid polluting the flag space with an internal flag we reuse the flag space overlapping MSG_SENDPAGE_NOPOLICY with MSG_WAITFORONE. Here WAITFORONE is specific to recv path and SENDPAGE_NOPOLICY is only used for sendpage hooks. The last thing to verify is user space API is masked correctly to ensure the flag can not be set by user. (Note this needs to be true regardless because we have internal flags already in-use that user space should not be able to set). But for completeness we have two UAPI paths into sendpage, sendfile and splice. In the sendfile case the function do_sendfile() zero's flags, ./fs/read_write.c: static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { ... fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif file_start_write(out.file); retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); } In the splice case the pipe_to_sendpage "actor" is used which masks flags with SPLICE_F_MORE. ./fs/splice.c: static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { ... more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; ... } Confirming what we expect that internal flags are in fact internal to socket side. Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/socket.h | 1 + include/net/tls.h | 9 +++++++++ net/ipv4/tcp_bpf.c | 13 ++++++++++++- net/tls/tls_sw.c | 43 +++++++++++++++++++++++++++++------------- 4 files changed, 52 insertions(+), 14 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index 8b571e9b9f76d..84c48a3c0227c 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -286,6 +286,7 @@ struct ucred { #define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ #define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ +#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN diff --git a/include/net/tls.h b/include/net/tls.h index bab5627ff5e39..23601f3b02ee2 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -454,6 +454,15 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx) return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } +static inline bool tls_sw_has_ctx_tx(const struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + + if (!ctx) + return false; + return !!tls_sw_ctx_tx(ctx); +} + static inline struct tls_offload_context_rx * tls_offload_ctx_rx(const struct tls_context *tls_ctx) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 87503343743d3..1bb7321a256d0 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -8,6 +8,7 @@ #include #include +#include static bool tcp_bpf_stream_read(const struct sock *sk) { @@ -218,6 +219,8 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, u32 off; while (1) { + bool has_tx_ulp; + sge = sk_msg_elem(msg, msg->sg.start); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; @@ -226,7 +229,15 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, tcp_rate_check_app_limited(sk); retry: - ret = do_tcp_sendpages(sk, page, off, size, flags); + has_tx_ulp = tls_sw_has_ctx_tx(sk); + if (has_tx_ulp) { + flags |= MSG_SENDPAGE_NOPOLICY; + ret = kernel_sendpage_locked(sk, + page, off, size, flags); + } else { + ret = do_tcp_sendpages(sk, page, off, size, flags); + } + if (ret <= 0) return ret; if (apply) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index d4ecc66464e6c..5aee9ae5ca535 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -686,12 +686,13 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, struct sk_psock *psock; struct sock *sk_redir; struct tls_rec *rec; + bool enospc, policy; int err = 0, send; u32 delta = 0; - bool enospc; + policy = !(flags & MSG_SENDPAGE_NOPOLICY); psock = sk_psock_get(sk); - if (!psock) + if (!psock || !policy) return tls_push_record(sk, flags, record_type); more_data: enospc = sk_msg_full(msg); @@ -1017,8 +1018,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) return copied ? copied : ret; } -int tls_sw_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) +int tls_sw_do_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) { long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); struct tls_context *tls_ctx = tls_get_ctx(sk); @@ -1033,15 +1034,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, int ret = 0; bool eor; - if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_SENDPAGE_NOTLAST)) - return -ENOTSUPP; - - /* No MSG_EOR from splice, only look at MSG_MORE */ eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST)); - - lock_sock(sk); - sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Wait till there is any pending write on socket */ @@ -1145,10 +1138,34 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, } sendpage_end: ret = sk_stream_error(sk, flags, ret); - release_sock(sk); return copied ? copied : ret; } +int tls_sw_sendpage_locked(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) + return -ENOTSUPP; + + return tls_sw_do_sendpage(sk, page, offset, size, flags); +} + +int tls_sw_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + int ret; + + if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) + return -ENOTSUPP; + + lock_sock(sk); + ret = tls_sw_do_sendpage(sk, page, offset, size, flags); + release_sock(sk); + return ret; +} + static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock, int flags, long timeo, int *err) { From 28cb6f1eaffdc5a6a9707cac55f4a43aa3fd7895 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:36 -0800 Subject: [PATCH 8/8] bpf: tls_sw, init TLS ULP removes BPF proto hooks The existing code did not expect users would initialize the TLS ULP without subsequently calling the TLS TX enabling socket option. If the application tries to send data after the TLS ULP enable op but before the TLS TX enable op the BPF sk_msg verdict program is skipped. This patch resolves this by converting the ipv4 sock ops to be calculated at init time the same way ipv6 ops are done. This pulls in any changes to the sock ops structure that have been made after the socket was created including the changes from adding the socket to a sock{map|hash}. This was discovered by running OpenSSL master branch which calls the TLS ULP setsockopt early in TLS handshake but only enables the TLS TX path once the handshake has completed. As a result the datapath missed the initial handshake messages. Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/tls/tls_main.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 311cec8e533de..acff12999c06e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -55,6 +55,8 @@ enum { static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); +static struct proto *saved_tcpv4_prot; +static DEFINE_MUTEX(tcpv4_prot_mutex); static LIST_HEAD(device_list); static DEFINE_MUTEX(device_mutex); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; @@ -690,6 +692,16 @@ static int tls_init(struct sock *sk) mutex_unlock(&tcpv6_prot_mutex); } + if (ip_ver == TLSV4 && + unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv4_prot))) { + mutex_lock(&tcpv4_prot_mutex); + if (likely(sk->sk_prot != saved_tcpv4_prot)) { + build_protos(tls_prots[TLSV4], sk->sk_prot); + smp_store_release(&saved_tcpv4_prot, sk->sk_prot); + } + mutex_unlock(&tcpv4_prot_mutex); + } + ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; update_sk_prot(sk, ctx); @@ -721,8 +733,6 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { static int __init tls_register(void) { - build_protos(tls_prots[TLSV4], &tcp_prot); - tls_sw_proto_ops = inet_stream_ops; tls_sw_proto_ops.splice_read = tls_sw_splice_read;