Skip to content

Commit

Permalink
tls: Add opt-in zerocopy mode of sendfile()
Browse files Browse the repository at this point in the history
TLS device offload copies sendfile data to a bounce buffer before
transmitting. It allows to maintain the valid MAC on TLS records when
the file contents change and a part of TLS record has to be
retransmitted on TCP level.

In many common use cases (like serving static files over HTTPS) the file
contents are not changed on the fly. In many use cases breaking the
connection is totally acceptable if the file is changed during
transmission, because it would be received corrupted in any case.

This commit allows to optimize performance for such use cases to
providing a new optional mode of TLS sendfile(), in which the extra copy
is skipped. Removing this copy improves performance significantly, as
TLS and TCP sendfile perform the same operations, and the only overhead
is TLS header/trailer insertion.

The new mode can only be enabled with the new socket option named
TLS_TX_ZEROCOPY_SENDFILE on per-socket basis. It preserves backwards
compatibility with existing applications that rely on the copying
behavior.

The new mode is safe, meaning that unsolicited modifications of the file
being sent can't break integrity of the kernel. The worst thing that can
happen is sending a corrupted TLS record, which is in any case not
forbidden when using regular TCP sockets.

Sockets other than TLS device offload are not affected by the new socket
option. The actual status of zerocopy sendfile can be queried with
sock_diag.

Performance numbers in a single-core test with 24 HTTPS streams on
nginx, under 100% CPU load:

* non-zerocopy: 33.6 Gbit/s
* zerocopy: 79.92 Gbit/s

CPU: Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz

Signed-off-by: Boris Pismenny <borisp@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20220518092731.1243494-1-maximmi@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
  • Loading branch information
Boris Pismenny authored and Paolo Abeni committed May 19, 2022
1 parent e43d940 commit c1318b3
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 13 deletions.
1 change: 1 addition & 0 deletions include/net/tls.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ struct tls_context {

u8 tx_conf:3;
u8 rx_conf:3;
u8 zerocopy_sendfile:1;

int (*push_pending_record)(struct sock *sk, int flags);
void (*sk_write_space)(struct sock *sk);
Expand Down
2 changes: 2 additions & 0 deletions include/uapi/linux/tls.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
/* TLS socket options */
#define TLS_TX 1 /* Set transmit parameters */
#define TLS_RX 2 /* Set receive parameters */
#define TLS_TX_ZEROCOPY_SENDFILE 3 /* transmit zerocopy sendfile */

/* Supported versions */
#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
Expand Down Expand Up @@ -160,6 +161,7 @@ enum {
TLS_INFO_CIPHER,
TLS_INFO_TXCONF,
TLS_INFO_RXCONF,
TLS_INFO_ZC_SENDFILE,
__TLS_INFO_MAX,
};
#define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
Expand Down
53 changes: 40 additions & 13 deletions net/tls/tls_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -411,10 +411,16 @@ static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i)
return 0;
}

union tls_iter_offset {
struct iov_iter *msg_iter;
int offset;
};

static int tls_push_data(struct sock *sk,
struct iov_iter *msg_iter,
union tls_iter_offset iter_offset,
size_t size, int flags,
unsigned char record_type)
unsigned char record_type,
struct page *zc_page)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_prot_info *prot = &tls_ctx->prot_info;
Expand Down Expand Up @@ -480,12 +486,21 @@ static int tls_push_data(struct sock *sk,
}

record = ctx->open_record;
copy = min_t(size_t, size, (pfrag->size - pfrag->offset));
copy = min_t(size_t, copy, (max_open_record_len - record->len));

if (copy) {
copy = min_t(size_t, size, max_open_record_len - record->len);
if (copy && zc_page) {
struct page_frag zc_pfrag;

zc_pfrag.page = zc_page;
zc_pfrag.offset = iter_offset.offset;
zc_pfrag.size = copy;
tls_append_frag(record, &zc_pfrag, copy);
} else if (copy) {
copy = min_t(size_t, copy, pfrag->size - pfrag->offset);

rc = tls_device_copy_data(page_address(pfrag->page) +
pfrag->offset, copy, msg_iter);
pfrag->offset, copy,
iter_offset.msg_iter);
if (rc)
goto handle_error;
tls_append_frag(record, pfrag, copy);
Expand Down Expand Up @@ -540,6 +555,7 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
unsigned char record_type = TLS_RECORD_TYPE_DATA;
struct tls_context *tls_ctx = tls_get_ctx(sk);
union tls_iter_offset iter;
int rc;

mutex_lock(&tls_ctx->tx_lock);
Expand All @@ -551,8 +567,8 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
goto out;
}

rc = tls_push_data(sk, &msg->msg_iter, size,
msg->msg_flags, record_type);
iter.msg_iter = &msg->msg_iter;
rc = tls_push_data(sk, iter, size, msg->msg_flags, record_type, NULL);

out:
release_sock(sk);
Expand All @@ -564,7 +580,8 @@ int tls_device_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct iov_iter msg_iter;
union tls_iter_offset iter_offset;
struct iov_iter msg_iter;
char *kaddr;
struct kvec iov;
int rc;
Expand All @@ -580,12 +597,20 @@ int tls_device_sendpage(struct sock *sk, struct page *page,
goto out;
}

if (tls_ctx->zerocopy_sendfile) {
iter_offset.offset = offset;
rc = tls_push_data(sk, iter_offset, size,
flags, TLS_RECORD_TYPE_DATA, page);
goto out;
}

kaddr = kmap(page);
iov.iov_base = kaddr + offset;
iov.iov_len = size;
iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size);
rc = tls_push_data(sk, &msg_iter, size,
flags, TLS_RECORD_TYPE_DATA);
iter_offset.msg_iter = &msg_iter;
rc = tls_push_data(sk, iter_offset, size, flags, TLS_RECORD_TYPE_DATA,
NULL);
kunmap(page);

out:
Expand Down Expand Up @@ -656,10 +681,12 @@ EXPORT_SYMBOL(tls_get_record);

static int tls_device_push_pending_record(struct sock *sk, int flags)
{
struct iov_iter msg_iter;
union tls_iter_offset iter;
struct iov_iter msg_iter;

iov_iter_kvec(&msg_iter, WRITE, NULL, 0, 0);
return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
iter.msg_iter = &msg_iter;
return tls_push_data(sk, iter, 0, flags, TLS_RECORD_TYPE_DATA, NULL);
}

void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
Expand Down
55 changes: 55 additions & 0 deletions net/tls/tls_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,26 @@ static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval,
return rc;
}

static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval,
int __user *optlen)
{
struct tls_context *ctx = tls_get_ctx(sk);
unsigned int value;
int len;

if (get_user(len, optlen))
return -EFAULT;

if (len != sizeof(value))
return -EINVAL;

value = ctx->zerocopy_sendfile;
if (copy_to_user(optval, &value, sizeof(value)))
return -EFAULT;

return 0;
}

static int do_tls_getsockopt(struct sock *sk, int optname,
char __user *optval, int __user *optlen)
{
Expand All @@ -524,6 +544,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
rc = do_tls_getsockopt_conf(sk, optval, optlen,
optname == TLS_TX);
break;
case TLS_TX_ZEROCOPY_SENDFILE:
rc = do_tls_getsockopt_tx_zc(sk, optval, optlen);
break;
default:
rc = -ENOPROTOOPT;
break;
Expand Down Expand Up @@ -675,6 +698,26 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
return rc;
}

static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval,
unsigned int optlen)
{
struct tls_context *ctx = tls_get_ctx(sk);
unsigned int value;

if (sockptr_is_null(optval) || optlen != sizeof(value))
return -EINVAL;

if (copy_from_sockptr(&value, optval, sizeof(value)))
return -EFAULT;

if (value > 1)
return -EINVAL;

ctx->zerocopy_sendfile = value;

return 0;
}

static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
unsigned int optlen)
{
Expand All @@ -688,6 +731,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
optname == TLS_TX);
release_sock(sk);
break;
case TLS_TX_ZEROCOPY_SENDFILE:
lock_sock(sk);
rc = do_tls_setsockopt_tx_zc(sk, optval, optlen);
release_sock(sk);
break;
default:
rc = -ENOPROTOOPT;
break;
Expand Down Expand Up @@ -921,6 +969,12 @@ static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
if (err)
goto nla_failure;

if (ctx->tx_conf == TLS_HW && ctx->zerocopy_sendfile) {
err = nla_put_flag(skb, TLS_INFO_ZC_SENDFILE);
if (err)
goto nla_failure;
}

rcu_read_unlock();
nla_nest_end(skb, start);
return 0;
Expand All @@ -940,6 +994,7 @@ static size_t tls_get_info_size(const struct sock *sk)
nla_total_size(sizeof(u16)) + /* TLS_INFO_CIPHER */
nla_total_size(sizeof(u16)) + /* TLS_INFO_RXCONF */
nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
nla_total_size(0) + /* TLS_INFO_ZC_SENDFILE */
0;

return size;
Expand Down

0 comments on commit c1318b3

Please sign in to comment.