From 0df7cd3c13e44d01f9f28e29cbce74e2931b00fe Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Sat, 16 Sep 2023 16:09:15 +0300 Subject: [PATCH 1/4] vsock/virtio/vhost: read data from non-linear skb This is preparation patch for MSG_ZEROCOPY support. It adds handling of non-linear skbs by replacing direct calls of 'memcpy_to_msg()' with 'skb_copy_datagram_iter()'. Main advantage of the second one is that it can handle paged part of the skb by using 'kmap()' on each page, but if there are no pages in the skb, it behaves like simple copying to iov iterator. This patch also adds new field to the control block of skb - this value shows current offset in the skb to read next portion of data (it doesn't matter linear it or not). Idea behind this field is that 'skb_copy_datagram_iter()' handles both types of skb internally - it just needs an offset from which to copy data from the given skb. This offset is incremented on each read from skb. This approach allows to simplify handling of both linear and non-linear skbs, because for linear skb we need to call 'skb_pull()' after reading data from it, while in non-linear case we need to update 'data_len'. Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- drivers/vhost/vsock.c | 14 +++++++---- include/linux/virtio_vsock.h | 1 + net/vmw_vsock/virtio_transport_common.c | 32 +++++++++++++++---------- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 817d377a3f360..83711aad855c4 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -114,6 +114,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, struct sk_buff *skb; unsigned out, in; size_t nbytes; + u32 offset; int head; skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); @@ -156,7 +157,8 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, } iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len); - payload_len = skb->len; + offset = VIRTIO_VSOCK_SKB_CB(skb)->offset; + payload_len = skb->len - offset; hdr = virtio_vsock_hdr(skb); /* If the packet is greater than the space available in the @@ -197,8 +199,10 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, break; } - nbytes = copy_to_iter(skb->data, payload_len, &iov_iter); - if (nbytes != payload_len) { + if (skb_copy_datagram_iter(skb, + offset, + &iov_iter, + payload_len)) { kfree_skb(skb); vq_err(vq, "Faulted on copying pkt buf\n"); break; @@ -212,13 +216,13 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, vhost_add_used(vq, head, sizeof(*hdr) + payload_len); added = true; - skb_pull(skb, payload_len); + VIRTIO_VSOCK_SKB_CB(skb)->offset += payload_len; total_len += payload_len; /* If we didn't send all the payload we can requeue the packet * to send it with the next available buffer. */ - if (skb->len > 0) { + if (VIRTIO_VSOCK_SKB_CB(skb)->offset < skb->len) { hdr->flags |= cpu_to_le32(flags_to_restore); /* We are queueing the same skb to handle diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index c58453699ee98..a91fbdf233e4f 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -12,6 +12,7 @@ struct virtio_vsock_skb_cb { bool reply; bool tap_delivered; + u32 offset; }; #define VIRTIO_VSOCK_SKB_CB(skb) ((struct virtio_vsock_skb_cb *)((skb)->cb)) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 352d042b130b5..3e08d52a9355b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -364,9 +364,10 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk, spin_unlock_bh(&vvs->rx_lock); /* sk_lock is held by caller so no one else can dequeue. - * Unlock rx_lock since memcpy_to_msg() may sleep. + * Unlock rx_lock since skb_copy_datagram_iter() may sleep. */ - err = memcpy_to_msg(msg, skb->data, bytes); + err = skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset, + &msg->msg_iter, bytes); if (err) goto out; @@ -410,25 +411,27 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, while (total < len && !skb_queue_empty(&vvs->rx_queue)) { skb = skb_peek(&vvs->rx_queue); - bytes = len - total; - if (bytes > skb->len) - bytes = skb->len; + bytes = min_t(size_t, len - total, + skb->len - VIRTIO_VSOCK_SKB_CB(skb)->offset); /* sk_lock is held by caller so no one else can dequeue. - * Unlock rx_lock since memcpy_to_msg() may sleep. + * Unlock rx_lock since skb_copy_datagram_iter() may sleep. */ spin_unlock_bh(&vvs->rx_lock); - err = memcpy_to_msg(msg, skb->data, bytes); + err = skb_copy_datagram_iter(skb, + VIRTIO_VSOCK_SKB_CB(skb)->offset, + &msg->msg_iter, bytes); if (err) goto out; spin_lock_bh(&vvs->rx_lock); total += bytes; - skb_pull(skb, bytes); - if (skb->len == 0) { + VIRTIO_VSOCK_SKB_CB(skb)->offset += bytes; + + if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->offset) { u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len); virtio_transport_dec_rx_pkt(vvs, pkt_len); @@ -492,9 +495,10 @@ virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk, spin_unlock_bh(&vvs->rx_lock); /* sk_lock is held by caller so no one else can dequeue. - * Unlock rx_lock since memcpy_to_msg() may sleep. + * Unlock rx_lock since skb_copy_datagram_iter() may sleep. */ - err = memcpy_to_msg(msg, skb->data, bytes); + err = skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset, + &msg->msg_iter, bytes); if (err) return err; @@ -553,11 +557,13 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, int err; /* sk_lock is held by caller so no one else can dequeue. - * Unlock rx_lock since memcpy_to_msg() may sleep. + * Unlock rx_lock since skb_copy_datagram_iter() may sleep. */ spin_unlock_bh(&vvs->rx_lock); - err = memcpy_to_msg(msg, skb->data, bytes_to_copy); + err = skb_copy_datagram_iter(skb, 0, + &msg->msg_iter, + bytes_to_copy); if (err) { /* Copy of message failed. Rest of * fragments will be freed without copy. From 64c99d2d6adac80cb17669736e32bdb331d68193 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Sat, 16 Sep 2023 16:09:16 +0300 Subject: [PATCH 2/4] vsock/virtio: support to send non-linear skb For non-linear skb use its pages from fragment array as buffers in virtio tx queue. These pages are already pinned by 'get_user_pages()' during such skb creation. Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport.c | 60 ++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index e95df847176b6..73d7301563491 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -63,6 +63,17 @@ struct virtio_vsock { u32 guest_cid; bool seqpacket_allow; + + /* These fields are used only in tx path in function + * 'virtio_transport_send_pkt_work()', so to save + * stack space in it, place both of them here. Each + * pointer from 'out_sgs' points to the corresponding + * element in 'out_bufs' - this is initialized in + * 'virtio_vsock_probe()'. Both fields are protected + * by 'tx_lock'. +1 is needed for packet header. + */ + struct scatterlist *out_sgs[MAX_SKB_FRAGS + 1]; + struct scatterlist out_bufs[MAX_SKB_FRAGS + 1]; }; static u32 virtio_transport_get_local_cid(void) @@ -100,8 +111,8 @@ virtio_transport_send_pkt_work(struct work_struct *work) vq = vsock->vqs[VSOCK_VQ_TX]; for (;;) { - struct scatterlist hdr, buf, *sgs[2]; int ret, in_sg = 0, out_sg = 0; + struct scatterlist **sgs; struct sk_buff *skb; bool reply; @@ -111,12 +122,43 @@ virtio_transport_send_pkt_work(struct work_struct *work) virtio_transport_deliver_tap_pkt(skb); reply = virtio_vsock_skb_reply(skb); - - sg_init_one(&hdr, virtio_vsock_hdr(skb), sizeof(*virtio_vsock_hdr(skb))); - sgs[out_sg++] = &hdr; - if (skb->len > 0) { - sg_init_one(&buf, skb->data, skb->len); - sgs[out_sg++] = &buf; + sgs = vsock->out_sgs; + sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), + sizeof(*virtio_vsock_hdr(skb))); + out_sg++; + + if (!skb_is_nonlinear(skb)) { + if (skb->len > 0) { + sg_init_one(sgs[out_sg], skb->data, skb->len); + out_sg++; + } + } else { + struct skb_shared_info *si; + int i; + + /* If skb is nonlinear, then its buffer must contain + * only header and nothing more. Data is stored in + * the fragged part. + */ + WARN_ON_ONCE(skb_headroom(skb) != sizeof(*virtio_vsock_hdr(skb))); + + si = skb_shinfo(skb); + + for (i = 0; i < si->nr_frags; i++) { + skb_frag_t *skb_frag = &si->frags[i]; + void *va; + + /* We will use 'page_to_virt()' for the userspace page + * here, because virtio or dma-mapping layers will call + * 'virt_to_phys()' later to fill the buffer descriptor. + * We don't touch memory at "virtual" address of this page. + */ + va = page_to_virt(skb_frag->bv_page); + sg_init_one(sgs[out_sg], + va + skb_frag->bv_offset, + skb_frag->bv_len); + out_sg++; + } } ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL); @@ -621,6 +663,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev) { struct virtio_vsock *vsock = NULL; int ret; + int i; ret = mutex_lock_interruptible(&the_virtio_vsock_mutex); if (ret) @@ -663,6 +706,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev) if (ret < 0) goto out; + for (i = 0; i < ARRAY_SIZE(vsock->out_sgs); i++) + vsock->out_sgs[i] = &vsock->out_bufs[i]; + rcu_assign_pointer(the_virtio_vsock, vsock); mutex_unlock(&the_virtio_vsock_mutex); From 4b0bf10eb077cb43c09746251ef3608d62c45667 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Sat, 16 Sep 2023 16:09:17 +0300 Subject: [PATCH 3/4] vsock/virtio: non-linear skb handling for tap For tap device new skb is created and data from the current skb is copied to it. This adds copying data from non-linear skb to new the skb. Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 31 ++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 3e08d52a9355b..3a48e48a99ac9 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -106,6 +106,27 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, return NULL; } +static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb, + void *dst, + size_t len) +{ + struct iov_iter iov_iter = { 0 }; + struct kvec kvec; + size_t to_copy; + + kvec.iov_base = dst; + kvec.iov_len = len; + + iov_iter.iter_type = ITER_KVEC; + iov_iter.kvec = &kvec; + iov_iter.nr_segs = 1; + + to_copy = min_t(size_t, len, skb->len); + + skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset, + &iov_iter, to_copy); +} + /* Packet capture */ static struct sk_buff *virtio_transport_build_skb(void *opaque) { @@ -114,7 +135,6 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) struct af_vsockmon_hdr *hdr; struct sk_buff *skb; size_t payload_len; - void *payload_buf; /* A packet could be split to fit the RX buffer, so we can retrieve * the payload length from the header and the buffer pointer taking @@ -122,7 +142,6 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) */ pkt_hdr = virtio_vsock_hdr(pkt); payload_len = pkt->len; - payload_buf = pkt->data; skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len, GFP_ATOMIC); @@ -165,7 +184,13 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr)); if (payload_len) { - skb_put_data(skb, payload_buf, payload_len); + if (skb_is_nonlinear(pkt)) { + void *data = skb_put(skb, payload_len); + + virtio_transport_copy_nonlinear_skb(pkt, data, payload_len); + } else { + skb_put_data(skb, pkt->data, payload_len); + } } return skb; From 581512a6dc939ef122e49336626ae159f3b8a345 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Sat, 16 Sep 2023 16:09:18 +0300 Subject: [PATCH 4/4] vsock/virtio: MSG_ZEROCOPY flag support This adds handling of MSG_ZEROCOPY flag on transmission path: 1) If this flag is set and zerocopy transmission is possible (enabled in socket options and transport allows zerocopy), then non-linear skb will be created and filled with the pages of user's buffer. Pages of user's buffer are locked in memory by 'get_user_pages()'. 2) Replaces way of skb owning: instead of 'skb_set_owner_sk_safe()' it calls 'skb_set_owner_w()'. Reason of this change is that '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc' of socket, so to decrease this field correctly, proper skb destructor is needed: 'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'. 3) Adds new callback to 'struct virtio_transport': 'can_msgzerocopy'. If this callback is set, then transport needs extra check to be able to send provided number of buffers in zerocopy mode. Currently, the only transport that needs this callback set is virtio, because this transport adds new buffers to the virtio queue and we need to check, that number of these buffers is less than size of the queue (it is required by virtio spec). vhost and loopback transports don't need this check. Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- include/linux/virtio_vsock.h | 9 + .../events/vsock_virtio_transport_common.h | 12 +- net/vmw_vsock/virtio_transport.c | 32 +++ net/vmw_vsock/virtio_transport_common.c | 250 ++++++++++++++---- 4 files changed, 241 insertions(+), 62 deletions(-) diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index a91fbdf233e4f..ebb3ce63d64da 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -160,6 +160,15 @@ struct virtio_transport { /* Takes ownership of the packet */ int (*send_pkt)(struct sk_buff *skb); + + /* Used in MSG_ZEROCOPY mode. Checks, that provided data + * (number of buffers) could be transmitted with zerocopy + * mode. If this callback is not implemented for the current + * transport - this means that this transport doesn't need + * extra checks and can perform zerocopy transmission by + * default. + */ + bool (*can_msgzerocopy)(int bufs_num); }; ssize_t diff --git a/include/trace/events/vsock_virtio_transport_common.h b/include/trace/events/vsock_virtio_transport_common.h index d0b3f0ea9ba11..f1ebe36787c3d 100644 --- a/include/trace/events/vsock_virtio_transport_common.h +++ b/include/trace/events/vsock_virtio_transport_common.h @@ -43,7 +43,8 @@ TRACE_EVENT(virtio_transport_alloc_pkt, __u32 len, __u16 type, __u16 op, - __u32 flags + __u32 flags, + bool zcopy ), TP_ARGS( src_cid, src_port, @@ -51,7 +52,8 @@ TRACE_EVENT(virtio_transport_alloc_pkt, len, type, op, - flags + flags, + zcopy ), TP_STRUCT__entry( __field(__u32, src_cid) @@ -62,6 +64,7 @@ TRACE_EVENT(virtio_transport_alloc_pkt, __field(__u16, type) __field(__u16, op) __field(__u32, flags) + __field(bool, zcopy) ), TP_fast_assign( __entry->src_cid = src_cid; @@ -72,14 +75,15 @@ TRACE_EVENT(virtio_transport_alloc_pkt, __entry->type = type; __entry->op = op; __entry->flags = flags; + __entry->zcopy = zcopy; ), - TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x", + TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x zcopy=%s", __entry->src_cid, __entry->src_port, __entry->dst_cid, __entry->dst_port, __entry->len, show_type(__entry->type), show_op(__entry->op), - __entry->flags) + __entry->flags, __entry->zcopy ? "true" : "false") ); TRACE_EVENT(virtio_transport_recv_pkt, diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 73d7301563491..09ba3128e7593 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -455,6 +455,37 @@ static void virtio_vsock_rx_done(struct virtqueue *vq) queue_work(virtio_vsock_workqueue, &vsock->rx_work); } +static bool virtio_transport_can_msgzerocopy(int bufs_num) +{ + struct virtio_vsock *vsock; + bool res = false; + + rcu_read_lock(); + + vsock = rcu_dereference(the_virtio_vsock); + if (vsock) { + struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX]; + + /* Check that tx queue is large enough to keep whole + * data to send. This is needed, because when there is + * not enough free space in the queue, current skb to + * send will be reinserted to the head of tx list of + * the socket to retry transmission later, so if skb + * is bigger than whole queue, it will be reinserted + * again and again, thus blocking other skbs to be sent. + * Each page of the user provided buffer will be added + * as a single buffer to the tx virtqueue, so compare + * number of pages against maximum capacity of the queue. + */ + if (bufs_num <= vq->num_max) + res = true; + } + + rcu_read_unlock(); + + return res; +} + static bool virtio_transport_seqpacket_allow(u32 remote_cid); static struct virtio_transport virtio_transport = { @@ -504,6 +535,7 @@ static struct virtio_transport virtio_transport = { }, .send_pkt = virtio_transport_send_pkt, + .can_msgzerocopy = virtio_transport_can_msgzerocopy, }; static bool virtio_transport_seqpacket_allow(u32 remote_cid) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 3a48e48a99ac9..e22c81435ef7d 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -37,73 +37,99 @@ virtio_transport_get_ops(struct vsock_sock *vsk) return container_of(t, struct virtio_transport, transport); } -/* Returns a new packet on success, otherwise returns NULL. - * - * If NULL is returned, errp is set to a negative errno. - */ -static struct sk_buff * -virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, - size_t len, - u32 src_cid, - u32 src_port, - u32 dst_cid, - u32 dst_port) -{ - const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len; - struct virtio_vsock_hdr *hdr; - struct sk_buff *skb; - void *payload; - int err; +static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops, + struct virtio_vsock_pkt_info *info, + size_t pkt_len) +{ + struct iov_iter *iov_iter; - skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL); - if (!skb) - return NULL; + if (!info->msg) + return false; - hdr = virtio_vsock_hdr(skb); - hdr->type = cpu_to_le16(info->type); - hdr->op = cpu_to_le16(info->op); - hdr->src_cid = cpu_to_le64(src_cid); - hdr->dst_cid = cpu_to_le64(dst_cid); - hdr->src_port = cpu_to_le32(src_port); - hdr->dst_port = cpu_to_le32(dst_port); - hdr->flags = cpu_to_le32(info->flags); - hdr->len = cpu_to_le32(len); + iov_iter = &info->msg->msg_iter; - if (info->msg && len > 0) { - payload = skb_put(skb, len); - err = memcpy_from_msg(payload, info->msg, len); - if (err) - goto out; + if (iov_iter->iov_offset) + return false; - if (msg_data_left(info->msg) == 0 && - info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) { - hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); + /* We can't send whole iov. */ + if (iov_iter->count > pkt_len) + return false; - if (info->msg->msg_flags & MSG_EOR) - hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); - } + /* Check that transport can send data in zerocopy mode. */ + t_ops = virtio_transport_get_ops(info->vsk); + + if (t_ops->can_msgzerocopy) { + int pages_in_iov = iov_iter_npages(iov_iter, MAX_SKB_FRAGS); + int pages_to_send = min(pages_in_iov, MAX_SKB_FRAGS); + + /* +1 is for packet header. */ + return t_ops->can_msgzerocopy(pages_to_send + 1); } - if (info->reply) - virtio_vsock_skb_set_reply(skb); + return true; +} - trace_virtio_transport_alloc_pkt(src_cid, src_port, - dst_cid, dst_port, - len, - info->type, - info->op, - info->flags); +static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, + struct sk_buff *skb, + struct msghdr *msg, + bool zerocopy) +{ + struct ubuf_info *uarg; - if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) { - WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n"); - goto out; + if (msg->msg_ubuf) { + uarg = msg->msg_ubuf; + net_zcopy_get(uarg); + } else { + struct iov_iter *iter = &msg->msg_iter; + struct ubuf_info_msgzc *uarg_zc; + + uarg = msg_zerocopy_realloc(sk_vsock(vsk), + iter->count, + NULL); + if (!uarg) + return -1; + + uarg_zc = uarg_to_msgzc(uarg); + uarg_zc->zerocopy = zerocopy ? 1 : 0; } - return skb; + skb_zcopy_init(skb, uarg); -out: - kfree_skb(skb); - return NULL; + return 0; +} + +static int virtio_transport_fill_skb(struct sk_buff *skb, + struct virtio_vsock_pkt_info *info, + size_t len, + bool zcopy) +{ + if (zcopy) + return __zerocopy_sg_from_iter(info->msg, NULL, skb, + &info->msg->msg_iter, + len); + + return memcpy_from_msg(skb_put(skb, len), info->msg, len); +} + +static void virtio_transport_init_hdr(struct sk_buff *skb, + struct virtio_vsock_pkt_info *info, + size_t payload_len, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port) +{ + struct virtio_vsock_hdr *hdr; + + hdr = virtio_vsock_hdr(skb); + hdr->type = cpu_to_le16(info->type); + hdr->op = cpu_to_le16(info->op); + hdr->src_cid = cpu_to_le64(src_cid); + hdr->dst_cid = cpu_to_le64(dst_cid); + hdr->src_port = cpu_to_le32(src_port); + hdr->dst_port = cpu_to_le32(dst_port); + hdr->flags = cpu_to_le32(info->flags); + hdr->len = cpu_to_le32(payload_len); } static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb, @@ -214,6 +240,82 @@ static u16 virtio_transport_get_type(struct sock *sk) return VIRTIO_VSOCK_TYPE_SEQPACKET; } +/* Returns new sk_buff on success, otherwise returns NULL. */ +static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, + size_t payload_len, + bool zcopy, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port) +{ + struct vsock_sock *vsk; + struct sk_buff *skb; + size_t skb_len; + + skb_len = VIRTIO_VSOCK_SKB_HEADROOM; + + if (!zcopy) + skb_len += payload_len; + + skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL); + if (!skb) + return NULL; + + virtio_transport_init_hdr(skb, info, payload_len, src_cid, src_port, + dst_cid, dst_port); + + vsk = info->vsk; + + /* If 'vsk' != NULL then payload is always present, so we + * will never call '__zerocopy_sg_from_iter()' below without + * setting skb owner in 'skb_set_owner_w()'. The only case + * when 'vsk' == NULL is VIRTIO_VSOCK_OP_RST control message + * without payload. + */ + WARN_ON_ONCE(!(vsk && (info->msg && payload_len)) && zcopy); + + /* Set owner here, because '__zerocopy_sg_from_iter()' uses + * owner of skb without check to update 'sk_wmem_alloc'. + */ + if (vsk) + skb_set_owner_w(skb, sk_vsock(vsk)); + + if (info->msg && payload_len > 0) { + int err; + + err = virtio_transport_fill_skb(skb, info, payload_len, zcopy); + if (err) + goto out; + + if (msg_data_left(info->msg) == 0 && + info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + + hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); + + if (info->msg->msg_flags & MSG_EOR) + hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); + } + } + + if (info->reply) + virtio_vsock_skb_set_reply(skb); + + trace_virtio_transport_alloc_pkt(src_cid, src_port, + dst_cid, dst_port, + payload_len, + info->type, + info->op, + info->flags, + zcopy); + + return skb; +out: + kfree_skb(skb); + return NULL; +} + /* This function can only be used on connecting/connected sockets, * since a socket assigned to a transport is required. * @@ -222,10 +324,12 @@ static u16 virtio_transport_get_type(struct sock *sk) static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, struct virtio_vsock_pkt_info *info) { + u32 max_skb_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; u32 src_cid, src_port, dst_cid, dst_port; const struct virtio_transport *t_ops; struct virtio_vsock_sock *vvs; u32 pkt_len = info->pkt_len; + bool can_zcopy = false; u32 rest_len; int ret; @@ -254,15 +358,30 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) return pkt_len; + if (info->msg) { + /* If zerocopy is not enabled by 'setsockopt()', we behave as + * there is no MSG_ZEROCOPY flag set. + */ + if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY)) + info->msg->msg_flags &= ~MSG_ZEROCOPY; + + if (info->msg->msg_flags & MSG_ZEROCOPY) + can_zcopy = virtio_transport_can_zcopy(t_ops, info, pkt_len); + + if (can_zcopy) + max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, + (MAX_SKB_FRAGS * PAGE_SIZE)); + } + rest_len = pkt_len; do { struct sk_buff *skb; size_t skb_len; - skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len); + skb_len = min(max_skb_len, rest_len); - skb = virtio_transport_alloc_skb(info, skb_len, + skb = virtio_transport_alloc_skb(info, skb_len, can_zcopy, src_cid, src_port, dst_cid, dst_port); if (!skb) { @@ -270,6 +389,21 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, break; } + /* We process buffer part by part, allocating skb on + * each iteration. If this is last skb for this buffer + * and MSG_ZEROCOPY mode is in use - we must allocate + * completion for the current syscall. + */ + if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY && + skb_len == rest_len && info->op == VIRTIO_VSOCK_OP_RW) { + if (virtio_transport_init_zcopy_skb(vsk, skb, + info->msg, + can_zcopy)) { + ret = -ENOMEM; + break; + } + } + virtio_transport_inc_tx_pkt(vvs, skb); ret = t_ops->send_pkt(skb); @@ -985,7 +1119,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, if (!t) return -ENOTCONN; - reply = virtio_transport_alloc_skb(&info, 0, + reply = virtio_transport_alloc_skb(&info, 0, false, le64_to_cpu(hdr->dst_cid), le32_to_cpu(hdr->dst_port), le64_to_cpu(hdr->src_cid),