Skip to content

Commit

Permalink
io_uring: add support for kernel registered bvecs
Browse files Browse the repository at this point in the history
Provide an interface for the kernel to leverage the existing
pre-registered buffers that io_uring provides. User space can reference
these later to achieve zero-copy IO.

User space must register an empty fixed buffer table with io_uring in
order for the kernel to make use of it.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20250227223916.143006-5-kbusch@meta.com
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
  • Loading branch information
Keith Busch authored and Jens Axboe committed Feb 28, 2025
1 parent 99fde89 commit 27cb27b
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 7 deletions.
7 changes: 7 additions & 0 deletions include/linux/io_uring/cmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <uapi/linux/io_uring.h>
#include <linux/io_uring_types.h>
#include <linux/blk-mq.h>

/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
#define IORING_URING_CMD_CANCELABLE (1U << 30)
Expand Down Expand Up @@ -125,4 +126,10 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur
return cmd_to_io_kiocb(cmd)->async_data;
}

int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags);
void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
unsigned int issue_flags);

#endif /* _LINUX_IO_URING_CMD_H */
3 changes: 3 additions & 0 deletions io_uring/io_uring.c
Original file line number Diff line number Diff line change
Expand Up @@ -3940,6 +3940,9 @@ static int __init io_uring_init(void)

io_uring_optable_init();

/* imu->dir is u8 */
BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX);

/*
* Allow user copy in the per-command field, which starts after the
* file in io_kiocb and until the opcode field. The openat2 handling
Expand Down
123 changes: 116 additions & 7 deletions io_uring/rsrc.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <linux/hugetlb.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/io_uring/cmd.h>

#include <uapi/linux/io_uring.h>

Expand Down Expand Up @@ -101,17 +102,23 @@ static int io_buffer_validate(struct iovec *iov)
return 0;
}

static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
static void io_release_ubuf(void *priv)
{
struct io_mapped_ubuf *imu = node->buf;
struct io_mapped_ubuf *imu = priv;
unsigned int i;

if (!refcount_dec_and_test(&imu->refs))
return;
for (i = 0; i < imu->nr_bvecs; i++)
unpin_user_page(imu->bvec[i].bv_page);
}

static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
{
if (!refcount_dec_and_test(&imu->refs))
return;

if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
imu->release(imu->priv);
kvfree(imu);
}

Expand Down Expand Up @@ -451,7 +458,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
break;
case IORING_RSRC_BUFFER:
if (node->buf)
io_buffer_unmap(ctx, node);
io_buffer_unmap(ctx, node->buf);
break;
default:
WARN_ON_ONCE(1);
Expand Down Expand Up @@ -761,6 +768,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
imu->len = iov->iov_len;
imu->nr_bvecs = nr_pages;
imu->folio_shift = PAGE_SHIFT;
imu->release = io_release_ubuf;
imu->priv = imu;
imu->is_kbuf = false;
imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
if (coalesced)
imu->folio_shift = data.folio_shift;
refcount_set(&imu->refs, 1);
Expand Down Expand Up @@ -857,6 +868,95 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}

int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags)
{
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
struct io_rsrc_data *data = &ctx->buf_table;
struct req_iterator rq_iter;
struct io_mapped_ubuf *imu;
struct io_rsrc_node *node;
struct bio_vec bv, *bvec;
u16 nr_bvecs;
int ret = 0;

io_ring_submit_lock(ctx, issue_flags);
if (index >= data->nr) {
ret = -EINVAL;
goto unlock;
}
index = array_index_nospec(index, data->nr);

if (data->nodes[index]) {
ret = -EBUSY;
goto unlock;
}

node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
if (!node) {
ret = -ENOMEM;
goto unlock;
}

nr_bvecs = blk_rq_nr_phys_segments(rq);
imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
if (!imu) {
kfree(node);
ret = -ENOMEM;
goto unlock;
}

imu->ubuf = 0;
imu->len = blk_rq_bytes(rq);
imu->acct_pages = 0;
imu->folio_shift = PAGE_SHIFT;
imu->nr_bvecs = nr_bvecs;
refcount_set(&imu->refs, 1);
imu->release = release;
imu->priv = rq;
imu->is_kbuf = true;

if (op_is_write(req_op(rq)))
imu->dir = IO_IMU_SOURCE;
else
imu->dir = IO_IMU_DEST;

bvec = imu->bvec;
rq_for_each_bvec(bv, rq, rq_iter)
*bvec++ = bv;

node->buf = imu;
data->nodes[index] = node;
unlock:
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
EXPORT_SYMBOL_GPL(io_buffer_register_bvec);

void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
unsigned int issue_flags)
{
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
struct io_rsrc_data *data = &ctx->buf_table;
struct io_rsrc_node *node;

io_ring_submit_lock(ctx, issue_flags);
if (index >= data->nr)
goto unlock;
index = array_index_nospec(index, data->nr);

node = data->nodes[index];
if (!node || !node->buf->is_kbuf)
goto unlock;

io_put_rsrc_node(ctx, node);
data->nodes[index] = NULL;
unlock:
io_ring_submit_unlock(ctx, issue_flags);
}
EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);

static int io_import_fixed(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu,
u64 buf_addr, size_t len)
Expand All @@ -871,6 +971,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
/* not inside the mapped region */
if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
return -EFAULT;
if (!(imu->dir & (1 << ddir)))
return -EFAULT;

/*
* Might not be a start of buffer, set size appropriately
Expand All @@ -883,8 +985,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
/*
* Don't use iov_iter_advance() here, as it's really slow for
* using the latter parts of a big fixed buffer - it iterates
* over each segment manually. We can cheat a bit here, because
* we know that:
* over each segment manually. We can cheat a bit here for user
* registered nodes, because we know that:
*
* 1) it's a BVEC iter, we set it up
* 2) all bvecs are the same in size, except potentially the
Expand All @@ -898,8 +1000,15 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
*/
const struct bio_vec *bvec = imu->bvec;

/*
* Kernel buffer bvecs, on the other hand, don't necessarily
* have the size property of user registered ones, so we have
* to use the slow iter advance.
*/
if (offset < bvec->bv_len) {
iter->iov_offset = offset;
} else if (imu->is_kbuf) {
iov_iter_advance(iter, offset);
} else {
unsigned long seg_skip;

Expand Down
9 changes: 9 additions & 0 deletions io_uring/rsrc.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,22 @@ struct io_rsrc_node {
};
};

enum {
IO_IMU_DEST = 1 << ITER_DEST,
IO_IMU_SOURCE = 1 << ITER_SOURCE,
};

struct io_mapped_ubuf {
u64 ubuf;
unsigned int len;
unsigned int nr_bvecs;
unsigned int folio_shift;
refcount_t refs;
unsigned long acct_pages;
void (*release)(void *);
void *priv;
bool is_kbuf;
u8 dir;
struct bio_vec bvec[] __counted_by(nr_bvecs);
};

Expand Down
3 changes: 3 additions & 0 deletions io_uring/rw.c
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,7 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
*/
static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
{
struct io_kiocb *req = cmd_to_io_kiocb(rw);
struct kiocb *kiocb = &rw->kiocb;
struct file *file = kiocb->ki_filp;
ssize_t ret = 0;
Expand All @@ -644,6 +645,8 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
if ((kiocb->ki_flags & IOCB_NOWAIT) &&
!(kiocb->ki_filp->f_flags & O_NONBLOCK))
return -EAGAIN;
if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf)
return -EFAULT;

ppos = io_kiocb_ppos(kiocb);

Expand Down

0 comments on commit 27cb27b

Please sign in to comment.