Skip to content

Commit

Permalink
Merge tag 'for-6.15/io_uring-reg-vec-20250327' of git://git.kernel.dk…
Browse files Browse the repository at this point in the history
…/linux

Pull more io_uring updates from Jens Axboe:
 "Final separate updates for io_uring.

  This started out as a series of cleanups improvements and improvements
  for registered buffers, but as the last series of the io_uring changes
  for 6.15, it also collected a few fixes for the other branches on top:

   - Add support for vectored fixed/registered buffers.

     Previously only single segments have been supported for commands,
     now vectored variants are supported as well. This series includes
     networking and file read/write support.

   - Small series unifying return codes across multi and single shot.

   - Small series cleaning up registerd buffer importing.

   - Adding support for vectored registered buffers for uring_cmd.

   - Fix for io-wq handling of command reissue.

   - Various little fixes and tweaks"

* tag 'for-6.15/io_uring-reg-vec-20250327' of git://git.kernel.dk/linux: (25 commits)
  io_uring/net: fix io_req_post_cqe abuse by send bundle
  io_uring/net: use REQ_F_IMPORT_BUFFER for send_zc
  io_uring: move min_events sanitisation
  io_uring: rename "min" arg in io_iopoll_check()
  io_uring: open code __io_post_aux_cqe()
  io_uring: defer iowq cqe overflow via task_work
  io_uring: fix retry handling off iowq
  io_uring/net: only import send_zc buffer once
  io_uring/cmd: introduce io_uring_cmd_import_fixed_vec
  io_uring/cmd: add iovec cache for commands
  io_uring/cmd: don't expose entire cmd async data
  io_uring: rename the data cmd cache
  io_uring: rely on io_prep_reg_vec for iovec placement
  io_uring: introduce io_prep_reg_iovec()
  io_uring: unify STOP_MULTISHOT with IOU_OK
  io_uring: return -EAGAIN to continue multishot
  io_uring: cap cached iovec/bvec size
  io_uring/net: implement vectored reg bufs for zctx
  io_uring/net: convert to struct iou_vec
  io_uring/net: pull vec alloc out of msghdr import
  ...
  • Loading branch information
Linus Torvalds committed Mar 28, 2025
2 parents 6df9d08 + 6889ae1 commit eff5f16
Show file tree
Hide file tree
Showing 16 changed files with 567 additions and 211 deletions.
13 changes: 13 additions & 0 deletions include/linux/io_uring/cmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter,
struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
const struct iovec __user *uvec,
size_t uvec_segs,
int ddir, struct iov_iter *iter,
unsigned issue_flags);

/*
* Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd
Expand Down Expand Up @@ -76,6 +81,14 @@ io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
{
return -EOPNOTSUPP;
}
static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
const struct iovec __user *uvec,
size_t uvec_segs,
int ddir, struct iov_iter *iter,
unsigned issue_flags)
{
return -EOPNOTSUPP;
}
static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
u64 ret2, unsigned issue_flags)
{
Expand Down
19 changes: 18 additions & 1 deletion include/linux/io_uring_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,14 @@ struct io_uring_task {
} ____cacheline_aligned_in_smp;
};

struct iou_vec {
union {
struct iovec *iovec;
struct bio_vec *bvec;
};
unsigned nr; /* number of struct iovec it can hold */
};

struct io_uring {
u32 head;
u32 tail;
Expand Down Expand Up @@ -310,7 +318,7 @@ struct io_ring_ctx {
struct io_alloc_cache apoll_cache;
struct io_alloc_cache netmsg_cache;
struct io_alloc_cache rw_cache;
struct io_alloc_cache uring_cache;
struct io_alloc_cache cmd_cache;

/*
* Any cancelable uring_cmd is added to this list in
Expand Down Expand Up @@ -482,6 +490,7 @@ enum {
REQ_F_SKIP_LINK_CQES_BIT,
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_MULTISHOT_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT,
/* keep async read/write and isreg together and in order */
Expand All @@ -494,6 +503,7 @@ enum {
REQ_F_BUFFERS_COMMIT_BIT,
REQ_F_BUF_NODE_BIT,
REQ_F_HAS_METADATA_BIT,
REQ_F_IMPORT_BUFFER_BIT,

/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
Expand Down Expand Up @@ -558,6 +568,8 @@ enum {
REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT),
/* double poll may active */
REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT),
/* request posts multiple completions, should be set at prep time */
REQ_F_MULTISHOT = IO_REQ_FLAG(REQ_F_MULTISHOT_BIT),
/* fast poll multishot mode */
REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
/* recvmsg special flag, clear EPOLLIN */
Expand All @@ -576,6 +588,11 @@ enum {
REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
/* request has read/write metadata assigned */
REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT),
/*
* For vectored fixed buffers, resolve iovec to registered buffers.
* For SEND_ZC, whether to import buffers (i.e. the first issue).
*/
REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
};

typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw);
Expand Down
2 changes: 2 additions & 0 deletions include/uapi/linux/io_uring.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@ enum io_uring_op {
IORING_OP_LISTEN,
IORING_OP_RECV_ZC,
IORING_OP_EPOLL_WAIT,
IORING_OP_READV_FIXED,
IORING_OP_WRITEV_FIXED,

/* this goes last, obviously */
IORING_OP_LAST,
Expand Down
9 changes: 0 additions & 9 deletions io_uring/alloc_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,6 @@ bool io_alloc_cache_init(struct io_alloc_cache *cache,

void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp);

static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr)
{
if (IS_ENABLED(CONFIG_KASAN)) {
kfree(*iov);
*iov = NULL;
*nr = 0;
}
}

static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
void *entry)
{
Expand Down
65 changes: 27 additions & 38 deletions io_uring/io_uring.c
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->apoll_cache, kfree);
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free);
io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
io_rsrc_cache_free(ctx);
Expand Down Expand Up @@ -334,8 +334,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_rw),
offsetof(struct io_async_rw, clear));
ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_uring_cmd_data), 0);
ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_cmd),
sizeof(struct io_async_cmd));
spin_lock_init(&ctx->msg_lock);
ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_kiocb), 0);
Expand Down Expand Up @@ -833,24 +834,14 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
return false;
}

static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
bool filled;

io_cq_lock(ctx);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
if (!filled)
filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);

return filled;
}

bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
bool filled;

io_cq_lock(ctx);
filled = __io_post_aux_cqe(ctx, user_data, res, cflags);
io_cq_unlock_post(ctx);
return filled;
}
Expand Down Expand Up @@ -891,6 +882,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
bool completed = true;

/*
* All execution paths but io-wq use the deferred completions by
Expand All @@ -903,19 +895,21 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
* Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
* the submitter task context, IOPOLL protects with uring_lock.
*/
if (ctx->lockless_cq) {
if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) {
defer_complete:
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req);
return;
}

io_cq_lock(ctx);
if (!(req->flags & REQ_F_CQE_SKIP)) {
if (!io_fill_cqe_req(ctx, req))
io_req_cqe_overflow(req);
}
if (!(req->flags & REQ_F_CQE_SKIP))
completed = io_fill_cqe_req(ctx, req);
io_cq_unlock_post(ctx);

if (!completed)
goto defer_complete;

/*
* We don't free the request here because we know it's called from
* io-wq only, which holds a reference, so it cannot be the last put.
Expand Down Expand Up @@ -1511,11 +1505,13 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}

static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
{
unsigned int nr_events = 0;
unsigned long check_cq;

min_events = min(min_events, ctx->cq_entries);

lockdep_assert_held(&ctx->uring_lock);

if (!io_allowed_run_tw(ctx))
Expand Down Expand Up @@ -1557,7 +1553,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
io_task_work_pending(ctx)) {
u32 tail = ctx->cached_cq_tail;

(void) io_run_local_work_locked(ctx, min);
(void) io_run_local_work_locked(ctx, min_events);

if (task_work_pending(current) ||
wq_list_empty(&ctx->iopoll_list)) {
Expand All @@ -1570,7 +1566,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
wq_list_empty(&ctx->iopoll_list))
break;
}
ret = io_do_iopoll(ctx, !min);
ret = io_do_iopoll(ctx, !min_events);
if (unlikely(ret < 0))
return ret;

Expand All @@ -1580,7 +1576,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
break;

nr_events += ret;
} while (nr_events < min);
} while (nr_events < min_events);

return 0;
}
Expand Down Expand Up @@ -1791,10 +1787,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw)

ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]);

WARN_ON_ONCE(ret == IOU_OK);

if (ret == IOU_ISSUE_SKIP_COMPLETE)
ret = 0;
WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE);
return ret;
}

Expand Down Expand Up @@ -1847,7 +1840,7 @@ void io_wq_submit_work(struct io_wq_work *work)
* Don't allow any multishot execution from io-wq. It's more restrictive
* than necessary and also cleaner.
*/
if (req->flags & REQ_F_APOLL_MULTISHOT) {
if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) {
err = -EBADFD;
if (!io_file_can_poll(req))
goto fail;
Expand All @@ -1858,7 +1851,7 @@ void io_wq_submit_work(struct io_wq_work *work)
goto fail;
return;
} else {
req->flags &= ~REQ_F_APOLL_MULTISHOT;
req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT);
}
}

Expand Down Expand Up @@ -2549,6 +2542,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
ktime_t start_time;
int ret;

min_events = min_t(int, min_events, ctx->cq_entries);

if (!io_allowed_run_tw(ctx))
return -EEXIST;
if (io_local_work_pending(ctx))
Expand Down Expand Up @@ -3435,22 +3430,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
mutex_lock(&ctx->uring_lock);
iopoll_locked:
ret2 = io_validate_ext_arg(ctx, flags, argp, argsz);
if (likely(!ret2)) {
min_complete = min(min_complete,
ctx->cq_entries);
if (likely(!ret2))
ret2 = io_iopoll_check(ctx, min_complete);
}
mutex_unlock(&ctx->uring_lock);
} else {
struct ext_arg ext_arg = { .argsz = argsz };

ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg);
if (likely(!ret2)) {
min_complete = min(min_complete,
ctx->cq_entries);
if (likely(!ret2))
ret2 = io_cqring_wait(ctx, min_complete, flags,
&ext_arg);
}
}

if (!ret) {
Expand Down
19 changes: 11 additions & 8 deletions io_uring/io_uring.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,25 @@
#endif

enum {
IOU_OK = 0,
IOU_OK = 0, /* deprecated, use IOU_COMPLETE */
IOU_COMPLETE = 0,

IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,

/*
* The request has more work to do and should be retried. io_uring will
* attempt to wait on the file for eligible opcodes, but otherwise
* it'll be handed to iowq for blocking execution. It works for normal
* requests as well as for the multi shot mode.
*/
IOU_RETRY = -EAGAIN,

/*
* Requeue the task_work to restart operations on this request. The
* actual value isn't important, should just be not an otherwise
* valid error code, yet less than -MAX_ERRNO and valid internally.
*/
IOU_REQUEUE = -3072,

/*
* Intended only when both IO_URING_F_MULTISHOT is passed
* to indicate to the poll runner that multishot should be
* removed and the result is set on req->cqe.res.
*/
IOU_STOP_MULTISHOT = -ECANCELED,
};

struct io_wait_queue {
Expand Down
Loading

0 comments on commit eff5f16

Please sign in to comment.