Skip to content

Commit

Permalink
Merge tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux
Browse files Browse the repository at this point in the history
Pull io_uring updates from Jens Axboe:

 - NAPI fixes and cleanups (Pavel, Olivier)

 - Add support for absolute timeouts (Pavel)

 - Fixes for io-wq/sqpoll affinities (Felix)

 - Efficiency improvements for dealing with huge pages (Chenliang)

 - Support for a minwait mode, where the application essentially has two
   timouts - one smaller one that defines the batch timeout, and the
   overall large one similar to what we had before. This enables
   efficient use of batching based on count + timeout, while still
   working well with periods of less intensive workloads

 - Use ITER_UBUF for single segment sends

 - Add support for incremental buffer consumption. Right now each
   operation will always consume a full buffer. With incremental
   consumption, a recv/read operation only consumes the part of the
   buffer that it needs to satisfy the operation

 - Add support for GCOV for io_uring, to help retain a high coverage of
   test to code ratio

 - Fix regression with ocfs2, where an odd -EOPNOTSUPP wasn't correctly
   converted to a blocking retry

 - Add support for cloning registered buffers from one ring to another

 - Misc cleanups (Anuj, me)

* tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux: (35 commits)
  io_uring: add IORING_REGISTER_COPY_BUFFERS method
  io_uring/register: provide helper to get io_ring_ctx from 'fd'
  io_uring/rsrc: add reference count to struct io_mapped_ubuf
  io_uring/rsrc: clear 'slot' entry upfront
  io_uring/io-wq: inherit cpuset of cgroup in io worker
  io_uring/io-wq: do not allow pinning outside of cpuset
  io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common()
  io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN
  io_uring/sqpoll: do not allow pinning outside of cpuset
  io_uring/eventfd: move refs to refcount_t
  io_uring: remove unused rsrc_put_fn
  io_uring: add new line after variable declaration
  io_uring: add GCOV_PROFILE_URING Kconfig option
  io_uring/kbuf: add support for incremental buffer consumption
  io_uring/kbuf: pass in 'len' argument for buffer commit
  Revert "io_uring: Require zeroed sqe->len on provided-buffers send"
  io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h
  io_uring/kbuf: add io_kbuf_commit() helper
  io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg
  io_uring: wire up min batch wake timeout
  ...
  • Loading branch information
Linus Torvalds committed Sep 16, 2024
2 parents 69a3a0a + 7cc2a6e commit 3a4d319
Show file tree
Hide file tree
Showing 20 changed files with 723 additions and 260 deletions.
3 changes: 3 additions & 0 deletions include/linux/io_uring_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,9 @@ struct io_ring_ctx {
struct io_rings *rings;
struct percpu_ref refs;

clockid_t clockid;
enum tk_offsets clock_offset;

enum task_work_notify_mode notify_method;
unsigned sq_thread_idle;
} ____cacheline_aligned_in_smp;
Expand Down
42 changes: 41 additions & 1 deletion include/uapi/linux/io_uring.h
Original file line number Diff line number Diff line change
Expand Up @@ -440,11 +440,21 @@ struct io_uring_cqe {
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
* them from sends.
* IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get
* more completions. In other words, the buffer is being
* partially consumed, and will be used by the kernel for
* more completions. This is only set for buffers used via
* the incremental buffer consumption, as provided by
* a ring buffer setup with IOU_PBUF_RING_INC. For any
* other provided buffer type, all completions with a
* buffer passed back is automatically returned to the
* application.
*/
#define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1)
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
#define IORING_CQE_F_NOTIF (1U << 3)
#define IORING_CQE_F_BUF_MORE (1U << 4)

#define IORING_CQE_BUFFER_SHIFT 16

Expand Down Expand Up @@ -507,6 +517,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_REGISTERED_RING (1U << 4)
#define IORING_ENTER_ABS_TIMER (1U << 5)

/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
Expand Down Expand Up @@ -542,6 +553,7 @@ struct io_uring_params {
#define IORING_FEAT_LINKED_FILE (1U << 12)
#define IORING_FEAT_REG_REG_RING (1U << 13)
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)

/*
* io_uring_register(2) opcodes and arguments
Expand Down Expand Up @@ -595,6 +607,11 @@ enum io_uring_register_op {
IORING_REGISTER_NAPI = 27,
IORING_UNREGISTER_NAPI = 28,

IORING_REGISTER_CLOCK = 29,

/* copy registered buffers from source ring to current ring */
IORING_REGISTER_COPY_BUFFERS = 30,

/* this goes last */
IORING_REGISTER_LAST,

Expand Down Expand Up @@ -675,6 +692,21 @@ struct io_uring_restriction {
__u32 resv2[3];
};

struct io_uring_clock_register {
__u32 clockid;
__u32 __resv[3];
};

enum {
IORING_REGISTER_SRC_REGISTERED = 1,
};

struct io_uring_copy_buffers {
__u32 src_fd;
__u32 flags;
__u32 pad[6];
};

struct io_uring_buf {
__u64 addr;
__u32 len;
Expand Down Expand Up @@ -707,9 +739,17 @@ struct io_uring_buf_ring {
* mmap(2) with the offset set as:
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
* to get a virtual mapping for the ring.
* IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be
* consumed incrementally. Normally one (or more) buffers
* are fully consumed. With incremental consumptions, it's
* feasible to register big ranges of buffers, and each
* use of it will consume only as much as it needs. This
* requires that both the kernel and application keep
* track of where the current read/recv index is at.
*/
enum io_uring_register_pbuf_ring_flags {
IOU_PBUF_RING_MMAP = 1,
IOU_PBUF_RING_INC = 2,
};

/* argument for IORING_(UN)REGISTER_PBUF_RING */
Expand Down Expand Up @@ -758,7 +798,7 @@ enum io_uring_register_restriction_op {
struct io_uring_getevents_arg {
__u64 sigmask;
__u32 sigmask_sz;
__u32 pad;
__u32 min_wait_usec;
__u64 ts;
};

Expand Down
13 changes: 13 additions & 0 deletions init/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -1687,6 +1687,19 @@ config IO_URING
applications to submit and complete IO through submission and
completion rings that are shared between the kernel and application.

config GCOV_PROFILE_URING
bool "Enable GCOV profiling on the io_uring subsystem"
depends on GCOV_KERNEL
help
Enable GCOV profiling on the io_uring subsystem, to facilitate
code coverage testing.

If unsure, say N.

Note that this will have a negative impact on the performance of
the io_uring subsystem, hence this should only be enabled for
specific test purposes.

config ADVISE_SYSCALLS
bool "Enable madvise/fadvise syscalls" if EXPERT
default y
Expand Down
4 changes: 4 additions & 0 deletions io_uring/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
#
# Makefile for io_uring

ifdef CONFIG_GCOV_PROFILE_URING
GCOV_PROFILE := y
endif

obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
tctx.o filetable.o rw.o net.o poll.o \
eventfd.o uring_cmd.o openclose.o \
Expand Down
13 changes: 7 additions & 6 deletions io_uring/eventfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async: 1;
struct rcu_head rcu;
atomic_t refs;
refcount_t refs;
atomic_t ops;
};

Expand All @@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)

eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);

if (atomic_dec_and_test(&ev_fd->refs))
if (refcount_dec_and_test(&ev_fd->refs))
io_eventfd_free(rcu);
}

Expand All @@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
*/
if (unlikely(!ev_fd))
return;
if (!atomic_inc_not_zero(&ev_fd->refs))
if (!refcount_inc_not_zero(&ev_fd->refs))
return;
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
goto out;
Expand All @@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
}
}
out:
if (atomic_dec_and_test(&ev_fd->refs))
if (refcount_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free);
}

Expand Down Expand Up @@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd);

kfree(ev_fd);
return ret;
}
Expand All @@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,

ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true;
atomic_set(&ev_fd->refs, 1);
refcount_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0;
Expand All @@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
if (ev_fd) {
ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
if (atomic_dec_and_test(&ev_fd->refs))
if (refcount_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free);
return 0;
}
Expand Down
14 changes: 13 additions & 1 deletion io_uring/fdinfo.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
cqe->user_data, cqe->res, cqe->flags);

}

spin_unlock(&ctx->completion_lock);

#ifdef CONFIG_NET_RX_BUSY_POLL
if (ctx->napi_enabled) {
seq_puts(m, "NAPI:\tenabled\n");
seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
if (ctx->napi_prefer_busy_poll)
seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
else
seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
} else {
seq_puts(m, "NAPI:\tdisabled\n");
}
#endif
}
#endif
25 changes: 19 additions & 6 deletions io_uring/io-wq.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/rculist_nulls.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/task_work.h>
#include <linux/audit.h>
#include <linux/mmu_context.h>
Expand Down Expand Up @@ -1167,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)

if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL))
goto err;
cpumask_copy(wq->cpu_mask, cpu_possible_mask);
cpuset_cpus_allowed(data->task, wq->cpu_mask);
wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
wq->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC);
Expand Down Expand Up @@ -1322,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)

int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
{
cpumask_var_t allowed_mask;
int ret = 0;

if (!tctx || !tctx->io_wq)
return -EINVAL;

if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
return -ENOMEM;

rcu_read_lock();
if (mask)
cpumask_copy(tctx->io_wq->cpu_mask, mask);
else
cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask);
cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask);
if (mask) {
if (cpumask_subset(mask, allowed_mask))
cpumask_copy(tctx->io_wq->cpu_mask, mask);
else
ret = -EINVAL;
} else {
cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask);
}
rcu_read_unlock();

return 0;
free_cpumask_var(allowed_mask);
return ret;
}

/*
Expand Down
Loading

0 comments on commit 3a4d319

Please sign in to comment.