Skip to content

Commit

Permalink
Merge tag 'for-6.6/io_uring-2023-08-28' of git://git.kernel.dk/linux
Browse files Browse the repository at this point in the history
Pull io_uring updates from Jens Axboe:
 "Fairly quiet round in terms of features, mostly just improvements all
  over the map for existing code. In detail:

   - Initial support for socket operations through io_uring. Latter half
     of this will likely land with the 6.7 kernel, then allowing things
     like get/setsockopt (Breno)

   - Cleanup of the cancel code, and then adding support for canceling
     requests with the opcode as the key (me)

   - Improvements for the io-wq locking (me)

   - Fix affinity setting for SQPOLL based io-wq (me)

   - Remove the io_uring userspace code. These were added initially as
     copies from liburing, but all of them have since bitrotted and are
     way out of date at this point. Rather than attempt to keep them in
     sync, just get rid of them. People will have liburing available
     anyway for these examples. (Pavel)

   - Series improving the CQ/SQ ring caching (Pavel)

   - Misc fixes and cleanups (Pavel, Yue, me)"

* tag 'for-6.6/io_uring-2023-08-28' of git://git.kernel.dk/linux: (47 commits)
  io_uring: move iopoll ctx fields around
  io_uring: move multishot cqe cache in ctx
  io_uring: separate task_work/waiting cache line
  io_uring: banish non-hot data to end of io_ring_ctx
  io_uring: move non aligned field to the end
  io_uring: add option to remove SQ indirection
  io_uring: compact SQ/CQ heads/tails
  io_uring: force inline io_fill_cqe_req
  io_uring: merge iopoll and normal completion paths
  io_uring: reorder cqring_flush and wakeups
  io_uring: optimise extra io_get_cqe null check
  io_uring: refactor __io_get_cqe()
  io_uring: simplify big_cqe handling
  io_uring: cqe init hardening
  io_uring: improve cqe !tracing hot path
  io_uring/rsrc: Annotate struct io_mapped_ubuf with __counted_by
  io_uring/sqpoll: fix io-wq affinity when IORING_SETUP_SQPOLL is used
  io_uring: simplify io_run_task_work_sig return
  io_uring/rsrc: keep one global dummy_ubuf
  io_uring: never overflow io_aux_cqe
  ...
  • Loading branch information
Linus Torvalds committed Aug 30, 2023
2 parents adfd671 + 644c4a7 commit c1b7fcf
Show file tree
Hide file tree
Showing 31 changed files with 432 additions and 1,767 deletions.
1 change: 0 additions & 1 deletion MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -10966,7 +10966,6 @@ F: include/linux/io_uring_types.h
F: include/trace/events/io_uring.h
F: include/uapi/linux/io_uring.h
F: io_uring/
F: tools/io_uring/

IPMI SUBSYSTEM
M: Corey Minyard <minyard@acm.org>
Expand Down
6 changes: 6 additions & 0 deletions include/linux/io_uring.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ static inline void io_uring_free(struct task_struct *tsk)
if (tsk->io_uring)
__io_uring_free(tsk);
}
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
#else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd)
Expand Down Expand Up @@ -116,6 +117,11 @@ static inline const char *io_uring_get_opcode(u8 opcode)
{
return "";
}
static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
#endif

#endif
129 changes: 65 additions & 64 deletions include/linux/io_uring_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ struct io_uring_task {
};

struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
u32 head;
u32 tail;
};

/*
Expand Down Expand Up @@ -176,7 +176,6 @@ struct io_submit_state {
unsigned short submit_nr;
unsigned int cqes_count;
struct blk_plug plug;
struct io_uring_cqe cqes[16];
};

struct io_ev_fd {
Expand Down Expand Up @@ -205,25 +204,17 @@ struct io_ring_ctx {
unsigned int has_evfd: 1;
/* all CQEs should be posted only by the submitter task */
unsigned int task_complete: 1;
unsigned int lockless_cq: 1;
unsigned int syscall_iopoll: 1;
unsigned int poll_activated: 1;
unsigned int drain_disabled: 1;
unsigned int compat: 1;

enum task_work_notify_mode notify_method;
struct task_struct *submitter_task;
struct io_rings *rings;
struct percpu_ref refs;

/*
* If IORING_SETUP_NO_MMAP is used, then the below holds
* the gup'ed pages for the two rings, and the sqes.
*/
unsigned short n_ring_pages;
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;

struct io_rings *rings;
struct task_struct *submitter_task;
struct percpu_ref refs;
enum task_work_notify_mode notify_method;
} ____cacheline_aligned_in_smp;

/* submission data */
Expand Down Expand Up @@ -261,31 +252,20 @@ struct io_ring_ctx {

struct io_buffer_list *io_bl;
struct xarray io_bl_xa;
struct list_head io_buffers_cache;

struct io_hash_table cancel_table_locked;
struct list_head cq_overflow_list;
struct io_alloc_cache apoll_cache;
struct io_alloc_cache netmsg_cache;
} ____cacheline_aligned_in_smp;

/* IRQ completion list, under ->completion_lock */
struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;

const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */

struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;

unsigned long check_cq;

unsigned int file_alloc_start;
unsigned int file_alloc_end;

struct xarray personalities;
u32 pers_next;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
struct io_wq_work_list iopoll_list;
bool poll_multi_queue;
} ____cacheline_aligned_in_smp;

struct {
/*
Expand All @@ -298,39 +278,55 @@ struct io_ring_ctx {
unsigned cached_cq_tail;
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
} ____cacheline_aligned_in_smp;

/*
* task_work and async notification delivery cacheline. Expected to
* regularly bounce b/w CPUs.
*/
struct {
spinlock_t completion_lock;

bool poll_multi_queue;
atomic_t cq_wait_nr;

/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
struct io_wq_work_list iopoll_list;
struct io_hash_table cancel_table;

struct llist_head work_llist;

struct list_head io_buffers_comp;
unsigned long check_cq;
atomic_t cq_wait_nr;
atomic_t cq_timeouts;
struct wait_queue_head cq_wait;
} ____cacheline_aligned_in_smp;

/* timeouts */
struct {
spinlock_t timeout_lock;
atomic_t cq_timeouts;
struct list_head timeout_list;
struct list_head ltimeout_list;
unsigned cq_last_tm_flush;
} ____cacheline_aligned_in_smp;

struct io_uring_cqe completion_cqes[16];

spinlock_t completion_lock;

/* IRQ completion list, under ->completion_lock */
struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;

struct list_head io_buffers_comp;
struct list_head cq_overflow_list;
struct io_hash_table cancel_table;

const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */

struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;

unsigned int file_alloc_start;
unsigned int file_alloc_end;

struct xarray personalities;
u32 pers_next;

struct list_head io_buffers_cache;

/* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
Expand Down Expand Up @@ -374,6 +370,15 @@ struct io_ring_ctx {
unsigned sq_thread_idle;
/* protected by ->completion_lock */
unsigned evfd_last_cq_tail;

/*
* If IORING_SETUP_NO_MMAP is used, then the below holds
* the gup'ed pages for the two rings, and the sqes.
*/
unsigned short n_ring_pages;
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
};

struct io_tw_state {
Expand Down Expand Up @@ -409,7 +414,6 @@ enum {
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT,
REQ_F_CQE32_INIT_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT,
REQ_F_HASH_LOCKED_BIT,
Expand Down Expand Up @@ -479,8 +483,6 @@ enum {
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
/* fast poll multishot mode */
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
/* ->extra1 and ->extra2 are initialised */
REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
/* recvmsg special flag, clear EPOLLIN */
REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
Expand Down Expand Up @@ -579,13 +581,7 @@ struct io_kiocb {
struct io_task_work io_task_work;
unsigned nr_tw;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
union {
struct hlist_node hash_node;
struct {
u64 extra1;
u64 extra2;
};
};
struct hlist_node hash_node;
/* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
Expand All @@ -595,6 +591,11 @@ struct io_kiocb {
/* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds;
struct io_wq_work work;

struct {
u64 extra1;
u64 extra2;
} big_cqe;
};

struct io_overflow_cqe {
Expand Down
21 changes: 20 additions & 1 deletion include/uapi/linux/io_uring.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,11 @@ enum {
*/
#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15)

/*
* Removes indirection through the SQ index array.
*/
#define IORING_SETUP_NO_SQARRAY (1U << 16)

enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
Expand Down Expand Up @@ -299,11 +304,15 @@ enum io_uring_op {
* request 'user_data'
* IORING_ASYNC_CANCEL_ANY Match any request
* IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor
* IORING_ASYNC_CANCEL_USERDATA Match on user_data, default for no other key
* IORING_ASYNC_CANCEL_OP Match request based on opcode
*/
#define IORING_ASYNC_CANCEL_ALL (1U << 0)
#define IORING_ASYNC_CANCEL_FD (1U << 1)
#define IORING_ASYNC_CANCEL_ANY (1U << 2)
#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3)
#define IORING_ASYNC_CANCEL_USERDATA (1U << 4)
#define IORING_ASYNC_CANCEL_OP (1U << 5)

/*
* send/sendmsg and recv/recvmsg flags (sqe->ioprio)
Expand Down Expand Up @@ -697,7 +706,9 @@ struct io_uring_sync_cancel_reg {
__s32 fd;
__u32 flags;
struct __kernel_timespec timeout;
__u64 pad[4];
__u8 opcode;
__u8 pad[7];
__u64 pad2[3];
};

/*
Expand All @@ -717,6 +728,14 @@ struct io_uring_recvmsg_out {
__u32 flags;
};

/*
* Argument for IORING_OP_URING_CMD when file is a socket
*/
enum {
SOCKET_URING_OP_SIOCINQ = 0,
SOCKET_URING_OP_SIOCOUTQ,
};

#ifdef __cplusplus
}
#endif
Expand Down
Loading

0 comments on commit c1b7fcf

Please sign in to comment.