Skip to content

Commit

Permalink
Merge tag 'for-6.4/io_uring-2023-04-21' of git://git.kernel.dk/linux
Browse files Browse the repository at this point in the history
Pull io_uring updates from Jens Axboe:

 - Cleanup of the io-wq per-node mapping, notably getting rid of it so
   we just have a single io_wq entry per ring (Breno)

 - Followup to the above, move accounting to io_wq as well and
   completely drop struct io_wqe (Gabriel)

 - Enable KASAN for the internal io_uring caches (Breno)

 - Add support for multishot timeouts. Some applications use timeouts to
   wake someone waiting on completion entries, and this makes it a bit
   easier to just have a recurring timer rather than needing to rearm it
   every time (David)

 - Support archs that have shared cache coloring between userspace and
   the kernel, and hence have strict address requirements for mmap'ing
   the ring into userspace. This should only be parisc/hppa. (Helge, me)

 - XFS has supported O_DIRECT writes without needing to lock the inode
   exclusively for a long time, and ext4 now supports it as well. This
   is true for the common cases of not extending the file size. Flag the
   fs as having that feature, and utilize that to avoid serializing
   those writes in io_uring (me)

 - Enable completion batching for uring commands (me)

 - Revert patch adding io_uring restriction to what can be GUP mapped or
   not. This does not belong in io_uring, as io_uring isn't really
   special in this regard. Since this is also getting in the way of
   cleanups and improvements to the GUP code, get rid of if (me)

 - A few series greatly reducing the complexity of registered resources,
   like buffers or files. Not only does this clean up the code a lot,
   the simplified code is also a LOT more efficient (Pavel)

 - Series optimizing how we wait for events and run task_work related to
   it (Pavel)

 - Fixes for file/buffer unregistration with DEFER_TASKRUN (Pavel)

 - Misc cleanups and improvements (Pavel, me)

* tag 'for-6.4/io_uring-2023-04-21' of git://git.kernel.dk/linux: (71 commits)
  Revert "io_uring/rsrc: disallow multi-source reg buffers"
  io_uring: add support for multishot timeouts
  io_uring/rsrc: disassociate nodes and rsrc_data
  io_uring/rsrc: devirtualise rsrc put callbacks
  io_uring/rsrc: pass node to io_rsrc_put_work()
  io_uring/rsrc: inline io_rsrc_put_work()
  io_uring/rsrc: add empty flag in rsrc_node
  io_uring/rsrc: merge nodes and io_rsrc_put
  io_uring/rsrc: infer node from ctx on io_queue_rsrc_removal
  io_uring/rsrc: remove unused io_rsrc_node::llist
  io_uring/rsrc: refactor io_queue_rsrc_removal
  io_uring/rsrc: simplify single file node switching
  io_uring/rsrc: clean up __io_sqe_buffers_update()
  io_uring/rsrc: inline switch_start fast path
  io_uring/rsrc: remove rsrc_data refs
  io_uring/rsrc: fix DEFER_TASKRUN rsrc quiesce
  io_uring/rsrc: use wq for quiescing
  io_uring/rsrc: refactor io_rsrc_ref_quiesce
  io_uring/rsrc: remove io_rsrc_node::done
  io_uring/rsrc: use nospec'ed indexes
  ...
  • Loading branch information
Linus Torvalds committed Apr 26, 2023
2 parents 5c7ecad + 3c85cc4 commit 5b9a7bb
Show file tree
Hide file tree
Showing 22 changed files with 949 additions and 847 deletions.
3 changes: 2 additions & 1 deletion fs/ext4/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -899,7 +899,8 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret;
}

filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC |
FMODE_DIO_PARALLEL_WRITE;
return dquot_file_open(inode, filp);
}

Expand Down
3 changes: 2 additions & 1 deletion fs/xfs/xfs_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -1171,7 +1171,8 @@ xfs_file_open(
{
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
FMODE_DIO_PARALLEL_WRITE;
return generic_file_open(inode, file);
}

Expand Down
3 changes: 3 additions & 0 deletions include/linux/fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,

#define FMODE_NOREUSE ((__force fmode_t)0x800000)

/* File supports non-exclusive O_DIRECT writes from multiple threads */
#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000)

/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)

Expand Down
24 changes: 15 additions & 9 deletions include/linux/io_uring_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,10 @@ struct io_ev_fd {
};

struct io_alloc_cache {
struct hlist_head list;
struct io_wq_work_node list;
unsigned int nr_cached;
unsigned int max_cached;
size_t elem_size;
};

struct io_ring_ctx {
Expand Down Expand Up @@ -239,7 +241,6 @@ struct io_ring_ctx {
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
int rsrc_cached_refs;
atomic_t cancel_seq;
struct io_file_table file_table;
unsigned nr_user_files;
Expand Down Expand Up @@ -295,7 +296,7 @@ struct io_ring_ctx {
spinlock_t completion_lock;

bool poll_multi_queue;
bool cq_waiting;
atomic_t cq_wait_nr;

/*
* ->iopoll_list is protected by the ctx->uring_lock for
Expand Down Expand Up @@ -325,16 +326,15 @@ struct io_ring_ctx {
struct io_restriction restrictions;

/* slow path rsrc auxilary data, used by update/register */
struct io_rsrc_node *rsrc_backup_node;
struct io_mapped_ubuf *dummy_ubuf;
struct io_rsrc_data *file_data;
struct io_rsrc_data *buf_data;

struct delayed_work rsrc_put_work;
struct callback_head rsrc_put_tw;
struct llist_head rsrc_put_llist;
/* protected by ->uring_lock */
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
struct io_alloc_cache rsrc_node_cache;
struct wait_queue_head rsrc_quiesce_wq;
unsigned rsrc_quiesce;

struct list_head io_buffers_pages;

Expand Down Expand Up @@ -366,6 +366,11 @@ struct io_ring_ctx {
unsigned evfd_last_cq_tail;
};

struct io_tw_state {
/* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */
bool locked;
};

enum {
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
Expand Down Expand Up @@ -472,7 +477,7 @@ enum {
REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
};

typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);

struct io_task_work {
struct llist_node node;
Expand Down Expand Up @@ -562,6 +567,7 @@ struct io_kiocb {
atomic_t refs;
atomic_t poll_refs;
struct io_task_work io_task_work;
unsigned nr_tw;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
union {
struct hlist_node hash_node;
Expand Down
15 changes: 6 additions & 9 deletions include/trace/events/io_uring.h
Original file line number Diff line number Diff line change
Expand Up @@ -360,27 +360,25 @@ TRACE_EVENT(io_uring_complete,
);

/**
* io_uring_submit_sqe - called before submitting one SQE
* io_uring_submit_req - called before submitting a request
*
* @req: pointer to a submitted request
* @force_nonblock: whether a context blocking or not
*
* Allows to track SQE submitting, to understand what was the source of it, SQ
* thread or io_uring_enter call.
*/
TRACE_EVENT(io_uring_submit_sqe,
TRACE_EVENT(io_uring_submit_req,

TP_PROTO(struct io_kiocb *req, bool force_nonblock),
TP_PROTO(struct io_kiocb *req),

TP_ARGS(req, force_nonblock),
TP_ARGS(req),

TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( u32, flags )
__field( bool, force_nonblock )
__field( bool, sq_thread )

__string( op_str, io_uring_get_opcode(req->opcode) )
Expand All @@ -392,16 +390,15 @@ TRACE_EVENT(io_uring_submit_sqe,
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->flags = req->flags;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;

__assign_str(op_str, io_uring_get_opcode(req->opcode));
),

TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
"non block %d, sq_thread %d", __entry->ctx, __entry->req,
"sq_thread %d", __entry->ctx, __entry->req,
__entry->user_data, __get_str(op_str),
__entry->flags, __entry->force_nonblock, __entry->sq_thread)
__entry->flags, __entry->sq_thread)
);

/*
Expand Down
33 changes: 19 additions & 14 deletions include/uapi/linux/io_uring.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ enum io_uring_op {
#define IORING_TIMEOUT_REALTIME (1U << 3)
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
#define IORING_TIMEOUT_MULTISHOT (1U << 6)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
Expand Down Expand Up @@ -389,6 +390,9 @@ enum {
#define IORING_OFF_SQ_RING 0ULL
#define IORING_OFF_CQ_RING 0x8000000ULL
#define IORING_OFF_SQES 0x10000000ULL
#define IORING_OFF_PBUF_RING 0x80000000ULL
#define IORING_OFF_PBUF_SHIFT 16
#define IORING_OFF_MMAP_MASK 0xf8000000ULL

/*
* Filled with the offset for mmap(2)
Expand Down Expand Up @@ -568,19 +572,6 @@ struct io_uring_rsrc_update2 {
__u32 resv2;
};

struct io_uring_notification_slot {
__u64 tag;
__u64 resv[3];
};

struct io_uring_notification_register {
__u32 nr_slots;
__u32 resv;
__u64 resv2;
__u64 data;
__u64 resv3;
};

/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)

Expand Down Expand Up @@ -635,12 +626,26 @@ struct io_uring_buf_ring {
};
};

/*
* Flags for IORING_REGISTER_PBUF_RING.
*
* IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring.
* The application must not set a ring_addr in struct
* io_uring_buf_reg, instead it must subsequently call
* mmap(2) with the offset set as:
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
* to get a virtual mapping for the ring.
*/
enum {
IOU_PBUF_RING_MMAP = 1,
};

/* argument for IORING_(UN)REGISTER_PBUF_RING */
struct io_uring_buf_reg {
__u64 ring_addr;
__u32 ring_entries;
__u16 bgid;
__u16 pad;
__u16 flags;
__u64 resv[3];
};

Expand Down
39 changes: 26 additions & 13 deletions io_uring/alloc_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,47 +7,60 @@
#define IO_ALLOC_CACHE_MAX 512

struct io_cache_entry {
struct hlist_node node;
struct io_wq_work_node node;
};

static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
struct io_cache_entry *entry)
{
if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
if (cache->nr_cached < cache->max_cached) {
cache->nr_cached++;
hlist_add_head(&entry->node, &cache->list);
wq_stack_add_head(&entry->node, &cache->list);
/* KASAN poisons object */
kasan_slab_free_mempool(entry);
return true;
}
return false;
}

static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache)
{
return !cache->list.next;
}

static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
{
if (!hlist_empty(&cache->list)) {
struct hlist_node *node = cache->list.first;
if (cache->list.next) {
struct io_cache_entry *entry;

hlist_del(node);
entry = container_of(cache->list.next, struct io_cache_entry, node);
kasan_unpoison_range(entry, cache->elem_size);
cache->list.next = cache->list.next->next;
cache->nr_cached--;
return container_of(node, struct io_cache_entry, node);
return entry;
}

return NULL;
}

static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
static inline void io_alloc_cache_init(struct io_alloc_cache *cache,
unsigned max_nr, size_t size)
{
INIT_HLIST_HEAD(&cache->list);
cache->list.next = NULL;
cache->nr_cached = 0;
cache->max_cached = max_nr;
cache->elem_size = size;
}

static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
void (*free)(struct io_cache_entry *))
{
while (!hlist_empty(&cache->list)) {
struct hlist_node *node = cache->list.first;
while (1) {
struct io_cache_entry *entry = io_alloc_cache_get(cache);

hlist_del(node);
free(container_of(node, struct io_cache_entry, node));
if (!entry)
break;
free(entry);
}
cache->nr_cached = 0;
}
Expand Down
21 changes: 4 additions & 17 deletions io_uring/filetable.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
u32 slot_index)
__must_hold(&req->ctx->uring_lock)
{
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret;

Expand All @@ -81,18 +80,13 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
if (file_slot->file_ptr) {
struct file *old_file;

ret = io_rsrc_node_switch_start(ctx);
if (ret)
goto err;

old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
ctx->rsrc_node, old_file);
ret = io_queue_rsrc_removal(ctx->file_data, slot_index, old_file);
if (ret)
goto err;
return ret;

file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, slot_index);
needs_switch = true;
}

ret = io_scm_file_account(ctx, file);
Expand All @@ -101,9 +95,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
io_fixed_file_set(file_slot, file);
io_file_bitmap_set(&ctx->file_table, slot_index);
}
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
return ret;
}

Expand Down Expand Up @@ -156,23 +147,19 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
return -ENXIO;
if (offset >= ctx->nr_user_files)
return -EINVAL;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;

offset = array_index_nospec(offset, ctx->nr_user_files);
file_slot = io_fixed_file_slot(&ctx->file_table, offset);
if (!file_slot->file_ptr)
return -EBADF;

file = (struct file *)(file_slot->file_ptr & FFS_MASK);
ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
ret = io_queue_rsrc_removal(ctx->file_data, offset, file);
if (ret)
return ret;

file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, offset);
io_rsrc_node_switch(ctx, ctx->file_data);
return 0;
}

Expand Down
Loading

0 comments on commit 5b9a7bb

Please sign in to comment.