From 35fa71a030caa50458a043560d4814ea9bcd639f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 22 Apr 2019 10:23:23 -0600
Subject: [PATCH 1/5] io_uring: fail io_uring_register(2) on a dying io_uring
 instance

If we have multiple threads doing io_uring_register(2) on an io_uring
fd, then we can potentially try and kill the percpu reference while
someone else has already killed it.

Prevent this race by failing io_uring_register(2) if the ref is marked
dying. This is safe since we're inside the io_uring mutex.

Fixes: b19062a56726 ("io_uring: fix possible deadlock between io_uring_{enter,register}")
Reported-by: syzbot <syzbot+10d25e23199614b7721f@syzkaller.appspotmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f65f85d892174..a2f39faed6a7e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2934,6 +2934,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 {
 	int ret;
 
+	/*
+	 * We're inside the ring mutex, if the ref is already dying, then
+	 * someone else killed the ctx or is already going through
+	 * io_uring_register().
+	 */
+	if (percpu_ref_is_dying(&ctx->refs))
+		return -ENXIO;
+
 	percpu_ref_kill(&ctx->refs);
 
 	/*

From e523a29c4f2703bdb98f68ce1bb256e259fd8d5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20B=C3=BChler?= <source@stbuehler.de>
Date: Fri, 19 Apr 2019 11:57:44 +0200
Subject: [PATCH 2/5] io_uring: fix race condition reading SQ entries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A read memory barrier is required between reading SQ tail and reading
the actual data belonging to the SQ entry.

Userspace needs a matching write barrier between writing SQ entries and
updating SQ tail (using smp_store_release to update tail will do).

Signed-off-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a2f39faed6a7e..41e3a6f6a096a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1739,7 +1739,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 	head = ctx->cached_sq_head;
 	/* See comment at the top of this file */
 	smp_rmb();
-	if (head == READ_ONCE(ring->r.tail))
+	/* make sure SQ entry isn't read before tail */
+	if (head == smp_load_acquire(&ring->r.tail))
 		return false;
 
 	head = READ_ONCE(ring->array[head & ctx->sq_mask]);

From 0d7bae69c574c5f25802f8a71252e7d66933a3ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20B=C3=BChler?= <source@stbuehler.de>
Date: Fri, 19 Apr 2019 11:57:45 +0200
Subject: [PATCH 3/5] io_uring: fix race condition when sq threads goes
 sleeping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reading the SQ tail needs to come after setting IORING_SQ_NEED_WAKEUP in
flags; there is no cheap barrier for ordering a store before a load, a
full memory barrier is required.

Userspace needs a full memory barrier between updating SQ tail and
checking for the IORING_SQ_NEED_WAKEUP too.

Signed-off-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 41e3a6f6a096a..69910fd9ccca0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1865,7 +1865,8 @@ static int io_sq_thread(void *data)
 
 			/* Tell userspace we may need a wakeup call */
 			ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
-			smp_wmb();
+			/* make sure to read SQ tail after writing flags */
+			smp_mb();
 
 			if (!io_get_sqring(ctx, &sqes[0])) {
 				if (kthread_should_stop()) {

From fb775faa9e46ff481e4ced11116c9bd45359cb43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20B=C3=BChler?= <source@stbuehler.de>
Date: Fri, 19 Apr 2019 11:57:46 +0200
Subject: [PATCH 4/5] io_uring: fix poll full SQ detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

io_uring_poll shouldn't signal EPOLLOUT | EPOLLWRNORM if the queue is
full; the old check would always signal EPOLLOUT | EPOLLWRNORM (unless
there were U32_MAX - 1 entries in the SQ queue).

Signed-off-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 69910fd9ccca0..b998e98acd01e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2576,7 +2576,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &ctx->cq_wait, wait);
 	/* See comment at the top of this file */
 	smp_rmb();
-	if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
+	if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head !=
+	    ctx->sq_ring->ring_entries)
 		mask |= EPOLLOUT | EPOLLWRNORM;
 	if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
 		mask |= EPOLLIN | EPOLLRDNORM;

From 8358e3a8264a228cf2dfb6f3a05c0328f4118f12 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 23 Apr 2019 08:17:58 -0600
Subject: [PATCH 5/5] io_uring: remove 'state' argument from io_{read,write}
 path

Since commit 09bb839434b we don't use the state argument for any sort
of on-stack caching in the io read and write path. Remove the stale
and unused argument from them, and bubble it up to __io_submit_sqe()
and down to io_prep_rw().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b998e98acd01e..0e9fb2cb1984b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -740,7 +740,7 @@ static bool io_file_supports_async(struct file *file)
 }
 
 static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
-		      bool force_nonblock, struct io_submit_state *state)
+		      bool force_nonblock)
 {
 	const struct io_uring_sqe *sqe = s->sqe;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -938,7 +938,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
 }
 
 static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
-		   bool force_nonblock, struct io_submit_state *state)
+		   bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
@@ -947,7 +947,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 	size_t iov_count;
 	int ret;
 
-	ret = io_prep_rw(req, s, force_nonblock, state);
+	ret = io_prep_rw(req, s, force_nonblock);
 	if (ret)
 		return ret;
 	file = kiocb->ki_filp;
@@ -985,7 +985,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 }
 
 static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
-		    bool force_nonblock, struct io_submit_state *state)
+		    bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
@@ -994,7 +994,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	size_t iov_count;
 	int ret;
 
-	ret = io_prep_rw(req, s, force_nonblock, state);
+	ret = io_prep_rw(req, s, force_nonblock);
 	if (ret)
 		return ret;
 
@@ -1336,8 +1336,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 }
 
 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			   const struct sqe_submit *s, bool force_nonblock,
-			   struct io_submit_state *state)
+			   const struct sqe_submit *s, bool force_nonblock)
 {
 	int ret, opcode;
 
@@ -1353,18 +1352,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_READV:
 		if (unlikely(s->sqe->buf_index))
 			return -EINVAL;
-		ret = io_read(req, s, force_nonblock, state);
+		ret = io_read(req, s, force_nonblock);
 		break;
 	case IORING_OP_WRITEV:
 		if (unlikely(s->sqe->buf_index))
 			return -EINVAL;
-		ret = io_write(req, s, force_nonblock, state);
+		ret = io_write(req, s, force_nonblock);
 		break;
 	case IORING_OP_READ_FIXED:
-		ret = io_read(req, s, force_nonblock, state);
+		ret = io_read(req, s, force_nonblock);
 		break;
 	case IORING_OP_WRITE_FIXED:
-		ret = io_write(req, s, force_nonblock, state);
+		ret = io_write(req, s, force_nonblock);
 		break;
 	case IORING_OP_FSYNC:
 		ret = io_fsync(req, s->sqe, force_nonblock);
@@ -1457,7 +1456,7 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 			s->has_user = cur_mm != NULL;
 			s->needs_lock = true;
 			do {
-				ret = __io_submit_sqe(ctx, req, s, false, NULL);
+				ret = __io_submit_sqe(ctx, req, s, false);
 				/*
 				 * We can get EAGAIN for polled IO even though
 				 * we're forcing a sync submission from here,
@@ -1623,7 +1622,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	if (unlikely(ret))
 		goto out;
 
-	ret = __io_submit_sqe(ctx, req, s, true, state);
+	ret = __io_submit_sqe(ctx, req, s, true);
 	if (ret == -EAGAIN) {
 		struct io_uring_sqe *sqe_copy;