From ad711c5d113f53d6f16096dd6ed9f4939a857149 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 Jun 2023 09:54:56 -0600
Subject: [PATCH 01/47] io_uring/poll: always set 'ctx' in io_cancel_data

This isn't strictly necessary for this callsite, as it uses it's
internal lookup for this cancelation purpose. But let's be consistent
with how it's used in general and set ctx as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index d4597efe14a7..c7bb292c9046 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -972,8 +972,8 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
-	struct io_cancel_data cd = { .data = poll_update->old_user_data, };
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, };
 	struct io_hash_bucket *bucket;
 	struct io_kiocb *preq;
 	int ret2, ret = 0;

From faa9c0ee3cab9c68b79183c9e0111ba967d9f402 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 Jun 2023 09:34:08 -0600
Subject: [PATCH 02/47] io_uring/timeout: always set 'ctx' in io_cancel_data

In preparation for using a generic handler to match requests for
cancelation purposes, ensure that ctx is set in io_cancel_data. The
timeout handlers don't check for this as it'll always match, but we'll
need it set going forward.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/timeout.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index fb0547b35dcd..4200099ad96e 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -409,7 +409,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 			     struct timespec64 *ts, enum hrtimer_mode mode)
 	__must_hold(&ctx->timeout_lock)
 {
-	struct io_cancel_data cd = { .data = user_data, };
+	struct io_cancel_data cd = { .ctx = ctx, .data = user_data, };
 	struct io_kiocb *req = io_timeout_extract(ctx, &cd);
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_timeout_data *data;
@@ -473,7 +473,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
-		struct io_cancel_data cd = { .data = tr->addr, };
+		struct io_cancel_data cd = { .ctx = ctx, .data = tr->addr, };
 
 		spin_lock(&ctx->completion_lock);
 		ret = io_timeout_cancel(ctx, &cd);

From aa5cd116f3c25c05e4724d7b5e24dc9ed9020a12 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 Jun 2023 09:00:24 -0600
Subject: [PATCH 03/47] io_uring/cancel: abstract out request match helper

We have different match code in a variety of spots. Start the cleanup of
this by abstracting out a helper that can be used to check if a given
request matches the cancelation criteria outlined in io_cancel_data.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/cancel.c | 17 +++++++++++++----
 io_uring/cancel.h |  1 +
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 58c46c852bdd..8527ec3cc11f 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -27,11 +27,11 @@ struct io_cancel {
 #define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
 			 IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED)
 
-static bool io_cancel_cb(struct io_wq_work *work, void *data)
+/*
+ * Returns true if the request matches the criteria outlined by 'cd'.
+ */
+bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
 {
-	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-	struct io_cancel_data *cd = data;
-
 	if (req->ctx != cd->ctx)
 		return false;
 	if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
@@ -48,9 +48,18 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 			return false;
 		req->work.cancel_seq = cd->seq;
 	}
+
 	return true;
 }
 
+static bool io_cancel_cb(struct io_wq_work *work, void *data)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_cancel_data *cd = data;
+
+	return io_cancel_req_match(req, cd);
+}
+
 static int io_async_cancel_one(struct io_uring_task *tctx,
 			       struct io_cancel_data *cd)
 {
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 6a59ee484d0c..496ce4dac78e 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -21,3 +21,4 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
 void init_hash_table(struct io_hash_table *table, unsigned size);
 
 int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
+bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);

From 3a372b66923e4af966af2900da588e3b3de6fcd2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 Jun 2023 09:38:26 -0600
Subject: [PATCH 04/47] io_uring/cancel: fix sequence matching for
 IORING_ASYNC_CANCEL_ANY

We always need to check/update the cancel sequence if
IORING_ASYNC_CANCEL_ALL is set. Also kill the redundant check for
IORING_ASYNC_CANCEL_ANY at the end, if we get here we know it's
not set as we would've matched it higher up.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/cancel.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 8527ec3cc11f..bf44563d687d 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -35,7 +35,7 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
 	if (req->ctx != cd->ctx)
 		return false;
 	if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
-		;
+		goto check_seq;
 	} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
 		if (req->file != cd->file)
 			return false;
@@ -43,7 +43,8 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
 		if (req->cqe.user_data != cd->data)
 			return false;
 	}
-	if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
+	if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+check_seq:
 		if (cd->seq == req->work.cancel_seq)
 			return false;
 		req->work.cancel_seq = cd->seq;

From a30badf66de8516b5a5bca7a5d339f377ff983ea Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 Jun 2023 09:04:35 -0600
Subject: [PATCH 05/47] io_uring: use cancelation match helper for poll and
 timeout requests

Get rid of the request vs io_cancel_data checking and just use the
exported helper for this.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c    | 12 ++++--------
 io_uring/timeout.c | 12 +++---------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index c7bb292c9046..dc1219f606e5 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -824,14 +824,10 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
 
 		spin_lock(&hb->lock);
 		hlist_for_each_entry(req, &hb->list, hash_node) {
-			if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
-			    req->file != cd->file)
-				continue;
-			if (cd->seq == req->work.cancel_seq)
-				continue;
-			req->work.cancel_seq = cd->seq;
-			*out_bucket = hb;
-			return req;
+			if (io_cancel_req_match(req, cd)) {
+				*out_bucket = hb;
+				return req;
+			}
 		}
 		spin_unlock(&hb->lock);
 	}
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 4200099ad96e..6242130e73c6 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -268,16 +268,10 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 	list_for_each_entry(timeout, &ctx->timeout_list, list) {
 		struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
 
-		if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
-		    cd->data != tmp->cqe.user_data)
-			continue;
-		if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
-			if (cd->seq == tmp->work.cancel_seq)
-				continue;
-			tmp->work.cancel_seq = cd->seq;
+		if (io_cancel_req_match(tmp, cd)) {
+			req = tmp;
+			break;
 		}
-		req = tmp;
-		break;
 	}
 	if (!req)
 		return ERR_PTR(-ENOENT);

From 8165b566049b14152873011ea540eb22eae5111d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 Jun 2023 10:33:11 -0600
Subject: [PATCH 06/47] io_uring/cancel: add IORING_ASYNC_CANCEL_USERDATA

Add a flag to explicitly match on user_data in the request for
cancelation purposes. This is the default behavior if none of the
other match flags are set, but if we ALSO want to match on user_data,
then this flag can be set.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  2 ++
 io_uring/cancel.c             | 18 ++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 08720c7bd92f..97246ec386d4 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -299,11 +299,13 @@ enum io_uring_op {
  *				request 'user_data'
  * IORING_ASYNC_CANCEL_ANY	Match any request
  * IORING_ASYNC_CANCEL_FD_FIXED	'fd' passed in is a fixed descriptor
+ * IORING_ASYNC_CANCEL_USERDATA	Match on user_data, default for no other key
  */
 #define IORING_ASYNC_CANCEL_ALL	(1U << 0)
 #define IORING_ASYNC_CANCEL_FD	(1U << 1)
 #define IORING_ASYNC_CANCEL_ANY	(1U << 2)
 #define IORING_ASYNC_CANCEL_FD_FIXED	(1U << 3)
+#define IORING_ASYNC_CANCEL_USERDATA	(1U << 4)
 
 /*
  * send/sendmsg and recv/recvmsg flags (sqe->ioprio)
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index bf44563d687d..20612e93a354 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -25,24 +25,30 @@ struct io_cancel {
 };
 
 #define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
-			 IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED)
+			 IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \
+			 IORING_ASYNC_CANCEL_USERDATA)
 
 /*
  * Returns true if the request matches the criteria outlined by 'cd'.
  */
 bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
 {
+	bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA;
+
 	if (req->ctx != cd->ctx)
 		return false;
-	if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
+
+	if (!(cd->flags & (IORING_ASYNC_CANCEL_FD)))
+		match_user_data = true;
+
+	if (cd->flags & IORING_ASYNC_CANCEL_ANY)
 		goto check_seq;
-	} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
+	if (cd->flags & IORING_ASYNC_CANCEL_FD) {
 		if (req->file != cd->file)
 			return false;
-	} else {
-		if (req->cqe.user_data != cd->data)
-			return false;
 	}
+	if (match_user_data && req->cqe.user_data != cd->data)
+		return false;
 	if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
 check_seq:
 		if (cd->seq == req->work.cancel_seq)

From d7b8b079a8f6bc007d06d9ee468659dae6053e13 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 Jun 2023 10:36:43 -0600
Subject: [PATCH 07/47] io_uring/cancel: support opcode based lookup and
 cancelation

Add IORING_ASYNC_CANCEL_OP flag for cancelation, which allows the
application to target cancelation based on the opcode of the original
request.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  2 ++
 io_uring/cancel.c             | 17 ++++++++++++++---
 io_uring/cancel.h             |  2 +-
 io_uring/poll.c               |  3 ++-
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 97246ec386d4..b64ddd41468e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -300,12 +300,14 @@ enum io_uring_op {
  * IORING_ASYNC_CANCEL_ANY	Match any request
  * IORING_ASYNC_CANCEL_FD_FIXED	'fd' passed in is a fixed descriptor
  * IORING_ASYNC_CANCEL_USERDATA	Match on user_data, default for no other key
+ * IORING_ASYNC_CANCEL_OP	Match request based on opcode
  */
 #define IORING_ASYNC_CANCEL_ALL	(1U << 0)
 #define IORING_ASYNC_CANCEL_FD	(1U << 1)
 #define IORING_ASYNC_CANCEL_ANY	(1U << 2)
 #define IORING_ASYNC_CANCEL_FD_FIXED	(1U << 3)
 #define IORING_ASYNC_CANCEL_USERDATA	(1U << 4)
+#define IORING_ASYNC_CANCEL_OP	(1U << 5)
 
 /*
  * send/sendmsg and recv/recvmsg flags (sqe->ioprio)
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 20612e93a354..d91116b032eb 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -22,11 +22,12 @@ struct io_cancel {
 	u64				addr;
 	u32				flags;
 	s32				fd;
+	u8				opcode;
 };
 
 #define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
 			 IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \
-			 IORING_ASYNC_CANCEL_USERDATA)
+			 IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP)
 
 /*
  * Returns true if the request matches the criteria outlined by 'cd'.
@@ -38,7 +39,7 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
 	if (req->ctx != cd->ctx)
 		return false;
 
-	if (!(cd->flags & (IORING_ASYNC_CANCEL_FD)))
+	if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP)))
 		match_user_data = true;
 
 	if (cd->flags & IORING_ASYNC_CANCEL_ANY)
@@ -47,6 +48,10 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
 		if (req->file != cd->file)
 			return false;
 	}
+	if (cd->flags & IORING_ASYNC_CANCEL_OP) {
+		if (req->opcode != cd->opcode)
+			return false;
+	}
 	if (match_user_data && req->cqe.user_data != cd->data)
 		return false;
 	if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
@@ -127,7 +132,7 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
-	if (sqe->off || sqe->len || sqe->splice_fd_in)
+	if (sqe->off || sqe->splice_fd_in)
 		return -EINVAL;
 
 	cancel->addr = READ_ONCE(sqe->addr);
@@ -139,6 +144,11 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 			return -EINVAL;
 		cancel->fd = READ_ONCE(sqe->fd);
 	}
+	if (cancel->flags & IORING_ASYNC_CANCEL_OP) {
+		if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
+			return -EINVAL;
+		cancel->opcode = READ_ONCE(sqe->len);
+	}
 
 	return 0;
 }
@@ -185,6 +195,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 		.ctx	= req->ctx,
 		.data	= cancel->addr,
 		.flags	= cancel->flags,
+		.opcode	= cancel->opcode,
 		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
 	};
 	struct io_uring_task *tctx = req->task->io_uring;
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 496ce4dac78e..fc98622e6166 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -8,11 +8,11 @@ struct io_cancel_data {
 		u64 data;
 		struct file *file;
 	};
+	u8 opcode;
 	u32 flags;
 	int seq;
 };
 
-
 int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
 
diff --git a/io_uring/poll.c b/io_uring/poll.c
index dc1219f606e5..65ec363f6377 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -851,7 +851,8 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 	struct io_hash_bucket *bucket;
 	struct io_kiocb *req;
 
-	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
+	if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP |
+			 IORING_ASYNC_CANCEL_ANY))
 		req = io_poll_file_find(ctx, cd, table, &bucket);
 	else
 		req = io_poll_find(ctx, false, cd, table, &bucket);

From f77569d22ad91dc25de294864fa5b24d37ddc149 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 22 Jun 2023 13:03:52 -0600
Subject: [PATCH 08/47] io_uring/cancel: wire up IORING_ASYNC_CANCEL_OP for
 sync cancel

Allow usage of IORING_ASYNC_CANCEL_OP through the sync cancelation
API as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  4 +++-
 io_uring/cancel.c             | 11 ++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b64ddd41468e..36f9c73082de 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -701,7 +701,9 @@ struct io_uring_sync_cancel_reg {
 	__s32				fd;
 	__u32				flags;
 	struct __kernel_timespec	timeout;
-	__u64				pad[4];
+	__u8				opcode;
+	__u8				pad[7];
+	__u64				pad2[3];
 };
 
 /*
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index d91116b032eb..7b23607cf4af 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -265,17 +265,22 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
 	struct io_uring_sync_cancel_reg sc;
 	struct fd f = { };
 	DEFINE_WAIT(wait);
-	int ret;
+	int ret, i;
 
 	if (copy_from_user(&sc, arg, sizeof(sc)))
 		return -EFAULT;
 	if (sc.flags & ~CANCEL_FLAGS)
 		return -EINVAL;
-	if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3])
-		return -EINVAL;
+	for (i = 0; i < ARRAY_SIZE(sc.pad); i++)
+		if (sc.pad[i])
+			return -EINVAL;
+	for (i = 0; i < ARRAY_SIZE(sc.pad2); i++)
+		if (sc.pad2[i])
+			return -EINVAL;
 
 	cd.data = sc.addr;
 	cd.flags = sc.flags;
+	cd.opcode = sc.opcode;
 
 	/* we can grab a normal file descriptor upfront */
 	if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&

From 8e9fad0e70b7b62848e0aeb1a873903b9ce4d7c4 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Tue, 27 Jun 2023 06:44:24 -0700
Subject: [PATCH 09/47] io_uring: Add io_uring command support for sockets

Enable io_uring commands on network sockets. Create two new
SOCKET_URING_OP commands that will operate on sockets.

In order to call ioctl on sockets, use the file_operations->io_uring_cmd
callbacks, and map it to a uring socket function, which handles the
SOCKET_URING_OP accordingly, and calls socket ioctls.

This patches was tested by creating a new test case in liburing.
Link: https://github.com/leitao/liburing/tree/io_uring_cmd

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20230627134424.2784797-1-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h      |  6 ++++++
 include/uapi/linux/io_uring.h |  8 ++++++++
 io_uring/uring_cmd.c          | 28 ++++++++++++++++++++++++++++
 net/socket.c                  |  2 ++
 4 files changed, 44 insertions(+)

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index bb9c666bd584..106cdc55ff3b 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -81,6 +81,7 @@ static inline void io_uring_free(struct task_struct *tsk)
 	if (tsk->io_uring)
 		__io_uring_free(tsk);
 }
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
 #else
 static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
@@ -116,6 +117,11 @@ static inline const char *io_uring_get_opcode(u8 opcode)
 {
 	return "";
 }
+static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
+				    unsigned int issue_flags)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 #endif
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 36f9c73082de..9fc7195f25df 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -723,6 +723,14 @@ struct io_uring_recvmsg_out {
 	__u32 flags;
 };
 
+/*
+ * Argument for IORING_OP_URING_CMD when file is a socket
+ */
+enum {
+	SOCKET_URING_OP_SIOCINQ		= 0,
+	SOCKET_URING_OP_SIOCOUTQ,
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 476c7877ce58..8e7a03c1b20e 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -7,6 +7,7 @@
 #include <linux/nospec.h>
 
 #include <uapi/linux/io_uring.h>
+#include <uapi/asm-generic/ioctls.h>
 
 #include "io_uring.h"
 #include "rsrc.h"
@@ -164,3 +165,30 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 	return io_import_fixed(rw, iter, req->imu, ubuf, len);
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
+
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	struct socket *sock = cmd->file->private_data;
+	struct sock *sk = sock->sk;
+	struct proto *prot = READ_ONCE(sk->sk_prot);
+	int ret, arg = 0;
+
+	if (!prot || !prot->ioctl)
+		return -EOPNOTSUPP;
+
+	switch (cmd->sqe->cmd_op) {
+	case SOCKET_URING_OP_SIOCINQ:
+		ret = prot->ioctl(sk, SIOCINQ, &arg);
+		if (ret)
+			return ret;
+		return arg;
+	case SOCKET_URING_OP_SIOCOUTQ:
+		ret = prot->ioctl(sk, SIOCOUTQ, &arg);
+		if (ret)
+			return ret;
+		return arg;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
diff --git a/net/socket.c b/net/socket.c
index 2b0e54b2405c..1dc23f5298ba 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -88,6 +88,7 @@
 #include <linux/xattr.h>
 #include <linux/nospec.h>
 #include <linux/indirect_call_wrapper.h>
+#include <linux/io_uring.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -159,6 +160,7 @@ static const struct file_operations socket_file_ops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = compat_sock_ioctl,
 #endif
+	.uring_cmd =    io_uring_cmd_sock,
 	.mmap =		sock_mmap,
 	.release =	sock_close,
 	.fasync =	sock_fasync,

From b97f96e22f051d59d07a527dbd7d90408b661ca8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 2 Aug 2023 14:38:01 -0600
Subject: [PATCH 10/47] io_uring: annotate the struct io_kiocb slab for
 appropriate user copy

When compiling the kernel with clang and having HARDENED_USERCOPY
enabled, the liburing openat2.t test case fails during request setup:

usercopy: Kernel memory overwrite attempt detected to SLUB object 'io_kiocb' (offset 24, size 24)!
------------[ cut here ]------------
kernel BUG at mm/usercopy.c:102!
invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
CPU: 3 PID: 413 Comm: openat2.t Tainted: G                 N 6.4.3-g6995e2de6891-dirty #19
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.1-0-g3208b098f51a-prebuilt.qemu.org 04/01/2014
RIP: 0010:usercopy_abort+0x84/0x90
Code: ce 49 89 ce 48 c7 c3 68 48 98 82 48 0f 44 de 48 c7 c7 56 c6 94 82 4c 89 de 48 89 c1 41 52 41 56 53 e8 e0 51 c5 00 48 83 c4 18 <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 41 57 41 56
RSP: 0018:ffffc900016b3da0 EFLAGS: 00010296
RAX: 0000000000000062 RBX: ffffffff82984868 RCX: 4e9b661ac6275b00
RDX: ffff8881b90ec580 RSI: ffffffff82949a64 RDI: 00000000ffffffff
RBP: 0000000000000018 R08: 0000000000000000 R09: 0000000000000000
R10: ffffc900016b3c88 R11: ffffc900016b3c30 R12: 00007ffe549659e0
R13: ffff888119014000 R14: 0000000000000018 R15: 0000000000000018
FS:  00007f862e3ca680(0000) GS:ffff8881b90c0000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005571483542a8 CR3: 0000000118c11000 CR4: 00000000003506e0
Call Trace:
 <TASK>
 ? __die_body+0x63/0xb0
 ? die+0x9d/0xc0
 ? do_trap+0xa7/0x180
 ? usercopy_abort+0x84/0x90
 ? do_error_trap+0xc6/0x110
 ? usercopy_abort+0x84/0x90
 ? handle_invalid_op+0x2c/0x40
 ? usercopy_abort+0x84/0x90
 ? exc_invalid_op+0x2f/0x40
 ? asm_exc_invalid_op+0x16/0x20
 ? usercopy_abort+0x84/0x90
 __check_heap_object+0xe2/0x110
 __check_object_size+0x142/0x3d0
 io_openat2_prep+0x68/0x140
 io_submit_sqes+0x28a/0x680
 __se_sys_io_uring_enter+0x120/0x580
 do_syscall_64+0x3d/0x80
 entry_SYSCALL_64_after_hwframe+0x46/0xb0
RIP: 0033:0x55714834de26
Code: ca 01 0f b6 82 d0 00 00 00 8b ba cc 00 00 00 45 31 c0 31 d2 41 b9 08 00 00 00 83 e0 01 c1 e0 04 41 09 c2 b8 aa 01 00 00 0f 05 <c3> 66 0f 1f 84 00 00 00 00 00 89 30 eb 89 0f 1f 40 00 8b 00 a8 06
RSP: 002b:00007ffe549659c8 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
RAX: ffffffffffffffda RBX: 00007ffe54965a50 RCX: 000055714834de26
RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000003
RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000008
R10: 0000000000000000 R11: 0000000000000246 R12: 000055714834f057
R13: 00007ffe54965a50 R14: 0000000000000001 R15: 0000557148351dd8
 </TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---

when it tries to copy struct open_how from userspace into the per-command
space in the io_kiocb. There's nothing wrong with the copy, but we're
missing the appropriate annotations for allowing user copies to/from the
io_kiocb slab.

Allow copies in the per-command area, which is from the 'file' pointer to
when 'opcode' starts. We do have existing user copies there, but they are
not all annotated like the one that openat2_prep() uses,
copy_struct_from_user(). But in practice opcodes should be allowed to
copy data into their per-command area in the io_kiocb.

Reported-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7505de2428e0..679bea7c41a6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4601,8 +4601,20 @@ static int __init io_uring_init(void)
 
 	io_uring_optable_init();
 
-	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
-				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
+	/*
+	 * Allow user copy in the per-command field, which starts after the
+	 * file in io_kiocb and until the opcode field. The openat2 handling
+	 * requires copying in user memory into the io_kiocb object in that
+	 * range, and HARDENED_USERCOPY will complain if we haven't
+	 * correctly annotated this range.
+	 */
+	req_cachep = kmem_cache_create_usercopy("io_kiocb",
+				sizeof(struct io_kiocb), 0,
+				SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
+				offsetof(struct io_kiocb, cmd.data),
+				sizeof_field(struct io_kiocb, cmd.data), NULL);
+
 	return 0;
 };
 __initcall(io_uring_init);

From d4b30eed51d79361c290dc25a1386f5611f4982a Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Tue, 8 Aug 2023 23:10:58 +0800
Subject: [PATCH 11/47] io_uring/rsrc: Remove unused declaration
 io_rsrc_put_tw()

Commit 36b9818a5a84 ("io_uring/rsrc: don't offload node free")
removed the implementation but leave declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230808151058.4572-1-yuehaibing@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 0a8a95e9b99e..8afa9ec66a55 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -57,7 +57,6 @@ struct io_mapped_ubuf {
 	struct bio_vec	bvec[];
 };
 
-void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);

From cfdbaa3a291d6fd2cb4a1a70d74e63b4abc2f5ec Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 9 Aug 2023 13:21:41 +0100
Subject: [PATCH 12/47] io_uring: fix drain stalls by invalid SQE

cq_extra is protected by ->completion_lock, which io_get_sqe() misses.
The bug is harmless as it doesn't happen in real life, requires invalid
SQ index array and racing with submission, and only messes up the
userspace, i.e. stall requests execution but will be cleaned up on
ring destruction.

Fixes: 15641e427070f ("io_uring: don't cache number of dropped SQEs")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/66096d54651b1a60534bb2023f2947f09f50ef73.1691538547.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 679bea7c41a6..249cab9c86d1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2374,7 +2374,9 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
 	}
 
 	/* drop invalid entries */
+	spin_lock(&ctx->completion_lock);
 	ctx->cq_extra--;
+	spin_unlock(&ctx->completion_lock);
 	WRITE_ONCE(ctx->rings->sq_dropped,
 		   READ_ONCE(ctx->rings->sq_dropped) + 1);
 	return false;

From 569f5308e54352a12181cc0185f848024c5443e8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 9 Aug 2023 13:22:16 +0100
Subject: [PATCH 13/47] io_uring: fix false positive KASAN warnings

io_req_local_work_add() peeks into the work list, which can be executed
in the meanwhile. It's completely fine without KASAN as we're in an RCU
read section and it's SLAB_TYPESAFE_BY_RCU. With KASAN though it may
trigger a false positive warning because internal io_uring caches are
sanitised.

Remove sanitisation from the io_uring request cache for now.

Cc: stable@vger.kernel.org
Fixes: 8751d15426a31 ("io_uring: reduce scheduling due to tw")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c6fbf7a82a341e66a0007c76eefd9d57f2d3ba51.1691541473.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 1 -
 io_uring/io_uring.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 249cab9c86d1..d0888907527d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -229,7 +229,6 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
 static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
 {
 	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
-	kasan_poison_object_data(req_cachep, req);
 }
 
 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index d3606d30cf6f..12769bad5cee 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -354,7 +354,6 @@ static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
 	struct io_kiocb *req;
 
 	req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list);
-	kasan_unpoison_object_data(req_cachep, req);
 	wq_stack_extract(&ctx->submit_state.free_list);
 	return req;
 }

From 17619322e56bce68290842889658ec5981f00a42 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 9 Aug 2023 13:22:52 +0100
Subject: [PATCH 14/47] io_uring: kill io_uring userspace examples

There are tons of io_uring tests and examples in liburing and on the
Internet. If you're looking for a benchmark, io_uring-bench.c is just an
acutely outdated version of fio/io_uring. And for basic condensed init
template for likes of selftests take a peek at io_uring_zerocopy_tx.c.

Kill tools/io_uring/, it's a burden keeping it here.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7c740701d3b475dcad8c92602a551044f72176b4.1691543666.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS                     |   1 -
 tools/io_uring/Makefile         |  18 -
 tools/io_uring/README           |  29 --
 tools/io_uring/barrier.h        |  16 -
 tools/io_uring/io_uring-bench.c | 592 --------------------------------
 tools/io_uring/io_uring-cp.c    | 283 ---------------
 tools/io_uring/liburing.h       | 187 ----------
 tools/io_uring/queue.c          | 156 ---------
 tools/io_uring/setup.c          | 107 ------
 tools/io_uring/syscall.c        |  52 ---
 10 files changed, 1441 deletions(-)
 delete mode 100644 tools/io_uring/Makefile
 delete mode 100644 tools/io_uring/README
 delete mode 100644 tools/io_uring/barrier.h
 delete mode 100644 tools/io_uring/io_uring-bench.c
 delete mode 100644 tools/io_uring/io_uring-cp.c
 delete mode 100644 tools/io_uring/liburing.h
 delete mode 100644 tools/io_uring/queue.c
 delete mode 100644 tools/io_uring/setup.c
 delete mode 100644 tools/io_uring/syscall.c

diff --git a/MAINTAINERS b/MAINTAINERS
index aee340630eca..2ce167bd0ab4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10920,7 +10920,6 @@ F:	include/linux/io_uring_types.h
 F:	include/trace/events/io_uring.h
 F:	include/uapi/linux/io_uring.h
 F:	io_uring/
-F:	tools/io_uring/
 
 IPMI SUBSYSTEM
 M:	Corey Minyard <minyard@acm.org>
diff --git a/tools/io_uring/Makefile b/tools/io_uring/Makefile
deleted file mode 100644
index 00f146c54c53..000000000000
--- a/tools/io_uring/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Makefile for io_uring test tools
-CFLAGS += -Wall -Wextra -g -D_GNU_SOURCE
-LDLIBS += -lpthread
-
-all: io_uring-cp io_uring-bench
-%: %.c
-	$(CC) $(CFLAGS) -o $@ $^
-
-io_uring-bench: syscall.o io_uring-bench.o
-	$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)
-
-io_uring-cp: setup.o syscall.o queue.o
-
-clean:
-	$(RM) io_uring-cp io_uring-bench *.o
-
-.PHONY: all clean
diff --git a/tools/io_uring/README b/tools/io_uring/README
deleted file mode 100644
index 67fd70115cff..000000000000
--- a/tools/io_uring/README
+++ /dev/null
@@ -1,29 +0,0 @@
-This directory includes a few programs that demonstrate how to use io_uring
-in an application. The examples are:
-
-io_uring-cp
-	A very basic io_uring implementation of cp(1). It takes two
-	arguments, copies the first argument to the second. This example
-	is part of liburing, and hence uses the simplified liburing API
-	for setting up an io_uring instance, submitting IO, completing IO,
-	etc. The support functions in queue.c and setup.c are straight
-	out of liburing.
-
-io_uring-bench
-	Benchmark program that does random reads on a number of files. This
-	app demonstrates the various features of io_uring, like fixed files,
-	fixed buffers, and polled IO. There are options in the program to
-	control which features to use. Arguments is the file (or files) that
-	io_uring-bench should operate on. This uses the raw io_uring
-	interface.
-
-liburing can be cloned with git here:
-
-	git://git.kernel.dk/liburing
-
-and contains a number of unit tests as well for testing io_uring. It also
-comes with man pages for the three system calls.
-
-Fio includes an io_uring engine, you can clone fio here:
-
-	git://git.kernel.dk/fio
diff --git a/tools/io_uring/barrier.h b/tools/io_uring/barrier.h
deleted file mode 100644
index ef00f6722ba9..000000000000
--- a/tools/io_uring/barrier.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef LIBURING_BARRIER_H
-#define LIBURING_BARRIER_H
-
-#if defined(__x86_64) || defined(__i386__)
-#define read_barrier()	__asm__ __volatile__("":::"memory")
-#define write_barrier()	__asm__ __volatile__("":::"memory")
-#else
-/*
- * Add arch appropriate definitions. Be safe and use full barriers for
- * archs we don't have support for.
- */
-#define read_barrier()	__sync_synchronize()
-#define write_barrier()	__sync_synchronize()
-#endif
-
-#endif
diff --git a/tools/io_uring/io_uring-bench.c b/tools/io_uring/io_uring-bench.c
deleted file mode 100644
index 7703f0118385..000000000000
--- a/tools/io_uring/io_uring-bench.c
+++ /dev/null
@@ -1,592 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Simple benchmark program that uses the various features of io_uring
- * to provide fast random access to a device/file. It has various
- * options that are control how we use io_uring, see the OPTIONS section
- * below. This uses the raw io_uring interface.
- *
- * Copyright (C) 2018-2019 Jens Axboe
- */
-#include <stdio.h>
-#include <errno.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <signal.h>
-#include <inttypes.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <sys/syscall.h>
-#include <sys/resource.h>
-#include <sys/mman.h>
-#include <sys/uio.h>
-#include <linux/fs.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <string.h>
-#include <pthread.h>
-#include <sched.h>
-
-#include "liburing.h"
-#include "barrier.h"
-
-#define min(a, b)		((a < b) ? (a) : (b))
-
-struct io_sq_ring {
-	unsigned *head;
-	unsigned *tail;
-	unsigned *ring_mask;
-	unsigned *ring_entries;
-	unsigned *flags;
-	unsigned *array;
-};
-
-struct io_cq_ring {
-	unsigned *head;
-	unsigned *tail;
-	unsigned *ring_mask;
-	unsigned *ring_entries;
-	struct io_uring_cqe *cqes;
-};
-
-#define DEPTH			128
-
-#define BATCH_SUBMIT		32
-#define BATCH_COMPLETE		32
-
-#define BS			4096
-
-#define MAX_FDS			16
-
-static unsigned sq_ring_mask, cq_ring_mask;
-
-struct file {
-	unsigned long max_blocks;
-	unsigned pending_ios;
-	int real_fd;
-	int fixed_fd;
-};
-
-struct submitter {
-	pthread_t thread;
-	int ring_fd;
-	struct drand48_data rand;
-	struct io_sq_ring sq_ring;
-	struct io_uring_sqe *sqes;
-	struct iovec iovecs[DEPTH];
-	struct io_cq_ring cq_ring;
-	int inflight;
-	unsigned long reaps;
-	unsigned long done;
-	unsigned long calls;
-	volatile int finish;
-
-	__s32 *fds;
-
-	struct file files[MAX_FDS];
-	unsigned nr_files;
-	unsigned cur_file;
-};
-
-static struct submitter submitters[1];
-static volatile int finish;
-
-/*
- * OPTIONS: Set these to test the various features of io_uring.
- */
-static int polled = 1;		/* use IO polling */
-static int fixedbufs = 1;	/* use fixed user buffers */
-static int register_files = 1;	/* use fixed files */
-static int buffered = 0;	/* use buffered IO, not O_DIRECT */
-static int sq_thread_poll = 0;	/* use kernel submission/poller thread */
-static int sq_thread_cpu = -1;	/* pin above thread to this CPU */
-static int do_nop = 0;		/* no-op SQ ring commands */
-
-static int io_uring_register_buffers(struct submitter *s)
-{
-	if (do_nop)
-		return 0;
-
-	return io_uring_register(s->ring_fd, IORING_REGISTER_BUFFERS, s->iovecs,
-					DEPTH);
-}
-
-static int io_uring_register_files(struct submitter *s)
-{
-	unsigned i;
-
-	if (do_nop)
-		return 0;
-
-	s->fds = calloc(s->nr_files, sizeof(__s32));
-	for (i = 0; i < s->nr_files; i++) {
-		s->fds[i] = s->files[i].real_fd;
-		s->files[i].fixed_fd = i;
-	}
-
-	return io_uring_register(s->ring_fd, IORING_REGISTER_FILES, s->fds,
-					s->nr_files);
-}
-
-static int lk_gettid(void)
-{
-	return syscall(__NR_gettid);
-}
-
-static unsigned file_depth(struct submitter *s)
-{
-	return (DEPTH + s->nr_files - 1) / s->nr_files;
-}
-
-static void init_io(struct submitter *s, unsigned index)
-{
-	struct io_uring_sqe *sqe = &s->sqes[index];
-	unsigned long offset;
-	struct file *f;
-	long r;
-
-	if (do_nop) {
-		sqe->opcode = IORING_OP_NOP;
-		return;
-	}
-
-	if (s->nr_files == 1) {
-		f = &s->files[0];
-	} else {
-		f = &s->files[s->cur_file];
-		if (f->pending_ios >= file_depth(s)) {
-			s->cur_file++;
-			if (s->cur_file == s->nr_files)
-				s->cur_file = 0;
-			f = &s->files[s->cur_file];
-		}
-	}
-	f->pending_ios++;
-
-	lrand48_r(&s->rand, &r);
-	offset = (r % (f->max_blocks - 1)) * BS;
-
-	if (register_files) {
-		sqe->flags = IOSQE_FIXED_FILE;
-		sqe->fd = f->fixed_fd;
-	} else {
-		sqe->flags = 0;
-		sqe->fd = f->real_fd;
-	}
-	if (fixedbufs) {
-		sqe->opcode = IORING_OP_READ_FIXED;
-		sqe->addr = (unsigned long) s->iovecs[index].iov_base;
-		sqe->len = BS;
-		sqe->buf_index = index;
-	} else {
-		sqe->opcode = IORING_OP_READV;
-		sqe->addr = (unsigned long) &s->iovecs[index];
-		sqe->len = 1;
-		sqe->buf_index = 0;
-	}
-	sqe->ioprio = 0;
-	sqe->off = offset;
-	sqe->user_data = (unsigned long) f;
-}
-
-static int prep_more_ios(struct submitter *s, unsigned max_ios)
-{
-	struct io_sq_ring *ring = &s->sq_ring;
-	unsigned index, tail, next_tail, prepped = 0;
-
-	next_tail = tail = *ring->tail;
-	do {
-		next_tail++;
-		read_barrier();
-		if (next_tail == *ring->head)
-			break;
-
-		index = tail & sq_ring_mask;
-		init_io(s, index);
-		ring->array[index] = index;
-		prepped++;
-		tail = next_tail;
-	} while (prepped < max_ios);
-
-	if (*ring->tail != tail) {
-		/* order tail store with writes to sqes above */
-		write_barrier();
-		*ring->tail = tail;
-		write_barrier();
-	}
-	return prepped;
-}
-
-static int get_file_size(struct file *f)
-{
-	struct stat st;
-
-	if (fstat(f->real_fd, &st) < 0)
-		return -1;
-	if (S_ISBLK(st.st_mode)) {
-		unsigned long long bytes;
-
-		if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
-			return -1;
-
-		f->max_blocks = bytes / BS;
-		return 0;
-	} else if (S_ISREG(st.st_mode)) {
-		f->max_blocks = st.st_size / BS;
-		return 0;
-	}
-
-	return -1;
-}
-
-static int reap_events(struct submitter *s)
-{
-	struct io_cq_ring *ring = &s->cq_ring;
-	struct io_uring_cqe *cqe;
-	unsigned head, reaped = 0;
-
-	head = *ring->head;
-	do {
-		struct file *f;
-
-		read_barrier();
-		if (head == *ring->tail)
-			break;
-		cqe = &ring->cqes[head & cq_ring_mask];
-		if (!do_nop) {
-			f = (struct file *) (uintptr_t) cqe->user_data;
-			f->pending_ios--;
-			if (cqe->res != BS) {
-				printf("io: unexpected ret=%d\n", cqe->res);
-				if (polled && cqe->res == -EOPNOTSUPP)
-					printf("Your filesystem doesn't support poll\n");
-				return -1;
-			}
-		}
-		reaped++;
-		head++;
-	} while (1);
-
-	s->inflight -= reaped;
-	*ring->head = head;
-	write_barrier();
-	return reaped;
-}
-
-static void *submitter_fn(void *data)
-{
-	struct submitter *s = data;
-	struct io_sq_ring *ring = &s->sq_ring;
-	int ret, prepped;
-
-	printf("submitter=%d\n", lk_gettid());
-
-	srand48_r(pthread_self(), &s->rand);
-
-	prepped = 0;
-	do {
-		int to_wait, to_submit, this_reap, to_prep;
-
-		if (!prepped && s->inflight < DEPTH) {
-			to_prep = min(DEPTH - s->inflight, BATCH_SUBMIT);
-			prepped = prep_more_ios(s, to_prep);
-		}
-		s->inflight += prepped;
-submit_more:
-		to_submit = prepped;
-submit:
-		if (to_submit && (s->inflight + to_submit <= DEPTH))
-			to_wait = 0;
-		else
-			to_wait = min(s->inflight + to_submit, BATCH_COMPLETE);
-
-		/*
-		 * Only need to call io_uring_enter if we're not using SQ thread
-		 * poll, or if IORING_SQ_NEED_WAKEUP is set.
-		 */
-		if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) {
-			unsigned flags = 0;
-
-			if (to_wait)
-				flags = IORING_ENTER_GETEVENTS;
-			if ((*ring->flags & IORING_SQ_NEED_WAKEUP))
-				flags |= IORING_ENTER_SQ_WAKEUP;
-			ret = io_uring_enter(s->ring_fd, to_submit, to_wait,
-						flags, NULL);
-			s->calls++;
-		}
-
-		/*
-		 * For non SQ thread poll, we already got the events we needed
-		 * through the io_uring_enter() above. For SQ thread poll, we
-		 * need to loop here until we find enough events.
-		 */
-		this_reap = 0;
-		do {
-			int r;
-			r = reap_events(s);
-			if (r == -1) {
-				s->finish = 1;
-				break;
-			} else if (r > 0)
-				this_reap += r;
-		} while (sq_thread_poll && this_reap < to_wait);
-		s->reaps += this_reap;
-
-		if (ret >= 0) {
-			if (!ret) {
-				to_submit = 0;
-				if (s->inflight)
-					goto submit;
-				continue;
-			} else if (ret < to_submit) {
-				int diff = to_submit - ret;
-
-				s->done += ret;
-				prepped -= diff;
-				goto submit_more;
-			}
-			s->done += ret;
-			prepped = 0;
-			continue;
-		} else if (ret < 0) {
-			if (errno == EAGAIN) {
-				if (s->finish)
-					break;
-				if (this_reap)
-					goto submit;
-				to_submit = 0;
-				goto submit;
-			}
-			printf("io_submit: %s\n", strerror(errno));
-			break;
-		}
-	} while (!s->finish);
-
-	finish = 1;
-	return NULL;
-}
-
-static void sig_int(int sig)
-{
-	printf("Exiting on signal %d\n", sig);
-	submitters[0].finish = 1;
-	finish = 1;
-}
-
-static void arm_sig_int(void)
-{
-	struct sigaction act;
-
-	memset(&act, 0, sizeof(act));
-	act.sa_handler = sig_int;
-	act.sa_flags = SA_RESTART;
-	sigaction(SIGINT, &act, NULL);
-}
-
-static int setup_ring(struct submitter *s)
-{
-	struct io_sq_ring *sring = &s->sq_ring;
-	struct io_cq_ring *cring = &s->cq_ring;
-	struct io_uring_params p;
-	int ret, fd;
-	void *ptr;
-
-	memset(&p, 0, sizeof(p));
-
-	if (polled && !do_nop)
-		p.flags |= IORING_SETUP_IOPOLL;
-	if (sq_thread_poll) {
-		p.flags |= IORING_SETUP_SQPOLL;
-		if (sq_thread_cpu != -1) {
-			p.flags |= IORING_SETUP_SQ_AFF;
-			p.sq_thread_cpu = sq_thread_cpu;
-		}
-	}
-
-	fd = io_uring_setup(DEPTH, &p);
-	if (fd < 0) {
-		perror("io_uring_setup");
-		return 1;
-	}
-	s->ring_fd = fd;
-
-	if (fixedbufs) {
-		ret = io_uring_register_buffers(s);
-		if (ret < 0) {
-			perror("io_uring_register_buffers");
-			return 1;
-		}
-	}
-
-	if (register_files) {
-		ret = io_uring_register_files(s);
-		if (ret < 0) {
-			perror("io_uring_register_files");
-			return 1;
-		}
-	}
-
-	ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
-			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
-			IORING_OFF_SQ_RING);
-	printf("sq_ring ptr = 0x%p\n", ptr);
-	sring->head = ptr + p.sq_off.head;
-	sring->tail = ptr + p.sq_off.tail;
-	sring->ring_mask = ptr + p.sq_off.ring_mask;
-	sring->ring_entries = ptr + p.sq_off.ring_entries;
-	sring->flags = ptr + p.sq_off.flags;
-	sring->array = ptr + p.sq_off.array;
-	sq_ring_mask = *sring->ring_mask;
-
-	s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
-			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
-			IORING_OFF_SQES);
-	printf("sqes ptr    = 0x%p\n", s->sqes);
-
-	ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
-			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
-			IORING_OFF_CQ_RING);
-	printf("cq_ring ptr = 0x%p\n", ptr);
-	cring->head = ptr + p.cq_off.head;
-	cring->tail = ptr + p.cq_off.tail;
-	cring->ring_mask = ptr + p.cq_off.ring_mask;
-	cring->ring_entries = ptr + p.cq_off.ring_entries;
-	cring->cqes = ptr + p.cq_off.cqes;
-	cq_ring_mask = *cring->ring_mask;
-	return 0;
-}
-
-static void file_depths(char *buf)
-{
-	struct submitter *s = &submitters[0];
-	unsigned i;
-	char *p;
-
-	buf[0] = '\0';
-	p = buf;
-	for (i = 0; i < s->nr_files; i++) {
-		struct file *f = &s->files[i];
-
-		if (i + 1 == s->nr_files)
-			p += sprintf(p, "%d", f->pending_ios);
-		else
-			p += sprintf(p, "%d, ", f->pending_ios);
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	struct submitter *s = &submitters[0];
-	unsigned long done, calls, reap;
-	int err, i, flags, fd;
-	char *fdepths;
-	void *ret;
-
-	if (!do_nop && argc < 2) {
-		printf("%s: filename\n", argv[0]);
-		return 1;
-	}
-
-	flags = O_RDONLY | O_NOATIME;
-	if (!buffered)
-		flags |= O_DIRECT;
-
-	i = 1;
-	while (!do_nop && i < argc) {
-		struct file *f;
-
-		if (s->nr_files == MAX_FDS) {
-			printf("Max number of files (%d) reached\n", MAX_FDS);
-			break;
-		}
-		fd = open(argv[i], flags);
-		if (fd < 0) {
-			perror("open");
-			return 1;
-		}
-
-		f = &s->files[s->nr_files];
-		f->real_fd = fd;
-		if (get_file_size(f)) {
-			printf("failed getting size of device/file\n");
-			return 1;
-		}
-		if (f->max_blocks <= 1) {
-			printf("Zero file/device size?\n");
-			return 1;
-		}
-		f->max_blocks--;
-
-		printf("Added file %s\n", argv[i]);
-		s->nr_files++;
-		i++;
-	}
-
-	if (fixedbufs) {
-		struct rlimit rlim;
-
-		rlim.rlim_cur = RLIM_INFINITY;
-		rlim.rlim_max = RLIM_INFINITY;
-		if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
-			perror("setrlimit");
-			return 1;
-		}
-	}
-
-	arm_sig_int();
-
-	for (i = 0; i < DEPTH; i++) {
-		void *buf;
-
-		if (posix_memalign(&buf, BS, BS)) {
-			printf("failed alloc\n");
-			return 1;
-		}
-		s->iovecs[i].iov_base = buf;
-		s->iovecs[i].iov_len = BS;
-	}
-
-	err = setup_ring(s);
-	if (err) {
-		printf("ring setup failed: %s, %d\n", strerror(errno), err);
-		return 1;
-	}
-	printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered);
-	printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", DEPTH, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
-
-	pthread_create(&s->thread, NULL, submitter_fn, s);
-
-	fdepths = malloc(8 * s->nr_files);
-	reap = calls = done = 0;
-	do {
-		unsigned long this_done = 0;
-		unsigned long this_reap = 0;
-		unsigned long this_call = 0;
-		unsigned long rpc = 0, ipc = 0;
-
-		sleep(1);
-		this_done += s->done;
-		this_call += s->calls;
-		this_reap += s->reaps;
-		if (this_call - calls) {
-			rpc = (this_done - done) / (this_call - calls);
-			ipc = (this_reap - reap) / (this_call - calls);
-		} else
-			rpc = ipc = -1;
-		file_depths(fdepths);
-		printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n",
-				this_done - done, rpc, ipc, s->inflight,
-				fdepths);
-		done = this_done;
-		calls = this_call;
-		reap = this_reap;
-	} while (!finish);
-
-	pthread_join(s->thread, &ret);
-	close(s->ring_fd);
-	free(fdepths);
-	return 0;
-}
diff --git a/tools/io_uring/io_uring-cp.c b/tools/io_uring/io_uring-cp.c
deleted file mode 100644
index d9bd6f5f8f46..000000000000
--- a/tools/io_uring/io_uring-cp.c
+++ /dev/null
@@ -1,283 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Simple test program that demonstrates a file copy through io_uring. This
- * uses the API exposed by liburing.
- *
- * Copyright (C) 2018-2019 Jens Axboe
- */
-#include <stdio.h>
-#include <fcntl.h>
-#include <string.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <assert.h>
-#include <errno.h>
-#include <inttypes.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-
-#include "liburing.h"
-
-#define QD	64
-#define BS	(32*1024)
-
-static int infd, outfd;
-
-struct io_data {
-	int read;
-	off_t first_offset, offset;
-	size_t first_len;
-	struct iovec iov;
-};
-
-static int setup_context(unsigned entries, struct io_uring *ring)
-{
-	int ret;
-
-	ret = io_uring_queue_init(entries, ring, 0);
-	if (ret < 0) {
-		fprintf(stderr, "queue_init: %s\n", strerror(-ret));
-		return -1;
-	}
-
-	return 0;
-}
-
-static int get_file_size(int fd, off_t *size)
-{
-	struct stat st;
-
-	if (fstat(fd, &st) < 0)
-		return -1;
-	if (S_ISREG(st.st_mode)) {
-		*size = st.st_size;
-		return 0;
-	} else if (S_ISBLK(st.st_mode)) {
-		unsigned long long bytes;
-
-		if (ioctl(fd, BLKGETSIZE64, &bytes) != 0)
-			return -1;
-
-		*size = bytes;
-		return 0;
-	}
-
-	return -1;
-}
-
-static void queue_prepped(struct io_uring *ring, struct io_data *data)
-{
-	struct io_uring_sqe *sqe;
-
-	sqe = io_uring_get_sqe(ring);
-	assert(sqe);
-
-	if (data->read)
-		io_uring_prep_readv(sqe, infd, &data->iov, 1, data->offset);
-	else
-		io_uring_prep_writev(sqe, outfd, &data->iov, 1, data->offset);
-
-	io_uring_sqe_set_data(sqe, data);
-}
-
-static int queue_read(struct io_uring *ring, off_t size, off_t offset)
-{
-	struct io_uring_sqe *sqe;
-	struct io_data *data;
-
-	data = malloc(size + sizeof(*data));
-	if (!data)
-		return 1;
-
-	sqe = io_uring_get_sqe(ring);
-	if (!sqe) {
-		free(data);
-		return 1;
-	}
-
-	data->read = 1;
-	data->offset = data->first_offset = offset;
-
-	data->iov.iov_base = data + 1;
-	data->iov.iov_len = size;
-	data->first_len = size;
-
-	io_uring_prep_readv(sqe, infd, &data->iov, 1, offset);
-	io_uring_sqe_set_data(sqe, data);
-	return 0;
-}
-
-static void queue_write(struct io_uring *ring, struct io_data *data)
-{
-	data->read = 0;
-	data->offset = data->first_offset;
-
-	data->iov.iov_base = data + 1;
-	data->iov.iov_len = data->first_len;
-
-	queue_prepped(ring, data);
-	io_uring_submit(ring);
-}
-
-static int copy_file(struct io_uring *ring, off_t insize)
-{
-	unsigned long reads, writes;
-	struct io_uring_cqe *cqe;
-	off_t write_left, offset;
-	int ret;
-
-	write_left = insize;
-	writes = reads = offset = 0;
-
-	while (insize || write_left) {
-		int had_reads, got_comp;
-
-		/*
-		 * Queue up as many reads as we can
-		 */
-		had_reads = reads;
-		while (insize) {
-			off_t this_size = insize;
-
-			if (reads + writes >= QD)
-				break;
-			if (this_size > BS)
-				this_size = BS;
-			else if (!this_size)
-				break;
-
-			if (queue_read(ring, this_size, offset))
-				break;
-
-			insize -= this_size;
-			offset += this_size;
-			reads++;
-		}
-
-		if (had_reads != reads) {
-			ret = io_uring_submit(ring);
-			if (ret < 0) {
-				fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
-				break;
-			}
-		}
-
-		/*
-		 * Queue is full at this point. Find at least one completion.
-		 */
-		got_comp = 0;
-		while (write_left) {
-			struct io_data *data;
-
-			if (!got_comp) {
-				ret = io_uring_wait_cqe(ring, &cqe);
-				got_comp = 1;
-			} else {
-				ret = io_uring_peek_cqe(ring, &cqe);
-				if (ret == -EAGAIN) {
-					cqe = NULL;
-					ret = 0;
-				}
-			}
-			if (ret < 0) {
-				fprintf(stderr, "io_uring_peek_cqe: %s\n",
-							strerror(-ret));
-				return 1;
-			}
-			if (!cqe)
-				break;
-
-			data = io_uring_cqe_get_data(cqe);
-			if (cqe->res < 0) {
-				if (cqe->res == -EAGAIN) {
-					queue_prepped(ring, data);
-					io_uring_cqe_seen(ring, cqe);
-					continue;
-				}
-				fprintf(stderr, "cqe failed: %s\n",
-						strerror(-cqe->res));
-				return 1;
-			} else if (cqe->res != data->iov.iov_len) {
-				/* Short read/write, adjust and requeue */
-				data->iov.iov_base += cqe->res;
-				data->iov.iov_len -= cqe->res;
-				data->offset += cqe->res;
-				queue_prepped(ring, data);
-				io_uring_cqe_seen(ring, cqe);
-				continue;
-			}
-
-			/*
-			 * All done. if write, nothing else to do. if read,
-			 * queue up corresponding write.
-			 */
-			if (data->read) {
-				queue_write(ring, data);
-				write_left -= data->first_len;
-				reads--;
-				writes++;
-			} else {
-				free(data);
-				writes--;
-			}
-			io_uring_cqe_seen(ring, cqe);
-		}
-	}
-
-	/* wait out pending writes */
-	while (writes) {
-		struct io_data *data;
-
-		ret = io_uring_wait_cqe(ring, &cqe);
-		if (ret) {
-			fprintf(stderr, "wait_cqe=%d\n", ret);
-			return 1;
-		}
-		if (cqe->res < 0) {
-			fprintf(stderr, "write res=%d\n", cqe->res);
-			return 1;
-		}
-		data = io_uring_cqe_get_data(cqe);
-		free(data);
-		writes--;
-		io_uring_cqe_seen(ring, cqe);
-	}
-
-	return 0;
-}
-
-int main(int argc, char *argv[])
-{
-	struct io_uring ring;
-	off_t insize;
-	int ret;
-
-	if (argc < 3) {
-		printf("%s: infile outfile\n", argv[0]);
-		return 1;
-	}
-
-	infd = open(argv[1], O_RDONLY);
-	if (infd < 0) {
-		perror("open infile");
-		return 1;
-	}
-	outfd = open(argv[2], O_WRONLY | O_CREAT | O_TRUNC, 0644);
-	if (outfd < 0) {
-		perror("open outfile");
-		return 1;
-	}
-
-	if (setup_context(QD, &ring))
-		return 1;
-	if (get_file_size(infd, &insize))
-		return 1;
-
-	ret = copy_file(&ring, insize);
-
-	close(infd);
-	close(outfd);
-	io_uring_queue_exit(&ring);
-	return ret;
-}
diff --git a/tools/io_uring/liburing.h b/tools/io_uring/liburing.h
deleted file mode 100644
index 28a837b6069d..000000000000
--- a/tools/io_uring/liburing.h
+++ /dev/null
@@ -1,187 +0,0 @@
-#ifndef LIB_URING_H
-#define LIB_URING_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/uio.h>
-#include <signal.h>
-#include <string.h>
-#include "../../include/uapi/linux/io_uring.h"
-#include <inttypes.h>
-#include <linux/swab.h>
-#include "barrier.h"
-
-/*
- * Library interface to io_uring
- */
-struct io_uring_sq {
-	unsigned *khead;
-	unsigned *ktail;
-	unsigned *kring_mask;
-	unsigned *kring_entries;
-	unsigned *kflags;
-	unsigned *kdropped;
-	unsigned *array;
-	struct io_uring_sqe *sqes;
-
-	unsigned sqe_head;
-	unsigned sqe_tail;
-
-	size_t ring_sz;
-};
-
-struct io_uring_cq {
-	unsigned *khead;
-	unsigned *ktail;
-	unsigned *kring_mask;
-	unsigned *kring_entries;
-	unsigned *koverflow;
-	struct io_uring_cqe *cqes;
-
-	size_t ring_sz;
-};
-
-struct io_uring {
-	struct io_uring_sq sq;
-	struct io_uring_cq cq;
-	int ring_fd;
-};
-
-/*
- * System calls
- */
-extern int io_uring_setup(unsigned entries, struct io_uring_params *p);
-extern int io_uring_enter(int fd, unsigned to_submit,
-	unsigned min_complete, unsigned flags, sigset_t *sig);
-extern int io_uring_register(int fd, unsigned int opcode, void *arg,
-	unsigned int nr_args);
-
-/*
- * Library interface
- */
-extern int io_uring_queue_init(unsigned entries, struct io_uring *ring,
-	unsigned flags);
-extern int io_uring_queue_mmap(int fd, struct io_uring_params *p,
-	struct io_uring *ring);
-extern void io_uring_queue_exit(struct io_uring *ring);
-extern int io_uring_peek_cqe(struct io_uring *ring,
-	struct io_uring_cqe **cqe_ptr);
-extern int io_uring_wait_cqe(struct io_uring *ring,
-	struct io_uring_cqe **cqe_ptr);
-extern int io_uring_submit(struct io_uring *ring);
-extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
-
-/*
- * Must be called after io_uring_{peek,wait}_cqe() after the cqe has
- * been processed by the application.
- */
-static inline void io_uring_cqe_seen(struct io_uring *ring,
-				     struct io_uring_cqe *cqe)
-{
-	if (cqe) {
-		struct io_uring_cq *cq = &ring->cq;
-
-		(*cq->khead)++;
-		/*
-		 * Ensure that the kernel sees our new head, the kernel has
-		 * the matching read barrier.
-		 */
-		write_barrier();
-	}
-}
-
-/*
- * Command prep helpers
- */
-static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data)
-{
-	sqe->user_data = (unsigned long) data;
-}
-
-static inline void *io_uring_cqe_get_data(struct io_uring_cqe *cqe)
-{
-	return (void *) (uintptr_t) cqe->user_data;
-}
-
-static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
-				    const void *addr, unsigned len,
-				    off_t offset)
-{
-	memset(sqe, 0, sizeof(*sqe));
-	sqe->opcode = op;
-	sqe->fd = fd;
-	sqe->off = offset;
-	sqe->addr = (unsigned long) addr;
-	sqe->len = len;
-}
-
-static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd,
-				       const struct iovec *iovecs,
-				       unsigned nr_vecs, off_t offset)
-{
-	io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset);
-}
-
-static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd,
-					    void *buf, unsigned nbytes,
-					    off_t offset)
-{
-	io_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset);
-}
-
-static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd,
-					const struct iovec *iovecs,
-					unsigned nr_vecs, off_t offset)
-{
-	io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset);
-}
-
-static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd,
-					     const void *buf, unsigned nbytes,
-					     off_t offset)
-{
-	io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset);
-}
-
-static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd,
-					  unsigned poll_mask)
-{
-	memset(sqe, 0, sizeof(*sqe));
-	sqe->opcode = IORING_OP_POLL_ADD;
-	sqe->fd = fd;
-#if __BYTE_ORDER == __BIG_ENDIAN
-	poll_mask = __swahw32(poll_mask);
-#endif
-	sqe->poll_events = poll_mask;
-}
-
-static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe,
-					     void *user_data)
-{
-	memset(sqe, 0, sizeof(*sqe));
-	sqe->opcode = IORING_OP_POLL_REMOVE;
-	sqe->addr = (unsigned long) user_data;
-}
-
-static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd,
-				       unsigned fsync_flags)
-{
-	memset(sqe, 0, sizeof(*sqe));
-	sqe->opcode = IORING_OP_FSYNC;
-	sqe->fd = fd;
-	sqe->fsync_flags = fsync_flags;
-}
-
-static inline void io_uring_prep_nop(struct io_uring_sqe *sqe)
-{
-	memset(sqe, 0, sizeof(*sqe));
-	sqe->opcode = IORING_OP_NOP;
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/tools/io_uring/queue.c b/tools/io_uring/queue.c
deleted file mode 100644
index 321819c132c7..000000000000
--- a/tools/io_uring/queue.c
+++ /dev/null
@@ -1,156 +0,0 @@
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-
-#include "liburing.h"
-#include "barrier.h"
-
-static int __io_uring_get_cqe(struct io_uring *ring,
-			      struct io_uring_cqe **cqe_ptr, int wait)
-{
-	struct io_uring_cq *cq = &ring->cq;
-	const unsigned mask = *cq->kring_mask;
-	unsigned head;
-	int ret;
-
-	*cqe_ptr = NULL;
-	head = *cq->khead;
-	do {
-		/*
-		 * It's necessary to use a read_barrier() before reading
-		 * the CQ tail, since the kernel updates it locklessly. The
-		 * kernel has the matching store barrier for the update. The
-		 * kernel also ensures that previous stores to CQEs are ordered
-		 * with the tail update.
-		 */
-		read_barrier();
-		if (head != *cq->ktail) {
-			*cqe_ptr = &cq->cqes[head & mask];
-			break;
-		}
-		if (!wait)
-			break;
-		ret = io_uring_enter(ring->ring_fd, 0, 1,
-					IORING_ENTER_GETEVENTS, NULL);
-		if (ret < 0)
-			return -errno;
-	} while (1);
-
-	return 0;
-}
-
-/*
- * Return an IO completion, if one is readily available. Returns 0 with
- * cqe_ptr filled in on success, -errno on failure.
- */
-int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
-{
-	return __io_uring_get_cqe(ring, cqe_ptr, 0);
-}
-
-/*
- * Return an IO completion, waiting for it if necessary. Returns 0 with
- * cqe_ptr filled in on success, -errno on failure.
- */
-int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
-{
-	return __io_uring_get_cqe(ring, cqe_ptr, 1);
-}
-
-/*
- * Submit sqes acquired from io_uring_get_sqe() to the kernel.
- *
- * Returns number of sqes submitted
- */
-int io_uring_submit(struct io_uring *ring)
-{
-	struct io_uring_sq *sq = &ring->sq;
-	const unsigned mask = *sq->kring_mask;
-	unsigned ktail, ktail_next, submitted, to_submit;
-	int ret;
-
-	/*
-	 * If we have pending IO in the kring, submit it first. We need a
-	 * read barrier here to match the kernels store barrier when updating
-	 * the SQ head.
-	 */
-	read_barrier();
-	if (*sq->khead != *sq->ktail) {
-		submitted = *sq->kring_entries;
-		goto submit;
-	}
-
-	if (sq->sqe_head == sq->sqe_tail)
-		return 0;
-
-	/*
-	 * Fill in sqes that we have queued up, adding them to the kernel ring
-	 */
-	submitted = 0;
-	ktail = ktail_next = *sq->ktail;
-	to_submit = sq->sqe_tail - sq->sqe_head;
-	while (to_submit--) {
-		ktail_next++;
-		read_barrier();
-
-		sq->array[ktail & mask] = sq->sqe_head & mask;
-		ktail = ktail_next;
-
-		sq->sqe_head++;
-		submitted++;
-	}
-
-	if (!submitted)
-		return 0;
-
-	if (*sq->ktail != ktail) {
-		/*
-		 * First write barrier ensures that the SQE stores are updated
-		 * with the tail update. This is needed so that the kernel
-		 * will never see a tail update without the preceeding sQE
-		 * stores being done.
-		 */
-		write_barrier();
-		*sq->ktail = ktail;
-		/*
-		 * The kernel has the matching read barrier for reading the
-		 * SQ tail.
-		 */
-		write_barrier();
-	}
-
-submit:
-	ret = io_uring_enter(ring->ring_fd, submitted, 0,
-				IORING_ENTER_GETEVENTS, NULL);
-	if (ret < 0)
-		return -errno;
-
-	return ret;
-}
-
-/*
- * Return an sqe to fill. Application must later call io_uring_submit()
- * when it's ready to tell the kernel about it. The caller may call this
- * function multiple times before calling io_uring_submit().
- *
- * Returns a vacant sqe, or NULL if we're full.
- */
-struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
-{
-	struct io_uring_sq *sq = &ring->sq;
-	unsigned next = sq->sqe_tail + 1;
-	struct io_uring_sqe *sqe;
-
-	/*
-	 * All sqes are used
-	 */
-	if (next - sq->sqe_head > *sq->kring_entries)
-		return NULL;
-
-	sqe = &sq->sqes[sq->sqe_tail & *sq->kring_mask];
-	sq->sqe_tail = next;
-	return sqe;
-}
diff --git a/tools/io_uring/setup.c b/tools/io_uring/setup.c
deleted file mode 100644
index 0b50fcd78520..000000000000
--- a/tools/io_uring/setup.c
+++ /dev/null
@@ -1,107 +0,0 @@
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-
-#include "liburing.h"
-
-static int io_uring_mmap(int fd, struct io_uring_params *p,
-			 struct io_uring_sq *sq, struct io_uring_cq *cq)
-{
-	size_t size;
-	void *ptr;
-	int ret;
-
-	sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
-	ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
-			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
-	if (ptr == MAP_FAILED)
-		return -errno;
-	sq->khead = ptr + p->sq_off.head;
-	sq->ktail = ptr + p->sq_off.tail;
-	sq->kring_mask = ptr + p->sq_off.ring_mask;
-	sq->kring_entries = ptr + p->sq_off.ring_entries;
-	sq->kflags = ptr + p->sq_off.flags;
-	sq->kdropped = ptr + p->sq_off.dropped;
-	sq->array = ptr + p->sq_off.array;
-
-	size = p->sq_entries * sizeof(struct io_uring_sqe);
-	sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
-				MAP_SHARED | MAP_POPULATE, fd,
-				IORING_OFF_SQES);
-	if (sq->sqes == MAP_FAILED) {
-		ret = -errno;
-err:
-		munmap(sq->khead, sq->ring_sz);
-		return ret;
-	}
-
-	cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
-	ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
-			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
-	if (ptr == MAP_FAILED) {
-		ret = -errno;
-		munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
-		goto err;
-	}
-	cq->khead = ptr + p->cq_off.head;
-	cq->ktail = ptr + p->cq_off.tail;
-	cq->kring_mask = ptr + p->cq_off.ring_mask;
-	cq->kring_entries = ptr + p->cq_off.ring_entries;
-	cq->koverflow = ptr + p->cq_off.overflow;
-	cq->cqes = ptr + p->cq_off.cqes;
-	return 0;
-}
-
-/*
- * For users that want to specify sq_thread_cpu or sq_thread_idle, this
- * interface is a convenient helper for mmap()ing the rings.
- * Returns -1 on error, or zero on success.  On success, 'ring'
- * contains the necessary information to read/write to the rings.
- */
-int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring)
-{
-	int ret;
-
-	memset(ring, 0, sizeof(*ring));
-	ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq);
-	if (!ret)
-		ring->ring_fd = fd;
-	return ret;
-}
-
-/*
- * Returns -1 on error, or zero on success. On success, 'ring'
- * contains the necessary information to read/write to the rings.
- */
-int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags)
-{
-	struct io_uring_params p;
-	int fd, ret;
-
-	memset(&p, 0, sizeof(p));
-	p.flags = flags;
-
-	fd = io_uring_setup(entries, &p);
-	if (fd < 0)
-		return fd;
-
-	ret = io_uring_queue_mmap(fd, &p, ring);
-	if (ret)
-		close(fd);
-
-	return ret;
-}
-
-void io_uring_queue_exit(struct io_uring *ring)
-{
-	struct io_uring_sq *sq = &ring->sq;
-	struct io_uring_cq *cq = &ring->cq;
-
-	munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
-	munmap(sq->khead, sq->ring_sz);
-	munmap(cq->khead, cq->ring_sz);
-	close(ring->ring_fd);
-}
diff --git a/tools/io_uring/syscall.c b/tools/io_uring/syscall.c
deleted file mode 100644
index b22e0aa54e9d..000000000000
--- a/tools/io_uring/syscall.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Will go away once libc support is there
- */
-#include <unistd.h>
-#include <sys/syscall.h>
-#include <sys/uio.h>
-#include <signal.h>
-#include "liburing.h"
-
-#ifdef __alpha__
-/*
- * alpha is the only exception, all other architectures
- * have common numbers for new system calls.
- */
-# ifndef __NR_io_uring_setup
-#  define __NR_io_uring_setup		535
-# endif
-# ifndef __NR_io_uring_enter
-#  define __NR_io_uring_enter		536
-# endif
-# ifndef __NR_io_uring_register
-#  define __NR_io_uring_register	537
-# endif
-#else /* !__alpha__ */
-# ifndef __NR_io_uring_setup
-#  define __NR_io_uring_setup		425
-# endif
-# ifndef __NR_io_uring_enter
-#  define __NR_io_uring_enter		426
-# endif
-# ifndef __NR_io_uring_register
-#  define __NR_io_uring_register	427
-# endif
-#endif
-
-int io_uring_register(int fd, unsigned int opcode, void *arg,
-		      unsigned int nr_args)
-{
-	return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
-}
-
-int io_uring_setup(unsigned int entries, struct io_uring_params *p)
-{
-	return syscall(__NR_io_uring_setup, entries, p);
-}
-
-int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete,
-		   unsigned int flags, sigset_t *sig)
-{
-	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
-			flags, sig, _NSIG / 8);
-}

From dc314886cb3d0e4ab2858003e8de2917f8a3ccbd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 9 Aug 2023 16:20:21 +0100
Subject: [PATCH 15/47] io_uring: break iopolling on signal

Don't keep spinning iopoll with a signal set. It'll eventually return
back, e.g. by virtue of need_resched(), but it's not a nice user
experience.

Cc: stable@vger.kernel.org
Fixes: def596e9557c9 ("io_uring: support for IO polling")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/eeba551e82cad12af30c3220125eb6cb244cc94c.1691594339.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d0888907527d..ad4ffd3a876f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1673,6 +1673,9 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 			break;
 		nr_events += ret;
 		ret = 0;
+
+		if (task_sigpending(current))
+			return -EINTR;
 	} while (nr_events < min && !need_resched());
 
 	return ret;

From 9e4bef2ba9e0c5fd0e0ae383f0c0fb0e338aafad Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 9 Aug 2023 10:03:00 -0600
Subject: [PATCH 16/47] io_uring: cleanup 'ret' handling in io_iopoll_check()

We return 0 for success, or -error when there's an error. Move the 'ret'
variable into the loop where we are actually using it, to make it
clearer that we don't carry this variable forward for return outside of
the loop.

While at it, also move the need_resched() break condition out of the
while check itself, keeping it with the signal pending check.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ad4ffd3a876f..dadd745d389e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1615,7 +1615,6 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 {
 	unsigned int nr_events = 0;
-	int ret = 0;
 	unsigned long check_cq;
 
 	if (!io_allowed_run_tw(ctx))
@@ -1641,6 +1640,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 		return 0;
 
 	do {
+		int ret = 0;
+
 		/*
 		 * If a submit got punted to a workqueue, we can have the
 		 * application entering polling for a command before it gets
@@ -1669,16 +1670,18 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 				break;
 		}
 		ret = io_do_iopoll(ctx, !min);
-		if (ret < 0)
-			break;
-		nr_events += ret;
-		ret = 0;
+		if (unlikely(ret < 0))
+			return ret;
 
 		if (task_sigpending(current))
 			return -EINTR;
-	} while (nr_events < min && !need_resched());
+		if (need_resched())
+			break;
 
-	return ret;
+		nr_events += ret;
+	} while (nr_events < min);
+
+	return 0;
 }
 
 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)

From 3aaf22b62a9270b90524cd257755b960461a7614 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 10 Jul 2023 13:13:54 -0600
Subject: [PATCH 17/47] io_uring/fdinfo: get rid of ref tryget

The caller holds a reference to the ring itself, so by definition
the ring cannot go away. There's no need to play games with tryget
for the reference, as we don't need an extra reference at all.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/fdinfo.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 76c279b13aee..300455b4bc12 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -46,9 +46,13 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
 	return 0;
 }
 
-static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
-					  struct seq_file *m)
+/*
+ * Caller holds a reference to the file already, we don't need to do
+ * anything else to get an extra reference.
+ */
+__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
 {
+	struct io_ring_ctx *ctx = f->private_data;
 	struct io_sq_data *sq = NULL;
 	struct io_overflow_cqe *ocqe;
 	struct io_rings *r = ctx->rings;
@@ -203,14 +207,4 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
 
 	spin_unlock(&ctx->completion_lock);
 }
-
-__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
-{
-	struct io_ring_ctx *ctx = f->private_data;
-
-	if (percpu_ref_tryget(&ctx->refs)) {
-		__io_uring_show_fdinfo(ctx, m);
-		percpu_ref_put(&ctx->refs);
-	}
-}
 #endif

From 9f69a259576ad46e6a13812b2c272bc9a89f8e03 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 7 Jul 2023 11:11:58 -0600
Subject: [PATCH 18/47] io_uring/splice: use fput() directly

No point in using io_file_put() here, as we need to check if it's a
fixed file in the caller anyway.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/splice.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/splice.c b/io_uring/splice.c
index 2a4bbb719531..7c4469e9540e 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -68,7 +68,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 		ret = do_tee(in, out, sp->len, flags);
 
 	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
-		io_put_file(in);
+		fput(in);
 done:
 	if (ret != sp->len)
 		req_set_fail(req);
@@ -112,7 +112,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
 
 	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
-		io_put_file(in);
+		fput(in);
 done:
 	if (ret != sp->len)
 		req_set_fail(req);

From 17bc28374cd06b7d2d3f1e88470ef89f9cd3a497 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 7 Jul 2023 11:14:40 -0600
Subject: [PATCH 19/47] io_uring: have io_file_put() take an io_kiocb rather
 than the file

No functional changes in this patch, just a prep patch for needing the
request in io_file_put().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 6 ++----
 io_uring/io_uring.h | 6 +++---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index dadd745d389e..15697d88930d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -998,8 +998,7 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 		io_put_kbuf_comp(req);
 		if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
 			io_clean_op(req);
-		if (!(req->flags & REQ_F_FIXED_FILE))
-			io_put_file(req->file);
+		io_put_file(req);
 
 		rsrc_node = req->rsrc_node;
 		/*
@@ -1533,8 +1532,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
 			if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
 				io_clean_op(req);
 		}
-		if (!(req->flags & REQ_F_FIXED_FILE))
-			io_put_file(req->file);
+		io_put_file(req);
 
 		io_req_put_rsrc_locked(req, ctx);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 12769bad5cee..ff153af28236 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -196,10 +196,10 @@ static inline bool req_has_async_data(struct io_kiocb *req)
 	return req->flags & REQ_F_ASYNC_DATA;
 }
 
-static inline void io_put_file(struct file *file)
+static inline void io_put_file(struct io_kiocb *req)
 {
-	if (file)
-		fput(file);
+	if (!(req->flags & REQ_F_FIXED_FILE) && req->file)
+		fput(req->file);
 }
 
 static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,

From 89226307b109f828566f0e024ee97b722167927c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 10 Aug 2023 15:01:58 -0600
Subject: [PATCH 20/47] io_uring: remove unnecessary forward declaration

We never use io_move_task_work_from_local() before it's defined in the
file anyway, so kill the forward declaration.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 15697d88930d..047576bc98d0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -147,7 +147,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 bool cancel_all);
 
 static void io_queue_sqe(struct io_kiocb *req);
-static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 
 struct kmem_cache *req_cachep;

From 78848b9b05623cfddb790d23b0dc38a275eb0763 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 9 Aug 2023 12:58:59 -0600
Subject: [PATCH 21/47] io_uring/io-wq: don't grab wq->lock for worker
 activation

The worker free list is RCU protected, and checks for workers going away
when iterating it. There's no need to hold the wq->lock around the
lookup.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 399e9a15c38d..3e7025b9e0dd 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -909,13 +909,10 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
 	clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
 	raw_spin_unlock(&acct->lock);
 
-	raw_spin_lock(&wq->lock);
 	rcu_read_lock();
 	do_create = !io_wq_activate_free_worker(wq, acct);
 	rcu_read_unlock();
 
-	raw_spin_unlock(&wq->lock);
-
 	if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
 	    !atomic_read(&acct->nr_running))) {
 		bool did_create;

From de36a15f9a3842be24ca220060b77925f2f5435b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 9 Aug 2023 12:59:40 -0600
Subject: [PATCH 22/47] io_uring/io-wq: reduce frequency of acct->lock
 acquisitions

When we check if we have work to run, we grab the acct lock, check,
drop it, and then return the result. If we do have work to run, then
running the work will again grab acct->lock and get the work item.

This causes us to grab acct->lock more frequently than we need to.
If we have work to do, have io_acct_run_queue() return with the acct
lock still acquired. io_worker_handle_work() is then always invoked
with the acct lock already held.

In a simple test cases that stats files (IORING_OP_STATX always hits
io-wq), we see a nice reduction in locking overhead with this change:

19.32%   -12.55%  [kernel.kallsyms]      [k] __cmpwait_case_32
20.90%   -12.07%  [kernel.kallsyms]      [k] queued_spin_lock_slowpath

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 47 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 3e7025b9e0dd..18a049fc53ef 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -232,17 +232,25 @@ static void io_worker_exit(struct io_worker *worker)
 	do_exit(0);
 }
 
-static inline bool io_acct_run_queue(struct io_wq_acct *acct)
+static inline bool __io_acct_run_queue(struct io_wq_acct *acct)
 {
-	bool ret = false;
+	return !test_bit(IO_ACCT_STALLED_BIT, &acct->flags) &&
+		!wq_list_empty(&acct->work_list);
+}
 
+/*
+ * If there's work to do, returns true with acct->lock acquired. If not,
+ * returns false with no lock held.
+ */
+static inline bool io_acct_run_queue(struct io_wq_acct *acct)
+	__acquires(&acct->lock)
+{
 	raw_spin_lock(&acct->lock);
-	if (!wq_list_empty(&acct->work_list) &&
-	    !test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
-		ret = true;
-	raw_spin_unlock(&acct->lock);
+	if (__io_acct_run_queue(acct))
+		return true;
 
-	return ret;
+	raw_spin_unlock(&acct->lock);
+	return false;
 }
 
 /*
@@ -397,6 +405,7 @@ static void io_wq_dec_running(struct io_worker *worker)
 	if (!io_acct_run_queue(acct))
 		return;
 
+	raw_spin_unlock(&acct->lock);
 	atomic_inc(&acct->nr_running);
 	atomic_inc(&wq->worker_refs);
 	io_queue_worker_create(worker, acct, create_worker_cb);
@@ -521,9 +530,13 @@ static void io_assign_current_work(struct io_worker *worker,
 	raw_spin_unlock(&worker->lock);
 }
 
-static void io_worker_handle_work(struct io_worker *worker)
+/*
+ * Called with acct->lock held, drops it before returning
+ */
+static void io_worker_handle_work(struct io_wq_acct *acct,
+				  struct io_worker *worker)
+	__releases(&acct->lock)
 {
-	struct io_wq_acct *acct = io_wq_get_acct(worker);
 	struct io_wq *wq = worker->wq;
 	bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
 
@@ -537,7 +550,6 @@ static void io_worker_handle_work(struct io_worker *worker)
 		 * can't make progress, any work completion or insertion will
 		 * clear the stalled flag.
 		 */
-		raw_spin_lock(&acct->lock);
 		work = io_get_next_work(acct, worker);
 		raw_spin_unlock(&acct->lock);
 		if (work) {
@@ -591,6 +603,10 @@ static void io_worker_handle_work(struct io_worker *worker)
 					wake_up(&wq->hash->wait);
 			}
 		} while (work);
+
+		if (!__io_acct_run_queue(acct))
+			break;
+		raw_spin_lock(&acct->lock);
 	} while (1);
 }
 
@@ -611,8 +627,13 @@ static int io_wq_worker(void *data)
 		long ret;
 
 		set_current_state(TASK_INTERRUPTIBLE);
+
+		/*
+		 * If we have work to do, io_acct_run_queue() returns with
+		 * the acct->lock held. If not, it will drop it.
+		 */
 		while (io_acct_run_queue(acct))
-			io_worker_handle_work(worker);
+			io_worker_handle_work(acct, worker);
 
 		raw_spin_lock(&wq->lock);
 		/*
@@ -645,8 +666,8 @@ static int io_wq_worker(void *data)
 		}
 	}
 
-	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
-		io_worker_handle_work(worker);
+	if (test_bit(IO_WQ_BIT_EXIT, &wq->state) && io_acct_run_queue(acct))
+		io_worker_handle_work(acct, worker);
 
 	io_worker_exit(worker);
 	return 0;

From 22f7fb80e6d91980785d235dc939695d3a271c3b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 9 Aug 2023 13:07:54 -0600
Subject: [PATCH 23/47] io_uring/io-wq: don't gate worker wake up success on
 wake_up_process()

All we really care about is finding a free worker. If said worker is
already running, it's either starting new work already or it's just
finishing up existing work. For the latter, we'll be finding this work
item next anyway, and for the former, if the worker does go to sleep,
it'll create a new worker anyway as we have pending items.

This reduces try_to_wake_up() overhead considerably:

23.16%    -10.46%  [kernel.kallsyms]      [k] try_to_wake_up

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 18a049fc53ef..2da0b1ba6a56 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -276,11 +276,14 @@ static bool io_wq_activate_free_worker(struct io_wq *wq,
 			io_worker_release(worker);
 			continue;
 		}
-		if (wake_up_process(worker->task)) {
-			io_worker_release(worker);
-			return true;
-		}
+		/*
+		 * If the worker is already running, it's either already
+		 * starting work or finishing work. In either case, if it does
+		 * to go sleep, we'll kick off a new task for this work anyway.
+		 */
+		wake_up_process(worker->task);
 		io_worker_release(worker);
+		return true;
 	}
 
 	return false;

From 1bfed23349716a7811645336a7ce42c4b8f250bc Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Aug 2023 13:53:41 +0100
Subject: [PATCH 24/47] io_uring/net: don't overflow multishot accept

Don't allow overflowing multishot accept CQEs, we want to limit
the grows of the overflow list.

Cc: stable@vger.kernel.org
Fixes: 4e86a2c980137 ("io_uring: implement multishot mode for accept")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7d0d749649244873772623dd7747966f516fe6e2.1691757663.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index eb1f51ddcb23..1599493544a5 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1367,7 +1367,7 @@ retry:
 	if (ret < 0)
 		return ret;
 	if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, ret,
-		       IORING_CQE_F_MORE, true))
+		       IORING_CQE_F_MORE, false))
 		goto retry;
 
 	return -ECANCELED;

From b2e74db55dd93d6db22a813c9a775b5dbf87c560 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Aug 2023 13:53:42 +0100
Subject: [PATCH 25/47] io_uring/net: don't overflow multishot recv

Don't allow overflowing multishot recv CQEs, it might get out of
hand, hurt performance, and in the worst case scenario OOM the task.

Cc: stable@vger.kernel.org
Fixes: b3fdea6ecb55c ("io_uring: multishot recv")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0b295634e8f1b71aa764c984608c22d85f88f75c.1691757663.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 1599493544a5..8c419c01a5db 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -642,7 +642,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 
 	if (!mshot_finished) {
 		if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
-			       *ret, cflags | IORING_CQE_F_MORE, true)) {
+			       *ret, cflags | IORING_CQE_F_MORE, false)) {
 			io_recv_prep_retry(req);
 			/* Known not-empty or unknown state, retry */
 			if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||

From 00b0db562485fbb259cd4054346208ad0885d662 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Aug 2023 13:53:43 +0100
Subject: [PATCH 26/47] io_uring: open code io_fill_cqe_req()

io_fill_cqe_req() is only called from one place, open code it, and
rename __io_fill_cqe_req().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f432ce75bb1c94cadf0bd2add4d6aa510bd1fb36.1691757663.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  8 +++++---
 io_uring/io_uring.h | 11 +----------
 io_uring/rw.c       |  2 +-
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 047576bc98d0..e969b4ca1c47 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -978,8 +978,10 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 	struct io_rsrc_node *rsrc_node = NULL;
 
 	io_cq_lock(ctx);
-	if (!(req->flags & REQ_F_CQE_SKIP))
-		io_fill_cqe_req(ctx, req);
+	if (!(req->flags & REQ_F_CQE_SKIP)) {
+		if (!io_fill_cqe_req(ctx, req))
+			io_req_cqe_overflow(req);
+	}
 
 	/*
 	 * If we're the last reference to this request, add to our locked
@@ -1556,7 +1558,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 					    comp_list);
 
 		if (!(req->flags & REQ_F_CQE_SKIP) &&
-		    unlikely(!__io_fill_cqe_req(ctx, req))) {
+		    unlikely(!io_fill_cqe_req(ctx, req))) {
 			if (ctx->task_complete) {
 				spin_lock(&ctx->completion_lock);
 				io_req_cqe_overflow(req);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index ff153af28236..3aa208fbe905 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -133,8 +133,7 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 	return io_get_cqe_overflow(ctx, false);
 }
 
-static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
-				     struct io_kiocb *req)
+static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req)
 {
 	struct io_uring_cqe *cqe;
 
@@ -168,14 +167,6 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 	return true;
 }
 
-static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
-				   struct io_kiocb *req)
-{
-	if (likely(__io_fill_cqe_req(ctx, req)))
-		return true;
-	return io_req_cqe_overflow(req);
-}
-
 static inline void req_set_fail(struct io_kiocb *req)
 {
 	req->flags |= REQ_F_FAIL;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1bce2208b65c..9b51afdae505 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1064,7 +1064,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 			continue;
 
 		req->cqe.flags = io_put_kbuf(req, 0);
-		if (unlikely(!__io_fill_cqe_req(ctx, req))) {
+		if (unlikely(!io_fill_cqe_req(ctx, req))) {
 			spin_lock(&ctx->completion_lock);
 			io_req_cqe_overflow(req);
 			spin_unlock(&ctx->completion_lock);

From 056695bffa4beed5668dd4aa11efb696eacb3ed9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Aug 2023 13:53:44 +0100
Subject: [PATCH 27/47] io_uring: remove return from io_req_cqe_overflow()

Nobody checks io_req_cqe_overflow()'s return, make it return void.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8f2029ad0c22f73451664172d834372608ee0a77.1691757663.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 8 ++++----
 io_uring/io_uring.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e969b4ca1c47..7595658a5073 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -813,15 +813,15 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 	return true;
 }
 
-bool io_req_cqe_overflow(struct io_kiocb *req)
+void io_req_cqe_overflow(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_CQE32_INIT)) {
 		req->extra1 = 0;
 		req->extra2 = 0;
 	}
-	return io_cqring_event_overflow(req->ctx, req->cqe.user_data,
-					req->cqe.res, req->cqe.flags,
-					req->extra1, req->extra2);
+	io_cqring_event_overflow(req->ctx, req->cqe.user_data,
+				req->cqe.res, req->cqe.flags,
+				req->extra1, req->extra2);
 }
 
 /*
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 3aa208fbe905..3dc0b6fb0ef7 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -39,7 +39,7 @@ enum {
 };
 
 struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow);
-bool io_req_cqe_overflow(struct io_kiocb *req);
+void io_req_cqe_overflow(struct io_kiocb *req);
 int io_run_task_work_sig(struct io_ring_ctx *ctx);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);

From b6b2bb58a75407660f638a68e6e34a07036146d0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Aug 2023 13:53:45 +0100
Subject: [PATCH 28/47] io_uring: never overflow io_aux_cqe

Now all callers of io_aux_cqe() set allow_overflow to false, remove the
parameter and not allow overflowing auxilary multishot cqes.

When CQ is full the function callers and all multishot requests in
general are expected to complete the request. That prevents indefinite
in-background grows of the overflow list and let's the userspace to
handle the backlog at its own pace.

Resubmitting a request should also be faster than accounting a bunch of
overflows, so it should be better for perf when it happens, but a well
behaving userspace should be trying to avoid overflows in any case.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/bb20d14d708ea174721e58bb53786b0521e4dd6d.1691757663.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 11 +++++++----
 io_uring/io_uring.h |  3 +--
 io_uring/net.c      |  8 ++++----
 io_uring/poll.c     |  4 ++--
 io_uring/timeout.c  |  4 ++--
 5 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7595658a5073..e57d00939ab9 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -939,15 +939,18 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 	return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
 }
 
-bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
-		bool allow_overflow)
+/*
+ * A helper for multishot requests posting additional CQEs.
+ * Should only be used from a task_work including IO_URING_F_MULTISHOT.
+ */
+bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	u64 user_data = req->cqe.user_data;
 	struct io_uring_cqe *cqe;
 
 	if (!defer)
-		return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
+		return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
 
 	lockdep_assert_held(&ctx->uring_lock);
 
@@ -962,7 +965,7 @@ bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
 	 * however it's main job is to prevent unbounded posted completions,
 	 * and in that it works just as well.
 	 */
-	if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
+	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
 		return false;
 
 	cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++];
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 3dc0b6fb0ef7..3e6ff3cd9a24 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -44,8 +44,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
-bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
-		bool allow_overflow);
+bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
diff --git a/io_uring/net.c b/io_uring/net.c
index 8c419c01a5db..3d07bf79c1e0 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -641,8 +641,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 	}
 
 	if (!mshot_finished) {
-		if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
-			       *ret, cflags | IORING_CQE_F_MORE, false)) {
+		if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
+					*ret, cflags | IORING_CQE_F_MORE)) {
 			io_recv_prep_retry(req);
 			/* Known not-empty or unknown state, retry */
 			if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
@@ -1366,8 +1366,8 @@ retry:
 
 	if (ret < 0)
 		return ret;
-	if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, ret,
-		       IORING_CQE_F_MORE, false))
+	if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
+				ret, IORING_CQE_F_MORE))
 		goto retry;
 
 	return -ECANCELED;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 65ec363f6377..4c360ba8793a 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -300,8 +300,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
 			__poll_t mask = mangle_poll(req->cqe.res &
 						    req->apoll_events);
 
-			if (!io_aux_cqe(req, ts->locked, mask,
-					IORING_CQE_F_MORE, false)) {
+			if (!io_fill_cqe_req_aux(req, ts->locked, mask,
+						 IORING_CQE_F_MORE)) {
 				io_req_set_res(req, mask, 0);
 				return IOU_POLL_REMOVE_POLL_USE_RES;
 			}
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 6242130e73c6..7fd7dbb211d6 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -73,8 +73,8 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
 
 	if (!io_timeout_finish(timeout, data)) {
 		bool filled;
-		filled = io_aux_cqe(req, ts->locked, -ETIME, IORING_CQE_F_MORE,
-				    false);
+		filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME,
+					     IORING_CQE_F_MORE);
 		if (filled) {
 			/* re-arm timer */
 			spin_lock_irq(&ctx->timeout_lock);

From 19a63c4021702e389a559726b16fcbf07a8a05f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Aug 2023 13:53:46 +0100
Subject: [PATCH 29/47] io_uring/rsrc: keep one global dummy_ubuf

We set empty registered buffers to dummy_ubuf as an optimisation.
Currently, we allocate the dummy entry for each ring, whenever we can
simply have one global instance.

We're casting out const on assignment, it's fine as we're not going to
change the content of the dummy, the constness gives us an extra layer
of protection if sth ever goes wrong.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e4a96dda35ab755914bc43f6781bba0df97ac489.1691757663.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  9 ---------
 io_uring/rsrc.c     | 14 ++++++++++----
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e57d00939ab9..a7a4d637aee0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -290,13 +290,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 		goto err;
 	if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
 		goto err;
-
-	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
-	if (!ctx->dummy_ubuf)
-		goto err;
-	/* set invalid range, so io_import_fixed() fails meeting it */
-	ctx->dummy_ubuf->ubuf = -1UL;
-
 	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 			    0, GFP_KERNEL))
 		goto err;
@@ -335,7 +328,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
 	return ctx;
 err:
-	kfree(ctx->dummy_ubuf);
 	kfree(ctx->cancel_table.hbs);
 	kfree(ctx->cancel_table_locked.hbs);
 	kfree(ctx->io_bl);
@@ -2897,7 +2889,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		io_wq_put_hash(ctx->hash_map);
 	kfree(ctx->cancel_table.hbs);
 	kfree(ctx->cancel_table_locked.hbs);
-	kfree(ctx->dummy_ubuf);
 	kfree(ctx->io_bl);
 	xa_destroy(&ctx->io_bl_xa);
 	kfree(ctx);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 5e8fdd9b8ca6..d9c853d10587 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -33,6 +33,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 #define IORING_MAX_FIXED_FILES	(1U << 20)
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
+static const struct io_mapped_ubuf dummy_ubuf = {
+	/* set invalid range, so io_import_fixed() fails meeting it */
+	.ubuf = -1UL,
+	.ubuf_end = 0,
+};
+
 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
 {
 	unsigned long page_limit, cur_pages, new_pages;
@@ -132,7 +138,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	struct io_mapped_ubuf *imu = *slot;
 	unsigned int i;
 
-	if (imu != ctx->dummy_ubuf) {
+	if (imu != &dummy_ubuf) {
 		for (i = 0; i < imu->nr_bvecs; i++)
 			unpin_user_page(imu->bvec[i].bv_page);
 		if (imu->acct_pages)
@@ -459,14 +465,14 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			break;
 
 		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
-		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
+		if (ctx->user_bufs[i] != &dummy_ubuf) {
 			err = io_queue_rsrc_removal(ctx->buf_data, i,
 						    ctx->user_bufs[i]);
 			if (unlikely(err)) {
 				io_buffer_unmap(ctx, &imu);
 				break;
 			}
-			ctx->user_bufs[i] = ctx->dummy_ubuf;
+			ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
 		}
 
 		ctx->user_bufs[i] = imu;
@@ -1077,7 +1083,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	int ret, nr_pages, i;
 	struct folio *folio = NULL;
 
-	*pimu = ctx->dummy_ubuf;
+	*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
 	if (!iov->iov_base)
 		return 0;
 

From d246c759c47eafe4688613e89b337e48c39c5968 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Aug 2023 13:53:47 +0100
Subject: [PATCH 30/47] io_uring: simplify io_run_task_work_sig return

Nobody cares about io_run_task_work_sig returning 1, we only check for
negative errors. Simplify by keeping to 0/-error returns.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3aec8a532c003d6e50739b969a82989402696170.1691757663.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a7a4d637aee0..e189158ebbdd 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2477,10 +2477,10 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
 	if (!llist_empty(&ctx->work_llist)) {
 		__set_current_state(TASK_RUNNING);
 		if (io_run_local_work(ctx) > 0)
-			return 1;
+			return 0;
 	}
 	if (io_run_task_work() > 0)
-		return 1;
+		return 0;
 	if (task_sigpending(current))
 		return -EINTR;
 	return 0;

From ebdfefc09c6de7897962769bd3e63a2ff443ebf5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 13 Aug 2023 11:05:36 -0600
Subject: [PATCH 31/47] io_uring/sqpoll: fix io-wq affinity when
 IORING_SETUP_SQPOLL is used

If we setup the ring with SQPOLL, then that polling thread has its
own io-wq setup. This means that if the application uses
IORING_REGISTER_IOWQ_AFF to set the io-wq affinity, we should not be
setting it for the invoking task, but rather the sqpoll task.

Add an sqpoll helper that parks the thread and updates the affinity,
and use that one if we're using SQPOLL.

Fixes: fe76421d1da1 ("io_uring: allow user configurable IO thread CPU affinity")
Cc: stable@vger.kernel.org # 5.10+
Link: https://github.com/axboe/liburing/discussions/884
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c    |  9 ++++++---
 io_uring/io-wq.h    |  2 +-
 io_uring/io_uring.c | 29 ++++++++++++++++++-----------
 io_uring/sqpoll.c   | 15 +++++++++++++++
 io_uring/sqpoll.h   |  1 +
 5 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 2da0b1ba6a56..62f345587df5 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -1306,13 +1306,16 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
 	return __io_wq_cpu_online(wq, cpu, false);
 }
 
-int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
+int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
 {
+	if (!tctx || !tctx->io_wq)
+		return -EINVAL;
+
 	rcu_read_lock();
 	if (mask)
-		cpumask_copy(wq->cpu_mask, mask);
+		cpumask_copy(tctx->io_wq->cpu_mask, mask);
 	else
-		cpumask_copy(wq->cpu_mask, cpu_possible_mask);
+		cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask);
 	rcu_read_unlock();
 
 	return 0;
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index 31228426d192..06d9ca90c577 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -50,7 +50,7 @@ void io_wq_put_and_exit(struct io_wq *wq);
 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
 void io_wq_hash_work(struct io_wq_work *work, void *val);
 
-int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
+int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask);
 int io_wq_max_workers(struct io_wq *wq, int *new_count);
 
 static inline bool io_wq_is_hashed(struct io_wq_work *work)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e189158ebbdd..e1a23f4993d3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4183,16 +4183,28 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
 	return 0;
 }
 
+static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
+					 cpumask_var_t new_mask)
+{
+	int ret;
+
+	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
+	} else {
+		mutex_unlock(&ctx->uring_lock);
+		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
+		mutex_lock(&ctx->uring_lock);
+	}
+
+	return ret;
+}
+
 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
 				       void __user *arg, unsigned len)
 {
-	struct io_uring_task *tctx = current->io_uring;
 	cpumask_var_t new_mask;
 	int ret;
 
-	if (!tctx || !tctx->io_wq)
-		return -EINVAL;
-
 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
 		return -ENOMEM;
 
@@ -4213,19 +4225,14 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
 		return -EFAULT;
 	}
 
-	ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
+	ret = __io_register_iowq_aff(ctx, new_mask);
 	free_cpumask_var(new_mask);
 	return ret;
 }
 
 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
 {
-	struct io_uring_task *tctx = current->io_uring;
-
-	if (!tctx || !tctx->io_wq)
-		return -EINVAL;
-
-	return io_wq_cpu_affinity(tctx->io_wq, NULL);
+	return __io_register_iowq_aff(ctx, NULL);
 }
 
 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 5e329e3cd470..ee2d2c687fda 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -421,3 +421,18 @@ err:
 	io_sq_thread_finish(ctx);
 	return ret;
 }
+
+__cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
+				     cpumask_var_t mask)
+{
+	struct io_sq_data *sqd = ctx->sq_data;
+	int ret = -EINVAL;
+
+	if (sqd) {
+		io_sq_thread_park(sqd);
+		ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
+		io_sq_thread_unpark(sqd);
+	}
+
+	return ret;
+}
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
index e1b8d508d22d..8df37e8c9149 100644
--- a/io_uring/sqpoll.h
+++ b/io_uring/sqpoll.h
@@ -27,3 +27,4 @@ void io_sq_thread_park(struct io_sq_data *sqd);
 void io_sq_thread_unpark(struct io_sq_data *sqd);
 void io_put_sq_data(struct io_sq_data *sqd);
 void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
+int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);

From 04d9244c9420db33149608a566399176d57690f8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 17 Aug 2023 14:21:47 -0700
Subject: [PATCH 32/47] io_uring/rsrc: Annotate struct io_mapped_ubuf with
 __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct io_mapped_ubuf.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: io-uring@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230817212146.never.853-kees@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 8afa9ec66a55..8625181fb87a 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -54,7 +54,7 @@ struct io_mapped_ubuf {
 	u64		ubuf_end;
 	unsigned int	nr_bvecs;
 	unsigned long	acct_pages;
-	struct bio_vec	bvec[];
+	struct bio_vec	bvec[] __counted_by(nr_bvecs);
 };
 
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);

From a0727c738309a06ef5579c1742f8f0def63aa883 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:23 +0100
Subject: [PATCH 33/47] io_uring: improve cqe !tracing hot path

While looking at io_fill_cqe_req()'s asm I stumbled on our trace points
turning into the chunk below:

trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
			req->cqe.res, req->cqe.flags,
			req->extra1, req->extra2);

io_uring/io_uring.c:898: 	trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
	movq	232(%rbx), %rdi	# req_44(D)->big_cqe.extra2, _5
	movq	224(%rbx), %rdx	# req_44(D)->big_cqe.extra1, _6
	movl	84(%rbx), %r9d	# req_44(D)->cqe.D.81184.flags, _7
	movl	80(%rbx), %r8d	# req_44(D)->cqe.res, _8
	movq	72(%rbx), %rcx	# req_44(D)->cqe.user_data, _9
	movq	88(%rbx), %rsi	# req_44(D)->ctx, _10
./arch/x86/include/asm/jump_label.h:27: 	asm_volatile_goto("1:"
	1:jmp .L1772 # objtool NOPs this 	#
	...

It does a jump_label for actual tracing, but those 6 moves will stay
there in the hottest io_uring path. As an optimisation, add a
trace_io_uring_complete_enabled() check, which is also uses jump_labels,
it tricks the compiler into behaving. It removes the junk without
changing anything else int the hot path.

Note: apparently, it's not only me noticing it, and people are also
working it around. We should remove the check when it's solved
generically or rework tracing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/555d8312644b3776f4be7e23f9b92943875c4bc7.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 3e6ff3cd9a24..465598223386 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -145,10 +145,11 @@ static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req
 	if (unlikely(!cqe))
 		return false;
 
-	trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
-				req->cqe.res, req->cqe.flags,
-				(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
-				(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
+	if (trace_io_uring_complete_enabled())
+		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+					req->cqe.res, req->cqe.flags,
+					(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
+					(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
 
 	memcpy(cqe, &req->cqe, sizeof(*cqe));
 

From 31d3ba924fd86add6d14f9085fdd2f4ec0879631 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:24 +0100
Subject: [PATCH 34/47] io_uring: cqe init hardening

io_kiocb::cqe stores the completion info which we'll memcpy to
userspace, and we rely on callbacks and other later steps to populate
it with right values. We have never had problems with that, but it would
still be safer to zero it on allocation.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b16a3b64dde678686460d3c3792c3ba6d3d1bc7a.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e1a23f4993d3..3e0fe1ebbc10 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1056,7 +1056,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 	req->link = NULL;
 	req->async_data = NULL;
 	/* not necessary, but safer to zero */
-	req->cqe.res = 0;
+	memset(&req->cqe, 0, sizeof(req->cqe));
 }
 
 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,

From b24c5d752962fa0970cd7e3d74b1cd0e843358de Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:25 +0100
Subject: [PATCH 35/47] io_uring: simplify big_cqe handling

Don't keep big_cqe bits of req in a union with hash_node, find a
separate space for it. It's bit safer, but also if we keep it always
initialised, we can get rid of ugly REQ_F_CQE32_INIT handling.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/447aa1b2968978c99e655ba88db536e903df0fe9.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 16 ++++++----------
 io_uring/io_uring.c            |  8 +++-----
 io_uring/io_uring.h            | 15 +++------------
 io_uring/uring_cmd.c           |  5 ++---
 4 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f04ce513fadb..9795eda529f7 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -409,7 +409,6 @@ enum {
 	REQ_F_SINGLE_POLL_BIT,
 	REQ_F_DOUBLE_POLL_BIT,
 	REQ_F_PARTIAL_IO_BIT,
-	REQ_F_CQE32_INIT_BIT,
 	REQ_F_APOLL_MULTISHOT_BIT,
 	REQ_F_CLEAR_POLLIN_BIT,
 	REQ_F_HASH_LOCKED_BIT,
@@ -479,8 +478,6 @@ enum {
 	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
 	/* fast poll multishot mode */
 	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT),
-	/* ->extra1 and ->extra2 are initialised */
-	REQ_F_CQE32_INIT	= BIT(REQ_F_CQE32_INIT_BIT),
 	/* recvmsg special flag, clear EPOLLIN */
 	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT),
 	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
@@ -579,13 +576,7 @@ struct io_kiocb {
 	struct io_task_work		io_task_work;
 	unsigned			nr_tw;
 	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
-	union {
-		struct hlist_node	hash_node;
-		struct {
-			u64		extra1;
-			u64		extra2;
-		};
-	};
+	struct hlist_node		hash_node;
 	/* internal polling, see IORING_FEAT_FAST_POLL */
 	struct async_poll		*apoll;
 	/* opcode allocated if it needs to store data for async defer */
@@ -595,6 +586,11 @@ struct io_kiocb {
 	/* custom credentials, valid IFF REQ_F_CREDS is set */
 	const struct cred		*creds;
 	struct io_wq_work		work;
+
+	struct {
+		u64			extra1;
+		u64			extra2;
+	} big_cqe;
 };
 
 struct io_overflow_cqe {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3e0fe1ebbc10..0aeb33256a6d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -807,13 +807,10 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 
 void io_req_cqe_overflow(struct io_kiocb *req)
 {
-	if (!(req->flags & REQ_F_CQE32_INIT)) {
-		req->extra1 = 0;
-		req->extra2 = 0;
-	}
 	io_cqring_event_overflow(req->ctx, req->cqe.user_data,
 				req->cqe.res, req->cqe.flags,
-				req->extra1, req->extra2);
+				req->big_cqe.extra1, req->big_cqe.extra2);
+	memset(&req->big_cqe, 0, sizeof(req->big_cqe));
 }
 
 /*
@@ -1057,6 +1054,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 	req->async_data = NULL;
 	/* not necessary, but safer to zero */
 	memset(&req->cqe, 0, sizeof(req->cqe));
+	memset(&req->big_cqe, 0, sizeof(req->big_cqe));
 }
 
 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 465598223386..9b5dfb6ef484 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -148,21 +148,12 @@ static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req
 	if (trace_io_uring_complete_enabled())
 		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
 					req->cqe.res, req->cqe.flags,
-					(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
-					(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
+					req->big_cqe.extra1, req->big_cqe.extra2);
 
 	memcpy(cqe, &req->cqe, sizeof(*cqe));
-
 	if (ctx->flags & IORING_SETUP_CQE32) {
-		u64 extra1 = 0, extra2 = 0;
-
-		if (req->flags & REQ_F_CQE32_INIT) {
-			extra1 = req->extra1;
-			extra2 = req->extra2;
-		}
-
-		WRITE_ONCE(cqe->big_cqe[0], extra1);
-		WRITE_ONCE(cqe->big_cqe[1], extra2);
+		memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
+		memset(&req->big_cqe, 0, sizeof(req->big_cqe));
 	}
 	return true;
 }
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 8e7a03c1b20e..537795fddc87 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -43,9 +43,8 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy);
 static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
 					  u64 extra1, u64 extra2)
 {
-	req->extra1 = extra1;
-	req->extra2 = extra2;
-	req->flags |= REQ_F_CQE32_INIT;
+	req->big_cqe.extra1 = extra1;
+	req->big_cqe.extra2 = extra2;
 }
 
 /*

From 20d6b633870495fda1d92d283ebf890d80f68ecd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:26 +0100
Subject: [PATCH 36/47] io_uring: refactor __io_get_cqe()

Make __io_get_cqe simpler by not grabbing the cqe from refilled cached,
but letting io_get_cqe() do it for us. That's cleaner and removes some
duplication.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/74dc8fdf2657e438b2e05e1d478a3596924604e9.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 13 ++++---------
 io_uring/io_uring.h | 23 ++++++++++++-----------
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0aeb33256a6d..de05831eeca7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -818,7 +818,7 @@ void io_req_cqe_overflow(struct io_kiocb *req)
  * control dependency is enough as we're using WRITE_ONCE to
  * fill the cq entry
  */
-struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
 {
 	struct io_rings *rings = ctx->rings;
 	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
@@ -830,7 +830,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
 	 * Force overflow the completion.
 	 */
 	if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
-		return NULL;
+		return false;
 
 	/* userspace may cheat modifying the tail, be safe and do min */
 	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
@@ -838,7 +838,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
 	/* we need a contiguous range, limit based on the current array offset */
 	len = min(free, ctx->cq_entries - off);
 	if (!len)
-		return NULL;
+		return false;
 
 	if (ctx->flags & IORING_SETUP_CQE32) {
 		off <<= 1;
@@ -847,12 +847,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
 
 	ctx->cqe_cached = &rings->cqes[off];
 	ctx->cqe_sentinel = ctx->cqe_cached + len;
-
-	ctx->cached_cq_tail++;
-	ctx->cqe_cached++;
-	if (ctx->flags & IORING_SETUP_CQE32)
-		ctx->cqe_cached++;
-	return &rings->cqes[off];
+	return true;
 }
 
 static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 9b5dfb6ef484..9c80d20fe18f 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -38,7 +38,7 @@ enum {
 	IOU_STOP_MULTISHOT	= -ECANCELED,
 };
 
-struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow);
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
 void io_req_cqe_overflow(struct io_kiocb *req);
 int io_run_task_work_sig(struct io_ring_ctx *ctx);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
@@ -112,19 +112,20 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
 static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
 						       bool overflow)
 {
+	struct io_uring_cqe *cqe;
+
 	io_lockdep_assert_cq_locked(ctx);
 
-	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
-		struct io_uring_cqe *cqe = ctx->cqe_cached;
-
-		ctx->cached_cq_tail++;
-		ctx->cqe_cached++;
-		if (ctx->flags & IORING_SETUP_CQE32)
-			ctx->cqe_cached++;
-		return cqe;
+	if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
+		if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
+			return NULL;
 	}
-
-	return __io_get_cqe(ctx, overflow);
+	cqe = ctx->cqe_cached;
+	ctx->cached_cq_tail++;
+	ctx->cqe_cached++;
+	if (ctx->flags & IORING_SETUP_CQE32)
+		ctx->cqe_cached++;
+	return cqe;
 }
 
 static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)

From 59fbc409e71649f558fb4578cdbfac67acb824dc Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:27 +0100
Subject: [PATCH 37/47] io_uring: optimise extra io_get_cqe null check

If the cached cqe check passes in io_get_cqe*() it already means that
the cqe we return is valid and non-zero, however the compiler is unable
to optimise null checks like in io_fill_cqe_req().

Do a bit of trickery, return success/fail boolean from io_get_cqe*()
and store cqe in the cqe parameter. That makes it do the right thing,
erasing the check together with the introduced indirection.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/322ea4d3377d3d4efd8ae90ab8ed28a99f518210.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  7 +++----
 io_uring/io_uring.h | 20 +++++++++-----------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index de05831eeca7..cfc2dc8c4b2f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -683,10 +683,10 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 
 	io_cq_lock(ctx);
 	while (!list_empty(&ctx->cq_overflow_list)) {
-		struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
+		struct io_uring_cqe *cqe;
 		struct io_overflow_cqe *ocqe;
 
-		if (!cqe)
+		if (!io_get_cqe_overflow(ctx, &cqe, true))
 			break;
 		ocqe = list_first_entry(&ctx->cq_overflow_list,
 					struct io_overflow_cqe, list);
@@ -862,8 +862,7 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 	 * submission (by quite a lot). Increment the overflow count in
 	 * the ring.
 	 */
-	cqe = io_get_cqe(ctx);
-	if (likely(cqe)) {
+	if (likely(io_get_cqe(ctx, &cqe))) {
 		trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
 
 		WRITE_ONCE(cqe->user_data, user_data);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 9c80d20fe18f..2960e35b32a5 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -109,28 +109,27 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
-static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
-						       bool overflow)
+static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
+					struct io_uring_cqe **ret,
+					bool overflow)
 {
-	struct io_uring_cqe *cqe;
-
 	io_lockdep_assert_cq_locked(ctx);
 
 	if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
 		if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
-			return NULL;
+			return false;
 	}
-	cqe = ctx->cqe_cached;
+	*ret = ctx->cqe_cached;
 	ctx->cached_cq_tail++;
 	ctx->cqe_cached++;
 	if (ctx->flags & IORING_SETUP_CQE32)
 		ctx->cqe_cached++;
-	return cqe;
+	return true;
 }
 
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret)
 {
-	return io_get_cqe_overflow(ctx, false);
+	return io_get_cqe_overflow(ctx, ret, false);
 }
 
 static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req)
@@ -142,8 +141,7 @@ static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req
 	 * submission (by quite a lot). Increment the overflow count in
 	 * the ring.
 	 */
-	cqe = io_get_cqe(ctx);
-	if (unlikely(!cqe))
+	if (unlikely(!io_get_cqe(ctx, &cqe)))
 		return false;
 
 	if (trace_io_uring_complete_enabled())

From 54927baf6c195fb512ac38b26a041ca44edb2e29 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:28 +0100
Subject: [PATCH 38/47] io_uring: reorder cqring_flush and wakeups

Unlike in the past, io_commit_cqring_flush() doesn't do anything that
may need io_cqring_wake() to be issued after, all requests it completes
will go via task_work. Do io_commit_cqring_flush() after
io_cqring_wake() to clean up __io_cq_unlock_post().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ed32dcfeec47e6c97bd6b18c152ddce5b218403f.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 14 +++-----------
 io_uring/rw.c       |  2 +-
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cfc2dc8c4b2f..7c1ef5b6628d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -629,19 +629,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
 static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
 {
 	io_commit_cqring(ctx);
-
-	if (ctx->task_complete) {
-		/*
-		 * ->task_complete implies that only current might be waiting
-		 * for CQEs, and obviously, we currently don't. No one is
-		 * waiting, wakeups are futile, skip them.
-		 */
-		io_commit_cqring_flush(ctx);
-	} else {
+	if (!ctx->task_complete) {
 		spin_unlock(&ctx->completion_lock);
-		io_commit_cqring_flush(ctx);
 		io_cqring_wake(ctx);
 	}
+	io_commit_cqring_flush(ctx);
 }
 
 static void io_cq_unlock_post(struct io_ring_ctx *ctx)
@@ -649,8 +641,8 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx)
 {
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	io_commit_cqring_flush(ctx);
 	io_cqring_wake(ctx);
+	io_commit_cqring_flush(ctx);
 }
 
 /* Returns true if there are no backlogged entries after the flush */
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 9b51afdae505..20140d3505f1 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -985,9 +985,9 @@ copy_iov:
 
 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 {
-	io_commit_cqring_flush(ctx);
 	if (ctx->flags & IORING_SETUP_SQPOLL)
 		io_cqring_wake(ctx);
+	io_commit_cqring_flush(ctx);
 }
 
 void io_rw_fail(struct io_kiocb *req)

From ec26c225f06f5993f8891fa6c79fab3c92981181 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:29 +0100
Subject: [PATCH 39/47] io_uring: merge iopoll and normal completion paths

io_do_iopoll() and io_submit_flush_completions() are pretty similar,
both filling CQEs and then free a list of requests. Don't duplicate it
and make iopoll use __io_submit_flush_completions(), which also helps
with inlining and other optimisations.

For that, we need to first find all completed iopoll requests and splice
them from the iopoll list and then pass it down. This adds one extra
list traversal, which should be fine as requests will stay hot in cache.

CQ locking is already conditional, introduce ->lockless_cq and skip
locking for IOPOLL as it's protected by ->uring_lock.

We also add a wakeup optimisation for IOPOLL to __io_cq_unlock_post(),
so it works just like io_cqring_ev_posted_iopoll().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3840473f5e8a960de35b77292026691880f6bdbc.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 +
 io_uring/io_uring.c            | 18 ++++++++++++------
 io_uring/io_uring.h            |  2 +-
 io_uring/rw.c                  | 24 +++++-------------------
 4 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 9795eda529f7..c0c03d8059df 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -205,6 +205,7 @@ struct io_ring_ctx {
 		unsigned int		has_evfd: 1;
 		/* all CQEs should be posted only by the submitter task */
 		unsigned int		task_complete: 1;
+		unsigned int		lockless_cq: 1;
 		unsigned int		syscall_iopoll: 1;
 		unsigned int		poll_activated: 1;
 		unsigned int		drain_disabled: 1;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7c1ef5b6628d..e8321903e3f3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -147,7 +147,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 bool cancel_all);
 
 static void io_queue_sqe(struct io_kiocb *req);
-static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 
 struct kmem_cache *req_cachep;
 
@@ -616,7 +615,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 
 static inline void __io_cq_lock(struct io_ring_ctx *ctx)
 {
-	if (!ctx->task_complete)
+	if (!ctx->lockless_cq)
 		spin_lock(&ctx->completion_lock);
 }
 
@@ -630,8 +629,11 @@ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
 {
 	io_commit_cqring(ctx);
 	if (!ctx->task_complete) {
-		spin_unlock(&ctx->completion_lock);
-		io_cqring_wake(ctx);
+		if (!ctx->lockless_cq)
+			spin_unlock(&ctx->completion_lock);
+		/* IOPOLL rings only need to wake up if it's also SQPOLL */
+		if (!ctx->syscall_iopoll)
+			io_cqring_wake(ctx);
 	}
 	io_commit_cqring_flush(ctx);
 }
@@ -1485,7 +1487,8 @@ void io_queue_next(struct io_kiocb *req)
 		io_req_task_queue(nxt);
 }
 
-void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
+static void io_free_batch_list(struct io_ring_ctx *ctx,
+			       struct io_wq_work_node *node)
 	__must_hold(&ctx->uring_lock)
 {
 	do {
@@ -1522,7 +1525,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
 	} while (node);
 }
 
-static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
+void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
 	struct io_submit_state *state = &ctx->submit_state;
@@ -3836,6 +3839,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	    !(ctx->flags & IORING_SETUP_SQPOLL))
 		ctx->task_complete = true;
 
+	if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL))
+		ctx->lockless_cq = true;
+
 	/*
 	 * lazy poll_wq activation relies on ->task_complete for synchronisation
 	 * purposes, see io_activate_pollwq()
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 2960e35b32a5..07fd185064d2 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -72,7 +72,7 @@ int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
 int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
-void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
+void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 int io_req_prep_async(struct io_kiocb *req);
 
 struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 20140d3505f1..0a1e515f0510 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -983,13 +983,6 @@ copy_iov:
 	return ret;
 }
 
-static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
-{
-	if (ctx->flags & IORING_SETUP_SQPOLL)
-		io_cqring_wake(ctx);
-	io_commit_cqring_flush(ctx);
-}
-
 void io_rw_fail(struct io_kiocb *req)
 {
 	int res;
@@ -1060,24 +1053,17 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (!smp_load_acquire(&req->iopoll_completed))
 			break;
 		nr_events++;
-		if (unlikely(req->flags & REQ_F_CQE_SKIP))
-			continue;
-
 		req->cqe.flags = io_put_kbuf(req, 0);
-		if (unlikely(!io_fill_cqe_req(ctx, req))) {
-			spin_lock(&ctx->completion_lock);
-			io_req_cqe_overflow(req);
-			spin_unlock(&ctx->completion_lock);
-		}
 	}
-
 	if (unlikely(!nr_events))
 		return 0;
 
-	io_commit_cqring(ctx);
-	io_cqring_ev_posted_iopoll(ctx);
 	pos = start ? start->next : ctx->iopoll_list.first;
 	wq_list_cut(&ctx->iopoll_list, prev, start);
-	io_free_batch_list(ctx, pos);
+
+	if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs)))
+		return 0;
+	ctx->submit_state.compl_reqs.first = pos;
+	__io_submit_flush_completions(ctx);
 	return nr_events;
 }

From 093a650b757210bc856ca7f5349fb5a4bb9d4bd6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:30 +0100
Subject: [PATCH 40/47] io_uring: force inline io_fill_cqe_req

There are only 2 callers of io_fill_cqe_req left, and one of them is
extremely hot. Force inline the function.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ffce4fc5e3521966def848a4d930586dfe33ae11.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 07fd185064d2..547c30582fb8 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -132,7 +132,8 @@ static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret
 	return io_get_cqe_overflow(ctx, ret, false);
 }
 
-static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req)
+static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
+					    struct io_kiocb *req)
 {
 	struct io_uring_cqe *cqe;
 

From e5598d6ae62626d261b046a2f19347c38681ff51 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:31 +0100
Subject: [PATCH 41/47] io_uring: compact SQ/CQ heads/tails

Queues heads and tails cache line aligned. That makes sq, cq taking 4
lines or 5 lines if we include the rest of struct io_rings (e.g.
sq_flags is frequently accessed).

Since modern io_uring is mostly single threaded, it doesn't make much
send to spread them as such, it wastes space and puts additional pressure
on caches. Put them all into a single line.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9c8deddf9a7ed32069235a530d1e117fb460bc4c.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index c0c03d8059df..608a8e80e881 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -69,8 +69,8 @@ struct io_uring_task {
 };
 
 struct io_uring {
-	u32 head ____cacheline_aligned_in_smp;
-	u32 tail ____cacheline_aligned_in_smp;
+	u32 head;
+	u32 tail;
 };
 
 /*

From 2af89abda7d9c2aeb573677e2c498ddb09f8058a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:32 +0100
Subject: [PATCH 42/47] io_uring: add option to remove SQ indirection

Not many aware, but io_uring submission queue has two levels. The first
level usually appears as sq_array and stores indexes into the actual SQ.

To my knowledge, no one has ever seriously used it, nor liburing exposes
it to users. Add IORING_SETUP_NO_SQARRAY, when set we don't bother
creating and using the sq_array and SQ heads/tails will be pointing
directly into the SQ. Improves memory footprint, in term of both
allocations as well as cache usage, and also should make io_get_sqe()
less branchy in the end.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0ffa3268a5ef61d326201ff43a233315c96312e0.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  5 ++++
 io_uring/io_uring.c           | 52 +++++++++++++++++++++--------------
 2 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 9fc7195f25df..8e61f8b7c2ce 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -185,6 +185,11 @@ enum {
  */
 #define IORING_SETUP_REGISTERED_FD_ONLY	(1U << 15)
 
+/*
+ * Removes indirection through the SQ index array.
+ */
+#define IORING_SETUP_NO_SQARRAY		(1U << 16)
+
 enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e8321903e3f3..a6eea3938802 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2339,8 +2339,21 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
  */
 static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
 {
-	unsigned head, mask = ctx->sq_entries - 1;
-	unsigned sq_idx = ctx->cached_sq_head++ & mask;
+	unsigned mask = ctx->sq_entries - 1;
+	unsigned head = ctx->cached_sq_head++ & mask;
+
+	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) {
+		head = READ_ONCE(ctx->sq_array[head]);
+		if (unlikely(head >= ctx->sq_entries)) {
+			/* drop invalid entries */
+			spin_lock(&ctx->completion_lock);
+			ctx->cq_extra--;
+			spin_unlock(&ctx->completion_lock);
+			WRITE_ONCE(ctx->rings->sq_dropped,
+				   READ_ONCE(ctx->rings->sq_dropped) + 1);
+			return false;
+		}
+	}
 
 	/*
 	 * The cached sq head (or cq tail) serves two purposes:
@@ -2350,22 +2363,12 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
 	 * 2) allows the kernel side to track the head on its own, even
 	 *    though the application is the one updating it.
 	 */
-	head = READ_ONCE(ctx->sq_array[sq_idx]);
-	if (likely(head < ctx->sq_entries)) {
-		/* double index for 128-byte SQEs, twice as long */
-		if (ctx->flags & IORING_SETUP_SQE128)
-			head <<= 1;
-		*sqe = &ctx->sq_sqes[head];
-		return true;
-	}
 
-	/* drop invalid entries */
-	spin_lock(&ctx->completion_lock);
-	ctx->cq_extra--;
-	spin_unlock(&ctx->completion_lock);
-	WRITE_ONCE(ctx->rings->sq_dropped,
-		   READ_ONCE(ctx->rings->sq_dropped) + 1);
-	return false;
+	/* double index for 128-byte SQEs, twice as long */
+	if (ctx->flags & IORING_SETUP_SQE128)
+		head <<= 1;
+	*sqe = &ctx->sq_sqes[head];
+	return true;
 }
 
 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
@@ -2734,6 +2737,12 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
 		return SIZE_MAX;
 #endif
 
+	if (ctx->flags & IORING_SETUP_NO_SQARRAY) {
+		if (sq_offset)
+			*sq_offset = SIZE_MAX;
+		return off;
+	}
+
 	if (sq_offset)
 		*sq_offset = off;
 
@@ -3710,7 +3719,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 		return PTR_ERR(rings);
 
 	ctx->rings = rings;
-	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
+	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+		ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
 	rings->sq_ring_mask = p->sq_entries - 1;
 	rings->cq_ring_mask = p->cq_entries - 1;
 	rings->sq_ring_entries = p->sq_entries;
@@ -3921,7 +3931,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
 	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
 	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
-	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
+	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+		p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
 	p->sq_off.resv1 = 0;
 	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
 		p->sq_off.user_addr = 0;
@@ -4010,7 +4021,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
 			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
-			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY))
+			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
+			IORING_SETUP_NO_SQARRAY))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);

From d7f06fea5d6be78403d42c9637f67bc883870094 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:33 +0100
Subject: [PATCH 43/47] io_uring: move non aligned field to the end

Move not cache aligned fields down in io_ring_ctx, should change
anything, but makes further refactoring easier.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/518e95d7888e9d481b2c5968dcf3f23db9ea47a5.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 36 +++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 608a8e80e881..ad87d6074fb2 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -270,24 +270,6 @@ struct io_ring_ctx {
 		struct io_alloc_cache	netmsg_cache;
 	} ____cacheline_aligned_in_smp;
 
-	/* IRQ completion list, under ->completion_lock */
-	struct io_wq_work_list	locked_free_list;
-	unsigned int		locked_free_nr;
-
-	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
-	struct io_sq_data	*sq_data;	/* if using sq thread polling */
-
-	struct wait_queue_head	sqo_sq_wait;
-	struct list_head	sqd_list;
-
-	unsigned long		check_cq;
-
-	unsigned int		file_alloc_start;
-	unsigned int		file_alloc_end;
-
-	struct xarray		personalities;
-	u32			pers_next;
-
 	struct {
 		/*
 		 * We cache a range of free CQEs we can use, once exhausted it
@@ -332,6 +314,24 @@ struct io_ring_ctx {
 		unsigned		cq_last_tm_flush;
 	} ____cacheline_aligned_in_smp;
 
+	/* IRQ completion list, under ->completion_lock */
+	struct io_wq_work_list	locked_free_list;
+	unsigned int		locked_free_nr;
+
+	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
+	struct io_sq_data	*sq_data;	/* if using sq thread polling */
+
+	struct wait_queue_head	sqo_sq_wait;
+	struct list_head	sqd_list;
+
+	unsigned long		check_cq;
+
+	unsigned int		file_alloc_start;
+	unsigned int		file_alloc_end;
+
+	struct xarray		personalities;
+	u32			pers_next;
+
 	/* Keep this last, we don't need it for the fast path */
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;

From 18df385f42f0b3310ed2e4a3e39264bf5e784692 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:34 +0100
Subject: [PATCH 44/47] io_uring: banish non-hot data to end of io_ring_ctx

Let's move all slow path, setup/init and so on fields to the end of
io_ring_ctx, that makes ctx reorganisation later easier. That includes,
page arrays used only on tear down, CQ overflow list, old provided
buffer caches and used by io-wq poll hashes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/fc471b63925a0bf90a34943c4d36163c523cfb43.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 37 +++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index ad87d6074fb2..72e609752323 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -211,20 +211,11 @@ struct io_ring_ctx {
 		unsigned int		drain_disabled: 1;
 		unsigned int		compat: 1;
 
+		struct task_struct	*submitter_task;
+		struct io_rings		*rings;
+		struct percpu_ref	refs;
+
 		enum task_work_notify_mode	notify_method;
-
-		/*
-		 * If IORING_SETUP_NO_MMAP is used, then the below holds
-		 * the gup'ed pages for the two rings, and the sqes.
-		 */
-		unsigned short		n_ring_pages;
-		unsigned short		n_sqe_pages;
-		struct page		**ring_pages;
-		struct page		**sqe_pages;
-
-		struct io_rings			*rings;
-		struct task_struct		*submitter_task;
-		struct percpu_ref		refs;
 	} ____cacheline_aligned_in_smp;
 
 	/* submission data */
@@ -262,10 +253,8 @@ struct io_ring_ctx {
 
 		struct io_buffer_list	*io_bl;
 		struct xarray		io_bl_xa;
-		struct list_head	io_buffers_cache;
 
 		struct io_hash_table	cancel_table_locked;
-		struct list_head	cq_overflow_list;
 		struct io_alloc_cache	apoll_cache;
 		struct io_alloc_cache	netmsg_cache;
 	} ____cacheline_aligned_in_smp;
@@ -298,11 +287,8 @@ struct io_ring_ctx {
 		 * manipulate the list, hence no extra locking is needed there.
 		 */
 		struct io_wq_work_list	iopoll_list;
-		struct io_hash_table	cancel_table;
 
 		struct llist_head	work_llist;
-
-		struct list_head	io_buffers_comp;
 	} ____cacheline_aligned_in_smp;
 
 	/* timeouts */
@@ -318,6 +304,10 @@ struct io_ring_ctx {
 	struct io_wq_work_list	locked_free_list;
 	unsigned int		locked_free_nr;
 
+	struct list_head	io_buffers_comp;
+	struct list_head	cq_overflow_list;
+	struct io_hash_table	cancel_table;
+
 	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
 	struct io_sq_data	*sq_data;	/* if using sq thread polling */
 
@@ -332,6 +322,8 @@ struct io_ring_ctx {
 	struct xarray		personalities;
 	u32			pers_next;
 
+	struct list_head	io_buffers_cache;
+
 	/* Keep this last, we don't need it for the fast path */
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
@@ -375,6 +367,15 @@ struct io_ring_ctx {
 	unsigned			sq_thread_idle;
 	/* protected by ->completion_lock */
 	unsigned			evfd_last_cq_tail;
+
+	/*
+	 * If IORING_SETUP_NO_MMAP is used, then the below holds
+	 * the gup'ed pages for the two rings, and the sqes.
+	 */
+	unsigned short			n_ring_pages;
+	unsigned short			n_sqe_pages;
+	struct page			**ring_pages;
+	struct page			**sqe_pages;
 };
 
 struct io_tw_state {

From c9def23dde5238184777340ad811e4903f216a2d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:35 +0100
Subject: [PATCH 45/47] io_uring: separate task_work/waiting cache line

task_work's are typically queued up from IRQ/softirq potentially by a
random CPU like in case of networking. Batch ctx fields bouncing as this
into a separate cache line.

We also move ->cq_timeouts there because waiters have to read and check
it. We can also conditionally hide ->cq_timeouts in the future from the
CQ wait path as a not really useful rudiment.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b7f3fcb5b6b9cca0238778262c1fdb7ada6286b7.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 72e609752323..5de5dffe29df 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -270,15 +270,25 @@ struct io_ring_ctx {
 		unsigned		cached_cq_tail;
 		unsigned		cq_entries;
 		struct io_ev_fd	__rcu	*io_ev_fd;
-		struct wait_queue_head	cq_wait;
 		unsigned		cq_extra;
 	} ____cacheline_aligned_in_smp;
 
+	/*
+	 * task_work and async notification delivery cacheline. Expected to
+	 * regularly bounce b/w CPUs.
+	 */
+	struct {
+		struct llist_head	work_llist;
+		unsigned long		check_cq;
+		atomic_t		cq_wait_nr;
+		atomic_t		cq_timeouts;
+		struct wait_queue_head	cq_wait;
+	} ____cacheline_aligned_in_smp;
+
 	struct {
 		spinlock_t		completion_lock;
 
 		bool			poll_multi_queue;
-		atomic_t		cq_wait_nr;
 
 		/*
 		 * ->iopoll_list is protected by the ctx->uring_lock for
@@ -287,14 +297,11 @@ struct io_ring_ctx {
 		 * manipulate the list, hence no extra locking is needed there.
 		 */
 		struct io_wq_work_list	iopoll_list;
-
-		struct llist_head	work_llist;
 	} ____cacheline_aligned_in_smp;
 
 	/* timeouts */
 	struct {
 		spinlock_t		timeout_lock;
-		atomic_t		cq_timeouts;
 		struct list_head	timeout_list;
 		struct list_head	ltimeout_list;
 		unsigned		cq_last_tm_flush;
@@ -314,8 +321,6 @@ struct io_ring_ctx {
 	struct wait_queue_head	sqo_sq_wait;
 	struct list_head	sqd_list;
 
-	unsigned long		check_cq;
-
 	unsigned int		file_alloc_start;
 	unsigned int		file_alloc_end;
 

From 0aa7aa5f766933d4f91b22d9658cd688e1f15dab Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:36 +0100
Subject: [PATCH 46/47] io_uring: move multishot cqe cache in ctx

We cache multishot CQEs before flushing them to the CQ in
submit_state.cqe. It's a 16 entry cache totalling 256 bytes in the
middle of the io_submit_state structure. Move it out of there, it
should help with CPU caches for the submission state, and shouldn't
affect cached CQEs.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/dbe1f39c043ee23da918836be44fcec252ce6711.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 3 ++-
 io_uring/io_uring.c            | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 5de5dffe29df..01bdbc223edd 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -176,7 +176,6 @@ struct io_submit_state {
 	unsigned short		submit_nr;
 	unsigned int		cqes_count;
 	struct blk_plug		plug;
-	struct io_uring_cqe	cqes[16];
 };
 
 struct io_ev_fd {
@@ -307,6 +306,8 @@ struct io_ring_ctx {
 		unsigned		cq_last_tm_flush;
 	} ____cacheline_aligned_in_smp;
 
+	struct io_uring_cqe	completion_cqes[16];
+
 	/* IRQ completion list, under ->completion_lock */
 	struct io_wq_work_list	locked_free_list;
 	unsigned int		locked_free_nr;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a6eea3938802..88599852af82 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -880,7 +880,7 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
 
 	lockdep_assert_held(&ctx->uring_lock);
 	for (i = 0; i < state->cqes_count; i++) {
-		struct io_uring_cqe *cqe = &state->cqes[i];
+		struct io_uring_cqe *cqe = &ctx->completion_cqes[i];
 
 		if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
 			if (ctx->task_complete) {
@@ -931,7 +931,7 @@ bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
 
 	lockdep_assert_held(&ctx->uring_lock);
 
-	if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) {
+	if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) {
 		__io_cq_lock(ctx);
 		__io_flush_post_cqes(ctx);
 		/* no need to flush - flush is deferred */
@@ -945,7 +945,7 @@ bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
 	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
 		return false;
 
-	cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++];
+	cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++];
 	cqe->user_data = user_data;
 	cqe->res = res;
 	cqe->flags = cflags;

From 644c4a7a721fb90356cdd42219c9928a3c386230 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Aug 2023 23:53:37 +0100
Subject: [PATCH 47/47] io_uring: move iopoll ctx fields around

Move poll_multi_queue and iopoll_list to the submission cache line, it
doesn't make much sense to keep them separately, and is better place
for it in general.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/5b03cf7e6652e350e6e70a917eec72ba9f33b97b.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 01bdbc223edd..13d19b9be9f4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -256,6 +256,15 @@ struct io_ring_ctx {
 		struct io_hash_table	cancel_table_locked;
 		struct io_alloc_cache	apoll_cache;
 		struct io_alloc_cache	netmsg_cache;
+
+		/*
+		 * ->iopoll_list is protected by the ctx->uring_lock for
+		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
+		 * For SQPOLL, only the single threaded io_sq_thread() will
+		 * manipulate the list, hence no extra locking is needed there.
+		 */
+		struct io_wq_work_list	iopoll_list;
+		bool			poll_multi_queue;
 	} ____cacheline_aligned_in_smp;
 
 	struct {
@@ -284,20 +293,6 @@ struct io_ring_ctx {
 		struct wait_queue_head	cq_wait;
 	} ____cacheline_aligned_in_smp;
 
-	struct {
-		spinlock_t		completion_lock;
-
-		bool			poll_multi_queue;
-
-		/*
-		 * ->iopoll_list is protected by the ctx->uring_lock for
-		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
-		 * For SQPOLL, only the single threaded io_sq_thread() will
-		 * manipulate the list, hence no extra locking is needed there.
-		 */
-		struct io_wq_work_list	iopoll_list;
-	} ____cacheline_aligned_in_smp;
-
 	/* timeouts */
 	struct {
 		spinlock_t		timeout_lock;
@@ -308,6 +303,8 @@ struct io_ring_ctx {
 
 	struct io_uring_cqe	completion_cqes[16];
 
+	spinlock_t		completion_lock;
+
 	/* IRQ completion list, under ->completion_lock */
 	struct io_wq_work_list	locked_free_list;
 	unsigned int		locked_free_nr;