io_uring: add a sysctl to disable io_uring system-wide

Introduce a new sysctl (io_uring_disabled) which can be either 0, 1,
or 2. When 0 (the default), all processes are allowed to create io_uring
instances, which is the current behavior. When 1, all calls to
io_uring_setup fail with -EPERM unless the calling process has
CAP_SYS_ADMIN. When 2, calls to io_uring_setup fail with -EPERM
regardless of privilege.

BUG=b/293017665
TEST=presubmit
RELEASE_NOTE=None

cos-patch: bug
Signed-off-by: Matteo Rizzo <matteorizzo@google.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/r/20230630151003.3622786-2-matteorizzo@google.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Change-Id: I0536bc6e6443ddacadf463fcb78e708a34d88fe5
Reviewed-on: https://cos-review.googlesource.com/c/third_party/kernel/+/55596
Tested-by: Cusky Presubmit Bot <presubmit@cos-infra-prod.iam.gserviceaccount.com>
Reviewed-by: Arnav Kansal <rnv@google.com>
Main-Branch-Verified: Cusky Presubmit Bot <presubmit@cos-infra-prod.iam.gserviceaccount.com>
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 48b91c4..2b1fdf4 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -442,6 +442,25 @@
 ``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded.
 
 
+io_uring_disabled
+=================
+
+Prevents all processes from creating new io_uring instances. Enabling this
+shrinks the kernel's attack surface.
+
+= ==================================================================
+0 All processes can create io_uring instances as normal. This is the
+  default setting.
+1 io_uring creation is disabled for unprivileged processes.
+  io_uring_setup fails with -EPERM unless the calling process is
+  privileged (CAP_SYS_ADMIN). Existing io_uring instances can
+  still be used.
+2 io_uring creation is disabled for all processes. io_uring_setup
+  always fails with -EPERM. Existing io_uring instances can still be
+  used.
+= ==================================================================
+
+
 kexec_load_disabled
 ===================
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cbfc9bb..25cab91 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1115,6 +1115,21 @@
 static struct kmem_cache *req_cachep;
 
 static const struct file_operations io_uring_fops;
+static int __read_mostly sysctl_io_uring_disabled;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kernel_io_uring_disabled_table[] = {
+	{
+		.procname	= "io_uring_disabled",
+		.data		= &sysctl_io_uring_disabled,
+		.maxlen		= sizeof(sysctl_io_uring_disabled),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_TWO,
+	},
+	{},
+};
+#endif
 
 struct sock *io_uring_get_socket(struct file *file)
 {
@@ -10663,9 +10678,19 @@
 	return  io_uring_create(entries, &p, params);
 }
 
+static inline bool io_uring_allowed(void)
+{
+	int disabled = READ_ONCE(sysctl_io_uring_disabled);
+
+	return disabled == 0 || (disabled == 1 && capable(CAP_SYS_ADMIN));
+}
+
 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
 		struct io_uring_params __user *, params)
 {
+	if (!io_uring_allowed())
+		return -EPERM;
+
 	return io_uring_setup(entries, params);
 }
 
@@ -11295,6 +11320,11 @@
 
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
 				SLAB_ACCOUNT);
+
+#ifdef CONFIG_SYSCTL
+	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
+#endif
+
 	return 0;
 };
 __initcall(io_uring_init);