blob: 17121c491a712a4b16a00682c2b8be53be241134 [file] [log] [blame]
From 2b188cc1bb857a9d4701ae59aa7768b5124e262e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 7 Jan 2019 10:46:33 -0700
Subject: [PATCH] BACKPORT: Add io_uring IO interface
The submission queue (SQ) and completion queue (CQ) rings are shared
between the application and the kernel. This eliminates the need to
copy data back and forth to submit and complete IO.
IO submissions use the io_uring_sqe data structure, and completions
are generated in the form of io_uring_cqe data structures. The SQ
ring is an index into the io_uring_sqe array, which makes it possible
to submit a batch of IOs without them being contiguous in the ring.
The CQ ring is always contiguous, as completion events are inherently
unordered, and hence any io_uring_cqe entry can point back to an
arbitrary submission.
Two new system calls are added for this:
io_uring_setup(entries, params)
Sets up an io_uring instance for doing async IO. On success,
returns a file descriptor that the application can mmap to
gain access to the SQ ring, CQ ring, and io_uring_sqes.
io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize)
Initiates IO against the rings mapped to this fd, or waits for
them to complete, or both. The behavior is controlled by the
parameters passed in. If 'to_submit' is non-zero, then we'll
try and submit new IO. If IORING_ENTER_GETEVENTS is set, the
kernel will wait for 'min_complete' events, if they aren't
already available. It's valid to set IORING_ENTER_GETEVENTS
and 'min_complete' == 0 at the same time, this allows the
kernel to return already completed events without waiting
for them. This is useful only for polling, as for IRQ
driven IO, the application can just check the CQ ring
without entering the kernel.
With this setup, it's possible to do async IO with a single system
call. Future developments will enable polled IO with this interface,
and polled submission as well. The latter will enable an application
to do IO without doing ANY system calls at all.
For IRQ driven IO, an application only needs to enter the kernel for
completions if it wants to wait for them to occur.
Each io_uring is backed by a workqueue, to support buffered async IO
as well. We will only punt to an async context if the command would
need to wait for IO on the device side. Any data that can be accessed
directly in the page cache is done inline. This avoids the slowness
issue of usual threadpools, since cached data is accessed as quickly
as a sync interface.
Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c
dgreid - Also pull in the ARM and ARM64 changes which were adding in
a separate and only partially related commit.
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
--- a/arch/arm/tools/syscall.tbl 2017-12-28 15:53:02.000000000 +0000
+++ b/arch/arm/tools/syscall.tbl 2020-10-30 03:59:25.357442520 +0000
@@ -412,3 +412,5 @@
395 common pkey_alloc sys_pkey_alloc
396 common pkey_free sys_pkey_free
397 common statx sys_statx
+425 common io_uring_setup sys_io_uring_setup
+426 common io_uring_enter sys_io_uring_enter
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
--- a/arch/arm64/include/asm/unistd32.h 2017-12-28 15:53:02.000000000 +0000
+++ b/arch/arm64/include/asm/unistd32.h 2020-10-30 04:02:43.711946882 +0000
@@ -817,6 +817,10 @@
__SYSCALL(__NR_pkey_free, sys_pkey_free)
#define __NR_statx 397
__SYSCALL(__NR_statx, sys_statx)
+#define __NR_io_uring_setup 425
+__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
+#define __NR_io_uring_enter 426
+__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
--- a/arch/arm64/include/asm/unistd.h 2017-12-28 15:53:02.000000000 +0000
+++ b/arch/arm64/include/asm/unistd.h 2020-10-30 04:02:56.152354901 +0000
@@ -43,7 +43,7 @@
#define __ARM_NR_compat_cacheflush (__ARM_NR_COMPAT_BASE+2)
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE+5)
-#define __NR_compat_syscalls 398
+#define __NR_compat_syscalls 427
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
--- a/arch/x86/entry/syscalls/syscall_32.tbl 2017-12-28 15:53:02.000000000 +0000
+++ b/arch/x86/entry/syscalls/syscall_32.tbl 2020-10-30 03:58:02.198716480 +0000
@@ -391,3 +391,5 @@
382 i386 pkey_free sys_pkey_free
383 i386 statx sys_statx
384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
+425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup
+426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
--- a/arch/x86/entry/syscalls/syscall_64.tbl 2017-12-28 15:53:02.000000000 +0000
+++ b/arch/x86/entry/syscalls/syscall_64.tbl 2020-10-30 03:49:50.902629025 +0000
@@ -339,6 +339,8 @@
330 common pkey_alloc sys_pkey_alloc
331 common pkey_free sys_pkey_free
332 common statx sys_statx
+425 common io_uring_setup __x64_sys_io_uring_setup
+426 common io_uring_enter __x64_sys_io_uring_enter
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 257cccba3062..3072dbaa7869 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -67,6 +67,7 @@ struct file_handle;
struct file_handle;
struct sigaltstack;
union bpf_attr;
+struct io_uring_params;
#include <linux/types.h>
#include <linux/aio_abi.h>
@@ -543,6 +544,11 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
struct iocb __user * __user *);
asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
struct io_event __user *result);
+asmlinkage long sys_io_uring_setup(u32 entries,
+ struct io_uring_params __user *p);
+asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
+ u32 min_complete, u32 flags,
+ const sigset_t __user *sig, size_t sigsz);
asmlinkage long sys_sendfile(int out_fd, int in_fd,
off_t __user *offset, size_t count);
asmlinkage long sys_sendfile64(int out_fd, int in_fd,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d90127298f12..87871e7b7ea7 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -732,9 +732,13 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
__SYSCALL(__NR_pkey_free, sys_pkey_free)
#define __NR_kexec_file_load 291
__SYSCALL(__NR_statx, sys_statx)
+#define __NR_io_uring_setup 425
+__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
+#define __NR_io_uring_enter 426
+__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
#undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 427
/*
* 32 bit systems traditionally used different
--
2.20.1