diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/Makefile | 41 | ||||
-rw-r--r-- | src/arch/aarch64/syscall.h | 95 | ||||
-rw-r--r-- | src/arch/generic/lib.h | 21 | ||||
-rw-r--r-- | src/arch/generic/syscall.h | 87 | ||||
-rw-r--r-- | src/arch/syscall-defs.h | 74 | ||||
-rw-r--r-- | src/arch/x86/lib.h | 15 | ||||
-rw-r--r-- | src/arch/x86/syscall.h | 300 | ||||
-rw-r--r-- | src/include/liburing.h | 719 | ||||
-rw-r--r-- | src/include/liburing/barrier.h | 8 | ||||
-rw-r--r-- | src/include/liburing/io_uring.h | 192 | ||||
-rw-r--r-- | src/int_flags.h | 9 | ||||
-rw-r--r-- | src/lib.h | 57 | ||||
-rw-r--r-- | src/liburing.map | 18 | ||||
-rw-r--r-- | src/nolibc.c | 48 | ||||
-rw-r--r-- | src/queue.c | 225 | ||||
-rw-r--r-- | src/register.c | 328 | ||||
-rw-r--r-- | src/setup.c | 157 | ||||
-rw-r--r-- | src/syscall.c | 44 | ||||
-rw-r--r-- | src/syscall.h | 99 |
19 files changed, 2059 insertions, 478 deletions
diff --git a/src/Makefile b/src/Makefile index dfca826..12cf49f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,22 +1,23 @@ +include ../Makefile.common + prefix ?= /usr includedir ?= $(prefix)/include libdir ?= $(prefix)/lib libdevdir ?= $(prefix)/lib CPPFLAGS ?= -override CPPFLAGS += -Iinclude/ -include ../config-host.h -CFLAGS ?= -g -fomit-frame-pointer -O2 -override CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare +override CPPFLAGS += -D_GNU_SOURCE \ + -Iinclude/ -include ../config-host.h +CFLAGS ?= -g -O2 -Wall -Wextra -fno-stack-protector +override CFLAGS += -Wno-unused-parameter -Wno-sign-compare -DLIBURING_INTERNAL SO_CFLAGS=-fPIC $(CFLAGS) L_CFLAGS=$(CFLAGS) LINK_FLAGS= LINK_FLAGS+=$(LDFLAGS) ENABLE_SHARED ?= 1 -soname=liburing.so.2 -minor=0 -micro=0 -libname=$(soname).$(minor).$(micro) +soname=liburing.so.$(VERSION_MAJOR) +libname=liburing.so.$(VERSION) all_targets += liburing.a ifeq ($(ENABLE_SHARED),1) @@ -31,19 +32,31 @@ endif all: $(all_targets) -liburing_srcs := setup.c queue.c syscall.c register.c +liburing_srcs := setup.c queue.c register.c + +ifeq ($(CONFIG_NOLIBC),y) + liburing_srcs += nolibc.c + override CFLAGS += -nostdlib -nodefaultlibs -ffreestanding + override CPPFLAGS += -nostdlib -nodefaultlibs -ffreestanding + override LINK_FLAGS += -nostdlib -nodefaultlibs +else + liburing_srcs += syscall.c +endif +override CPPFLAGS += -MT "$@" -MMD -MP -MF "$@.d" liburing_objs := $(patsubst %.c,%.ol,$(liburing_srcs)) liburing_sobjs := $(patsubst %.c,%.os,$(liburing_srcs)) -$(liburing_objs) $(liburing_sobjs): include/liburing/io_uring.h - %.os: %.c $(QUIET_CC)$(CC) $(CPPFLAGS) $(SO_CFLAGS) -c -o $@ $< %.ol: %.c $(QUIET_CC)$(CC) $(CPPFLAGS) $(L_CFLAGS) -c -o $@ $< +# Include compiler generated dependency files. +-include $(liburing_objs:%=%.d) +-include $(liburing_sobjs:%=%.d) + AR ?= ar RANLIB ?= ranlib liburing.a: $(liburing_objs) @@ -66,9 +79,11 @@ ifeq ($(ENABLE_SHARED),1) ln -sf $(relativelibdir)$(libname) $(libdevdir)/liburing.so endif -$(liburing_objs): include/liburing.h - clean: @rm -f $(all_targets) $(liburing_objs) $(liburing_sobjs) $(soname).new - @rm -f *.so* *.a *.o + @rm -f *.so* *.a *.o *.d @rm -f include/liburing/compat.h + + @# When cleaning, we don't include ../config-host.mak, + @# so the nolibc objects are always skipped, clean them up! + @rm -f nolibc.ol nolibc.os diff --git a/src/arch/aarch64/syscall.h b/src/arch/aarch64/syscall.h new file mode 100644 index 0000000..c0ab7e2 --- /dev/null +++ b/src/arch/aarch64/syscall.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: MIT */ + +#ifndef __INTERNAL__LIBURING_SYSCALL_H + #error "This file should be included from src/syscall.h (liburing)" +#endif + +#ifndef LIBURING_ARCH_AARCH64_SYSCALL_H +#define LIBURING_ARCH_AARCH64_SYSCALL_H + +#if defined(__aarch64__) + +#define __do_syscallN(...) ({ \ + __asm__ volatile ( \ + "svc 0" \ + : "=r"(x0) \ + : __VA_ARGS__ \ + : "memory", "cc"); \ + (long) x0; \ +}) + +#define __do_syscall0(__n) ({ \ + register long x8 __asm__("x8") = __n; \ + register long x0 __asm__("x0"); \ + \ + __do_syscallN("r" (x8)); \ +}) + +#define __do_syscall1(__n, __a) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + \ + __do_syscallN("r" (x8), "0" (x0)); \ +}) + +#define __do_syscall2(__n, __a, __b) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + register __typeof__(__b) x1 __asm__("x1") = __b; \ + \ + __do_syscallN("r" (x8), "0" (x0), "r" (x1)); \ +}) + +#define __do_syscall3(__n, __a, __b, __c) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + register __typeof__(__b) x1 __asm__("x1") = __b; \ + register __typeof__(__c) x2 __asm__("x2") = __c; \ + \ + __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2)); \ +}) + +#define __do_syscall4(__n, __a, __b, __c, __d) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + register __typeof__(__b) x1 __asm__("x1") = __b; \ + register __typeof__(__c) x2 __asm__("x2") = __c; \ + register __typeof__(__d) x3 __asm__("x3") = __d; \ + \ + __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3));\ +}) + +#define __do_syscall5(__n, __a, __b, __c, __d, __e) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + register __typeof__(__b) x1 __asm__("x1") = __b; \ + register __typeof__(__c) x2 __asm__("x2") = __c; \ + register __typeof__(__d) x3 __asm__("x3") = __d; \ + register __typeof__(__e) x4 __asm__("x4") = __e; \ + \ + __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3), \ + "r"(x4)); \ +}) + +#define __do_syscall6(__n, __a, __b, __c, __d, __e, __f) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + register __typeof__(__b) x1 __asm__("x1") = __b; \ + register __typeof__(__c) x2 __asm__("x2") = __c; \ + register __typeof__(__d) x3 __asm__("x3") = __d; \ + register __typeof__(__e) x4 __asm__("x4") = __e; \ + register __typeof__(__f) x5 __asm__("x5") = __f; \ + \ + __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3), \ + "r" (x4), "r"(x5)); \ +}) + +#include "../syscall-defs.h" + +#else /* #if defined(__aarch64__) */ + +#include "../generic/syscall.h" + +#endif /* #if defined(__aarch64__) */ + +#endif /* #ifndef LIBURING_ARCH_AARCH64_SYSCALL_H */ diff --git a/src/arch/generic/lib.h b/src/arch/generic/lib.h new file mode 100644 index 0000000..737e795 --- /dev/null +++ b/src/arch/generic/lib.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: MIT */ + +#ifndef __INTERNAL__LIBURING_LIB_H + #error "This file should be included from src/lib.h (liburing)" +#endif + +#ifndef LIBURING_ARCH_GENERIC_LIB_H +#define LIBURING_ARCH_GENERIC_LIB_H + +static inline long get_page_size(void) +{ + long page_size; + + page_size = sysconf(_SC_PAGESIZE); + if (page_size < 0) + page_size = 4096; + + return page_size; +} + +#endif /* #ifndef LIBURING_ARCH_GENERIC_LIB_H */ diff --git a/src/arch/generic/syscall.h b/src/arch/generic/syscall.h new file mode 100644 index 0000000..fa93064 --- /dev/null +++ b/src/arch/generic/syscall.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: MIT */ + +#ifndef __INTERNAL__LIBURING_SYSCALL_H + #error "This file should be included from src/syscall.h (liburing)" +#endif + +#ifndef LIBURING_ARCH_GENERIC_SYSCALL_H +#define LIBURING_ARCH_GENERIC_SYSCALL_H + +static inline int ____sys_io_uring_register(int fd, unsigned opcode, + const void *arg, unsigned nr_args) +{ + int ret; + ret = syscall(__NR_io_uring_register, fd, opcode, arg, nr_args); + return (ret < 0) ? -errno : ret; +} + +static inline int ____sys_io_uring_setup(unsigned entries, + struct io_uring_params *p) +{ + int ret; + ret = syscall(__NR_io_uring_setup, entries, p); + return (ret < 0) ? -errno : ret; +} + +static inline int ____sys_io_uring_enter2(int fd, unsigned to_submit, + unsigned min_complete, unsigned flags, + sigset_t *sig, int sz) +{ + int ret; + ret = syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, + sig, sz); + return (ret < 0) ? -errno : ret; +} + +static inline int ____sys_io_uring_enter(int fd, unsigned to_submit, + unsigned min_complete, unsigned flags, + sigset_t *sig) +{ + return ____sys_io_uring_enter2(fd, to_submit, min_complete, flags, sig, + _NSIG / 8); +} + +static inline void *__sys_mmap(void *addr, size_t length, int prot, int flags, + int fd, off_t offset) +{ + void *ret; + ret = mmap(addr, length, prot, flags, fd, offset); + return (ret == MAP_FAILED) ? ERR_PTR(-errno) : ret; +} + +static inline int __sys_munmap(void *addr, size_t length) +{ + int ret; + ret = munmap(addr, length); + return (ret < 0) ? -errno : ret; +} + +static inline int __sys_madvise(void *addr, size_t length, int advice) +{ + int ret; + ret = madvise(addr, length, advice); + return (ret < 0) ? -errno : ret; +} + +static inline int __sys_getrlimit(int resource, struct rlimit *rlim) +{ + int ret; + ret = getrlimit(resource, rlim); + return (ret < 0) ? -errno : ret; +} + +static inline int __sys_setrlimit(int resource, const struct rlimit *rlim) +{ + int ret; + ret = setrlimit(resource, rlim); + return (ret < 0) ? -errno : ret; +} + +static inline int __sys_close(int fd) +{ + int ret; + ret = close(fd); + return (ret < 0) ? -errno : ret; +} + +#endif /* #ifndef LIBURING_ARCH_GENERIC_SYSCALL_H */ diff --git a/src/arch/syscall-defs.h b/src/arch/syscall-defs.h new file mode 100644 index 0000000..1e8ae1b --- /dev/null +++ b/src/arch/syscall-defs.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: MIT */ + +#ifndef LIBURING_ARCH_SYSCALL_DEFS_H +#define LIBURING_ARCH_SYSCALL_DEFS_H + +static inline void *__sys_mmap(void *addr, size_t length, int prot, int flags, + int fd, off_t offset) +{ + int nr; + +#if defined(__i386__) + nr = __NR_mmap2; + offset >>= 12; +#else + nr = __NR_mmap; +#endif + return (void *) __do_syscall6(nr, addr, length, prot, flags, fd, offset); +} + +static inline int __sys_munmap(void *addr, size_t length) +{ + return (int) __do_syscall2(__NR_munmap, addr, length); +} + +static inline int __sys_madvise(void *addr, size_t length, int advice) +{ + return (int) __do_syscall3(__NR_madvise, addr, length, advice); +} + +static inline int __sys_getrlimit(int resource, struct rlimit *rlim) +{ + return (int) __do_syscall2(__NR_getrlimit, resource, rlim); +} + +static inline int __sys_setrlimit(int resource, const struct rlimit *rlim) +{ + return (int) __do_syscall2(__NR_setrlimit, resource, rlim); +} + +static inline int __sys_close(int fd) +{ + return (int) __do_syscall1(__NR_close, fd); +} + +static inline int ____sys_io_uring_register(int fd, unsigned opcode, + const void *arg, unsigned nr_args) +{ + return (int) __do_syscall4(__NR_io_uring_register, fd, opcode, arg, + nr_args); +} + +static inline int ____sys_io_uring_setup(unsigned entries, + struct io_uring_params *p) +{ + return (int) __do_syscall2(__NR_io_uring_setup, entries, p); +} + +static inline int ____sys_io_uring_enter2(int fd, unsigned to_submit, + unsigned min_complete, unsigned flags, + sigset_t *sig, int sz) +{ + return (int) __do_syscall6(__NR_io_uring_enter, fd, to_submit, + min_complete, flags, sig, sz); +} + +static inline int ____sys_io_uring_enter(int fd, unsigned to_submit, + unsigned min_complete, unsigned flags, + sigset_t *sig) +{ + return ____sys_io_uring_enter2(fd, to_submit, min_complete, flags, sig, + _NSIG / 8); +} + +#endif diff --git a/src/arch/x86/lib.h b/src/arch/x86/lib.h new file mode 100644 index 0000000..e6a74f3 --- /dev/null +++ b/src/arch/x86/lib.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ + +#ifndef __INTERNAL__LIBURING_LIB_H + #error "This file should be included from src/lib.h (liburing)" +#endif + +#ifndef LIBURING_ARCH_X86_LIB_H +#define LIBURING_ARCH_X86_LIB_H + +static inline long get_page_size(void) +{ + return 4096; +} + +#endif /* #ifndef LIBURING_ARCH_X86_LIB_H */ diff --git a/src/arch/x86/syscall.h b/src/arch/x86/syscall.h new file mode 100644 index 0000000..43c576b --- /dev/null +++ b/src/arch/x86/syscall.h @@ -0,0 +1,300 @@ +/* SPDX-License-Identifier: MIT */ + +#ifndef __INTERNAL__LIBURING_SYSCALL_H + #error "This file should be included from src/syscall.h (liburing)" +#endif + +#ifndef LIBURING_ARCH_X86_SYSCALL_H +#define LIBURING_ARCH_X86_SYSCALL_H + +#if defined(__x86_64__) +/** + * Note for syscall registers usage (x86-64): + * - %rax is the syscall number. + * - %rax is also the return value. + * - %rdi is the 1st argument. + * - %rsi is the 2nd argument. + * - %rdx is the 3rd argument. + * - %r10 is the 4th argument (**yes it's %r10, not %rcx!**). + * - %r8 is the 5th argument. + * - %r9 is the 6th argument. + * + * `syscall` instruction will clobber %r11 and %rcx. + * + * After the syscall returns to userspace: + * - %r11 will contain %rflags. + * - %rcx will contain the return address. + * + * IOW, after the syscall returns to userspace: + * %r11 == %rflags and %rcx == %rip. + */ + +#define __do_syscall0(NUM) ({ \ + intptr_t rax; \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"(NUM) /* %rax */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall1(NUM, ARG1) ({ \ + intptr_t rax; \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)) /* %rdi */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall2(NUM, ARG1, ARG2) ({ \ + intptr_t rax; \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)), /* %rdi */ \ + "S"((ARG2)) /* %rsi */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall3(NUM, ARG1, ARG2, ARG3) ({ \ + intptr_t rax; \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)), /* %rdi */ \ + "S"((ARG2)), /* %rsi */ \ + "d"((ARG3)) /* %rdx */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall4(NUM, ARG1, ARG2, ARG3, ARG4) ({ \ + intptr_t rax; \ + register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)), /* %rdi */ \ + "S"((ARG2)), /* %rsi */ \ + "d"((ARG3)), /* %rdx */ \ + "r"(__r10) /* %r10 */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall5(NUM, ARG1, ARG2, ARG3, ARG4, ARG5) ({ \ + intptr_t rax; \ + register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \ + register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5); \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)), /* %rdi */ \ + "S"((ARG2)), /* %rsi */ \ + "d"((ARG3)), /* %rdx */ \ + "r"(__r10), /* %r10 */ \ + "r"(__r8) /* %r8 */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall6(NUM, ARG1, ARG2, ARG3, ARG4, ARG5, ARG6) ({ \ + intptr_t rax; \ + register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \ + register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5); \ + register __typeof__(ARG6) __r9 __asm__("r9") = (ARG6); \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)), /* %rdi */ \ + "S"((ARG2)), /* %rsi */ \ + "d"((ARG3)), /* %rdx */ \ + "r"(__r10), /* %r10 */ \ + "r"(__r8), /* %r8 */ \ + "r"(__r9) /* %r9 */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#include "../syscall-defs.h" + +#else /* #if defined(__x86_64__) */ + +#ifdef CONFIG_NOLIBC +/** + * Note for syscall registers usage (x86, 32-bit): + * - %eax is the syscall number. + * - %eax is also the return value. + * - %ebx is the 1st argument. + * - %ecx is the 2nd argument. + * - %edx is the 3rd argument. + * - %esi is the 4th argument. + * - %edi is the 5th argument. + * - %ebp is the 6th argument. + */ + +#define __do_syscall0(NUM) ({ \ + intptr_t eax; \ + \ + __asm__ volatile( \ + "int $0x80" \ + : "=a"(eax) /* %eax */ \ + : "a"(NUM) /* %eax */ \ + : "memory" \ + ); \ + eax; \ +}) + +#define __do_syscall1(NUM, ARG1) ({ \ + intptr_t eax; \ + \ + __asm__ volatile( \ + "int $0x80" \ + : "=a"(eax) /* %eax */ \ + : "a"(NUM), /* %eax */ \ + "b"((ARG1)) /* %ebx */ \ + : "memory" \ + ); \ + eax; \ +}) + +#define __do_syscall2(NUM, ARG1, ARG2) ({ \ + intptr_t eax; \ + \ + __asm__ volatile( \ + "int $0x80" \ + : "=a" (eax) /* %eax */ \ + : "a"(NUM), /* %eax */ \ + "b"((ARG1)), /* %ebx */ \ + "c"((ARG2)) /* %ecx */ \ + : "memory" \ + ); \ + eax; \ +}) + +#define __do_syscall3(NUM, ARG1, ARG2, ARG3) ({ \ + intptr_t eax; \ + \ + __asm__ volatile( \ + "int $0x80" \ + : "=a" (eax) /* %eax */ \ + : "a"(NUM), /* %eax */ \ + "b"((ARG1)), /* %ebx */ \ + "c"((ARG2)), /* %ecx */ \ + "d"((ARG3)) /* %edx */ \ + : "memory" \ + ); \ + eax; \ +}) + +#define __do_syscall4(NUM, ARG1, ARG2, ARG3, ARG4) ({ \ + intptr_t eax; \ + \ + __asm__ volatile( \ + "int $0x80" \ + : "=a" (eax) /* %eax */ \ + : "a"(NUM), /* %eax */ \ + "b"((ARG1)), /* %ebx */ \ + "c"((ARG2)), /* %ecx */ \ + "d"((ARG3)), /* %edx */ \ + "S"((ARG4)) /* %esi */ \ + : "memory" \ + ); \ + eax; \ +}) + +#define __do_syscall5(NUM, ARG1, ARG2, ARG3, ARG4, ARG5) ({ \ + intptr_t eax; \ + \ + __asm__ volatile( \ + "int $0x80" \ + : "=a" (eax) /* %eax */ \ + : "a"(NUM), /* %eax */ \ + "b"((ARG1)), /* %ebx */ \ + "c"((ARG2)), /* %ecx */ \ + "d"((ARG3)), /* %edx */ \ + "S"((ARG4)), /* %esi */ \ + "D"((ARG5)) /* %edi */ \ + : "memory" \ + ); \ + eax; \ +}) + + +/* + * On i386, the 6th argument of syscall goes in %ebp. However, both Clang + * and GCC cannot use %ebp in the clobber list and in the "r" constraint + * without using -fomit-frame-pointer. To make it always available for + * any kind of compilation, the below workaround is implemented: + * + * 1) Push the 6-th argument. + * 2) Push %ebp. + * 3) Load the 6-th argument from 4(%esp) to %ebp. + * 4) Do the syscall (int $0x80). + * 5) Pop %ebp (restore the old value of %ebp). + * 6) Add %esp by 4 (undo the stack pointer). + * + * WARNING: + * Don't use register variables for __do_syscall6(), there is a known + * GCC bug that results in an endless loop. + * + * BugLink: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105032 + * + */ +#define __do_syscall6(NUM, ARG1, ARG2, ARG3, ARG4, ARG5, ARG6) ({ \ + intptr_t eax = (intptr_t)(NUM); \ + intptr_t arg6 = (intptr_t)(ARG6); /* Always in memory */ \ + __asm__ volatile ( \ + "pushl %[_arg6]\n\t" \ + "pushl %%ebp\n\t" \ + "movl 4(%%esp),%%ebp\n\t" \ + "int $0x80\n\t" \ + "popl %%ebp\n\t" \ + "addl $4,%%esp" \ + : "+a"(eax) /* %eax */ \ + : "b"(ARG1), /* %ebx */ \ + "c"(ARG2), /* %ecx */ \ + "d"(ARG3), /* %edx */ \ + "S"(ARG4), /* %esi */ \ + "D"(ARG5), /* %edi */ \ + [_arg6]"m"(arg6) /* memory */ \ + : "memory", "cc" \ + ); \ + eax; \ +}) + +#include "../syscall-defs.h" + +#else /* #ifdef CONFIG_NOLIBC */ + +#include "../generic/syscall.h" + +#endif /* #ifdef CONFIG_NOLIBC */ + +#endif /* #if defined(__x86_64__) */ + +#endif /* #ifndef LIBURING_ARCH_X86_SYSCALL_H */ diff --git a/src/include/liburing.h b/src/include/liburing.h index 51dc602..1c1b03e 100644 --- a/src/include/liburing.h +++ b/src/include/liburing.h @@ -6,25 +6,31 @@ #define _XOPEN_SOURCE 500 /* Required for glibc to expose sigset_t */ #endif +#ifndef _GNU_SOURCE +#define _GNU_SOURCE /* Required for musl to expose cpu_set_t */ +#endif + #include <sys/socket.h> -#include <sys/uio.h> #include <sys/stat.h> +#include <sys/uio.h> #include <errno.h> #include <signal.h> #include <stdbool.h> #include <inttypes.h> #include <time.h> +#include <fcntl.h> +#include <sched.h> #include <linux/swab.h> #include "liburing/compat.h" #include "liburing/io_uring.h" #include "liburing/barrier.h" #ifndef uring_unlikely -# define uring_unlikely(cond) __builtin_expect(!!(cond), 0) +#define uring_unlikely(cond) __builtin_expect(!!(cond), 0) #endif #ifndef uring_likely -# define uring_likely(cond) __builtin_expect(!!(cond), 1) +#define uring_likely(cond) __builtin_expect(!!(cond), 1) #endif #ifdef __cplusplus @@ -75,7 +81,10 @@ struct io_uring { int ring_fd; unsigned features; - unsigned pad[3]; + int enter_ring_fd; + __u8 int_flags; + __u8 pad[3]; + unsigned pad2; }; /* @@ -86,74 +95,115 @@ struct io_uring { * return an allocated io_uring_probe structure, or NULL if probe fails (for * example, if it is not available). The caller is responsible for freeing it */ -extern struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring); +struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring); /* same as io_uring_get_probe_ring, but takes care of ring init and teardown */ -extern struct io_uring_probe *io_uring_get_probe(void); +struct io_uring_probe *io_uring_get_probe(void); /* * frees a probe allocated through io_uring_get_probe() or * io_uring_get_probe_ring() */ -extern void io_uring_free_probe(struct io_uring_probe *probe); +void io_uring_free_probe(struct io_uring_probe *probe); -static inline int io_uring_opcode_supported(const struct io_uring_probe *p, int op) +static inline int io_uring_opcode_supported(const struct io_uring_probe *p, + int op) { if (op > p->last_op) return 0; return (p->ops[op].flags & IO_URING_OP_SUPPORTED) != 0; } -extern int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, - struct io_uring_params *p); -extern int io_uring_queue_init(unsigned entries, struct io_uring *ring, - unsigned flags); -extern int io_uring_queue_mmap(int fd, struct io_uring_params *p, - struct io_uring *ring); -extern int io_uring_ring_dontfork(struct io_uring *ring); -extern void io_uring_queue_exit(struct io_uring *ring); +int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, + struct io_uring_params *p); +int io_uring_queue_init(unsigned entries, struct io_uring *ring, + unsigned flags); +int io_uring_queue_mmap(int fd, struct io_uring_params *p, + struct io_uring *ring); +int io_uring_ring_dontfork(struct io_uring *ring); +void io_uring_queue_exit(struct io_uring *ring); unsigned io_uring_peek_batch_cqe(struct io_uring *ring, struct io_uring_cqe **cqes, unsigned count); -extern int io_uring_wait_cqes(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr, unsigned wait_nr, - struct __kernel_timespec *ts, sigset_t *sigmask); -extern int io_uring_wait_cqe_timeout(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr, struct __kernel_timespec *ts); -extern int io_uring_submit(struct io_uring *ring); -extern int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr); -extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring); - -extern int io_uring_register_buffers(struct io_uring *ring, - const struct iovec *iovecs, - unsigned nr_iovecs); -extern int io_uring_unregister_buffers(struct io_uring *ring); -extern int io_uring_register_files(struct io_uring *ring, const int *files, - unsigned nr_files); -extern int io_uring_unregister_files(struct io_uring *ring); -extern int io_uring_register_files_update(struct io_uring *ring, unsigned off, - int *files, unsigned nr_files); -extern int io_uring_register_eventfd(struct io_uring *ring, int fd); -extern int io_uring_register_eventfd_async(struct io_uring *ring, int fd); -extern int io_uring_unregister_eventfd(struct io_uring *ring); -extern int io_uring_register_probe(struct io_uring *ring, - struct io_uring_probe *p, unsigned nr); -extern int io_uring_register_personality(struct io_uring *ring); -extern int io_uring_unregister_personality(struct io_uring *ring, int id); -extern int io_uring_register_restrictions(struct io_uring *ring, - struct io_uring_restriction *res, - unsigned int nr_res); -extern int io_uring_enable_rings(struct io_uring *ring); -extern int __io_uring_sqring_wait(struct io_uring *ring); +int io_uring_wait_cqes(struct io_uring *ring, struct io_uring_cqe **cqe_ptr, + unsigned wait_nr, struct __kernel_timespec *ts, + sigset_t *sigmask); +int io_uring_wait_cqe_timeout(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, + struct __kernel_timespec *ts); +int io_uring_submit(struct io_uring *ring); +int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr); +int io_uring_submit_and_wait_timeout(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, + unsigned wait_nr, + struct __kernel_timespec *ts, + sigset_t *sigmask); + +int io_uring_register_buffers(struct io_uring *ring, const struct iovec *iovecs, + unsigned nr_iovecs); +int io_uring_register_buffers_tags(struct io_uring *ring, + const struct iovec *iovecs, + const __u64 *tags, unsigned nr); +int io_uring_register_buffers_sparse(struct io_uring *ring, unsigned nr); +int io_uring_register_buffers_update_tag(struct io_uring *ring, + unsigned off, + const struct iovec *iovecs, + const __u64 *tags, unsigned nr); +int io_uring_unregister_buffers(struct io_uring *ring); + +int io_uring_register_files(struct io_uring *ring, const int *files, + unsigned nr_files); +int io_uring_register_files_tags(struct io_uring *ring, const int *files, + const __u64 *tags, unsigned nr); +int io_uring_register_files_sparse(struct io_uring *ring, unsigned nr); +int io_uring_register_files_update_tag(struct io_uring *ring, unsigned off, + const int *files, const __u64 *tags, + unsigned nr_files); + +int io_uring_unregister_files(struct io_uring *ring); +int io_uring_register_files_update(struct io_uring *ring, unsigned off, + int *files, unsigned nr_files); +int io_uring_register_eventfd(struct io_uring *ring, int fd); +int io_uring_register_eventfd_async(struct io_uring *ring, int fd); +int io_uring_unregister_eventfd(struct io_uring *ring); +int io_uring_register_probe(struct io_uring *ring, struct io_uring_probe *p, + unsigned nr); +int io_uring_register_personality(struct io_uring *ring); +int io_uring_unregister_personality(struct io_uring *ring, int id); +int io_uring_register_restrictions(struct io_uring *ring, + struct io_uring_restriction *res, + unsigned int nr_res); +int io_uring_enable_rings(struct io_uring *ring); +int __io_uring_sqring_wait(struct io_uring *ring); +int io_uring_register_iowq_aff(struct io_uring *ring, size_t cpusz, + const cpu_set_t *mask); +int io_uring_unregister_iowq_aff(struct io_uring *ring); +int io_uring_register_iowq_max_workers(struct io_uring *ring, + unsigned int *values); +int io_uring_register_ring_fd(struct io_uring *ring); +int io_uring_unregister_ring_fd(struct io_uring *ring); +int io_uring_register_buf_ring(struct io_uring *ring, + struct io_uring_buf_reg *reg, unsigned int flags); +int io_uring_unregister_buf_ring(struct io_uring *ring, int bgid); /* * Helper for the peek/wait single cqe functions. Exported because of that, * but probably shouldn't be used directly in an application. */ -extern int __io_uring_get_cqe(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr, unsigned submit, - unsigned wait_nr, sigset_t *sigmask); +int __io_uring_get_cqe(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, unsigned submit, + unsigned wait_nr, sigset_t *sigmask); #define LIBURING_UDATA_TIMEOUT ((__u64) -1) +/* + * Calculates the step size for CQE iteration. + * For standard CQE's its 1, for big CQE's its two. + */ +#define io_uring_cqe_shift(ring) \ + (!!((ring)->flags & IORING_SETUP_CQE32)) + +#define io_uring_cqe_index(ring,ptr,mask) \ + (((ptr) & (mask)) << io_uring_cqe_shift(ring)) + #define io_uring_for_each_cqe(ring, head, cqe) \ /* \ * io_uring_smp_load_acquire() enforces the order of tail \ @@ -161,7 +211,7 @@ extern int __io_uring_get_cqe(struct io_uring *ring, */ \ for (head = *(ring)->cq.khead; \ (cqe = (head != io_uring_smp_load_acquire((ring)->cq.ktail) ? \ - &(ring)->cq.cqes[head & (*(ring)->cq.kring_mask)] : NULL)); \ + &(ring)->cq.cqes[io_uring_cqe_index(ring, head, *(ring)->cq.kring_mask)] : NULL)); \ head++) \ /* @@ -195,6 +245,11 @@ static inline void io_uring_cqe_seen(struct io_uring *ring, /* * Command prep helpers */ + +/* + * Associate pointer @data with the sqe, for later retrieval from the cqe + * at command completion time with io_uring_cqe_get_data(). + */ static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data) { sqe->user_data = (unsigned long) data; @@ -205,17 +260,45 @@ static inline void *io_uring_cqe_get_data(const struct io_uring_cqe *cqe) return (void *) (uintptr_t) cqe->user_data; } +/* + * Assign a 64-bit value to this sqe, which can get retrieved at completion + * time with io_uring_cqe_get_data64. Just like the non-64 variants, except + * these store a 64-bit type rather than a data pointer. + */ +static inline void io_uring_sqe_set_data64(struct io_uring_sqe *sqe, + __u64 data) +{ + sqe->user_data = data; +} + +static inline __u64 io_uring_cqe_get_data64(const struct io_uring_cqe *cqe) +{ + return cqe->user_data; +} + +/* + * Tell the app the have the 64-bit variants of the get/set userdata + */ +#define LIBURING_HAVE_DATA64 + static inline void io_uring_sqe_set_flags(struct io_uring_sqe *sqe, unsigned flags) { - sqe->flags = flags; + sqe->flags = (__u8) flags; +} + +static inline void __io_uring_set_target_fixed_file(struct io_uring_sqe *sqe, + unsigned int file_index) +{ + /* 0 means no fixed files, indexes should be encoded as "index + 1" */ + sqe->file_index = file_index + 1; } static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, __u64 offset) { - sqe->opcode = op; + sqe->opcode = (__u8) op; sqe->flags = 0; sqe->ioprio = 0; sqe->fd = fd; @@ -223,27 +306,33 @@ static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, sqe->addr = (unsigned long) addr; sqe->len = len; sqe->rw_flags = 0; - sqe->user_data = 0; - sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0; + sqe->buf_index = 0; + sqe->personality = 0; + sqe->file_index = 0; + sqe->addr3 = 0; + sqe->__pad2[0] = 0; } /** * @pre Either fd_in or fd_out must be a pipe. * @param off_in If fd_in refers to a pipe, off_in must be (int64_t) -1; - * If fd_in does not refer to a pipe and off_in is (int64_t) -1, then bytes are read - * from fd_in starting from the file offset and it is adjust appropriately; - * If fd_in does not refer to a pipe and off_in is not (int64_t) -1, then the - * starting offset of fd_in will be off_in. + * If fd_in does not refer to a pipe and off_in is (int64_t) -1, + * then bytes are read from fd_in starting from the file offset + * and it is adjust appropriately; + * If fd_in does not refer to a pipe and off_in is not + * (int64_t) -1, then the starting offset of fd_in will be + * off_in. * @param off_out The description of off_in also applied to off_out. * @param splice_flags see man splice(2) for description of flags. * - * This splice operation can be used to implement sendfile by splicing to an intermediate pipe - * first, then splice to the final destination. + * This splice operation can be used to implement sendfile by splicing to an + * intermediate pipe first, then splice to the final destination. * In fact, the implementation of sendfile in kernel uses splice internally. * - * NOTE that even if fd_in or fd_out refers to a pipe, the splice operation can still failed with - * EINVAL if one of the fd doesn't explicitly support splice operation, e.g. reading from terminal - * is unsupported from kernel 5.7 to 5.11. + * NOTE that even if fd_in or fd_out refers to a pipe, the splice operation + * can still failed with EINVAL if one of the fd doesn't explicitly support + * splice operation, e.g. reading from terminal is unsupported from kernel 5.7 + * to 5.11. * Check issue #291 for more information. */ static inline void io_uring_prep_splice(struct io_uring_sqe *sqe, @@ -252,8 +341,9 @@ static inline void io_uring_prep_splice(struct io_uring_sqe *sqe, unsigned int nbytes, unsigned int splice_flags) { - io_uring_prep_rw(IORING_OP_SPLICE, sqe, fd_out, NULL, nbytes, off_out); - sqe->splice_off_in = off_in; + io_uring_prep_rw(IORING_OP_SPLICE, sqe, fd_out, NULL, nbytes, + (__u64) off_out); + sqe->splice_off_in = (__u64) off_in; sqe->splice_fd_in = fd_in; sqe->splice_flags = splice_flags; } @@ -271,32 +361,50 @@ static inline void io_uring_prep_tee(struct io_uring_sqe *sqe, static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs, - unsigned nr_vecs, off_t offset) + unsigned nr_vecs, __u64 offset) { io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset); } +static inline void io_uring_prep_readv2(struct io_uring_sqe *sqe, int fd, + const struct iovec *iovecs, + unsigned nr_vecs, __u64 offset, + int flags) +{ + io_uring_prep_readv(sqe, fd, iovecs, nr_vecs, offset); + sqe->rw_flags = flags; +} + static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, - off_t offset, int buf_index) + __u64 offset, int buf_index) { io_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset); - sqe->buf_index = buf_index; + sqe->buf_index = (__u16) buf_index; } static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs, - unsigned nr_vecs, off_t offset) + unsigned nr_vecs, __u64 offset) { io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset); } +static inline void io_uring_prep_writev2(struct io_uring_sqe *sqe, int fd, + const struct iovec *iovecs, + unsigned nr_vecs, __u64 offset, + int flags) +{ + io_uring_prep_writev(sqe, fd, iovecs, nr_vecs, offset); + sqe->rw_flags = flags; +} + static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, const void *buf, unsigned nbytes, - off_t offset, int buf_index) + __u64 offset, int buf_index) { io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset); - sqe->buf_index = buf_index; + sqe->buf_index = (__u16) buf_index; } static inline void io_uring_prep_recvmsg(struct io_uring_sqe *sqe, int fd, @@ -307,39 +415,51 @@ static inline void io_uring_prep_recvmsg(struct io_uring_sqe *sqe, int fd, } static inline void io_uring_prep_sendmsg(struct io_uring_sqe *sqe, int fd, - const struct msghdr *msg, unsigned flags) + const struct msghdr *msg, + unsigned flags) { io_uring_prep_rw(IORING_OP_SENDMSG, sqe, fd, msg, 1, 0); sqe->msg_flags = flags; } -static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd, - unsigned poll_mask) +static inline unsigned __io_uring_prep_poll_mask(unsigned poll_mask) { - io_uring_prep_rw(IORING_OP_POLL_ADD, sqe, fd, NULL, 0, 0); #if __BYTE_ORDER == __BIG_ENDIAN poll_mask = __swahw32(poll_mask); #endif - sqe->poll32_events = poll_mask; + return poll_mask; +} + +static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd, + unsigned poll_mask) +{ + io_uring_prep_rw(IORING_OP_POLL_ADD, sqe, fd, NULL, 0, 0); + sqe->poll32_events = __io_uring_prep_poll_mask(poll_mask); +} + +static inline void io_uring_prep_poll_multishot(struct io_uring_sqe *sqe, + int fd, unsigned poll_mask) +{ + io_uring_prep_poll_add(sqe, fd, poll_mask); + sqe->len = IORING_POLL_ADD_MULTI; } static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe, - void *user_data) + __u64 user_data) { - io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, user_data, 0, 0); + io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, NULL, 0, 0); + sqe->addr = user_data; } static inline void io_uring_prep_poll_update(struct io_uring_sqe *sqe, - void *old_user_data, - void *new_user_data, + __u64 old_user_data, + __u64 new_user_data, unsigned poll_mask, unsigned flags) { - io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, old_user_data, flags, - (__u64)new_user_data); -#if __BYTE_ORDER == __BIG_ENDIAN - poll_mask = __swahw32(poll_mask); -#endif - sqe->poll32_events = poll_mask; + io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, NULL, flags, + new_user_data); + sqe->addr = old_user_data; + sqe->poll32_events = __io_uring_prep_poll_mask(poll_mask); } static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd, @@ -365,8 +485,8 @@ static inline void io_uring_prep_timeout(struct io_uring_sqe *sqe, static inline void io_uring_prep_timeout_remove(struct io_uring_sqe *sqe, __u64 user_data, unsigned flags) { - io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1, - (void *)(unsigned long)user_data, 0, 0); + io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1, NULL, 0, 0); + sqe->addr = user_data; sqe->timeout_flags = flags; } @@ -374,9 +494,9 @@ static inline void io_uring_prep_timeout_update(struct io_uring_sqe *sqe, struct __kernel_timespec *ts, __u64 user_data, unsigned flags) { - io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1, - (void *)(unsigned long)user_data, 0, - (uintptr_t)ts); + io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1, NULL, 0, + (uintptr_t) ts); + sqe->addr = user_data; sqe->timeout_flags = flags | IORING_TIMEOUT_UPDATE; } @@ -386,14 +506,57 @@ static inline void io_uring_prep_accept(struct io_uring_sqe *sqe, int fd, { io_uring_prep_rw(IORING_OP_ACCEPT, sqe, fd, addr, 0, (__u64) (unsigned long) addrlen); - sqe->accept_flags = flags; + sqe->accept_flags = (__u32) flags; +} + +/* accept directly into the fixed file table */ +static inline void io_uring_prep_accept_direct(struct io_uring_sqe *sqe, int fd, + struct sockaddr *addr, + socklen_t *addrlen, int flags, + unsigned int file_index) +{ + io_uring_prep_accept(sqe, fd, addr, addrlen, flags); + __io_uring_set_target_fixed_file(sqe, file_index); +} + +static inline void io_uring_prep_multishot_accept(struct io_uring_sqe *sqe, + int fd, struct sockaddr *addr, + socklen_t *addrlen, int flags) +{ + io_uring_prep_accept(sqe, fd, addr, addrlen, flags); + sqe->ioprio |= IORING_ACCEPT_MULTISHOT; +} + +/* multishot accept directly into the fixed file table */ +static inline void io_uring_prep_multishot_accept_direct(struct io_uring_sqe *sqe, + int fd, + struct sockaddr *addr, + socklen_t *addrlen, + int flags) +{ + io_uring_prep_multishot_accept(sqe, fd, addr, addrlen, flags); + __io_uring_set_target_fixed_file(sqe, IORING_FILE_INDEX_ALLOC - 1); +} + +static inline void io_uring_prep_cancel64(struct io_uring_sqe *sqe, + __u64 user_data, int flags) +{ + io_uring_prep_rw(IORING_OP_ASYNC_CANCEL, sqe, -1, NULL, 0, 0); + sqe->addr = user_data; + sqe->cancel_flags = (__u32) flags; +} + +static inline void io_uring_prep_cancel(struct io_uring_sqe *sqe, + void *user_data, int flags) +{ + io_uring_prep_cancel64(sqe, (__u64) (uintptr_t) user_data, flags); } -static inline void io_uring_prep_cancel(struct io_uring_sqe *sqe, void *user_data, - int flags) +static inline void io_uring_prep_cancel_fd(struct io_uring_sqe *sqe, int fd, + unsigned int flags) { - io_uring_prep_rw(IORING_OP_ASYNC_CANCEL, sqe, -1, user_data, 0, 0); - sqe->cancel_flags = flags; + io_uring_prep_rw(IORING_OP_ASYNC_CANCEL, sqe, fd, NULL, 0, 0); + sqe->cancel_flags = (__u32) flags | IORING_ASYNC_CANCEL_FD; } static inline void io_uring_prep_link_timeout(struct io_uring_sqe *sqe, @@ -415,7 +578,8 @@ static inline void io_uring_prep_files_update(struct io_uring_sqe *sqe, int *fds, unsigned nr_fds, int offset) { - io_uring_prep_rw(IORING_OP_FILES_UPDATE, sqe, -1, fds, nr_fds, offset); + io_uring_prep_rw(IORING_OP_FILES_UPDATE, sqe, -1, fds, nr_fds, + (__u64) offset); } static inline void io_uring_prep_fallocate(struct io_uring_sqe *sqe, int fd, @@ -423,14 +587,26 @@ static inline void io_uring_prep_fallocate(struct io_uring_sqe *sqe, int fd, { io_uring_prep_rw(IORING_OP_FALLOCATE, sqe, fd, - (const uintptr_t *) (unsigned long) len, mode, offset); + (const uintptr_t *) (unsigned long) len, + (unsigned int) mode, (__u64) offset); } static inline void io_uring_prep_openat(struct io_uring_sqe *sqe, int dfd, - const char *path, int flags, mode_t mode) + const char *path, int flags, + mode_t mode) { io_uring_prep_rw(IORING_OP_OPENAT, sqe, dfd, path, mode, 0); - sqe->open_flags = flags; + sqe->open_flags = (__u32) flags; +} + +/* open directly into the fixed file table */ +static inline void io_uring_prep_openat_direct(struct io_uring_sqe *sqe, + int dfd, const char *path, + int flags, mode_t mode, + unsigned file_index) +{ + io_uring_prep_openat(sqe, dfd, path, flags, mode); + __io_uring_set_target_fixed_file(sqe, file_index); } static inline void io_uring_prep_close(struct io_uring_sqe *sqe, int fd) @@ -438,14 +614,22 @@ static inline void io_uring_prep_close(struct io_uring_sqe *sqe, int fd) io_uring_prep_rw(IORING_OP_CLOSE, sqe, fd, NULL, 0, 0); } +static inline void io_uring_prep_close_direct(struct io_uring_sqe *sqe, + unsigned file_index) +{ + io_uring_prep_close(sqe, 0); + __io_uring_set_target_fixed_file(sqe, file_index); +} + static inline void io_uring_prep_read(struct io_uring_sqe *sqe, int fd, - void *buf, unsigned nbytes, off_t offset) + void *buf, unsigned nbytes, __u64 offset) { io_uring_prep_rw(IORING_OP_READ, sqe, fd, buf, nbytes, offset); } static inline void io_uring_prep_write(struct io_uring_sqe *sqe, int fd, - const void *buf, unsigned nbytes, off_t offset) + const void *buf, unsigned nbytes, + __u64 offset) { io_uring_prep_rw(IORING_OP_WRITE, sqe, fd, buf, nbytes, offset); } @@ -457,35 +641,35 @@ static inline void io_uring_prep_statx(struct io_uring_sqe *sqe, int dfd, { io_uring_prep_rw(IORING_OP_STATX, sqe, dfd, path, mask, (__u64) (unsigned long) statxbuf); - sqe->statx_flags = flags; + sqe->statx_flags = (__u32) flags; } static inline void io_uring_prep_fadvise(struct io_uring_sqe *sqe, int fd, - off_t offset, off_t len, int advice) + __u64 offset, off_t len, int advice) { - io_uring_prep_rw(IORING_OP_FADVISE, sqe, fd, NULL, len, offset); - sqe->fadvise_advice = advice; + io_uring_prep_rw(IORING_OP_FADVISE, sqe, fd, NULL, (__u32) len, offset); + sqe->fadvise_advice = (__u32) advice; } static inline void io_uring_prep_madvise(struct io_uring_sqe *sqe, void *addr, off_t length, int advice) { - io_uring_prep_rw(IORING_OP_MADVISE, sqe, -1, addr, length, 0); - sqe->fadvise_advice = advice; + io_uring_prep_rw(IORING_OP_MADVISE, sqe, -1, addr, (__u32) length, 0); + sqe->fadvise_advice = (__u32) advice; } static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd, const void *buf, size_t len, int flags) { - io_uring_prep_rw(IORING_OP_SEND, sqe, sockfd, buf, len, 0); - sqe->msg_flags = flags; + io_uring_prep_rw(IORING_OP_SEND, sqe, sockfd, buf, (__u32) len, 0); + sqe->msg_flags = (__u32) flags; } static inline void io_uring_prep_recv(struct io_uring_sqe *sqe, int sockfd, void *buf, size_t len, int flags) { - io_uring_prep_rw(IORING_OP_RECV, sqe, sockfd, buf, len, 0); - sqe->msg_flags = flags; + io_uring_prep_rw(IORING_OP_RECV, sqe, sockfd, buf, (__u32) len, 0); + sqe->msg_flags = (__u32) flags; } static inline void io_uring_prep_openat2(struct io_uring_sqe *sqe, int dfd, @@ -495,57 +679,82 @@ static inline void io_uring_prep_openat2(struct io_uring_sqe *sqe, int dfd, (uint64_t) (uintptr_t) how); } +/* open directly into the fixed file table */ +static inline void io_uring_prep_openat2_direct(struct io_uring_sqe *sqe, + int dfd, const char *path, + struct open_how *how, + unsigned file_index) +{ + io_uring_prep_openat2(sqe, dfd, path, how); + __io_uring_set_target_fixed_file(sqe, file_index); +} + struct epoll_event; static inline void io_uring_prep_epoll_ctl(struct io_uring_sqe *sqe, int epfd, int fd, int op, struct epoll_event *ev) { - io_uring_prep_rw(IORING_OP_EPOLL_CTL, sqe, epfd, ev, op, fd); + io_uring_prep_rw(IORING_OP_EPOLL_CTL, sqe, epfd, ev, + (__u32) op, (__u32) fd); } static inline void io_uring_prep_provide_buffers(struct io_uring_sqe *sqe, void *addr, int len, int nr, int bgid, int bid) { - io_uring_prep_rw(IORING_OP_PROVIDE_BUFFERS, sqe, nr, addr, len, bid); - sqe->buf_group = bgid; + io_uring_prep_rw(IORING_OP_PROVIDE_BUFFERS, sqe, nr, addr, (__u32) len, + (__u64) bid); + sqe->buf_group = (__u16) bgid; } static inline void io_uring_prep_remove_buffers(struct io_uring_sqe *sqe, int nr, int bgid) { io_uring_prep_rw(IORING_OP_REMOVE_BUFFERS, sqe, nr, NULL, 0, 0); - sqe->buf_group = bgid; + sqe->buf_group = (__u16) bgid; } static inline void io_uring_prep_shutdown(struct io_uring_sqe *sqe, int fd, int how) { - io_uring_prep_rw(IORING_OP_SHUTDOWN, sqe, fd, NULL, how, 0); + io_uring_prep_rw(IORING_OP_SHUTDOWN, sqe, fd, NULL, (__u32) how, 0); } static inline void io_uring_prep_unlinkat(struct io_uring_sqe *sqe, int dfd, const char *path, int flags) { io_uring_prep_rw(IORING_OP_UNLINKAT, sqe, dfd, path, 0, 0); - sqe->unlink_flags = flags; + sqe->unlink_flags = (__u32) flags; +} + +static inline void io_uring_prep_unlink(struct io_uring_sqe *sqe, + const char *path, int flags) +{ + io_uring_prep_unlinkat(sqe, AT_FDCWD, path, flags); } static inline void io_uring_prep_renameat(struct io_uring_sqe *sqe, int olddfd, const char *oldpath, int newdfd, const char *newpath, int flags) { - io_uring_prep_rw(IORING_OP_RENAMEAT, sqe, olddfd, oldpath, newdfd, + io_uring_prep_rw(IORING_OP_RENAMEAT, sqe, olddfd, oldpath, + (__u32) newdfd, (uint64_t) (uintptr_t) newpath); - sqe->rename_flags = flags; + sqe->rename_flags = (__u32) flags; +} + +static inline void io_uring_prep_rename(struct io_uring_sqe *sqe, + const char *oldpath, const char *newpath) +{ + io_uring_prep_renameat(sqe, AT_FDCWD, oldpath, AT_FDCWD, newpath, 0); } static inline void io_uring_prep_sync_file_range(struct io_uring_sqe *sqe, int fd, unsigned len, - off_t offset, int flags) + __u64 offset, int flags) { io_uring_prep_rw(IORING_OP_SYNC_FILE_RANGE, sqe, fd, NULL, len, offset); - sqe->sync_range_flags = flags; + sqe->sync_range_flags = (__u32) flags; } static inline void io_uring_prep_mkdirat(struct io_uring_sqe *sqe, int dfd, @@ -554,20 +763,123 @@ static inline void io_uring_prep_mkdirat(struct io_uring_sqe *sqe, int dfd, io_uring_prep_rw(IORING_OP_MKDIRAT, sqe, dfd, path, mode, 0); } +static inline void io_uring_prep_mkdir(struct io_uring_sqe *sqe, + const char *path, mode_t mode) +{ + io_uring_prep_mkdirat(sqe, AT_FDCWD, path, mode); +} + static inline void io_uring_prep_symlinkat(struct io_uring_sqe *sqe, - const char *target, int newdirfd, const char *linkpath) + const char *target, int newdirfd, + const char *linkpath) { io_uring_prep_rw(IORING_OP_SYMLINKAT, sqe, newdirfd, target, 0, (uint64_t) (uintptr_t) linkpath); } +static inline void io_uring_prep_symlink(struct io_uring_sqe *sqe, + const char *target, const char *linkpath) +{ + io_uring_prep_symlinkat(sqe, target, AT_FDCWD, linkpath); +} + static inline void io_uring_prep_linkat(struct io_uring_sqe *sqe, int olddfd, const char *oldpath, int newdfd, const char *newpath, int flags) { - io_uring_prep_rw(IORING_OP_LINKAT, sqe, olddfd, oldpath, newdfd, + io_uring_prep_rw(IORING_OP_LINKAT, sqe, olddfd, oldpath, (__u32) newdfd, (uint64_t) (uintptr_t) newpath); - sqe->hardlink_flags = flags; + sqe->hardlink_flags = (__u32) flags; +} + +static inline void io_uring_prep_link(struct io_uring_sqe *sqe, + const char *oldpath, const char *newpath, int flags) +{ + io_uring_prep_linkat(sqe, AT_FDCWD, oldpath, AT_FDCWD, newpath, flags); +} + +static inline void io_uring_prep_msg_ring(struct io_uring_sqe *sqe, int fd, + unsigned int len, __u64 data, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_MSG_RING, sqe, fd, NULL, len, data); + sqe->rw_flags = flags; +} + +static inline void io_uring_prep_getxattr(struct io_uring_sqe *sqe, + const char *name, + const char *value, + const char *path, + size_t len) +{ + io_uring_prep_rw(IORING_OP_GETXATTR, sqe, 0, name, len, + (__u64) (uintptr_t) value); + sqe->addr3 = (__u64) (uintptr_t) path; + sqe->xattr_flags = 0; +} + +static inline void io_uring_prep_setxattr(struct io_uring_sqe *sqe, + const char *name, + const char *value, + const char *path, + int flags, + size_t len) +{ + io_uring_prep_rw(IORING_OP_SETXATTR, sqe, 0, name, len, + (__u64) (uintptr_t) value); + sqe->addr3 = (__u64) (uintptr_t) path; + sqe->xattr_flags = flags; +} + +static inline void io_uring_prep_fgetxattr(struct io_uring_sqe *sqe, + int fd, + const char *name, + const char *value, + size_t len) +{ + io_uring_prep_rw(IORING_OP_FGETXATTR, sqe, fd, name, len, + (__u64) (uintptr_t) value); + sqe->xattr_flags = 0; +} + +static inline void io_uring_prep_fsetxattr(struct io_uring_sqe *sqe, + int fd, + const char *name, + const char *value, + int flags, + size_t len) +{ + io_uring_prep_rw(IORING_OP_FSETXATTR, sqe, fd, name, len, + (__u64) (uintptr_t) value); + sqe->xattr_flags = flags; +} + +static inline void io_uring_prep_socket(struct io_uring_sqe *sqe, int domain, + int type, int protocol, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_SOCKET, sqe, domain, NULL, protocol, type); + sqe->rw_flags = flags; +} + +static inline void io_uring_prep_socket_direct(struct io_uring_sqe *sqe, + int domain, int type, + int protocol, + unsigned file_index, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_SOCKET, sqe, domain, NULL, protocol, type); + sqe->rw_flags = flags; + __io_uring_set_target_fixed_file(sqe, file_index); +} + +static inline void io_uring_prep_socket_direct_alloc(struct io_uring_sqe *sqe, + int domain, int type, int protocol, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_SOCKET, sqe, domain, NULL, protocol, type); + sqe->rw_flags = flags; + __io_uring_set_target_fixed_file(sqe, IORING_FILE_INDEX_ALLOC - 1); } /* @@ -576,15 +888,18 @@ static inline void io_uring_prep_linkat(struct io_uring_sqe *sqe, int olddfd, */ static inline unsigned io_uring_sq_ready(const struct io_uring *ring) { + unsigned khead = *ring->sq.khead; + /* - * Without a barrier, we could miss an update and think the SQ wasn't ready. - * We don't need the load acquire for non-SQPOLL since then we drive updates. + * Without a barrier, we could miss an update and think the SQ wasn't + * ready. We don't need the load acquire for non-SQPOLL since then we + * drive updates. */ if (ring->flags & IORING_SETUP_SQPOLL) - return ring->sq.sqe_tail - io_uring_smp_load_acquire(ring->sq.khead); + khead = io_uring_smp_load_acquire(ring->sq.khead); /* always use real head, to avoid losing sync for short submit */ - return ring->sq.sqe_tail - *ring->sq.khead; + return ring->sq.sqe_tail - khead; } /* @@ -671,12 +986,62 @@ static inline int io_uring_wait_cqe_nr(struct io_uring *ring, } /* + * Internal helper, don't use directly in applications. Use one of the + * "official" versions of this, io_uring_peek_cqe(), io_uring_wait_cqe(), + * or io_uring_wait_cqes*(). + */ +static inline int __io_uring_peek_cqe(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, + unsigned *nr_available) +{ + struct io_uring_cqe *cqe; + int err = 0; + unsigned available; + unsigned mask = *ring->cq.kring_mask; + int shift = 0; + + if (ring->flags & IORING_SETUP_CQE32) + shift = 1; + + do { + unsigned tail = io_uring_smp_load_acquire(ring->cq.ktail); + unsigned head = *ring->cq.khead; + + cqe = NULL; + available = tail - head; + if (!available) + break; + + cqe = &ring->cq.cqes[(head & mask) << shift]; + if (!(ring->features & IORING_FEAT_EXT_ARG) && + cqe->user_data == LIBURING_UDATA_TIMEOUT) { + if (cqe->res < 0) + err = cqe->res; + io_uring_cq_advance(ring, 1); + if (!err) + continue; + cqe = NULL; + } + + break; + } while (1); + + *cqe_ptr = cqe; + if (nr_available) + *nr_available = available; + return err; +} + +/* * Return an IO completion, if one is readily available. Returns 0 with * cqe_ptr filled in on success, -errno on failure. */ static inline int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) { + if (!__io_uring_peek_cqe(ring, cqe_ptr, NULL) && *cqe_ptr) + return 0; + return io_uring_wait_cqe_nr(ring, cqe_ptr, 0); } @@ -687,9 +1052,105 @@ static inline int io_uring_peek_cqe(struct io_uring *ring, static inline int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) { + if (!__io_uring_peek_cqe(ring, cqe_ptr, NULL) && *cqe_ptr) + return 0; + return io_uring_wait_cqe_nr(ring, cqe_ptr, 1); } +/* + * Return an sqe to fill. Application must later call io_uring_submit() + * when it's ready to tell the kernel about it. The caller may call this + * function multiple times before calling io_uring_submit(). + * + * Returns a vacant sqe, or NULL if we're full. + */ +static inline struct io_uring_sqe *_io_uring_get_sqe(struct io_uring *ring) +{ + struct io_uring_sq *sq = &ring->sq; + unsigned int head = io_uring_smp_load_acquire(sq->khead); + unsigned int next = sq->sqe_tail + 1; + int shift = 0; + + if (ring->flags & IORING_SETUP_SQE128) + shift = 1; + + if (next - head <= *sq->kring_entries) { + struct io_uring_sqe *sqe; + + sqe = &sq->sqes[(sq->sqe_tail & *sq->kring_mask) << shift]; + sq->sqe_tail = next; + return sqe; + } + + return NULL; +} + +/* + * Return the appropriate mask for a buffer ring of size 'ring_entries' + */ +static inline int io_uring_buf_ring_mask(__u32 ring_entries) +{ + return ring_entries - 1; +} + +static inline void io_uring_buf_ring_init(struct io_uring_buf_ring *br) +{ + br->tail = 0; +} + +/* + * Assign 'buf' with the addr/len/buffer ID supplied + */ +static inline void io_uring_buf_ring_add(struct io_uring_buf_ring *br, + void *addr, unsigned int len, + unsigned short bid, int mask, + int buf_offset) +{ + struct io_uring_buf *buf = &br->bufs[(br->tail + buf_offset) & mask]; + + buf->addr = (unsigned long) (uintptr_t) addr; + buf->len = len; + buf->bid = bid; +} + +/* + * Make 'count' new buffers visible to the kernel. Called after + * io_uring_buf_ring_add() has been called 'count' times to fill in new + * buffers. + */ +static inline void io_uring_buf_ring_advance(struct io_uring_buf_ring *br, + int count) +{ + unsigned short new_tail = br->tail + count; + + io_uring_smp_store_release(&br->tail, new_tail); +} + +/* + * Make 'count' new buffers visible to the kernel while at the same time + * advancing the CQ ring seen entries. This can be used when the application + * is using ring provided buffers and returns buffers while processing CQEs, + * avoiding an extra atomic when needing to increment both the CQ ring and + * the ring buffer index at the same time. + */ +static inline void io_uring_buf_ring_cq_advance(struct io_uring *ring, + struct io_uring_buf_ring *br, + int count) +{ + br->tail += count; + io_uring_cq_advance(ring, count); +} + +#ifndef LIBURING_INTERNAL +static inline struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) +{ + return _io_uring_get_sqe(ring); +} +#else +struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring); +#endif + ssize_t io_uring_mlock_size(unsigned entries, unsigned flags); ssize_t io_uring_mlock_size_params(unsigned entries, struct io_uring_params *p); diff --git a/src/include/liburing/barrier.h b/src/include/liburing/barrier.h index 89ac682..aedeb47 100644 --- a/src/include/liburing/barrier.h +++ b/src/include/liburing/barrier.h @@ -52,6 +52,11 @@ static inline T io_uring_smp_load_acquire(const T *p) reinterpret_cast<const std::atomic<T> *>(p), std::memory_order_acquire); } + +static inline void io_uring_smp_mb() +{ + std::atomic_thread_fence(std::memory_order_seq_cst); +} #else #include <stdatomic.h> @@ -68,6 +73,9 @@ static inline T io_uring_smp_load_acquire(const T *p) #define io_uring_smp_load_acquire(p) \ atomic_load_explicit((_Atomic __typeof__(*(p)) *)(p), \ memory_order_acquire) + +#define io_uring_smp_mb() \ + atomic_thread_fence(memory_order_seq_cst) #endif #endif /* defined(LIBURING_BARRIER_H) */ diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h index e4a4fc4..2f391c9 100644 --- a/src/include/liburing/io_uring.h +++ b/src/include/liburing/io_uring.h @@ -49,25 +49,35 @@ struct io_uring_sqe { __u32 rename_flags; __u32 unlink_flags; __u32 hardlink_flags; + __u32 xattr_flags; }; __u64 user_data; /* data to be passed back at completion time */ + /* pack this to avoid bogus arm OABI complaints */ union { - struct { - /* pack this to avoid bogus arm OABI complaints */ - union { - /* index into fixed buffers, if used */ - __u16 buf_index; - /* for grouped buffer selection */ - __u16 buf_group; - } __attribute__((packed)); - /* personality to use, if used */ - __u16 personality; - __s32 splice_fd_in; - }; - __u64 __pad2[3]; + /* index into fixed buffers, if used */ + __u16 buf_index; + /* for grouped buffer selection */ + __u16 buf_group; + } __attribute__((packed)); + /* personality to use, if used */ + __u16 personality; + union { + __s32 splice_fd_in; + __u32 file_index; }; + __u64 addr3; + __u64 __pad2[1]; }; +/* + * If sqe->file_index is set to this for opcodes that instantiate a new + * direct descriptor (like openat/openat2/accept), then io_uring will allocate + * an available direct descriptor instead of having the application pass one + * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE + * if the space is full. + */ +#define IORING_FILE_INDEX_ALLOC (~0U) + enum { IOSQE_FIXED_FILE_BIT, IOSQE_IO_DRAIN_BIT, @@ -75,6 +85,7 @@ enum { IOSQE_IO_HARDLINK_BIT, IOSQE_ASYNC_BIT, IOSQE_BUFFER_SELECT_BIT, + IOSQE_CQE_SKIP_SUCCESS_BIT, }; /* @@ -92,6 +103,8 @@ enum { #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) /* select buffer from sqe->buf_group */ #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) +/* don't post CQE if request succeeded */ +#define IOSQE_CQE_SKIP_SUCCESS (1U << IOSQE_CQE_SKIP_SUCCESS_BIT) /* * io_uring_setup() flags @@ -103,8 +116,26 @@ enum { #define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ +#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ +/* + * Cooperative task running. When requests complete, they often require + * forcing the submitter to transition to the kernel to complete. If this + * flag is set, work will be done when the task transitions anyway, rather + * than force an inter-processor interrupt reschedule. This avoids interrupting + * a task running in userspace, and saves an IPI. + */ +#define IORING_SETUP_COOP_TASKRUN (1U << 8) +/* + * If COOP_TASKRUN is set, get notified if task work is available for + * running and a kernel transition would be needed to run it. This sets + * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. + */ +#define IORING_SETUP_TASKRUN_FLAG (1U << 9) -enum { +#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */ +#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */ + +enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, IORING_OP_WRITEV, @@ -145,6 +176,13 @@ enum { IORING_OP_MKDIRAT, IORING_OP_SYMLINKAT, IORING_OP_LINKAT, + IORING_OP_MSG_RING, + IORING_OP_FSETXATTR, + IORING_OP_SETXATTR, + IORING_OP_FGETXATTR, + IORING_OP_GETXATTR, + IORING_OP_SOCKET, + IORING_OP_URING_CMD, /* this goes last, obviously */ IORING_OP_LAST, @@ -158,9 +196,14 @@ enum { /* * sqe->timeout_flags */ -#define IORING_TIMEOUT_ABS (1U << 0) -#define IORING_TIMEOUT_UPDATE (1U << 1) - +#define IORING_TIMEOUT_ABS (1U << 0) +#define IORING_TIMEOUT_UPDATE (1U << 1) +#define IORING_TIMEOUT_BOOTTIME (1U << 2) +#define IORING_TIMEOUT_REALTIME (1U << 3) +#define IORING_LINK_TIMEOUT_UPDATE (1U << 4) +#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) +#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) +#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) /* * sqe->splice_flags * extends splice(2) flags @@ -183,12 +226,45 @@ enum { #define IORING_POLL_UPDATE_USER_DATA (1U << 2) /* + * ASYNC_CANCEL flags. + * + * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key + * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the + * request 'user_data' + * IORING_ASYNC_CANCEL_ANY Match any request + */ +#define IORING_ASYNC_CANCEL_ALL (1U << 0) +#define IORING_ASYNC_CANCEL_FD (1U << 1) +#define IORING_ASYNC_CANCEL_ANY (1U << 2) + +/* + * send/sendmsg and recv/recvmsg flags (sqe->addr2) + * + * IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send + * or receive and arm poll if that yields an + * -EAGAIN result, arm poll upfront and skip + * the initial transfer attempt. + */ +#define IORING_RECVSEND_POLL_FIRST (1U << 0) + +/* + * accept flags stored in sqe->ioprio + */ +#define IORING_ACCEPT_MULTISHOT (1U << 0) + +/* * IO completion data structure (Completion Queue Entry) */ struct io_uring_cqe { __u64 user_data; /* sqe->data submission passed back */ __s32 res; /* result code for this event */ __u32 flags; + + /* + * If the ring is initialized with IORING_SETUP_CQE32, then this field + * contains 16-bytes of padding, doubling the size of the CQE. + */ + __u64 big_cqe[]; }; /* @@ -196,9 +272,11 @@ struct io_uring_cqe { * * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries + * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) +#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) enum { IORING_CQE_BUFFER_SHIFT = 16, @@ -231,6 +309,7 @@ struct io_sqring_offsets { */ #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ #define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */ +#define IORING_SQ_TASKRUN (1U << 2) /* task should enter the kernel */ struct io_cqring_offsets { __u32 head; @@ -254,10 +333,11 @@ struct io_cqring_offsets { /* * io_uring_enter(2) flags */ -#define IORING_ENTER_GETEVENTS (1U << 0) -#define IORING_ENTER_SQ_WAKEUP (1U << 1) -#define IORING_ENTER_SQ_WAIT (1U << 2) -#define IORING_ENTER_EXT_ARG (1U << 3) +#define IORING_ENTER_GETEVENTS (1U << 0) +#define IORING_ENTER_SQ_WAKEUP (1U << 1) +#define IORING_ENTER_SQ_WAIT (1U << 2) +#define IORING_ENTER_EXT_ARG (1U << 3) +#define IORING_ENTER_REGISTERED_RING (1U << 4) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -289,6 +369,8 @@ struct io_uring_params { #define IORING_FEAT_EXT_ARG (1U << 8) #define IORING_FEAT_NATIVE_WORKERS (1U << 9) #define IORING_FEAT_RSRC_TAGS (1U << 10) +#define IORING_FEAT_CQE_SKIP (1U << 11) +#define IORING_FEAT_LINKED_FILE (1U << 12) /* * io_uring_register(2) opcodes and arguments @@ -314,10 +396,31 @@ enum { IORING_REGISTER_BUFFERS2 = 15, IORING_REGISTER_BUFFERS_UPDATE = 16, + /* set/clear io-wq thread affinities */ + IORING_REGISTER_IOWQ_AFF = 17, + IORING_UNREGISTER_IOWQ_AFF = 18, + + /* set/get max number of io-wq workers */ + IORING_REGISTER_IOWQ_MAX_WORKERS = 19, + + /* register/unregister io_uring fd with the ring */ + IORING_REGISTER_RING_FDS = 20, + IORING_UNREGISTER_RING_FDS = 21, + + /* register ring based provide buffer group */ + IORING_REGISTER_PBUF_RING = 22, + IORING_UNREGISTER_PBUF_RING = 23, + /* this goes last */ IORING_REGISTER_LAST }; +/* io-wq worker categories */ +enum { + IO_WQ_BOUND, + IO_WQ_UNBOUND, +}; + /* deprecated, see struct io_uring_rsrc_update */ struct io_uring_files_update { __u32 offset; @@ -325,9 +428,15 @@ struct io_uring_files_update { __aligned_u64 /* __s32 * */ fds; }; +/* + * Register a fully sparse file space, rather than pass in an array of all + * -1 file descriptors. + */ +#define IORING_RSRC_REGISTER_SPARSE (1U << 0) + struct io_uring_rsrc_register { __u32 nr; - __u32 resv; + __u32 flags; __u64 resv2; __aligned_u64 data; __aligned_u64 tags; @@ -365,7 +474,7 @@ struct io_uring_probe { __u8 ops_len; /* length of ops[] array below */ __u16 resv; __u32 resv2[3]; - struct io_uring_probe_op ops[]; + struct io_uring_probe_op ops[0]; }; struct io_uring_restriction { @@ -379,6 +488,38 @@ struct io_uring_restriction { __u32 resv2[3]; }; +struct io_uring_buf { + __u64 addr; + __u32 len; + __u16 bid; + __u16 resv; +}; + +struct io_uring_buf_ring { + union { + /* + * To avoid spilling into more pages than we need to, the + * ring tail is overlaid with the io_uring_buf->resv field. + */ + struct { + __u64 resv1; + __u32 resv2; + __u16 resv3; + __u16 tail; + }; + struct io_uring_buf bufs[0]; + }; +}; + +/* argument for IORING_(UN)REGISTER_PBUF_RING */ +struct io_uring_buf_reg { + __u64 ring_addr; + __u32 ring_entries; + __u16 bgid; + __u16 pad; + __u64 resv[3]; +}; + /* * io_uring_restriction->opcode values */ @@ -405,6 +546,11 @@ struct io_uring_getevents_arg { __u64 ts; }; +/* + * accept flags stored in sqe->ioprio + */ +#define IORING_ACCEPT_MULTISHOT (1U << 0) + #ifdef __cplusplus } #endif diff --git a/src/int_flags.h b/src/int_flags.h new file mode 100644 index 0000000..90505ec --- /dev/null +++ b/src/int_flags.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef LIBURING_INT_FLAGS +#define LIBURING_INT_FLAGS + +enum { + INT_FLAG_REG_RING = 1, +}; + +#endif diff --git a/src/lib.h b/src/lib.h new file mode 100644 index 0000000..6672cc5 --- /dev/null +++ b/src/lib.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef LIBURING_LIB_H +#define LIBURING_LIB_H + +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#define __INTERNAL__LIBURING_LIB_H +#if defined(__x86_64__) || defined(__i386__) + #include "arch/x86/lib.h" +#else + /* + * We don't have nolibc support for this arch. Must use libc! + */ + #ifdef CONFIG_NOLIBC + #error "This arch doesn't support building liburing without libc" + #endif + /* libc wrappers. */ + #include "arch/generic/lib.h" +#endif +#undef __INTERNAL__LIBURING_LIB_H + + +#ifndef offsetof + #define offsetof(TYPE, FIELD) ((size_t) &((TYPE *)0)->FIELD) +#endif + +#ifndef container_of + #define container_of(PTR, TYPE, FIELD) ({ \ + __typeof__(((TYPE *)0)->FIELD) *__FIELD_PTR = (PTR); \ + (TYPE *)((char *) __FIELD_PTR - offsetof(TYPE, FIELD)); \ + }) +#endif + +void *__uring_malloc(size_t len); +void __uring_free(void *p); + +static inline void *uring_malloc(size_t len) +{ +#ifdef CONFIG_NOLIBC + return __uring_malloc(len); +#else + return malloc(len); +#endif +} + +static inline void uring_free(void *ptr) +{ +#ifdef CONFIG_NOLIBC + __uring_free(ptr); +#else + free(ptr); +#endif +} + +#endif /* #ifndef LIBURING_LIB_H */ diff --git a/src/liburing.map b/src/liburing.map index 012ac4e..879f791 100644 --- a/src/liburing.map +++ b/src/liburing.map @@ -36,4 +36,22 @@ LIBURING_2.1 { global: io_uring_mlock_size_params; io_uring_mlock_size; + io_uring_register_buffers_tags; + io_uring_register_buffers_update_tag; + io_uring_register_files_tags; + io_uring_register_files_update_tag; + io_uring_register_iowq_aff; + io_uring_unregister_iowq_aff; + io_uring_register_iowq_max_workers; } LIBURING_2.0; + +LIBURING_2.2 { + global: + io_uring_submit_and_wait_timeout; + io_uring_register_ring_fd; + io_uring_unregister_ring_fd; + io_uring_register_files_sparse; + io_uring_register_buffers_sparse; + io_uring_register_buf_ring; + io_uring_unregister_buf_ring; +} LIBURING_2.1; diff --git a/src/nolibc.c b/src/nolibc.c new file mode 100644 index 0000000..9a04ead --- /dev/null +++ b/src/nolibc.c @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: MIT */ + +#ifndef CONFIG_NOLIBC +#error "This file should only be compiled for no libc build" +#endif + +#include "lib.h" +#include "syscall.h" + +void *memset(void *s, int c, size_t n) +{ + size_t i; + unsigned char *p = s; + + for (i = 0; i < n; i++) + p[i] = (unsigned char) c; + + return s; +} + +struct uring_heap { + size_t len; + char user_p[] __attribute__((__aligned__)); +}; + +void *__uring_malloc(size_t len) +{ + struct uring_heap *heap; + + heap = __sys_mmap(NULL, sizeof(*heap) + len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (IS_ERR(heap)) + return NULL; + + heap->len = sizeof(*heap) + len; + return heap->user_p; +} + +void __uring_free(void *p) +{ + struct uring_heap *heap; + + if (uring_unlikely(!p)) + return; + + heap = container_of(p, struct uring_heap, user_p); + __sys_munmap(heap, heap->len); +} diff --git a/src/queue.c b/src/queue.c index 2f0f19b..ce0ecf6 100644 --- a/src/queue.c +++ b/src/queue.c @@ -1,20 +1,12 @@ /* SPDX-License-Identifier: MIT */ #define _POSIX_C_SOURCE 200112L -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/mman.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <stdbool.h> - +#include "lib.h" +#include "syscall.h" +#include "liburing.h" +#include "int_flags.h" #include "liburing/compat.h" #include "liburing/io_uring.h" -#include "liburing.h" -#include "liburing/barrier.h" - -#include "syscall.h" /* * Returns true if we're not using SQ thread (thus nobody submits but us) @@ -26,6 +18,12 @@ static inline bool sq_ring_needs_enter(struct io_uring *ring, unsigned *flags) if (!(ring->flags & IORING_SETUP_SQPOLL)) return true; + /* + * Ensure the kernel can see the store to the SQ tail before we read + * the flags. + */ + io_uring_smp_mb(); + if (uring_unlikely(IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_NEED_WAKEUP)) { *flags |= IORING_ENTER_SQ_WAKEUP; @@ -37,44 +35,13 @@ static inline bool sq_ring_needs_enter(struct io_uring *ring, unsigned *flags) static inline bool cq_ring_needs_flush(struct io_uring *ring) { - return IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_CQ_OVERFLOW; + return IO_URING_READ_ONCE(*ring->sq.kflags) & + (IORING_SQ_CQ_OVERFLOW | IORING_SQ_TASKRUN); } -static int __io_uring_peek_cqe(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr, - unsigned *nr_available) +static inline bool cq_ring_needs_enter(struct io_uring *ring) { - struct io_uring_cqe *cqe; - int err = 0; - unsigned available; - unsigned mask = *ring->cq.kring_mask; - - do { - unsigned tail = io_uring_smp_load_acquire(ring->cq.ktail); - unsigned head = *ring->cq.khead; - - cqe = NULL; - available = tail - head; - if (!available) - break; - - cqe = &ring->cq.cqes[head & mask]; - if (!(ring->features & IORING_FEAT_EXT_ARG) && - cqe->user_data == LIBURING_UDATA_TIMEOUT) { - if (cqe->res < 0) - err = cqe->res; - io_uring_cq_advance(ring, 1); - if (!err) - continue; - cqe = NULL; - } - - break; - } while (1); - - *cqe_ptr = cqe; - *nr_available = available; - return err; + return (ring->flags & IORING_SETUP_IOPOLL) || cq_ring_needs_flush(ring); } struct get_data { @@ -85,15 +52,16 @@ struct get_data { void *arg; }; -static int _io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr, +static int _io_uring_get_cqe(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, struct get_data *data) { struct io_uring_cqe *cqe = NULL; + bool looped = false; int err; do { bool need_enter = false; - bool cq_overflow_flush = false; unsigned flags = 0; unsigned nr_available; int ret; @@ -102,34 +70,40 @@ static int _io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_pt if (err) break; if (!cqe && !data->wait_nr && !data->submit) { - if (!cq_ring_needs_flush(ring)) { + /* + * If we already looped once, we already entererd + * the kernel. Since there's nothing to submit or + * wait for, don't keep retrying. + */ + if (looped || !cq_ring_needs_enter(ring)) { err = -EAGAIN; break; } - cq_overflow_flush = true; + need_enter = true; } - if (data->wait_nr > nr_available || cq_overflow_flush) { + if (data->wait_nr > nr_available || need_enter) { flags = IORING_ENTER_GETEVENTS | data->get_flags; need_enter = true; } - if (data->submit) { - sq_ring_needs_enter(ring, &flags); + if (data->submit && sq_ring_needs_enter(ring, &flags)) need_enter = true; - } if (!need_enter) break; - ret = __sys_io_uring_enter2(ring->ring_fd, data->submit, - data->wait_nr, flags, data->arg, - data->sz); + if (ring->int_flags & INT_FLAG_REG_RING) + flags |= IORING_ENTER_REGISTERED_RING; + ret = ____sys_io_uring_enter2(ring->enter_ring_fd, data->submit, + data->wait_nr, flags, data->arg, + data->sz); if (ret < 0) { - err = -errno; + err = ret; break; } data->submit -= ret; if (cqe) break; + looped = true; } while (1); *cqe_ptr = cqe; @@ -159,6 +133,10 @@ unsigned io_uring_peek_batch_cqe(struct io_uring *ring, { unsigned ready; bool overflow_checked = false; + int shift = 0; + + if (ring->flags & IORING_SETUP_CQE32) + shift = 1; again: ready = io_uring_cq_ready(ring); @@ -171,7 +149,7 @@ again: count = count > ready ? ready : count; last = head + count; for (;head != last; head++, i++) - cqes[i] = &ring->cq.cqes[head & mask]; + cqes[i] = &ring->cq.cqes[(head & mask) << shift]; return count; } @@ -180,8 +158,11 @@ again: goto done; if (cq_ring_needs_flush(ring)) { - __sys_io_uring_enter(ring->ring_fd, 0, 0, - IORING_ENTER_GETEVENTS, NULL); + int flags = IORING_ENTER_GETEVENTS; + + if (ring->int_flags & INT_FLAG_REG_RING) + flags |= IORING_ENTER_REGISTERED_RING; + ____sys_io_uring_enter(ring->enter_ring_fd, 0, 0, flags, NULL); overflow_checked = true; goto again; } @@ -239,7 +220,8 @@ out: */ static int io_uring_wait_cqes_new(struct io_uring *ring, struct io_uring_cqe **cqe_ptr, - unsigned wait_nr, struct __kernel_timespec *ts, + unsigned wait_nr, + struct __kernel_timespec *ts, sigset_t *sigmask) { struct io_uring_getevents_arg arg = { @@ -248,7 +230,6 @@ static int io_uring_wait_cqes_new(struct io_uring *ring, .ts = (unsigned long) ts }; struct get_data data = { - .submit = __io_uring_flush_sq(ring), .wait_nr = wait_nr, .get_flags = IORING_ENTER_EXT_ARG, .sz = sizeof(arg), @@ -275,36 +256,77 @@ static int io_uring_wait_cqes_new(struct io_uring *ring, * hence this function is safe to use for applications that split SQ and CQ * handling between two threads. */ +static int __io_uring_submit_timeout(struct io_uring *ring, unsigned wait_nr, + struct __kernel_timespec *ts) +{ + struct io_uring_sqe *sqe; + int ret; + + /* + * If the SQ ring is full, we may need to submit IO first + */ + sqe = io_uring_get_sqe(ring); + if (!sqe) { + ret = io_uring_submit(ring); + if (ret < 0) + return ret; + sqe = io_uring_get_sqe(ring); + if (!sqe) + return -EAGAIN; + } + io_uring_prep_timeout(sqe, ts, wait_nr, 0); + sqe->user_data = LIBURING_UDATA_TIMEOUT; + return __io_uring_flush_sq(ring); +} + int io_uring_wait_cqes(struct io_uring *ring, struct io_uring_cqe **cqe_ptr, unsigned wait_nr, struct __kernel_timespec *ts, sigset_t *sigmask) { - unsigned to_submit = 0; + int to_submit = 0; if (ts) { - struct io_uring_sqe *sqe; - int ret; - if (ring->features & IORING_FEAT_EXT_ARG) return io_uring_wait_cqes_new(ring, cqe_ptr, wait_nr, ts, sigmask); + to_submit = __io_uring_submit_timeout(ring, wait_nr, ts); + if (to_submit < 0) + return to_submit; + } - /* - * If the SQ ring is full, we may need to submit IO first - */ - sqe = io_uring_get_sqe(ring); - if (!sqe) { - ret = io_uring_submit(ring); - if (ret < 0) - return ret; - sqe = io_uring_get_sqe(ring); - if (!sqe) - return -EAGAIN; + return __io_uring_get_cqe(ring, cqe_ptr, to_submit, wait_nr, sigmask); +} + +int io_uring_submit_and_wait_timeout(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, + unsigned wait_nr, + struct __kernel_timespec *ts, + sigset_t *sigmask) +{ + int to_submit; + + if (ts) { + if (ring->features & IORING_FEAT_EXT_ARG) { + struct io_uring_getevents_arg arg = { + .sigmask = (unsigned long) sigmask, + .sigmask_sz = _NSIG / 8, + .ts = (unsigned long) ts + }; + struct get_data data = { + .submit = __io_uring_flush_sq(ring), + .wait_nr = wait_nr, + .get_flags = IORING_ENTER_EXT_ARG, + .sz = sizeof(arg), + .arg = &arg + }; + + return _io_uring_get_cqe(ring, cqe_ptr, &data); } - io_uring_prep_timeout(sqe, ts, wait_nr, 0); - sqe->user_data = LIBURING_UDATA_TIMEOUT; + to_submit = __io_uring_submit_timeout(ring, wait_nr, ts); + if (to_submit < 0) + return to_submit; + } else to_submit = __io_uring_flush_sq(ring); - } return __io_uring_get_cqe(ring, cqe_ptr, to_submit, wait_nr, sigmask); } @@ -335,11 +357,11 @@ static int __io_uring_submit(struct io_uring *ring, unsigned submitted, if (sq_ring_needs_enter(ring, &flags) || wait_nr) { if (wait_nr || (ring->flags & IORING_SETUP_IOPOLL)) flags |= IORING_ENTER_GETEVENTS; + if (ring->int_flags & INT_FLAG_REG_RING) + flags |= IORING_ENTER_REGISTERED_RING; - ret = __sys_io_uring_enter(ring->ring_fd, submitted, wait_nr, - flags, NULL); - if (ret < 0) - return -errno; + ret = ____sys_io_uring_enter(ring->enter_ring_fd, submitted, + wait_nr, flags, NULL); } else ret = submitted; @@ -371,34 +393,19 @@ int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr) return __io_uring_submit_and_wait(ring, wait_nr); } -/* - * Return an sqe to fill. Application must later call io_uring_submit() - * when it's ready to tell the kernel about it. The caller may call this - * function multiple times before calling io_uring_submit(). - * - * Returns a vacant sqe, or NULL if we're full. - */ +#ifdef LIBURING_INTERNAL struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) { - struct io_uring_sq *sq = &ring->sq; - unsigned int head = io_uring_smp_load_acquire(sq->khead); - unsigned int next = sq->sqe_tail + 1; - struct io_uring_sqe *sqe = NULL; - - if (next - head <= *sq->kring_entries) { - sqe = &sq->sqes[sq->sqe_tail & *sq->kring_mask]; - sq->sqe_tail = next; - } - return sqe; + return _io_uring_get_sqe(ring); } +#endif int __io_uring_sqring_wait(struct io_uring *ring) { - int ret; + int flags = IORING_ENTER_SQ_WAIT; - ret = __sys_io_uring_enter(ring->ring_fd, 0, 0, IORING_ENTER_SQ_WAIT, - NULL); - if (ret < 0) - ret = -errno; - return ret; + if (ring->int_flags & INT_FLAG_REG_RING) + flags |= IORING_ENTER_REGISTERED_RING; + + return ____sys_io_uring_enter(ring->enter_ring_fd, 0, 0, flags, NULL); } diff --git a/src/register.c b/src/register.c index 994aaff..993c450 100644 --- a/src/register.c +++ b/src/register.c @@ -1,42 +1,91 @@ /* SPDX-License-Identifier: MIT */ #define _POSIX_C_SOURCE 200112L -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/mman.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> - +#include "lib.h" +#include "syscall.h" +#include "liburing.h" +#include "int_flags.h" #include "liburing/compat.h" #include "liburing/io_uring.h" -#include "liburing.h" -#include "syscall.h" +int io_uring_register_buffers_update_tag(struct io_uring *ring, unsigned off, + const struct iovec *iovecs, + const __u64 *tags, + unsigned nr) +{ + struct io_uring_rsrc_update2 up = { + .offset = off, + .data = (unsigned long)iovecs, + .tags = (unsigned long)tags, + .nr = nr, + }; + + return ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_BUFFERS_UPDATE, &up, + sizeof(up)); +} + +int io_uring_register_buffers_tags(struct io_uring *ring, + const struct iovec *iovecs, + const __u64 *tags, + unsigned nr) +{ + struct io_uring_rsrc_register reg = { + .nr = nr, + .data = (unsigned long)iovecs, + .tags = (unsigned long)tags, + }; + + return ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_BUFFERS2, ®, + sizeof(reg)); +} + +int io_uring_register_buffers_sparse(struct io_uring *ring, unsigned nr) +{ + struct io_uring_rsrc_register reg = { + .flags = IORING_RSRC_REGISTER_SPARSE, + .nr = nr, + }; + + return ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_BUFFERS2, ®, + sizeof(reg)); +} int io_uring_register_buffers(struct io_uring *ring, const struct iovec *iovecs, unsigned nr_iovecs) { int ret; - ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_BUFFERS, + ret = ____sys_io_uring_register(ring->ring_fd, IORING_REGISTER_BUFFERS, iovecs, nr_iovecs); - if (ret < 0) - return -errno; - - return 0; + return (ret < 0) ? ret : 0; } int io_uring_unregister_buffers(struct io_uring *ring) { int ret; - ret = __sys_io_uring_register(ring->ring_fd, IORING_UNREGISTER_BUFFERS, - NULL, 0); - if (ret < 0) - return -errno; + ret = ____sys_io_uring_register(ring->ring_fd, + IORING_UNREGISTER_BUFFERS, NULL, 0); + return (ret < 0) ? ret : 0; +} - return 0; +int io_uring_register_files_update_tag(struct io_uring *ring, unsigned off, + const int *files, const __u64 *tags, + unsigned nr_files) +{ + struct io_uring_rsrc_update2 up = { + .offset = off, + .data = (unsigned long)files, + .tags = (unsigned long)tags, + .nr = nr_files, + }; + + return ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_FILES_UPDATE2, &up, + sizeof(up)); } /* @@ -53,76 +102,138 @@ int io_uring_register_files_update(struct io_uring *ring, unsigned off, .offset = off, .fds = (unsigned long) files, }; + + return ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_FILES_UPDATE, &up, + nr_files); +} + +static int increase_rlimit_nofile(unsigned nr) +{ int ret; + struct rlimit rlim; - ret = __sys_io_uring_register(ring->ring_fd, - IORING_REGISTER_FILES_UPDATE, &up, - nr_files); + ret = __sys_getrlimit(RLIMIT_NOFILE, &rlim); if (ret < 0) - return -errno; + return ret; + + if (rlim.rlim_cur < nr) { + rlim.rlim_cur += nr; + __sys_setrlimit(RLIMIT_NOFILE, &rlim); + } + + return 0; +} + +int io_uring_register_files_sparse(struct io_uring *ring, unsigned nr) +{ + struct io_uring_rsrc_register reg = { + .flags = IORING_RSRC_REGISTER_SPARSE, + .nr = nr, + }; + int ret, did_increase = 0; + + do { + ret = ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_FILES2, ®, + sizeof(reg)); + if (ret >= 0) + break; + if (ret == -EMFILE && !did_increase) { + did_increase = 1; + increase_rlimit_nofile(nr); + continue; + } + break; + } while (1); return ret; } -int io_uring_register_files(struct io_uring *ring, const int *files, - unsigned nr_files) +int io_uring_register_files_tags(struct io_uring *ring, const int *files, + const __u64 *tags, unsigned nr) { - int ret; + struct io_uring_rsrc_register reg = { + .nr = nr, + .data = (unsigned long)files, + .tags = (unsigned long)tags, + }; + int ret, did_increase = 0; + + do { + ret = ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_FILES2, ®, + sizeof(reg)); + if (ret >= 0) + break; + if (ret == -EMFILE && !did_increase) { + did_increase = 1; + increase_rlimit_nofile(nr); + continue; + } + break; + } while (1); - ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_FILES, - files, nr_files); - if (ret < 0) - return -errno; + return ret; +} - return 0; +int io_uring_register_files(struct io_uring *ring, const int *files, + unsigned nr_files) +{ + int ret, did_increase = 0; + + do { + ret = ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_FILES, files, + nr_files); + if (ret >= 0) + break; + if (ret == -EMFILE && !did_increase) { + did_increase = 1; + increase_rlimit_nofile(nr_files); + continue; + } + break; + } while (1); + + return ret; } int io_uring_unregister_files(struct io_uring *ring) { int ret; - ret = __sys_io_uring_register(ring->ring_fd, IORING_UNREGISTER_FILES, + ret = ____sys_io_uring_register(ring->ring_fd, IORING_UNREGISTER_FILES, NULL, 0); - if (ret < 0) - return -errno; - - return 0; + return (ret < 0) ? ret : 0; } int io_uring_register_eventfd(struct io_uring *ring, int event_fd) { int ret; - ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_EVENTFD, + ret = ____sys_io_uring_register(ring->ring_fd, IORING_REGISTER_EVENTFD, &event_fd, 1); - if (ret < 0) - return -errno; - - return 0; + return (ret < 0) ? ret : 0; } int io_uring_unregister_eventfd(struct io_uring *ring) { int ret; - ret = __sys_io_uring_register(ring->ring_fd, IORING_UNREGISTER_EVENTFD, - NULL, 0); - if (ret < 0) - return -errno; - - return 0; + ret = ____sys_io_uring_register(ring->ring_fd, + IORING_UNREGISTER_EVENTFD, NULL, 0); + return (ret < 0) ? ret : 0; } int io_uring_register_eventfd_async(struct io_uring *ring, int event_fd) { int ret; - ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_EVENTFD_ASYNC, - &event_fd, 1); - if (ret < 0) - return -errno; - - return 0; + ret = ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_EVENTFD_ASYNC, + &event_fd, 1); + return (ret < 0) ? ret : 0; } int io_uring_register_probe(struct io_uring *ring, struct io_uring_probe *p, @@ -130,36 +241,22 @@ int io_uring_register_probe(struct io_uring *ring, struct io_uring_probe *p, { int ret; - ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_PROBE, - p, nr_ops); - if (ret < 0) - return -errno; - - return 0; + ret = ____sys_io_uring_register(ring->ring_fd, IORING_REGISTER_PROBE, p, + nr_ops); + return (ret < 0) ? ret : 0; } int io_uring_register_personality(struct io_uring *ring) { - int ret; - - ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_PERSONALITY, - NULL, 0); - if (ret < 0) - return -errno; - - return ret; + return ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_PERSONALITY, NULL, 0); } int io_uring_unregister_personality(struct io_uring *ring, int id) { - int ret; - - ret = __sys_io_uring_register(ring->ring_fd, IORING_UNREGISTER_PERSONALITY, - NULL, id); - if (ret < 0) - return -errno; - - return ret; + return ____sys_io_uring_register(ring->ring_fd, + IORING_UNREGISTER_PERSONALITY, NULL, + id); } int io_uring_register_restrictions(struct io_uring *ring, @@ -168,22 +265,83 @@ int io_uring_register_restrictions(struct io_uring *ring, { int ret; - ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_RESTRICTIONS, - res, nr_res); - if (ret < 0) - return -errno; - - return 0; + ret = ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_RESTRICTIONS, res, + nr_res); + return (ret < 0) ? ret : 0; } int io_uring_enable_rings(struct io_uring *ring) { + return ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_ENABLE_RINGS, NULL, 0); +} + +int io_uring_register_iowq_aff(struct io_uring *ring, size_t cpusz, + const cpu_set_t *mask) +{ + return ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_IOWQ_AFF, mask, cpusz); +} + +int io_uring_unregister_iowq_aff(struct io_uring *ring) +{ + return ____sys_io_uring_register(ring->ring_fd, + IORING_UNREGISTER_IOWQ_AFF, NULL, 0); +} + +int io_uring_register_iowq_max_workers(struct io_uring *ring, unsigned int *val) +{ + return ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_IOWQ_MAX_WORKERS, val, + 2); +} + +int io_uring_register_ring_fd(struct io_uring *ring) +{ + struct io_uring_rsrc_update up = { + .data = ring->ring_fd, + .offset = -1U, + }; int ret; - ret = __sys_io_uring_register(ring->ring_fd, - IORING_REGISTER_ENABLE_RINGS, NULL, 0); - if (ret < 0) - return -errno; + ret = ____sys_io_uring_register(ring->ring_fd, IORING_REGISTER_RING_FDS, + &up, 1); + if (ret == 1) { + ring->enter_ring_fd = up.offset; + ring->int_flags |= INT_FLAG_REG_RING; + } + return ret; +} + + +int io_uring_unregister_ring_fd(struct io_uring *ring) +{ + struct io_uring_rsrc_update up = { + .offset = ring->enter_ring_fd, + }; + int ret; + ret = ____sys_io_uring_register(ring->ring_fd, + IORING_UNREGISTER_RING_FDS, &up, 1); + if (ret == 1) { + ring->enter_ring_fd = ring->ring_fd; + ring->int_flags &= ~INT_FLAG_REG_RING; + } return ret; } + +int io_uring_register_buf_ring(struct io_uring *ring, + struct io_uring_buf_reg *reg, unsigned int flags) +{ + return ____sys_io_uring_register(ring->ring_fd, + IORING_REGISTER_PBUF_RING, reg, 1); +} + +int io_uring_unregister_buf_ring(struct io_uring *ring, int bgid) +{ + struct io_uring_buf_reg reg = { .bgid = bgid }; + + return ____sys_io_uring_register(ring->ring_fd, + IORING_UNREGISTER_PBUF_RING, ®, 1); +} diff --git a/src/setup.c b/src/setup.c index 54225e8..d2adc7f 100644 --- a/src/setup.c +++ b/src/setup.c @@ -1,26 +1,18 @@ /* SPDX-License-Identifier: MIT */ #define _DEFAULT_SOURCE -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/mman.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <stdlib.h> -#include <signal.h> - +#include "lib.h" +#include "syscall.h" +#include "liburing.h" +#include "int_flags.h" #include "liburing/compat.h" #include "liburing/io_uring.h" -#include "liburing.h" - -#include "syscall.h" static void io_uring_unmap_rings(struct io_uring_sq *sq, struct io_uring_cq *cq) { - munmap(sq->ring_ptr, sq->ring_sz); + __sys_munmap(sq->ring_ptr, sq->ring_sz); if (cq->ring_ptr && cq->ring_ptr != sq->ring_ptr) - munmap(cq->ring_ptr, cq->ring_sz); + __sys_munmap(cq->ring_ptr, cq->ring_sz); } static int io_uring_mmap(int fd, struct io_uring_params *p, @@ -29,27 +21,33 @@ static int io_uring_mmap(int fd, struct io_uring_params *p, size_t size; int ret; + size = sizeof(struct io_uring_cqe); + if (p->flags & IORING_SETUP_CQE32) + size += sizeof(struct io_uring_cqe); + sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); - cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); + cq->ring_sz = p->cq_off.cqes + p->cq_entries * size; if (p->features & IORING_FEAT_SINGLE_MMAP) { if (cq->ring_sz > sq->ring_sz) sq->ring_sz = cq->ring_sz; cq->ring_sz = sq->ring_sz; } - sq->ring_ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); - if (sq->ring_ptr == MAP_FAILED) - return -errno; + sq->ring_ptr = __sys_mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, + IORING_OFF_SQ_RING); + if (IS_ERR(sq->ring_ptr)) + return PTR_ERR(sq->ring_ptr); if (p->features & IORING_FEAT_SINGLE_MMAP) { cq->ring_ptr = sq->ring_ptr; } else { - cq->ring_ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); - if (cq->ring_ptr == MAP_FAILED) { + cq->ring_ptr = __sys_mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, + IORING_OFF_CQ_RING); + if (IS_ERR(cq->ring_ptr)) { + ret = PTR_ERR(cq->ring_ptr); cq->ring_ptr = NULL; - ret = -errno; goto err; } } @@ -62,12 +60,13 @@ static int io_uring_mmap(int fd, struct io_uring_params *p, sq->kdropped = sq->ring_ptr + p->sq_off.dropped; sq->array = sq->ring_ptr + p->sq_off.array; - size = p->sq_entries * sizeof(struct io_uring_sqe); - sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, fd, - IORING_OFF_SQES); - if (sq->sqes == MAP_FAILED) { - ret = -errno; + size = sizeof(struct io_uring_sqe); + if (p->flags & IORING_SETUP_SQE128) + size += 64; + sq->sqes = __sys_mmap(0, size * p->sq_entries, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (IS_ERR(sq->sqes)) { + ret = PTR_ERR(sq->sqes); err: io_uring_unmap_rings(sq, cq); return ret; @@ -98,7 +97,8 @@ int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); if (!ret) { ring->flags = p->flags; - ring->ring_fd = fd; + ring->ring_fd = ring->enter_ring_fd = fd; + ring->int_flags = 0; } return ret; } @@ -115,21 +115,24 @@ int io_uring_ring_dontfork(struct io_uring *ring) if (!ring->sq.ring_ptr || !ring->sq.sqes || !ring->cq.ring_ptr) return -EINVAL; - len = *ring->sq.kring_entries * sizeof(struct io_uring_sqe); - ret = madvise(ring->sq.sqes, len, MADV_DONTFORK); - if (ret == -1) - return -errno; + len = sizeof(struct io_uring_sqe); + if (ring->flags & IORING_SETUP_SQE128) + len += 64; + len *= *ring->sq.kring_entries; + ret = __sys_madvise(ring->sq.sqes, len, MADV_DONTFORK); + if (ret < 0) + return ret; len = ring->sq.ring_sz; - ret = madvise(ring->sq.ring_ptr, len, MADV_DONTFORK); - if (ret == -1) - return -errno; + ret = __sys_madvise(ring->sq.ring_ptr, len, MADV_DONTFORK); + if (ret < 0) + return ret; if (ring->cq.ring_ptr != ring->sq.ring_ptr) { len = ring->cq.ring_sz; - ret = madvise(ring->cq.ring_ptr, len, MADV_DONTFORK); - if (ret == -1) - return -errno; + ret = __sys_madvise(ring->cq.ring_ptr, len, MADV_DONTFORK); + if (ret < 0) + return ret; } return 0; @@ -140,13 +143,13 @@ int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, { int fd, ret; - fd = __sys_io_uring_setup(entries, p); + fd = ____sys_io_uring_setup(entries, p); if (fd < 0) - return -errno; + return fd; ret = io_uring_queue_mmap(fd, p, ring); if (ret) { - close(fd); + __sys_close(fd); return ret; } @@ -172,10 +175,20 @@ void io_uring_queue_exit(struct io_uring *ring) { struct io_uring_sq *sq = &ring->sq; struct io_uring_cq *cq = &ring->cq; + size_t sqe_size; - munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe)); + sqe_size = sizeof(struct io_uring_sqe); + if (ring->flags & IORING_SETUP_SQE128) + sqe_size += 64; + __sys_munmap(sq->sqes, sqe_size * *sq->kring_entries); io_uring_unmap_rings(sq, cq); - close(ring->ring_fd); + /* + * Not strictly required, but frees up the slot we used now rather + * than at process exit time. + */ + if (ring->int_flags & INT_FLAG_REG_RING) + io_uring_unregister_ring_fd(ring); + __sys_close(ring->ring_fd); } struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring) @@ -185,7 +198,7 @@ struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring) int r; len = sizeof(*probe) + 256 * sizeof(struct io_uring_probe_op); - probe = malloc(len); + probe = uring_malloc(len); if (!probe) return NULL; memset(probe, 0, len); @@ -194,7 +207,7 @@ struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring) if (r >= 0) return probe; - free(probe); + uring_free(probe); return NULL; } @@ -215,36 +228,14 @@ struct io_uring_probe *io_uring_get_probe(void) void io_uring_free_probe(struct io_uring_probe *probe) { - free(probe); + uring_free(probe); } -static int __fls(int x) +static inline int __fls(int x) { - int r = 32; - if (!x) return 0; - if (!(x & 0xffff0000u)) { - x <<= 16; - r -= 16; - } - if (!(x & 0xff000000u)) { - x <<= 8; - r -= 8; - } - if (!(x & 0xf0000000u)) { - x <<= 4; - r -= 4; - } - if (!(x & 0xc0000000u)) { - x <<= 2; - r -= 2; - } - if (!(x & 0x80000000u)) { - x <<= 1; - r -= 1; - } - return r; + return 8 * sizeof(x) - __builtin_clz(x); } static unsigned roundup_pow2(unsigned depth) @@ -261,16 +252,23 @@ static size_t npages(size_t size, unsigned page_size) #define KRING_SIZE 320 -static size_t rings_size(unsigned entries, unsigned cq_entries, unsigned page_size) +static size_t rings_size(struct io_uring_params *p, unsigned entries, + unsigned cq_entries, unsigned page_size) { size_t pages, sq_size, cq_size; - cq_size = KRING_SIZE; - cq_size += cq_entries * sizeof(struct io_uring_cqe); + cq_size = sizeof(struct io_uring_cqe); + if (p->flags & IORING_SETUP_CQE32) + cq_size += sizeof(struct io_uring_cqe); + cq_size *= cq_entries; + cq_size += KRING_SIZE; cq_size = (cq_size + 63) & ~63UL; pages = (size_t) 1 << npages(cq_size, page_size); - sq_size = sizeof(struct io_uring_sqe) * entries; + sq_size = sizeof(struct io_uring_sqe); + if (p->flags & IORING_SETUP_SQE128) + sq_size += 64; + sq_size *= entries; pages += (size_t) 1 << npages(sq_size, page_size); return pages * page_size; } @@ -337,11 +335,8 @@ ssize_t io_uring_mlock_size_params(unsigned entries, struct io_uring_params *p) cq_entries = 2 * entries; } - page_size = sysconf(_SC_PAGESIZE); - if (page_size < 0) - page_size = 4096; - - return rings_size(entries, cq_entries, page_size); + page_size = get_page_size(); + return rings_size(p, entries, cq_entries, page_size); } /* diff --git a/src/syscall.c b/src/syscall.c index 2fd3dd4..362f1f5 100644 --- a/src/syscall.c +++ b/src/syscall.c @@ -2,6 +2,16 @@ #define _DEFAULT_SOURCE /* + * Functions in this file require libc, only build them when we use libc. + * + * Note: + * liburing's tests still need these functions. + */ +#if defined(CONFIG_NOLIBC) && !defined(LIBURING_BUILD_TEST) +#error "This file should only be compiled for libc build, or for liburing tests" +#endif + +/* * Will go away once libc support is there */ #include <unistd.h> @@ -11,32 +21,6 @@ #include "liburing/io_uring.h" #include "syscall.h" -#ifdef __alpha__ -/* - * alpha is the only exception, all other architectures - * have common numbers for new system calls. - */ -# ifndef __NR_io_uring_setup -# define __NR_io_uring_setup 535 -# endif -# ifndef __NR_io_uring_enter -# define __NR_io_uring_enter 536 -# endif -# ifndef __NR_io_uring_register -# define __NR_io_uring_register 537 -# endif -#else /* !__alpha__ */ -# ifndef __NR_io_uring_setup -# define __NR_io_uring_setup 425 -# endif -# ifndef __NR_io_uring_enter -# define __NR_io_uring_enter 426 -# endif -# ifndef __NR_io_uring_register -# define __NR_io_uring_register 427 -# endif -#endif - int __sys_io_uring_register(int fd, unsigned opcode, const void *arg, unsigned nr_args) { @@ -49,15 +33,15 @@ int __sys_io_uring_setup(unsigned entries, struct io_uring_params *p) } int __sys_io_uring_enter2(int fd, unsigned to_submit, unsigned min_complete, - unsigned flags, sigset_t *sig, int sz) + unsigned flags, sigset_t *sig, int sz) { - return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, - flags, sig, sz); + return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, + sig, sz); } int __sys_io_uring_enter(int fd, unsigned to_submit, unsigned min_complete, unsigned flags, sigset_t *sig) { return __sys_io_uring_enter2(fd, to_submit, min_complete, flags, sig, - _NSIG / 8); + _NSIG / 8); } diff --git a/src/syscall.h b/src/syscall.h index 3b94efc..214789d 100644 --- a/src/syscall.h +++ b/src/syscall.h @@ -2,19 +2,102 @@ #ifndef LIBURING_SYSCALL_H #define LIBURING_SYSCALL_H +#include <errno.h> #include <signal.h> +#include <stdint.h> +#include <unistd.h> +#include <stdbool.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/resource.h> +#include <liburing.h> + +#ifdef __alpha__ +/* + * alpha and mips are exception, other architectures have + * common numbers for new system calls. + */ +#ifndef __NR_io_uring_setup +#define __NR_io_uring_setup 535 +#endif +#ifndef __NR_io_uring_enter +#define __NR_io_uring_enter 536 +#endif +#ifndef __NR_io_uring_register +#define __NR_io_uring_register 537 +#endif +#elif defined __mips__ +#ifndef __NR_io_uring_setup +#define __NR_io_uring_setup (__NR_Linux + 425) +#endif +#ifndef __NR_io_uring_enter +#define __NR_io_uring_enter (__NR_Linux + 426) +#endif +#ifndef __NR_io_uring_register +#define __NR_io_uring_register (__NR_Linux + 427) +#endif +#else /* !__alpha__ and !__mips__ */ +#ifndef __NR_io_uring_setup +#define __NR_io_uring_setup 425 +#endif +#ifndef __NR_io_uring_enter +#define __NR_io_uring_enter 426 +#endif +#ifndef __NR_io_uring_register +#define __NR_io_uring_register 427 +#endif +#endif + +/* + * Don't put this below the #include "arch/$arch/syscall.h", that + * file may need it. + */ struct io_uring_params; +static inline void *ERR_PTR(intptr_t n) +{ + return (void *) n; +} + +static inline intptr_t PTR_ERR(const void *ptr) +{ + return (intptr_t) ptr; +} + +static inline bool IS_ERR(const void *ptr) +{ + return uring_unlikely((uintptr_t) ptr >= (uintptr_t) -4095UL); +} + +#define __INTERNAL__LIBURING_SYSCALL_H +#if defined(__x86_64__) || defined(__i386__) +#include "arch/x86/syscall.h" +#elif defined(__aarch64__) +#include "arch/aarch64/syscall.h" +#else +/* + * We don't have native syscall wrappers + * for this arch. Must use libc! + */ +#ifdef CONFIG_NOLIBC + #error "This arch doesn't support building liburing without libc" +#endif +/* libc syscall wrappers. */ +#include "arch/generic/syscall.h" +#endif +#undef __INTERNAL__LIBURING_SYSCALL_H + /* - * System calls + * For backward compatibility. + * (these __sys* functions always use libc, see syscall.c) */ -extern int __sys_io_uring_setup(unsigned entries, struct io_uring_params *p); -extern int __sys_io_uring_enter(int fd, unsigned to_submit, - unsigned min_complete, unsigned flags, sigset_t *sig); -extern int __sys_io_uring_enter2(int fd, unsigned to_submit, - unsigned min_complete, unsigned flags, sigset_t *sig, int sz); -extern int __sys_io_uring_register(int fd, unsigned int opcode, const void *arg, - unsigned int nr_args); +int __sys_io_uring_setup(unsigned entries, struct io_uring_params *p); +int __sys_io_uring_enter(int fd, unsigned to_submit, unsigned min_complete, + unsigned flags, sigset_t *sig); +int __sys_io_uring_enter2(int fd, unsigned to_submit, unsigned min_complete, + unsigned flags, sigset_t *sig, int sz); +int __sys_io_uring_register(int fd, unsigned int opcode, const void *arg, + unsigned int nr_args); #endif |