diff options
Diffstat (limited to 'src/include/liburing.h')
-rw-r--r-- | src/include/liburing.h | 719 |
1 files changed, 590 insertions, 129 deletions
diff --git a/src/include/liburing.h b/src/include/liburing.h index 51dc602..1c1b03e 100644 --- a/src/include/liburing.h +++ b/src/include/liburing.h @@ -6,25 +6,31 @@ #define _XOPEN_SOURCE 500 /* Required for glibc to expose sigset_t */ #endif +#ifndef _GNU_SOURCE +#define _GNU_SOURCE /* Required for musl to expose cpu_set_t */ +#endif + #include <sys/socket.h> -#include <sys/uio.h> #include <sys/stat.h> +#include <sys/uio.h> #include <errno.h> #include <signal.h> #include <stdbool.h> #include <inttypes.h> #include <time.h> +#include <fcntl.h> +#include <sched.h> #include <linux/swab.h> #include "liburing/compat.h" #include "liburing/io_uring.h" #include "liburing/barrier.h" #ifndef uring_unlikely -# define uring_unlikely(cond) __builtin_expect(!!(cond), 0) +#define uring_unlikely(cond) __builtin_expect(!!(cond), 0) #endif #ifndef uring_likely -# define uring_likely(cond) __builtin_expect(!!(cond), 1) +#define uring_likely(cond) __builtin_expect(!!(cond), 1) #endif #ifdef __cplusplus @@ -75,7 +81,10 @@ struct io_uring { int ring_fd; unsigned features; - unsigned pad[3]; + int enter_ring_fd; + __u8 int_flags; + __u8 pad[3]; + unsigned pad2; }; /* @@ -86,74 +95,115 @@ struct io_uring { * return an allocated io_uring_probe structure, or NULL if probe fails (for * example, if it is not available). The caller is responsible for freeing it */ -extern struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring); +struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring); /* same as io_uring_get_probe_ring, but takes care of ring init and teardown */ -extern struct io_uring_probe *io_uring_get_probe(void); +struct io_uring_probe *io_uring_get_probe(void); /* * frees a probe allocated through io_uring_get_probe() or * io_uring_get_probe_ring() */ -extern void io_uring_free_probe(struct io_uring_probe *probe); +void io_uring_free_probe(struct io_uring_probe *probe); -static inline int io_uring_opcode_supported(const struct io_uring_probe *p, int op) +static inline int io_uring_opcode_supported(const struct io_uring_probe *p, + int op) { if (op > p->last_op) return 0; return (p->ops[op].flags & IO_URING_OP_SUPPORTED) != 0; } -extern int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, - struct io_uring_params *p); -extern int io_uring_queue_init(unsigned entries, struct io_uring *ring, - unsigned flags); -extern int io_uring_queue_mmap(int fd, struct io_uring_params *p, - struct io_uring *ring); -extern int io_uring_ring_dontfork(struct io_uring *ring); -extern void io_uring_queue_exit(struct io_uring *ring); +int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, + struct io_uring_params *p); +int io_uring_queue_init(unsigned entries, struct io_uring *ring, + unsigned flags); +int io_uring_queue_mmap(int fd, struct io_uring_params *p, + struct io_uring *ring); +int io_uring_ring_dontfork(struct io_uring *ring); +void io_uring_queue_exit(struct io_uring *ring); unsigned io_uring_peek_batch_cqe(struct io_uring *ring, struct io_uring_cqe **cqes, unsigned count); -extern int io_uring_wait_cqes(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr, unsigned wait_nr, - struct __kernel_timespec *ts, sigset_t *sigmask); -extern int io_uring_wait_cqe_timeout(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr, struct __kernel_timespec *ts); -extern int io_uring_submit(struct io_uring *ring); -extern int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr); -extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring); - -extern int io_uring_register_buffers(struct io_uring *ring, - const struct iovec *iovecs, - unsigned nr_iovecs); -extern int io_uring_unregister_buffers(struct io_uring *ring); -extern int io_uring_register_files(struct io_uring *ring, const int *files, - unsigned nr_files); -extern int io_uring_unregister_files(struct io_uring *ring); -extern int io_uring_register_files_update(struct io_uring *ring, unsigned off, - int *files, unsigned nr_files); -extern int io_uring_register_eventfd(struct io_uring *ring, int fd); -extern int io_uring_register_eventfd_async(struct io_uring *ring, int fd); -extern int io_uring_unregister_eventfd(struct io_uring *ring); -extern int io_uring_register_probe(struct io_uring *ring, - struct io_uring_probe *p, unsigned nr); -extern int io_uring_register_personality(struct io_uring *ring); -extern int io_uring_unregister_personality(struct io_uring *ring, int id); -extern int io_uring_register_restrictions(struct io_uring *ring, - struct io_uring_restriction *res, - unsigned int nr_res); -extern int io_uring_enable_rings(struct io_uring *ring); -extern int __io_uring_sqring_wait(struct io_uring *ring); +int io_uring_wait_cqes(struct io_uring *ring, struct io_uring_cqe **cqe_ptr, + unsigned wait_nr, struct __kernel_timespec *ts, + sigset_t *sigmask); +int io_uring_wait_cqe_timeout(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, + struct __kernel_timespec *ts); +int io_uring_submit(struct io_uring *ring); +int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr); +int io_uring_submit_and_wait_timeout(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, + unsigned wait_nr, + struct __kernel_timespec *ts, + sigset_t *sigmask); + +int io_uring_register_buffers(struct io_uring *ring, const struct iovec *iovecs, + unsigned nr_iovecs); +int io_uring_register_buffers_tags(struct io_uring *ring, + const struct iovec *iovecs, + const __u64 *tags, unsigned nr); +int io_uring_register_buffers_sparse(struct io_uring *ring, unsigned nr); +int io_uring_register_buffers_update_tag(struct io_uring *ring, + unsigned off, + const struct iovec *iovecs, + const __u64 *tags, unsigned nr); +int io_uring_unregister_buffers(struct io_uring *ring); + +int io_uring_register_files(struct io_uring *ring, const int *files, + unsigned nr_files); +int io_uring_register_files_tags(struct io_uring *ring, const int *files, + const __u64 *tags, unsigned nr); +int io_uring_register_files_sparse(struct io_uring *ring, unsigned nr); +int io_uring_register_files_update_tag(struct io_uring *ring, unsigned off, + const int *files, const __u64 *tags, + unsigned nr_files); + +int io_uring_unregister_files(struct io_uring *ring); +int io_uring_register_files_update(struct io_uring *ring, unsigned off, + int *files, unsigned nr_files); +int io_uring_register_eventfd(struct io_uring *ring, int fd); +int io_uring_register_eventfd_async(struct io_uring *ring, int fd); +int io_uring_unregister_eventfd(struct io_uring *ring); +int io_uring_register_probe(struct io_uring *ring, struct io_uring_probe *p, + unsigned nr); +int io_uring_register_personality(struct io_uring *ring); +int io_uring_unregister_personality(struct io_uring *ring, int id); +int io_uring_register_restrictions(struct io_uring *ring, + struct io_uring_restriction *res, + unsigned int nr_res); +int io_uring_enable_rings(struct io_uring *ring); +int __io_uring_sqring_wait(struct io_uring *ring); +int io_uring_register_iowq_aff(struct io_uring *ring, size_t cpusz, + const cpu_set_t *mask); +int io_uring_unregister_iowq_aff(struct io_uring *ring); +int io_uring_register_iowq_max_workers(struct io_uring *ring, + unsigned int *values); +int io_uring_register_ring_fd(struct io_uring *ring); +int io_uring_unregister_ring_fd(struct io_uring *ring); +int io_uring_register_buf_ring(struct io_uring *ring, + struct io_uring_buf_reg *reg, unsigned int flags); +int io_uring_unregister_buf_ring(struct io_uring *ring, int bgid); /* * Helper for the peek/wait single cqe functions. Exported because of that, * but probably shouldn't be used directly in an application. */ -extern int __io_uring_get_cqe(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr, unsigned submit, - unsigned wait_nr, sigset_t *sigmask); +int __io_uring_get_cqe(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, unsigned submit, + unsigned wait_nr, sigset_t *sigmask); #define LIBURING_UDATA_TIMEOUT ((__u64) -1) +/* + * Calculates the step size for CQE iteration. + * For standard CQE's its 1, for big CQE's its two. + */ +#define io_uring_cqe_shift(ring) \ + (!!((ring)->flags & IORING_SETUP_CQE32)) + +#define io_uring_cqe_index(ring,ptr,mask) \ + (((ptr) & (mask)) << io_uring_cqe_shift(ring)) + #define io_uring_for_each_cqe(ring, head, cqe) \ /* \ * io_uring_smp_load_acquire() enforces the order of tail \ @@ -161,7 +211,7 @@ extern int __io_uring_get_cqe(struct io_uring *ring, */ \ for (head = *(ring)->cq.khead; \ (cqe = (head != io_uring_smp_load_acquire((ring)->cq.ktail) ? \ - &(ring)->cq.cqes[head & (*(ring)->cq.kring_mask)] : NULL)); \ + &(ring)->cq.cqes[io_uring_cqe_index(ring, head, *(ring)->cq.kring_mask)] : NULL)); \ head++) \ /* @@ -195,6 +245,11 @@ static inline void io_uring_cqe_seen(struct io_uring *ring, /* * Command prep helpers */ + +/* + * Associate pointer @data with the sqe, for later retrieval from the cqe + * at command completion time with io_uring_cqe_get_data(). + */ static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data) { sqe->user_data = (unsigned long) data; @@ -205,17 +260,45 @@ static inline void *io_uring_cqe_get_data(const struct io_uring_cqe *cqe) return (void *) (uintptr_t) cqe->user_data; } +/* + * Assign a 64-bit value to this sqe, which can get retrieved at completion + * time with io_uring_cqe_get_data64. Just like the non-64 variants, except + * these store a 64-bit type rather than a data pointer. + */ +static inline void io_uring_sqe_set_data64(struct io_uring_sqe *sqe, + __u64 data) +{ + sqe->user_data = data; +} + +static inline __u64 io_uring_cqe_get_data64(const struct io_uring_cqe *cqe) +{ + return cqe->user_data; +} + +/* + * Tell the app the have the 64-bit variants of the get/set userdata + */ +#define LIBURING_HAVE_DATA64 + static inline void io_uring_sqe_set_flags(struct io_uring_sqe *sqe, unsigned flags) { - sqe->flags = flags; + sqe->flags = (__u8) flags; +} + +static inline void __io_uring_set_target_fixed_file(struct io_uring_sqe *sqe, + unsigned int file_index) +{ + /* 0 means no fixed files, indexes should be encoded as "index + 1" */ + sqe->file_index = file_index + 1; } static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, __u64 offset) { - sqe->opcode = op; + sqe->opcode = (__u8) op; sqe->flags = 0; sqe->ioprio = 0; sqe->fd = fd; @@ -223,27 +306,33 @@ static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, sqe->addr = (unsigned long) addr; sqe->len = len; sqe->rw_flags = 0; - sqe->user_data = 0; - sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0; + sqe->buf_index = 0; + sqe->personality = 0; + sqe->file_index = 0; + sqe->addr3 = 0; + sqe->__pad2[0] = 0; } /** * @pre Either fd_in or fd_out must be a pipe. * @param off_in If fd_in refers to a pipe, off_in must be (int64_t) -1; - * If fd_in does not refer to a pipe and off_in is (int64_t) -1, then bytes are read - * from fd_in starting from the file offset and it is adjust appropriately; - * If fd_in does not refer to a pipe and off_in is not (int64_t) -1, then the - * starting offset of fd_in will be off_in. + * If fd_in does not refer to a pipe and off_in is (int64_t) -1, + * then bytes are read from fd_in starting from the file offset + * and it is adjust appropriately; + * If fd_in does not refer to a pipe and off_in is not + * (int64_t) -1, then the starting offset of fd_in will be + * off_in. * @param off_out The description of off_in also applied to off_out. * @param splice_flags see man splice(2) for description of flags. * - * This splice operation can be used to implement sendfile by splicing to an intermediate pipe - * first, then splice to the final destination. + * This splice operation can be used to implement sendfile by splicing to an + * intermediate pipe first, then splice to the final destination. * In fact, the implementation of sendfile in kernel uses splice internally. * - * NOTE that even if fd_in or fd_out refers to a pipe, the splice operation can still failed with - * EINVAL if one of the fd doesn't explicitly support splice operation, e.g. reading from terminal - * is unsupported from kernel 5.7 to 5.11. + * NOTE that even if fd_in or fd_out refers to a pipe, the splice operation + * can still failed with EINVAL if one of the fd doesn't explicitly support + * splice operation, e.g. reading from terminal is unsupported from kernel 5.7 + * to 5.11. * Check issue #291 for more information. */ static inline void io_uring_prep_splice(struct io_uring_sqe *sqe, @@ -252,8 +341,9 @@ static inline void io_uring_prep_splice(struct io_uring_sqe *sqe, unsigned int nbytes, unsigned int splice_flags) { - io_uring_prep_rw(IORING_OP_SPLICE, sqe, fd_out, NULL, nbytes, off_out); - sqe->splice_off_in = off_in; + io_uring_prep_rw(IORING_OP_SPLICE, sqe, fd_out, NULL, nbytes, + (__u64) off_out); + sqe->splice_off_in = (__u64) off_in; sqe->splice_fd_in = fd_in; sqe->splice_flags = splice_flags; } @@ -271,32 +361,50 @@ static inline void io_uring_prep_tee(struct io_uring_sqe *sqe, static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs, - unsigned nr_vecs, off_t offset) + unsigned nr_vecs, __u64 offset) { io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset); } +static inline void io_uring_prep_readv2(struct io_uring_sqe *sqe, int fd, + const struct iovec *iovecs, + unsigned nr_vecs, __u64 offset, + int flags) +{ + io_uring_prep_readv(sqe, fd, iovecs, nr_vecs, offset); + sqe->rw_flags = flags; +} + static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, - off_t offset, int buf_index) + __u64 offset, int buf_index) { io_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset); - sqe->buf_index = buf_index; + sqe->buf_index = (__u16) buf_index; } static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs, - unsigned nr_vecs, off_t offset) + unsigned nr_vecs, __u64 offset) { io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset); } +static inline void io_uring_prep_writev2(struct io_uring_sqe *sqe, int fd, + const struct iovec *iovecs, + unsigned nr_vecs, __u64 offset, + int flags) +{ + io_uring_prep_writev(sqe, fd, iovecs, nr_vecs, offset); + sqe->rw_flags = flags; +} + static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, const void *buf, unsigned nbytes, - off_t offset, int buf_index) + __u64 offset, int buf_index) { io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset); - sqe->buf_index = buf_index; + sqe->buf_index = (__u16) buf_index; } static inline void io_uring_prep_recvmsg(struct io_uring_sqe *sqe, int fd, @@ -307,39 +415,51 @@ static inline void io_uring_prep_recvmsg(struct io_uring_sqe *sqe, int fd, } static inline void io_uring_prep_sendmsg(struct io_uring_sqe *sqe, int fd, - const struct msghdr *msg, unsigned flags) + const struct msghdr *msg, + unsigned flags) { io_uring_prep_rw(IORING_OP_SENDMSG, sqe, fd, msg, 1, 0); sqe->msg_flags = flags; } -static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd, - unsigned poll_mask) +static inline unsigned __io_uring_prep_poll_mask(unsigned poll_mask) { - io_uring_prep_rw(IORING_OP_POLL_ADD, sqe, fd, NULL, 0, 0); #if __BYTE_ORDER == __BIG_ENDIAN poll_mask = __swahw32(poll_mask); #endif - sqe->poll32_events = poll_mask; + return poll_mask; +} + +static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd, + unsigned poll_mask) +{ + io_uring_prep_rw(IORING_OP_POLL_ADD, sqe, fd, NULL, 0, 0); + sqe->poll32_events = __io_uring_prep_poll_mask(poll_mask); +} + +static inline void io_uring_prep_poll_multishot(struct io_uring_sqe *sqe, + int fd, unsigned poll_mask) +{ + io_uring_prep_poll_add(sqe, fd, poll_mask); + sqe->len = IORING_POLL_ADD_MULTI; } static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe, - void *user_data) + __u64 user_data) { - io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, user_data, 0, 0); + io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, NULL, 0, 0); + sqe->addr = user_data; } static inline void io_uring_prep_poll_update(struct io_uring_sqe *sqe, - void *old_user_data, - void *new_user_data, + __u64 old_user_data, + __u64 new_user_data, unsigned poll_mask, unsigned flags) { - io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, old_user_data, flags, - (__u64)new_user_data); -#if __BYTE_ORDER == __BIG_ENDIAN - poll_mask = __swahw32(poll_mask); -#endif - sqe->poll32_events = poll_mask; + io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, NULL, flags, + new_user_data); + sqe->addr = old_user_data; + sqe->poll32_events = __io_uring_prep_poll_mask(poll_mask); } static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd, @@ -365,8 +485,8 @@ static inline void io_uring_prep_timeout(struct io_uring_sqe *sqe, static inline void io_uring_prep_timeout_remove(struct io_uring_sqe *sqe, __u64 user_data, unsigned flags) { - io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1, - (void *)(unsigned long)user_data, 0, 0); + io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1, NULL, 0, 0); + sqe->addr = user_data; sqe->timeout_flags = flags; } @@ -374,9 +494,9 @@ static inline void io_uring_prep_timeout_update(struct io_uring_sqe *sqe, struct __kernel_timespec *ts, __u64 user_data, unsigned flags) { - io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1, - (void *)(unsigned long)user_data, 0, - (uintptr_t)ts); + io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1, NULL, 0, + (uintptr_t) ts); + sqe->addr = user_data; sqe->timeout_flags = flags | IORING_TIMEOUT_UPDATE; } @@ -386,14 +506,57 @@ static inline void io_uring_prep_accept(struct io_uring_sqe *sqe, int fd, { io_uring_prep_rw(IORING_OP_ACCEPT, sqe, fd, addr, 0, (__u64) (unsigned long) addrlen); - sqe->accept_flags = flags; + sqe->accept_flags = (__u32) flags; +} + +/* accept directly into the fixed file table */ +static inline void io_uring_prep_accept_direct(struct io_uring_sqe *sqe, int fd, + struct sockaddr *addr, + socklen_t *addrlen, int flags, + unsigned int file_index) +{ + io_uring_prep_accept(sqe, fd, addr, addrlen, flags); + __io_uring_set_target_fixed_file(sqe, file_index); +} + +static inline void io_uring_prep_multishot_accept(struct io_uring_sqe *sqe, + int fd, struct sockaddr *addr, + socklen_t *addrlen, int flags) +{ + io_uring_prep_accept(sqe, fd, addr, addrlen, flags); + sqe->ioprio |= IORING_ACCEPT_MULTISHOT; +} + +/* multishot accept directly into the fixed file table */ +static inline void io_uring_prep_multishot_accept_direct(struct io_uring_sqe *sqe, + int fd, + struct sockaddr *addr, + socklen_t *addrlen, + int flags) +{ + io_uring_prep_multishot_accept(sqe, fd, addr, addrlen, flags); + __io_uring_set_target_fixed_file(sqe, IORING_FILE_INDEX_ALLOC - 1); +} + +static inline void io_uring_prep_cancel64(struct io_uring_sqe *sqe, + __u64 user_data, int flags) +{ + io_uring_prep_rw(IORING_OP_ASYNC_CANCEL, sqe, -1, NULL, 0, 0); + sqe->addr = user_data; + sqe->cancel_flags = (__u32) flags; +} + +static inline void io_uring_prep_cancel(struct io_uring_sqe *sqe, + void *user_data, int flags) +{ + io_uring_prep_cancel64(sqe, (__u64) (uintptr_t) user_data, flags); } -static inline void io_uring_prep_cancel(struct io_uring_sqe *sqe, void *user_data, - int flags) +static inline void io_uring_prep_cancel_fd(struct io_uring_sqe *sqe, int fd, + unsigned int flags) { - io_uring_prep_rw(IORING_OP_ASYNC_CANCEL, sqe, -1, user_data, 0, 0); - sqe->cancel_flags = flags; + io_uring_prep_rw(IORING_OP_ASYNC_CANCEL, sqe, fd, NULL, 0, 0); + sqe->cancel_flags = (__u32) flags | IORING_ASYNC_CANCEL_FD; } static inline void io_uring_prep_link_timeout(struct io_uring_sqe *sqe, @@ -415,7 +578,8 @@ static inline void io_uring_prep_files_update(struct io_uring_sqe *sqe, int *fds, unsigned nr_fds, int offset) { - io_uring_prep_rw(IORING_OP_FILES_UPDATE, sqe, -1, fds, nr_fds, offset); + io_uring_prep_rw(IORING_OP_FILES_UPDATE, sqe, -1, fds, nr_fds, + (__u64) offset); } static inline void io_uring_prep_fallocate(struct io_uring_sqe *sqe, int fd, @@ -423,14 +587,26 @@ static inline void io_uring_prep_fallocate(struct io_uring_sqe *sqe, int fd, { io_uring_prep_rw(IORING_OP_FALLOCATE, sqe, fd, - (const uintptr_t *) (unsigned long) len, mode, offset); + (const uintptr_t *) (unsigned long) len, + (unsigned int) mode, (__u64) offset); } static inline void io_uring_prep_openat(struct io_uring_sqe *sqe, int dfd, - const char *path, int flags, mode_t mode) + const char *path, int flags, + mode_t mode) { io_uring_prep_rw(IORING_OP_OPENAT, sqe, dfd, path, mode, 0); - sqe->open_flags = flags; + sqe->open_flags = (__u32) flags; +} + +/* open directly into the fixed file table */ +static inline void io_uring_prep_openat_direct(struct io_uring_sqe *sqe, + int dfd, const char *path, + int flags, mode_t mode, + unsigned file_index) +{ + io_uring_prep_openat(sqe, dfd, path, flags, mode); + __io_uring_set_target_fixed_file(sqe, file_index); } static inline void io_uring_prep_close(struct io_uring_sqe *sqe, int fd) @@ -438,14 +614,22 @@ static inline void io_uring_prep_close(struct io_uring_sqe *sqe, int fd) io_uring_prep_rw(IORING_OP_CLOSE, sqe, fd, NULL, 0, 0); } +static inline void io_uring_prep_close_direct(struct io_uring_sqe *sqe, + unsigned file_index) +{ + io_uring_prep_close(sqe, 0); + __io_uring_set_target_fixed_file(sqe, file_index); +} + static inline void io_uring_prep_read(struct io_uring_sqe *sqe, int fd, - void *buf, unsigned nbytes, off_t offset) + void *buf, unsigned nbytes, __u64 offset) { io_uring_prep_rw(IORING_OP_READ, sqe, fd, buf, nbytes, offset); } static inline void io_uring_prep_write(struct io_uring_sqe *sqe, int fd, - const void *buf, unsigned nbytes, off_t offset) + const void *buf, unsigned nbytes, + __u64 offset) { io_uring_prep_rw(IORING_OP_WRITE, sqe, fd, buf, nbytes, offset); } @@ -457,35 +641,35 @@ static inline void io_uring_prep_statx(struct io_uring_sqe *sqe, int dfd, { io_uring_prep_rw(IORING_OP_STATX, sqe, dfd, path, mask, (__u64) (unsigned long) statxbuf); - sqe->statx_flags = flags; + sqe->statx_flags = (__u32) flags; } static inline void io_uring_prep_fadvise(struct io_uring_sqe *sqe, int fd, - off_t offset, off_t len, int advice) + __u64 offset, off_t len, int advice) { - io_uring_prep_rw(IORING_OP_FADVISE, sqe, fd, NULL, len, offset); - sqe->fadvise_advice = advice; + io_uring_prep_rw(IORING_OP_FADVISE, sqe, fd, NULL, (__u32) len, offset); + sqe->fadvise_advice = (__u32) advice; } static inline void io_uring_prep_madvise(struct io_uring_sqe *sqe, void *addr, off_t length, int advice) { - io_uring_prep_rw(IORING_OP_MADVISE, sqe, -1, addr, length, 0); - sqe->fadvise_advice = advice; + io_uring_prep_rw(IORING_OP_MADVISE, sqe, -1, addr, (__u32) length, 0); + sqe->fadvise_advice = (__u32) advice; } static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd, const void *buf, size_t len, int flags) { - io_uring_prep_rw(IORING_OP_SEND, sqe, sockfd, buf, len, 0); - sqe->msg_flags = flags; + io_uring_prep_rw(IORING_OP_SEND, sqe, sockfd, buf, (__u32) len, 0); + sqe->msg_flags = (__u32) flags; } static inline void io_uring_prep_recv(struct io_uring_sqe *sqe, int sockfd, void *buf, size_t len, int flags) { - io_uring_prep_rw(IORING_OP_RECV, sqe, sockfd, buf, len, 0); - sqe->msg_flags = flags; + io_uring_prep_rw(IORING_OP_RECV, sqe, sockfd, buf, (__u32) len, 0); + sqe->msg_flags = (__u32) flags; } static inline void io_uring_prep_openat2(struct io_uring_sqe *sqe, int dfd, @@ -495,57 +679,82 @@ static inline void io_uring_prep_openat2(struct io_uring_sqe *sqe, int dfd, (uint64_t) (uintptr_t) how); } +/* open directly into the fixed file table */ +static inline void io_uring_prep_openat2_direct(struct io_uring_sqe *sqe, + int dfd, const char *path, + struct open_how *how, + unsigned file_index) +{ + io_uring_prep_openat2(sqe, dfd, path, how); + __io_uring_set_target_fixed_file(sqe, file_index); +} + struct epoll_event; static inline void io_uring_prep_epoll_ctl(struct io_uring_sqe *sqe, int epfd, int fd, int op, struct epoll_event *ev) { - io_uring_prep_rw(IORING_OP_EPOLL_CTL, sqe, epfd, ev, op, fd); + io_uring_prep_rw(IORING_OP_EPOLL_CTL, sqe, epfd, ev, + (__u32) op, (__u32) fd); } static inline void io_uring_prep_provide_buffers(struct io_uring_sqe *sqe, void *addr, int len, int nr, int bgid, int bid) { - io_uring_prep_rw(IORING_OP_PROVIDE_BUFFERS, sqe, nr, addr, len, bid); - sqe->buf_group = bgid; + io_uring_prep_rw(IORING_OP_PROVIDE_BUFFERS, sqe, nr, addr, (__u32) len, + (__u64) bid); + sqe->buf_group = (__u16) bgid; } static inline void io_uring_prep_remove_buffers(struct io_uring_sqe *sqe, int nr, int bgid) { io_uring_prep_rw(IORING_OP_REMOVE_BUFFERS, sqe, nr, NULL, 0, 0); - sqe->buf_group = bgid; + sqe->buf_group = (__u16) bgid; } static inline void io_uring_prep_shutdown(struct io_uring_sqe *sqe, int fd, int how) { - io_uring_prep_rw(IORING_OP_SHUTDOWN, sqe, fd, NULL, how, 0); + io_uring_prep_rw(IORING_OP_SHUTDOWN, sqe, fd, NULL, (__u32) how, 0); } static inline void io_uring_prep_unlinkat(struct io_uring_sqe *sqe, int dfd, const char *path, int flags) { io_uring_prep_rw(IORING_OP_UNLINKAT, sqe, dfd, path, 0, 0); - sqe->unlink_flags = flags; + sqe->unlink_flags = (__u32) flags; +} + +static inline void io_uring_prep_unlink(struct io_uring_sqe *sqe, + const char *path, int flags) +{ + io_uring_prep_unlinkat(sqe, AT_FDCWD, path, flags); } static inline void io_uring_prep_renameat(struct io_uring_sqe *sqe, int olddfd, const char *oldpath, int newdfd, const char *newpath, int flags) { - io_uring_prep_rw(IORING_OP_RENAMEAT, sqe, olddfd, oldpath, newdfd, + io_uring_prep_rw(IORING_OP_RENAMEAT, sqe, olddfd, oldpath, + (__u32) newdfd, (uint64_t) (uintptr_t) newpath); - sqe->rename_flags = flags; + sqe->rename_flags = (__u32) flags; +} + +static inline void io_uring_prep_rename(struct io_uring_sqe *sqe, + const char *oldpath, const char *newpath) +{ + io_uring_prep_renameat(sqe, AT_FDCWD, oldpath, AT_FDCWD, newpath, 0); } static inline void io_uring_prep_sync_file_range(struct io_uring_sqe *sqe, int fd, unsigned len, - off_t offset, int flags) + __u64 offset, int flags) { io_uring_prep_rw(IORING_OP_SYNC_FILE_RANGE, sqe, fd, NULL, len, offset); - sqe->sync_range_flags = flags; + sqe->sync_range_flags = (__u32) flags; } static inline void io_uring_prep_mkdirat(struct io_uring_sqe *sqe, int dfd, @@ -554,20 +763,123 @@ static inline void io_uring_prep_mkdirat(struct io_uring_sqe *sqe, int dfd, io_uring_prep_rw(IORING_OP_MKDIRAT, sqe, dfd, path, mode, 0); } +static inline void io_uring_prep_mkdir(struct io_uring_sqe *sqe, + const char *path, mode_t mode) +{ + io_uring_prep_mkdirat(sqe, AT_FDCWD, path, mode); +} + static inline void io_uring_prep_symlinkat(struct io_uring_sqe *sqe, - const char *target, int newdirfd, const char *linkpath) + const char *target, int newdirfd, + const char *linkpath) { io_uring_prep_rw(IORING_OP_SYMLINKAT, sqe, newdirfd, target, 0, (uint64_t) (uintptr_t) linkpath); } +static inline void io_uring_prep_symlink(struct io_uring_sqe *sqe, + const char *target, const char *linkpath) +{ + io_uring_prep_symlinkat(sqe, target, AT_FDCWD, linkpath); +} + static inline void io_uring_prep_linkat(struct io_uring_sqe *sqe, int olddfd, const char *oldpath, int newdfd, const char *newpath, int flags) { - io_uring_prep_rw(IORING_OP_LINKAT, sqe, olddfd, oldpath, newdfd, + io_uring_prep_rw(IORING_OP_LINKAT, sqe, olddfd, oldpath, (__u32) newdfd, (uint64_t) (uintptr_t) newpath); - sqe->hardlink_flags = flags; + sqe->hardlink_flags = (__u32) flags; +} + +static inline void io_uring_prep_link(struct io_uring_sqe *sqe, + const char *oldpath, const char *newpath, int flags) +{ + io_uring_prep_linkat(sqe, AT_FDCWD, oldpath, AT_FDCWD, newpath, flags); +} + +static inline void io_uring_prep_msg_ring(struct io_uring_sqe *sqe, int fd, + unsigned int len, __u64 data, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_MSG_RING, sqe, fd, NULL, len, data); + sqe->rw_flags = flags; +} + +static inline void io_uring_prep_getxattr(struct io_uring_sqe *sqe, + const char *name, + const char *value, + const char *path, + size_t len) +{ + io_uring_prep_rw(IORING_OP_GETXATTR, sqe, 0, name, len, + (__u64) (uintptr_t) value); + sqe->addr3 = (__u64) (uintptr_t) path; + sqe->xattr_flags = 0; +} + +static inline void io_uring_prep_setxattr(struct io_uring_sqe *sqe, + const char *name, + const char *value, + const char *path, + int flags, + size_t len) +{ + io_uring_prep_rw(IORING_OP_SETXATTR, sqe, 0, name, len, + (__u64) (uintptr_t) value); + sqe->addr3 = (__u64) (uintptr_t) path; + sqe->xattr_flags = flags; +} + +static inline void io_uring_prep_fgetxattr(struct io_uring_sqe *sqe, + int fd, + const char *name, + const char *value, + size_t len) +{ + io_uring_prep_rw(IORING_OP_FGETXATTR, sqe, fd, name, len, + (__u64) (uintptr_t) value); + sqe->xattr_flags = 0; +} + +static inline void io_uring_prep_fsetxattr(struct io_uring_sqe *sqe, + int fd, + const char *name, + const char *value, + int flags, + size_t len) +{ + io_uring_prep_rw(IORING_OP_FSETXATTR, sqe, fd, name, len, + (__u64) (uintptr_t) value); + sqe->xattr_flags = flags; +} + +static inline void io_uring_prep_socket(struct io_uring_sqe *sqe, int domain, + int type, int protocol, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_SOCKET, sqe, domain, NULL, protocol, type); + sqe->rw_flags = flags; +} + +static inline void io_uring_prep_socket_direct(struct io_uring_sqe *sqe, + int domain, int type, + int protocol, + unsigned file_index, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_SOCKET, sqe, domain, NULL, protocol, type); + sqe->rw_flags = flags; + __io_uring_set_target_fixed_file(sqe, file_index); +} + +static inline void io_uring_prep_socket_direct_alloc(struct io_uring_sqe *sqe, + int domain, int type, int protocol, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_SOCKET, sqe, domain, NULL, protocol, type); + sqe->rw_flags = flags; + __io_uring_set_target_fixed_file(sqe, IORING_FILE_INDEX_ALLOC - 1); } /* @@ -576,15 +888,18 @@ static inline void io_uring_prep_linkat(struct io_uring_sqe *sqe, int olddfd, */ static inline unsigned io_uring_sq_ready(const struct io_uring *ring) { + unsigned khead = *ring->sq.khead; + /* - * Without a barrier, we could miss an update and think the SQ wasn't ready. - * We don't need the load acquire for non-SQPOLL since then we drive updates. + * Without a barrier, we could miss an update and think the SQ wasn't + * ready. We don't need the load acquire for non-SQPOLL since then we + * drive updates. */ if (ring->flags & IORING_SETUP_SQPOLL) - return ring->sq.sqe_tail - io_uring_smp_load_acquire(ring->sq.khead); + khead = io_uring_smp_load_acquire(ring->sq.khead); /* always use real head, to avoid losing sync for short submit */ - return ring->sq.sqe_tail - *ring->sq.khead; + return ring->sq.sqe_tail - khead; } /* @@ -671,12 +986,62 @@ static inline int io_uring_wait_cqe_nr(struct io_uring *ring, } /* + * Internal helper, don't use directly in applications. Use one of the + * "official" versions of this, io_uring_peek_cqe(), io_uring_wait_cqe(), + * or io_uring_wait_cqes*(). + */ +static inline int __io_uring_peek_cqe(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, + unsigned *nr_available) +{ + struct io_uring_cqe *cqe; + int err = 0; + unsigned available; + unsigned mask = *ring->cq.kring_mask; + int shift = 0; + + if (ring->flags & IORING_SETUP_CQE32) + shift = 1; + + do { + unsigned tail = io_uring_smp_load_acquire(ring->cq.ktail); + unsigned head = *ring->cq.khead; + + cqe = NULL; + available = tail - head; + if (!available) + break; + + cqe = &ring->cq.cqes[(head & mask) << shift]; + if (!(ring->features & IORING_FEAT_EXT_ARG) && + cqe->user_data == LIBURING_UDATA_TIMEOUT) { + if (cqe->res < 0) + err = cqe->res; + io_uring_cq_advance(ring, 1); + if (!err) + continue; + cqe = NULL; + } + + break; + } while (1); + + *cqe_ptr = cqe; + if (nr_available) + *nr_available = available; + return err; +} + +/* * Return an IO completion, if one is readily available. Returns 0 with * cqe_ptr filled in on success, -errno on failure. */ static inline int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) { + if (!__io_uring_peek_cqe(ring, cqe_ptr, NULL) && *cqe_ptr) + return 0; + return io_uring_wait_cqe_nr(ring, cqe_ptr, 0); } @@ -687,9 +1052,105 @@ static inline int io_uring_peek_cqe(struct io_uring *ring, static inline int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) { + if (!__io_uring_peek_cqe(ring, cqe_ptr, NULL) && *cqe_ptr) + return 0; + return io_uring_wait_cqe_nr(ring, cqe_ptr, 1); } +/* + * Return an sqe to fill. Application must later call io_uring_submit() + * when it's ready to tell the kernel about it. The caller may call this + * function multiple times before calling io_uring_submit(). + * + * Returns a vacant sqe, or NULL if we're full. + */ +static inline struct io_uring_sqe *_io_uring_get_sqe(struct io_uring *ring) +{ + struct io_uring_sq *sq = &ring->sq; + unsigned int head = io_uring_smp_load_acquire(sq->khead); + unsigned int next = sq->sqe_tail + 1; + int shift = 0; + + if (ring->flags & IORING_SETUP_SQE128) + shift = 1; + + if (next - head <= *sq->kring_entries) { + struct io_uring_sqe *sqe; + + sqe = &sq->sqes[(sq->sqe_tail & *sq->kring_mask) << shift]; + sq->sqe_tail = next; + return sqe; + } + + return NULL; +} + +/* + * Return the appropriate mask for a buffer ring of size 'ring_entries' + */ +static inline int io_uring_buf_ring_mask(__u32 ring_entries) +{ + return ring_entries - 1; +} + +static inline void io_uring_buf_ring_init(struct io_uring_buf_ring *br) +{ + br->tail = 0; +} + +/* + * Assign 'buf' with the addr/len/buffer ID supplied + */ +static inline void io_uring_buf_ring_add(struct io_uring_buf_ring *br, + void *addr, unsigned int len, + unsigned short bid, int mask, + int buf_offset) +{ + struct io_uring_buf *buf = &br->bufs[(br->tail + buf_offset) & mask]; + + buf->addr = (unsigned long) (uintptr_t) addr; + buf->len = len; + buf->bid = bid; +} + +/* + * Make 'count' new buffers visible to the kernel. Called after + * io_uring_buf_ring_add() has been called 'count' times to fill in new + * buffers. + */ +static inline void io_uring_buf_ring_advance(struct io_uring_buf_ring *br, + int count) +{ + unsigned short new_tail = br->tail + count; + + io_uring_smp_store_release(&br->tail, new_tail); +} + +/* + * Make 'count' new buffers visible to the kernel while at the same time + * advancing the CQ ring seen entries. This can be used when the application + * is using ring provided buffers and returns buffers while processing CQEs, + * avoiding an extra atomic when needing to increment both the CQ ring and + * the ring buffer index at the same time. + */ +static inline void io_uring_buf_ring_cq_advance(struct io_uring *ring, + struct io_uring_buf_ring *br, + int count) +{ + br->tail += count; + io_uring_cq_advance(ring, count); +} + +#ifndef LIBURING_INTERNAL +static inline struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) +{ + return _io_uring_get_sqe(ring); +} +#else +struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring); +#endif + ssize_t io_uring_mlock_size(unsigned entries, unsigned flags); ssize_t io_uring_mlock_size_params(unsigned entries, struct io_uring_params *p); |