aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-07-07 04:55:56 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-07-07 04:55:56 +0000
commit7ad54f005759f2e24bcb7db178e629b3dc507710 (patch)
treeebe9c872e416346e4b333e5062da32401c8e0a73
parentd53db6851ea17b2d219d084e1afc683b8b62b105 (diff)
parent1d27ff1934c5c4292dc00fba7f7f8ae411ed42f5 (diff)
downloadliburing-android14-mainline-extservices-release.tar.gz
Snap for 10453563 from 1d27ff1934c5c4292dc00fba7f7f8ae411ed42f5 to mainline-extservices-releaseaml_ext_341620040aml_ext_341518010aml_ext_341414010aml_ext_341317010aml_ext_341131030aml_ext_341027030android14-mainline-extservices-release
Change-Id: I37e718fbf69ba4f7fa036e2013d4ab5e5a2c1d57
-rw-r--r--.github/pull_request_template.md86
-rw-r--r--.github/workflows/build.yml127
-rw-r--r--.github/workflows/shellcheck.yml20
-rw-r--r--.gitignore111
-rw-r--r--.travis.yml22
-rw-r--r--Android.bp8
-rw-r--r--CHANGELOG29
-rw-r--r--CITATION.cff11
-rw-r--r--LICENSE19
-rw-r--r--METADATA4
-rw-r--r--Makefile10
-rw-r--r--Makefile.common6
-rw-r--r--README16
-rw-r--r--SECURITY.md6
-rwxr-xr-xconfigure127
-rw-r--r--examples/Makefile31
-rw-r--r--examples/io_uring-cp.c7
-rw-r--r--examples/link-cp.c6
-rw-r--r--examples/ucontext-cp.c2
-rw-r--r--liburing.spec2
-rw-r--r--man/io_uring.759
-rw-r--r--man/io_uring_buf_ring_add.353
-rw-r--r--man/io_uring_buf_ring_advance.331
-rw-r--r--man/io_uring_buf_ring_cq_advance.341
-rw-r--r--man/io_uring_buf_ring_init.330
-rw-r--r--man/io_uring_buf_ring_mask.327
-rw-r--r--man/io_uring_cq_advance.349
-rw-r--r--man/io_uring_cq_ready.326
-rw-r--r--man/io_uring_cqe_get_data.353
l---------man/io_uring_cqe_get_data64.31
-rw-r--r--man/io_uring_cqe_seen.342
-rw-r--r--man/io_uring_enter.2410
-rw-r--r--man/io_uring_free_probe.327
-rw-r--r--man/io_uring_get_probe.330
-rw-r--r--man/io_uring_get_sqe.322
-rw-r--r--man/io_uring_opcode_supported.330
-rw-r--r--man/io_uring_peek_cqe.338
-rw-r--r--man/io_uring_prep_accept.3159
l---------man/io_uring_prep_accept_direct.31
-rw-r--r--man/io_uring_prep_cancel.3118
l---------man/io_uring_prep_cancel64.31
-rw-r--r--man/io_uring_prep_close.359
l---------man/io_uring_prep_close_direct.31
-rw-r--r--man/io_uring_prep_connect.366
-rw-r--r--man/io_uring_prep_fadvise.359
-rw-r--r--man/io_uring_prep_fallocate.359
-rw-r--r--man/io_uring_prep_files_update.392
-rw-r--r--man/io_uring_prep_fsync.370
l---------man/io_uring_prep_link.31
-rw-r--r--man/io_uring_prep_linkat.391
-rw-r--r--man/io_uring_prep_madvise.356
l---------man/io_uring_prep_mkdir.31
-rw-r--r--man/io_uring_prep_mkdirat.383
-rw-r--r--man/io_uring_prep_msg_ring.372
l---------man/io_uring_prep_multishot_accept.31
l---------man/io_uring_prep_multishot_accept_direct.31
-rw-r--r--man/io_uring_prep_openat.3117
-rw-r--r--man/io_uring_prep_openat2.3117
l---------man/io_uring_prep_openat2_direct.31
l---------man/io_uring_prep_openat_direct.31
-rw-r--r--man/io_uring_prep_poll_add.372
l---------man/io_uring_prep_poll_multishot.31
-rw-r--r--man/io_uring_prep_poll_remove.355
-rw-r--r--man/io_uring_prep_poll_update.389
-rw-r--r--man/io_uring_prep_provide_buffers.3131
-rw-r--r--man/io_uring_prep_read.369
-rw-r--r--man/io_uring_prep_read_fixed.372
-rw-r--r--man/io_uring_prep_readv.385
-rw-r--r--man/io_uring_prep_readv2.3111
-rw-r--r--man/io_uring_prep_recv.383
-rw-r--r--man/io_uring_prep_recvmsg.394
-rw-r--r--man/io_uring_prep_remove_buffers.352
l---------man/io_uring_prep_rename.31
-rw-r--r--man/io_uring_prep_renameat.396
-rw-r--r--man/io_uring_prep_send.357
-rw-r--r--man/io_uring_prep_sendmsg.369
-rw-r--r--man/io_uring_prep_shutdown.353
-rw-r--r--man/io_uring_prep_socket.397
l---------man/io_uring_prep_socket_direct.31
-rw-r--r--man/io_uring_prep_splice.380
-rw-r--r--man/io_uring_prep_statx.374
l---------man/io_uring_prep_symlink.31
-rw-r--r--man/io_uring_prep_symlinkat.385
-rw-r--r--man/io_uring_prep_sync_file_range.359
-rw-r--r--man/io_uring_prep_tee.374
-rw-r--r--man/io_uring_prep_timeout.395
l---------man/io_uring_prep_timeout_remove.31
-rw-r--r--man/io_uring_prep_timeout_update.398
l---------man/io_uring_prep_unlink.31
-rw-r--r--man/io_uring_prep_unlinkat.382
-rw-r--r--man/io_uring_prep_write.367
-rw-r--r--man/io_uring_prep_write_fixed.372
-rw-r--r--man/io_uring_prep_writev.385
-rw-r--r--man/io_uring_prep_writev2.3111
-rw-r--r--man/io_uring_queue_exit.37
-rw-r--r--man/io_uring_queue_init.371
l---------man/io_uring_queue_init_params.31
-rw-r--r--man/io_uring_register.2315
-rw-r--r--man/io_uring_register_buf_ring.3139
-rw-r--r--man/io_uring_register_buffers.361
-rw-r--r--man/io_uring_register_eventfd.351
l---------man/io_uring_register_eventfd_async.31
-rw-r--r--man/io_uring_register_files.350
-rw-r--r--man/io_uring_register_iowq_aff.361
-rw-r--r--man/io_uring_register_iowq_max_workers.371
-rw-r--r--man/io_uring_register_ring_fd.349
-rw-r--r--man/io_uring_setup.2153
-rw-r--r--man/io_uring_sq_ready.331
-rw-r--r--man/io_uring_sq_space_left.325
-rw-r--r--man/io_uring_sqe_set_data.348
l---------man/io_uring_sqe_set_data64.31
-rw-r--r--man/io_uring_sqe_set_flags.386
-rw-r--r--man/io_uring_sqring_wait.334
-rw-r--r--man/io_uring_submit.346
-rw-r--r--man/io_uring_submit_and_wait.338
-rw-r--r--man/io_uring_submit_and_wait_timeout.354
-rw-r--r--man/io_uring_unregister_buf_ring.330
-rw-r--r--man/io_uring_unregister_buffers.327
l---------man/io_uring_unregister_eventfd.31
-rw-r--r--man/io_uring_unregister_files.327
l---------man/io_uring_unregister_iowq_aff.31
-rw-r--r--man/io_uring_unregister_ring_fd.332
-rw-r--r--man/io_uring_wait_cqe.340
-rw-r--r--man/io_uring_wait_cqe_nr.343
-rw-r--r--man/io_uring_wait_cqe_timeout.353
-rw-r--r--man/io_uring_wait_cqes.356
-rw-r--r--src/Makefile41
-rw-r--r--src/arch/aarch64/syscall.h95
-rw-r--r--src/arch/generic/lib.h21
-rw-r--r--src/arch/generic/syscall.h87
-rw-r--r--src/arch/syscall-defs.h74
-rw-r--r--src/arch/x86/lib.h15
-rw-r--r--src/arch/x86/syscall.h300
-rw-r--r--src/include/liburing.h719
-rw-r--r--src/include/liburing/barrier.h8
-rw-r--r--src/include/liburing/io_uring.h192
-rw-r--r--src/int_flags.h9
-rw-r--r--src/lib.h57
-rw-r--r--src/liburing.map18
-rw-r--r--src/nolibc.c48
-rw-r--r--src/queue.c225
-rw-r--r--src/register.c328
-rw-r--r--src/setup.c157
-rw-r--r--src/syscall.c44
-rw-r--r--src/syscall.h99
-rw-r--r--test/232c93d07b74.c (renamed from test/232c93d07b74-test.c)11
-rw-r--r--test/35fa71a030ca.c (renamed from test/35fa71a030ca-test.c)6
-rw-r--r--test/500f9fbadef8.c (renamed from test/500f9fbadef8-test.c)0
-rw-r--r--test/7ad0e4b2f83c.c (renamed from test/7ad0e4b2f83c-test.c)0
-rw-r--r--test/8a9973408177.c (renamed from test/8a9973408177-test.c)0
-rw-r--r--test/917257daa0fe.c (renamed from test/917257daa0fe-test.c)0
-rw-r--r--test/Makefile327
-rw-r--r--test/a0908ae19763.c (renamed from test/a0908ae19763-test.c)0
-rw-r--r--test/a4c0b3decb33.c (renamed from test/a4c0b3decb33-test.c)0
-rw-r--r--test/accept-link.c11
-rw-r--r--test/accept-test.c2
-rw-r--r--test/accept.c576
-rw-r--r--test/b19062a56726.c (renamed from test/b19062a56726-test.c)0
-rw-r--r--test/b5837bd5311d.c (renamed from test/b5837bd5311d-test.c)0
-rw-r--r--test/buf-ring.c390
-rw-r--r--test/ce593a6c480a.c (renamed from test/ce593a6c480a-test.c)1
-rw-r--r--test/connect.c7
-rw-r--r--test/cq-overflow.c13
-rw-r--r--test/cq-size.c10
-rw-r--r--test/d4ae271dfaae.c (renamed from test/d4ae271dfaae-test.c)9
-rw-r--r--test/d77a67ed5f27.c (renamed from test/d77a67ed5f27-test.c)0
-rw-r--r--test/defer.c67
-rw-r--r--test/double-poll-crash.c22
-rw-r--r--test/drop-submit.c93
-rw-r--r--test/eeed8b54e0df.c (renamed from test/eeed8b54e0df-test.c)11
-rw-r--r--test/empty-eownerdead.c45
-rw-r--r--test/eventfd-disable.c2
-rw-r--r--test/eventfd-reg.c76
-rw-r--r--test/eventfd-ring.c2
-rw-r--r--test/eventfd.c2
-rw-r--r--test/exec-target.c6
-rw-r--r--test/exit-no-cleanup.c117
-rw-r--r--test/fadvise.c4
-rw-r--r--test/fallocate.c9
-rw-r--r--test/fc2a85cb02ef.c (renamed from test/fc2a85cb02ef-test.c)7
-rw-r--r--test/file-register.c117
-rw-r--r--test/file-update.c1
-rw-r--r--test/file-verify.c629
-rw-r--r--test/fixed-buf-iter.c115
-rw-r--r--test/fixed-reuse.c160
-rw-r--r--test/fpos.c252
-rw-r--r--test/fsync.c17
-rw-r--r--test/hardlink.c3
-rw-r--r--test/helpers.c33
-rw-r--r--test/helpers.h12
-rw-r--r--test/io-cancel.c53
-rw-r--r--test/io_uring_enter.c88
-rw-r--r--test/io_uring_register.c117
-rw-r--r--test/io_uring_setup.c26
-rw-r--r--test/iopoll.c76
-rw-r--r--test/lfs-openat-write.c2
-rw-r--r--test/lfs-openat.c2
-rw-r--r--test/link-timeout.c27
-rw-r--r--test/link.c6
-rw-r--r--test/link_drain.c3
-rw-r--r--test/madvise.c3
-rw-r--r--test/mkdir.c3
-rw-r--r--test/msg-ring.c236
-rw-r--r--test/multicqes_drain.c22
-rw-r--r--test/nop-all-sizes.c8
-rw-r--r--test/nop.c94
-rw-r--r--test/open-close.c115
-rw-r--r--test/open-direct-link.c188
-rw-r--r--test/open-direct-pick.c180
-rw-r--r--test/openat2.c231
-rw-r--r--test/pipe-eof.c2
-rw-r--r--test/poll-cancel-all.c472
-rw-r--r--test/poll-cancel-ton.c14
-rw-r--r--test/poll-cancel.c107
-rw-r--r--test/poll-link.c11
-rw-r--r--test/poll-many.c2
-rw-r--r--test/poll-mshot-update.c78
-rw-r--r--test/poll-ring.c2
-rw-r--r--test/poll-v-poll.c2
-rw-r--r--test/poll.c2
-rw-r--r--test/pollfree.c426
-rw-r--r--test/probe.c12
-rw-r--r--test/read-before-exit.c112
-rw-r--r--test/read-write.c67
-rw-r--r--test/recv-msgall-stream.c400
-rw-r--r--test/recv-msgall.c267
-rw-r--r--test/register-restrictions.c2
-rw-r--r--test/rename.c8
-rw-r--r--test/ring-leak.c84
-rw-r--r--test/ring-leak2.c3
-rw-r--r--test/ringbuf-read.c195
-rw-r--r--test/rsrc_tags.c32
-rwxr-xr-xtest/runtests-loop.sh6
-rwxr-xr-xtest/runtests-quiet.sh11
-rwxr-xr-xtest/runtests.sh83
-rw-r--r--test/rw_merge_test.c3
-rw-r--r--test/send_recv.c11
-rw-r--r--test/send_recvmsg.c220
-rw-r--r--test/sendmsg_fs_cve.c10
-rw-r--r--test/short-read.c2
-rw-r--r--test/shutdown.c6
-rw-r--r--test/sigfd-deadlock.c2
-rw-r--r--test/skip-cqe.c429
-rw-r--r--test/socket-rw-eagain.c28
-rw-r--r--test/socket-rw-offset.c157
-rw-r--r--test/socket-rw.c22
-rw-r--r--test/socket.c408
-rw-r--r--test/splice.c1
-rw-r--r--test/sq-poll-dup.c19
-rw-r--r--test/sq-poll-kthread.c17
-rw-r--r--test/sq-poll-share.c10
-rw-r--r--test/sqpoll-cancel-hang.c157
-rw-r--r--test/sqpoll-disable-exit.c1
-rw-r--r--test/sqpoll-exit-hang.c3
-rw-r--r--test/sqpoll-sleep.c1
-rw-r--r--test/statx.c2
-rw-r--r--test/submit-link-fail.c156
-rw-r--r--test/submit-reuse.c27
-rw-r--r--test/symlink.c3
-rw-r--r--test/test.h35
-rw-r--r--test/thread-exit.c28
-rw-r--r--test/timeout-new.c24
-rw-r--r--test/timeout-overflow.c2
-rw-r--r--test/timeout.c190
-rw-r--r--test/tty-write-dpoll.c60
-rw-r--r--test/unlink.c3
-rw-r--r--test/xattr.c425
267 files changed, 17149 insertions, 1717 deletions
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000..ae9f4de
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,86 @@
+
+<!-- Explain your changes here... -->
+
+----
+## git request-pull output:
+```
+<!-- START REPLACE ME -->
+
+Generate your PR shortlog and diffstat with these commands:
+ git remote add axboe-tree https://github.com/axboe/liburing
+ git fetch axboe-tree
+ git request-pull axboe-tree/master your_fork_URL your_branch_name
+
+Then replace this with the output of `git request-pull` command.
+
+<!-- END REPLACE ME -->
+```
+----
+<details>
+<summary>Click to show/hide pull request guidelines</summary>
+
+## Pull Request Guidelines
+1. To make everyone easily filter pull request from the email
+notification, use `[GIT PULL]` as a prefix in your PR title.
+```
+[GIT PULL] Your Pull Request Title
+```
+2. Follow the commit message format rules below.
+3. Follow the Linux kernel coding style (see: https://github.com/torvalds/linux/blob/master/Documentation/process/coding-style.rst).
+
+### Commit message format rules:
+1. The first line is title (don't be more than 72 chars if possible).
+2. Then an empty line.
+3. Then a description (may be omitted for truly trivial changes).
+4. Then an empty line again (if it has a description).
+5. Then a `Signed-off-by` tag with your real name and email. For example:
+```
+Signed-off-by: Foo Bar <foo.bar@gmail.com>
+```
+
+The description should be word-wrapped at 72 chars. Some things should
+not be word-wrapped. They may be some kind of quoted text - long
+compiler error messages, oops reports, Link, etc. (things that have a
+certain specific format).
+
+Note that all of this goes in the commit message, not in the pull
+request text. The pull request text should introduce what this pull
+request does, and each commit message should explain the rationale for
+why that particular change was made. The git tree is canonical source
+of truth, not github.
+
+Each patch should do one thing, and one thing only. If you find yourself
+writing an explanation for why a patch is fixing multiple issues, that's
+a good indication that the change should be split into separate patches.
+
+If the commit is a fix for an issue, add a `Fixes` tag with the issue
+URL.
+
+Don't use GitHub anonymous email like this as the commit author:
+```
+123456789+username@users.noreply.github.com
+```
+
+Use a real email address!
+
+### Commit message example:
+```
+src/queue: don't flush SQ ring for new wait interface
+
+If we have IORING_FEAT_EXT_ARG, then timeouts are done through the
+syscall instead of by posting an internal timeout. This was done
+to be both more efficient, but also to enable multi-threaded use
+the wait side. If we touch the SQ state by flushing it, that isn't
+safe without synchronization.
+
+Fixes: https://github.com/axboe/liburing/issues/402
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+```
+
+</details>
+
+----
+## By submitting this pull request, I acknowledge that:
+1. I have followed the above pull request guidelines.
+2. I have the rights to submit this work under the same license.
+3. I agree to a Developer Certificate of Origin (see https://developercertificate.org for more information).
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..88192ff
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,127 @@
+name: Build test
+
+on:
+ # Trigger the workflow on push or pull requests.
+ push:
+ pull_request:
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ # x86-64 gcc
+ - arch: x86_64
+ cc_pkg: gcc-x86-64-linux-gnu
+ cxx_pkg: g++-x86-64-linux-gnu
+ cc: x86_64-linux-gnu-gcc
+ cxx: x86_64-linux-gnu-g++
+
+ # x86-64 clang
+ - arch: x86_64
+ cc_pkg: clang
+ cxx_pkg: clang
+ cc: clang
+ cxx: clang++
+
+ # x86 (32-bit) gcc
+ - arch: i686
+ cc_pkg: gcc-i686-linux-gnu
+ cxx_pkg: g++-i686-linux-gnu
+ cc: i686-linux-gnu-gcc
+ cxx: i686-linux-gnu-g++
+
+ # aarch64 gcc
+ - arch: aarch64
+ cc_pkg: gcc-aarch64-linux-gnu
+ cxx_pkg: g++-aarch64-linux-gnu
+ cc: aarch64-linux-gnu-gcc
+ cxx: aarch64-linux-gnu-g++
+
+ # arm (32-bit) gcc
+ - arch: arm
+ cc_pkg: gcc-arm-linux-gnueabi
+ cxx_pkg: g++-arm-linux-gnueabi
+ cc: arm-linux-gnueabi-gcc
+ cxx: arm-linux-gnueabi-g++
+
+ # powerpc64
+ - arch: powerpc64
+ cc_pkg: gcc-powerpc64-linux-gnu
+ cxx_pkg: g++-powerpc64-linux-gnu
+ cc: powerpc64-linux-gnu-gcc
+ cxx: powerpc64-linux-gnu-g++
+
+ # powerpc
+ - arch: powerpc
+ cc_pkg: gcc-powerpc-linux-gnu
+ cxx_pkg: g++-powerpc-linux-gnu
+ cc: powerpc-linux-gnu-gcc
+ cxx: powerpc-linux-gnu-g++
+
+ # alpha
+ - arch: alpha
+ cc_pkg: gcc-alpha-linux-gnu
+ cxx_pkg: g++-alpha-linux-gnu
+ cc: alpha-linux-gnu-gcc
+ cxx: alpha-linux-gnu-g++
+
+ # mips64
+ - arch: mips64
+ cc_pkg: gcc-mips64-linux-gnuabi64
+ cxx_pkg: g++-mips64-linux-gnuabi64
+ cc: mips64-linux-gnuabi64-gcc
+ cxx: mips64-linux-gnuabi64-g++
+
+ # mips
+ - arch: mips
+ cc_pkg: gcc-mips-linux-gnu
+ cxx_pkg: g++-mips-linux-gnu
+ cc: mips-linux-gnu-gcc
+ cxx: mips-linux-gnu-g++
+
+ env:
+ FLAGS: -g -O2 -Wall -Wextra -Werror
+
+ steps:
+ - name: Checkout source
+ uses: actions/checkout@v2
+
+ - name: Install Compilers
+ run: |
+ if [[ "${{matrix.cc_pkg}}" == "clang" ]]; then \
+ wget https://apt.llvm.org/llvm.sh -O /tmp/llvm.sh; \
+ sudo bash /tmp/llvm.sh 15; \
+ sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 400; \
+ sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 400; \
+ else \
+ sudo apt-get update -y; \
+ sudo apt-get install -y ${{matrix.cc_pkg}} ${{matrix.cxx_pkg}}; \
+ fi;
+
+ - name: Display compiler versions
+ run: |
+ ${{matrix.cc}} --version;
+ ${{matrix.cxx}} --version;
+
+ - name: Build
+ run: |
+ ./configure --cc=${{matrix.cc}} --cxx=${{matrix.cxx}};
+ make -j$(nproc) V=1 CPPFLAGS="-Werror" CFLAGS="$FLAGS" CXXFLAGS="$FLAGS";
+
+ - name: Build nolibc
+ run: |
+ if [[ "${{matrix.arch}}" == "x86_64" || "${{matrix.arch}}" == "i686" ]]; then \
+ make clean; \
+ ./configure --cc=${{matrix.cc}} --cxx=${{matrix.cxx}} --nolibc; \
+ make -j$(nproc) V=1 CPPFLAGS="-Werror" CFLAGS="$FLAGS" CXXFLAGS="$FLAGS"; \
+ else \
+ echo "Skipping nolibc build, this arch doesn't support building liburing without libc"; \
+ fi;
+
+ - name: Test install command
+ run: |
+ sudo make install;
diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml
new file mode 100644
index 0000000..8873f0b
--- /dev/null
+++ b/.github/workflows/shellcheck.yml
@@ -0,0 +1,20 @@
+name: Shellcheck
+
+on:
+ # Trigger the workflow on push or pull requests.
+ push:
+ pull_request:
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout source
+ uses: actions/checkout@v2
+
+ - name: Display shellcheck version
+ run: shellcheck --version
+
+ - name: Shellcheck execution
+ run: shellcheck test/runtest*.sh
diff --git a/.gitignore b/.gitignore
index 7a6f75c..b5acffd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
*~
/*.patch
+*.d
*.o
*.o[ls]
@@ -15,116 +16,14 @@
/examples/link-cp
/examples/ucontext-cp
-/test/232c93d07b74-test
-/test/35fa71a030ca-test
-/test/500f9fbadef8-test
-/test/7ad0e4b2f83c-test
-/test/8a9973408177-test
-/test/917257daa0fe-test
-/test/a0908ae19763-test
-/test/a4c0b3decb33-test
-/test/accept
-/test/accept-link
-/test/accept-reuse
-/test/accept-test
-/test/across-fork
-/test/b19062a56726-test
-/test/b5837bd5311d-test
-/test/ce593a6c480a-test
-/test/close-opath
-/test/config.local
-/test/connect
-/test/cq-full
-/test/cq-overflow
-/test/cq-overflow-peek
-/test/cq-peek-batch
-/test/cq-ready
-/test/cq-size
-/test/d4ae271dfaae-test
-/test/d77a67ed5f27-test
-/test/defer
-/test/double-poll-crash
-/test/eeed8b54e0df-test
-/test/eventfd
-/test/eventfd-disable
-/test/eventfd-ring
-/test/fadvise
-/test/fallocate
-/test/fc2a85cb02ef-test
-/test/file-register
-/test/file-update
-/test/files-exit-hang-poll
-/test/files-exit-hang-timeout
-/test/fixed-link
-/test/fsync
-/test/hardlink
-/test/io-cancel
-/test/io_uring_enter
-/test/io_uring_register
-/test/io_uring_setup
-/test/iopoll
-/test/lfs-openat
-/test/lfs-openat-write
-/test/link
-/test/link-timeout
-/test/link_drain
-/test/madvise
-/test/mkdir
-/test/nop
-/test/nop-all-sizes
-/test/open-close
-/test/openat2
-/test/personality
-/test/pipe-eof
-/test/pipe-reuse
-/test/poll
-/test/poll-cancel
-/test/poll-cancel-ton
-/test/poll-link
-/test/poll-many
-/test/poll-ring
-/test/poll-v-poll
-/test/probe
-/test/read-write
-/test/register-restrictions
-/test/rename
-/test/ring-leak
-/test/ring-leak2
-/test/self
-/test/send_recv
-/test/send_recvmsg
-/test/sendmsg_fs_cve
-/test/shared-wq
-/test/short-read
-/test/shutdown
-/test/sigfd-deadlock
-/test/socket-rw
-/test/socket-rw-eagain
-/test/splice
-/test/sq-full
-/test/sq-full-cpp
-/test/sq-poll-dup
-/test/sq-poll-kthread
-/test/sq-poll-share
-/test/sqpoll-disable-exit
-/test/sqpoll-exit-hang
-/test/sqpoll-sleep
-/test/sq-space_left
-/test/statx
-/test/stdout
-/test/submit-reuse
-/test/symlink
-/test/teardowns
-/test/thread-exit
-/test/timeout
-/test/timeout-new
-/test/timeout-overflow
-/test/unlink
-/test/wakeup-hang
+/test/*.t
/test/*.dmesg
+/test/output/
config-host.h
config-host.mak
config.log
liburing.pc
+
+cscope.out
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index e02fdd0..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-language: cpp
-os:
- - linux
-compiler:
- - clang
- - gcc
-env:
- matrix:
- - BUILD_ARCH="x86"
- - BUILD_ARCH="x86_64"
- global:
- - MAKEFLAGS="-j 2"
-matrix:
- exclude:
- - os: linux
- compiler: clang
- env: BUILD_ARCH="x86" # Only do the gcc x86 build to reduce clutter
-before_install:
- - EXTRA_CFLAGS="-Werror"
-script:
- - ./configure && make
- - sudo make runtests || true
diff --git a/Android.bp b/Android.bp
index 4a6eda9..5fc7a9b 100644
--- a/Android.bp
+++ b/Android.bp
@@ -30,9 +30,15 @@ cc_defaults {
"-Wall",
"-Werror",
"-Wno-pointer-arith",
+ "-Wno-unused-parameter",
+ "-Wno-implicit-function-declaration",
+ "-D_GNU_SOURCE"
],
include_dirs: ["bionic/libc/kernel"],
- export_include_dirs: ["src/include"],
+ export_include_dirs: [
+ "src/include",
+ "src/arch",
+ ],
srcs: [
"src/queue.c",
"src/register.c",
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 0000000..01cb677
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,29 @@
+liburing-2.2 release
+
+- Support non-libc builds.
+- Optimized syscall handling for x86-64/x86/aarch64.
+- Enable non-lib function calls for fast path functions.
+- Add support for multishot accept.
+- io_uring_register_files() will set RLIMIT_NOFILE if necessary.
+- Add support for registered ring fds, io_uring_register_ring_fd(),
+ reducingthe overhead of an io_uring_enter() system call.
+- Add support for the message ring opcode.
+- Add support for newer request cancelation features.
+- Add support for IORING_SETUP_COOP_TASKRUN, which can help reduce the
+ overhead of io_uring in general. Most applications should set this flag,
+ see the io_uring_setup.2 man page for details.
+- Add support for registering a sparse buffer and file set.
+- Add support for a new buffer provide scheme, see
+ io_uring_register_buf_ring.3 for details.
+- Add io_uring_submit_and_wait_timeout() for submitting IO and waiting
+ for completions with a timeout.
+- Add io_uring_prep_{read,write}v2 prep helpers.
+- Add io_uring_prep_close_direct() helper.
+- Add support for SQE128 and CQE32, which are doubly sized SQE and CQE
+ rings. This is needed for some cases of the new IORING_OP_URING_CMD,
+ notably for NVMe passthrough.
+- ~5500 lines of man page additions, including adding ~90 new man pages.
+- Synced with the 5.19 kernel release, supporting all the features of
+ 5.19 and earlier.
+- 24 new regression test cases, and ~7000 lines of new tests in general.
+- General optimizations and fixes.
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000..089f674
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,11 @@
+cff-version: 1.2.0
+preferred-citation:
+type: software
+authors:
+ - family-names: "Axboe"
+ given-names: "Jens"
+ email: axboe@kernel.dk
+title: "liburing library for io_uring"
+year: 2022
+url: "https://github.com/axboe/liburing"
+licence: MIT
diff --git a/LICENSE b/LICENSE
index ae941fa..d559f33 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,7 +1,20 @@
Copyright 2020 Jens Axboe
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/METADATA b/METADATA
index fb168cc..e5962aa 100644
--- a/METADATA
+++ b/METADATA
@@ -11,7 +11,7 @@ third_party {
type: GIT
value: "https://github.com/axboe/liburing"
}
- version: "2.0"
- last_upgrade_date { year: 2021 month: 7 day: 20 }
+ version: "2.2"
+ last_upgrade_date { year: 2022 month: 10 day: 28 }
license_type: NOTICE
}
diff --git a/Makefile b/Makefile
index 5d9c4dc..686be4f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,5 @@
-NAME=liburing
-SPECFILE=$(NAME).spec
-VERSION=$(shell awk '/Version:/ { print $$2 }' $(SPECFILE))
-TAG = $(NAME)-$(VERSION)
+include Makefile.common
+
RPMBUILD=$(shell `which rpmbuild >&/dev/null` && echo "rpmbuild" || echo "rpm")
INSTALL=install
@@ -21,8 +19,10 @@ partcheck: all
runtests: all
@$(MAKE) -C test runtests
-runtests-loop:
+runtests-loop: all
@$(MAKE) -C test runtests-loop
+runtests-parallel: all
+ @$(MAKE) -C test runtests-parallel
config-host.mak: configure
@if [ ! -e "$@" ]; then \
diff --git a/Makefile.common b/Makefile.common
new file mode 100644
index 0000000..27fc233
--- /dev/null
+++ b/Makefile.common
@@ -0,0 +1,6 @@
+TOP := $(dir $(CURDIR)/$(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))
+NAME=liburing
+SPECFILE=$(TOP)/$(NAME).spec
+VERSION=$(shell awk '/Version:/ { print $$2 }' $(SPECFILE))
+VERSION_MAJOR=$(shell echo $(VERSION) | cut -d. -f1)
+TAG = $(NAME)-$(VERSION)
diff --git a/README b/README
index a76021e..80d2b3d 100644
--- a/README
+++ b/README
@@ -16,6 +16,15 @@ and development for both kernel and userspace. The list is archived here:
https://lore.kernel.org/io-uring/
+kernel version dependency
+--------------------------
+
+liburing itself is not tied to any specific kernel release, and hence it's
+possible to use the newest liburing release even on older kernels (and vice
+versa). Newer features may only be available on more recent kernels,
+obviously.
+
+
ulimit settings
---------------
@@ -26,7 +35,9 @@ it quickly. root isn't under this restriction, but regular users are. Going
into detail on how to bump the limit on various systems is beyond the scope
of this little blurb, but check /etc/security/limits.conf for user specific
settings, or /etc/systemd/user.conf and /etc/systemd/system.conf for systemd
-setups.
+setups. This affects 5.11 and earlier, new kernels are less dependent
+on RLIMIT_MEMLOCK as it is only used for registering buffers.
+
Regressions tests
-----------------
@@ -35,6 +46,7 @@ The bulk of liburing is actually regression/unit tests for both liburing and
the kernel io_uring support. Please note that this suite isn't expected to
pass on older kernels, and may even crash or hang older kernels!
+
License
-------
@@ -43,4 +55,4 @@ COPYING and LICENSE, except for a header coming from the kernel which is
dual licensed GPL with a Linux-syscall-note exception and MIT, see
COPYING.GPL and <https://spdx.org/licenses/Linux-syscall-note.html>.
-Jens Axboe 2020-01-20
+Jens Axboe 2022-05-19
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..c9c2ffe
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,6 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+Please report any security issue to axboe@kernel.dk where the issue will be triaged appropriately.
+Thank you in advance for helping to keep liburing secure.
diff --git a/configure b/configure
index 3b96cde..2c2441b 100755
--- a/configure
+++ b/configure
@@ -1,10 +1,12 @@
#!/bin/sh
+set -e
+
cc=${CC:-gcc}
cxx=${CXX:-g++}
for opt do
- optarg=$(expr "x$opt" : 'x[^=]*=\(.*\)')
+ optarg=$(expr "x$opt" : 'x[^=]*=\(.*\)' || true)
case "$opt" in
--help|-h) show_help=yes
;;
@@ -24,6 +26,8 @@ for opt do
;;
--cxx=*) cxx="$optarg"
;;
+ --nolibc) liburing_nolibc="yes"
+ ;;
*)
echo "ERROR: unknown option $opt"
echo "Try '$0 --help' for more information"
@@ -69,18 +73,25 @@ Options: [defaults in brackets after descriptions]
--libdevdir=PATH install development libraries in PATH [$libdevdir]
--mandir=PATH install man pages in PATH [$mandir]
--datadir=PATH install shared data in PATH [$datadir]
+ --cc=CMD use CMD as the C compiler
+ --cxx=CMD use CMD as the C++ compiler
+ --nolibc build liburing without libc
EOF
exit 0
fi
-TMPC="$(mktemp --tmpdir fio-conf-XXXXXXXXXX.c)"
-TMPC2="$(mktemp --tmpdir fio-conf-XXXXXXXXXX-2.c)"
-TMPO="$(mktemp --tmpdir fio-conf-XXXXXXXXXX.o)"
-TMPE="$(mktemp --tmpdir fio-conf-XXXXXXXXXX.exe)"
+TMP_DIRECTORY="$(mktemp -d)"
+TMPC="$TMP_DIRECTORY/liburing-conf.c"
+TMPC2="$TMP_DIRECTORY/liburing-conf-2.c"
+TMPCXX="$TMP_DIRECTORY/liburing-conf-2.cpp"
+TMPO="$TMP_DIRECTORY/liburing-conf.o"
+TMPE="$TMP_DIRECTORY/liburing-conf.exe"
+
+touch $TMPC $TMPC2 $TMPCXX $TMPO $TMPE
# NB: do not call "exit" in the trap handler; this is buggy with some shells;
# see <1285349658-3122-1-git-send-email-loic.minier@linaro.org>
-trap "rm -f $TMPC $TMPC2 $TMPO $TMPE" EXIT INT QUIT TERM
+trap "rm -rf $TMP_DIRECTORY" EXIT INT QUIT TERM
rm -rf config.log
@@ -163,7 +174,7 @@ compile_prog_cxx() {
local_cflags="$1"
local_ldflags="$2 $LIBS"
echo "Compiling test case $3" >> config.log
- do_cxx $CFLAGS $local_cflags -o $TMPE $TMPC $LDFLAGS $local_ldflags
+ do_cxx $CFLAGS $local_cflags -o $TMPE $TMPCXX $LDFLAGS $local_ldflags
}
has() {
@@ -192,6 +203,37 @@ print_and_output_mak "mandir" "$mandir"
print_and_output_mak "datadir" "$datadir"
##########################################
+# check for compiler -Wstringop-overflow
+stringop_overflow="no"
+cat > $TMPC << EOF
+#include <linux/fs.h>
+int main(int argc, char **argv)
+{
+ return 0;
+}
+EOF
+if compile_prog "-Werror -Wstringop-overflow=0" "" "stringop_overflow"; then
+ stringop_overflow="yes"
+fi
+print_config "stringop_overflow" "$stringop_overflow"
+
+##########################################
+# check for compiler -Warryr-bounds
+array_bounds="no"
+cat > $TMPC << EOF
+#include <linux/fs.h>
+int main(int argc, char **argv)
+{
+ return 0;
+}
+EOF
+if compile_prog "-Werror -Warray-bounds=0" "" "array_bounds"; then
+ array_bounds="yes"
+fi
+print_config "array_bounds" "$array_bounds"
+
+
+##########################################
# check for __kernel_rwf_t
__kernel_rwf_t="no"
cat > $TMPC << EOF
@@ -232,9 +274,9 @@ print_config "__kernel_timespec" "$__kernel_timespec"
open_how="no"
cat > $TMPC << EOF
#include <sys/types.h>
-#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
+#include <linux/openat2.h>
int main(int argc, char **argv)
{
struct open_how how;
@@ -258,7 +300,6 @@ cat > $TMPC << EOF
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
-#include <linux/stat.h>
int main(int argc, char **argv)
{
struct statx x;
@@ -272,9 +313,30 @@ fi
print_config "statx" "$statx"
##########################################
+# check for glibc statx
+glibc_statx="no"
+cat > $TMPC << EOF
+#include <sys/types.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+int main(int argc, char **argv)
+{
+ struct statx x;
+
+ return memset(&x, 0, sizeof(x)) != NULL;
+}
+EOF
+if compile_prog "" "" "glibc_statx"; then
+ glibc_statx="yes"
+fi
+print_config "glibc_statx" "$glibc_statx"
+
+##########################################
# check for C++
has_cxx="no"
-cat > $TMPC << EOF
+cat > $TMPCXX << EOF
#include <iostream>
int main(int argc, char **argv)
{
@@ -296,6 +358,7 @@ int main(int argc, char **argv)
{
ucontext_t ctx;
getcontext(&ctx);
+ makecontext(&ctx, 0, 0);
return 0;
}
EOF
@@ -304,8 +367,30 @@ if compile_prog "" "" "has_ucontext"; then
fi
print_config "has_ucontext" "$has_ucontext"
+##########################################
+# check for memfd_create(2)
+has_memfd_create="no"
+cat > $TMPC << EOF
+#include <sys/mman.h>
+int main(int argc, char **argv)
+{
+ int memfd = memfd_create("test", 0);
+ return 0;
+}
+EOF
+if compile_prog "-Werror=implicit-function-declaration" "" "has_memfd_create"; then
+ has_memfd_create="yes"
+fi
+print_config "has_memfd_create" "$has_memfd_create"
+
#############################################################################
+if test "$liburing_nolibc" = "yes"; then
+ output_sym "CONFIG_NOLIBC"
+else
+ liburing_nolibc="no"
+fi
+print_config "liburing_nolibc" "$liburing_nolibc"
if test "$__kernel_rwf_t" = "yes"; then
output_sym "CONFIG_HAVE_KERNEL_RWF_T"
@@ -319,12 +404,24 @@ fi
if test "$statx" = "yes"; then
output_sym "CONFIG_HAVE_STATX"
fi
+if test "$glibc_statx" = "yes"; then
+ output_sym "CONFIG_HAVE_GLIBC_STATX"
+fi
if test "$has_cxx" = "yes"; then
output_sym "CONFIG_HAVE_CXX"
fi
if test "$has_ucontext" = "yes"; then
output_sym "CONFIG_HAVE_UCONTEXT"
fi
+if test "$stringop_overflow" = "yes"; then
+ output_sym "CONFIG_HAVE_STRINGOP_OVERFLOW"
+fi
+if test "$array_bounds" = "yes"; then
+ output_sym "CONFIG_HAVE_ARRAY_BOUNDS"
+fi
+if test "$has_memfd_create" = "yes"; then
+ output_sym "CONFIG_HAVE_MEMFD_CREATE"
+fi
echo "CC=$cc" >> $config_host_mak
print_config "CC" "$cc"
@@ -373,6 +470,16 @@ struct open_how {
};
EOF
+else cat >> $compat_h << EOF
+#include <linux/openat2.h>
+
+EOF
+fi
+if [ "$glibc_statx" = "no" ] && [ "$statx" = "yes" ]; then
+cat >> $compat_h << EOF
+#include <sys/stat.h>
+
+EOF
fi
cat >> $compat_h << EOF
diff --git a/examples/Makefile b/examples/Makefile
index 60c1b71..95a45f9 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -1,8 +1,8 @@
CPPFLAGS ?=
override CPPFLAGS += -D_GNU_SOURCE -I../src/include/
-CFLAGS ?= -g -O2
-XCFLAGS =
-override CFLAGS += -Wall -L../src/
+CFLAGS ?= -g -O2 -Wall
+LDFLAGS ?=
+override LDFLAGS += -L../src/ -luring
include ../Makefile.quiet
@@ -10,20 +10,29 @@ ifneq ($(MAKECMDGOALS),clean)
include ../config-host.mak
endif
-all_targets += io_uring-test io_uring-cp link-cp
+example_srcs := \
+ io_uring-cp.c \
+ io_uring-test.c \
+ link-cp.c
+
+all_targets :=
+
ifdef CONFIG_HAVE_UCONTEXT
-all_targets += ucontext-cp
+ example_srcs += ucontext-cp.c
endif
+all_targets += ucontext-cp
-all: $(all_targets)
+example_targets := $(patsubst %.c,%,$(patsubst %.cc,%,$(example_srcs)))
+all_targets += $(example_targets)
-test_srcs := io_uring-test.c io_uring-cp.c link-cp.c
-test_objs := $(patsubst %.c,%.ol,$(test_srcs))
+all: $(example_targets)
-%: %.c
- $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ $< -luring $(XCFLAGS)
+%: %.c ../src/liburing.a
+ $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ $< $(LDFLAGS)
clean:
- @rm -f $(all_targets) $(test_objs)
+ @rm -f $(all_targets)
+
+.PHONY: all clean
diff --git a/examples/io_uring-cp.c b/examples/io_uring-cp.c
index 2a44c30..43444d8 100644
--- a/examples/io_uring-cp.c
+++ b/examples/io_uring-cp.c
@@ -127,7 +127,8 @@ static int copy_file(struct io_uring *ring, off_t insize)
writes = reads = offset = 0;
while (insize || write_left) {
- int had_reads, got_comp;
+ unsigned long had_reads;
+ int got_comp;
/*
* Queue up as many reads as we can
@@ -188,18 +189,20 @@ static int copy_file(struct io_uring *ring, off_t insize)
if (cqe->res < 0) {
if (cqe->res == -EAGAIN) {
queue_prepped(ring, data);
+ io_uring_submit(ring);
io_uring_cqe_seen(ring, cqe);
continue;
}
fprintf(stderr, "cqe failed: %s\n",
strerror(-cqe->res));
return 1;
- } else if (cqe->res != data->iov.iov_len) {
+ } else if ((size_t)cqe->res != data->iov.iov_len) {
/* Short read/write, adjust and requeue */
data->iov.iov_base += cqe->res;
data->iov.iov_len -= cqe->res;
data->offset += cqe->res;
queue_prepped(ring, data);
+ io_uring_submit(ring);
io_uring_cqe_seen(ring, cqe);
continue;
}
diff --git a/examples/link-cp.c b/examples/link-cp.c
index e15dfc3..018124e 100644
--- a/examples/link-cp.c
+++ b/examples/link-cp.c
@@ -26,7 +26,7 @@ struct io_data {
};
static int infd, outfd;
-static unsigned inflight;
+static int inflight;
static int setup_context(unsigned entries, struct io_uring *ring)
{
@@ -95,7 +95,7 @@ static int handle_cqe(struct io_uring *ring, struct io_uring_cqe *cqe)
if (cqe->res < 0) {
if (cqe->res == -ECANCELED) {
- queue_rw_pair(ring, BS, data->offset);
+ queue_rw_pair(ring, data->iov.iov_len, data->offset);
inflight += 2;
} else {
printf("cqe error: %s\n", strerror(-cqe->res));
@@ -115,7 +115,7 @@ static int handle_cqe(struct io_uring *ring, struct io_uring_cqe *cqe)
static int copy_file(struct io_uring *ring, off_t insize)
{
struct io_uring_cqe *cqe;
- size_t this_size;
+ off_t this_size;
off_t offset;
offset = 0;
diff --git a/examples/ucontext-cp.c b/examples/ucontext-cp.c
index ea0c934..281013f 100644
--- a/examples/ucontext-cp.c
+++ b/examples/ucontext-cp.c
@@ -16,7 +16,7 @@
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/timerfd.h>
-#include <sys/poll.h>
+#include <poll.h>
#include "liburing.h"
#define QD 64
diff --git a/liburing.spec b/liburing.spec
index 0268d23..df62d2f 100644
--- a/liburing.spec
+++ b/liburing.spec
@@ -1,5 +1,5 @@
Name: liburing
-Version: 2.0
+Version: 2.2
Release: 1%{?dist}
Summary: Linux-native io_uring I/O access library
License: (GPLv2 with exceptions and LGPLv2+) or MIT
diff --git a/man/io_uring.7 b/man/io_uring.7
index a63b3e9..8c71d93 100644
--- a/man/io_uring.7
+++ b/man/io_uring.7
@@ -84,17 +84,35 @@ a read operation under
.BR io_uring ,
started with the
.BR IORING_OP_READ
-operation,
-which issues the equivalent of the
+operation, issues the equivalent of the
.BR read (2)
-system call,
-would return as part of
+system call. In practice, it mixes the semantics of
+.BR pread (2)
+and
+.BR preadv2 (2)
+in that it takes an explicit offset, and supports using -1 for the offset to
+indicate that the current file position should be used instead of passing in
+an explicit offset. See the opcode documentation for more details. Given that
+io_uring is an async interface,
+.I errno
+is never used for passing back error information. Instead,
.I res
-what
-.BR read (2)
-would have returned if called directly,
-without using
-.BR io_uring .
+will contain what the equivalent system call would have returned in case
+of success, and in case of error
+.I res
+will contain
+.I -errno .
+For example, if the normal read system call would have returned -1 and set
+.I errno
+to
+.B EINVAL ,
+then
+.I res
+would contain
+.B -EINVAL .
+If the normal system call would have returned a read size of 1024, then
+.I res
+would contain 1024.
.IP \(bu
Optionally,
.BR io_uring_enter (2)
@@ -259,7 +277,8 @@ you need to acquire a submission queue entry (SQE) from the submission
queue (SQ),
fill it up with details of the operation you want to submit and call
.BR io_uring_enter (2).
-If you want to avoid calling
+There are helper functions of the form io_uring_prep_X to enable proper
+setup of the SQE. If you want to avoid calling
.BR io_uring_enter (2),
you have the option of setting up Submission Queue Polling.
.PP
@@ -425,7 +444,7 @@ successful read and update of the head.
Because of the shared ring buffers between kernel and user space,
.B io_uring
can be a zero-copy system.
-Copying buffers to and fro becomes necessary when system calls that
+Copying buffers to and from becomes necessary when system calls that
transfer data between kernel and user space are involved.
But since the bulk of the communication in
.B io_uring
@@ -435,7 +454,7 @@ this huge performance overhead is completely avoided.
While system calls may not seem like a significant overhead,
in high performance applications,
making a lot of them will begin to matter.
-While workarounds the operating system has in place to deal with Specter
+While workarounds the operating system has in place to deal with Spectre
and Meltdown are ideally best done away with,
unfortunately,
some of these workarounds are around the system call interface,
@@ -466,7 +485,7 @@ them to the submission queue. This avoids the
.BR io_uring_enter (2)
call you need to make to tell the kernel to pick SQEs up.
For high-performance applications,
-this means even lesser system call overheads.
+this means even fewer system call overheads.
.SH CONFORMING TO
.B io_uring
is Linux-specific.
@@ -533,8 +552,8 @@ int io_uring_setup(unsigned entries, struct io_uring_params *p)
int io_uring_enter(int ring_fd, unsigned int to_submit,
unsigned int min_complete, unsigned int flags)
{
- return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete,
- flags, NULL, 0);
+ return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit,
+ min_complete, flags, NULL, 0);
}
int app_setup_uring(void) {
@@ -623,7 +642,7 @@ int app_setup_uring(void) {
int read_from_cq() {
struct io_uring_cqe *cqe;
- unsigned head, reaped = 0;
+ unsigned head;
/* Read barrier */
head = io_uring_smp_load_acquire(cring_head);
@@ -678,10 +697,10 @@ int submit_to_sq(int fd, int op) {
io_uring_smp_store_release(sring_tail, tail);
/*
- * Tell the kernel we have submitted events with the io_uring_enter() system
- * call. We also pass in the IOURING_ENTER_GETEVENTS flag which causes the
- * io_uring_enter() call to wait until min_complete (the 3rd param) events
- * complete.
+ * Tell the kernel we have submitted events with the io_uring_enter()
+ * system call. We also pass in the IOURING_ENTER_GETEVENTS flag which
+ * causes the io_uring_enter() call to wait until min_complete
+ * (the 3rd param) events complete.
* */
int ret = io_uring_enter(ring_fd, 1,1,
IORING_ENTER_GETEVENTS);
diff --git a/man/io_uring_buf_ring_add.3 b/man/io_uring_buf_ring_add.3
new file mode 100644
index 0000000..9d8283b
--- /dev/null
+++ b/man/io_uring_buf_ring_add.3
@@ -0,0 +1,53 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_buf_ring_add 3 "May 18, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_buf_ring_add \- add buffers to a shared buffer ring
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_buf_ring_add(struct io_uring_buf_ring *" br ",
+.BI " void *" addr ",
+.BI " unsigned int " len ",
+.BI " unsigned short " bid ",
+.BI " int " mask ",
+.BI " int " buf_offset ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_buf_ring_add (3)
+adds a new buffer to the shared buffer ring
+.IR br .
+The buffer address is indicated by
+.I addr
+and is of
+.I len
+bytes of length.
+.I bid
+is the buffer ID, which will be returned in the CQE.
+.I mask
+is the size mask of the ring, available from
+.BR io_uring_buf_ring_mask (3) .
+.I buf_offset
+is the offset to insert at from the current tail. If just one buffer is provided
+before the ring tail is committed with
+.BR io_uring_buf_ring_advance (3)
+or
+.BR io_uring_buf_ring_cq_advance (3),
+then
+.I buf_offset
+should be 0. If buffers are provided in a loop before being committed, the
+.I buf_offset
+must be incremented by one for each buffer added.
+
+.SH RETURN VALUE
+None
+.SH SEE ALSO
+.BR io_uring_register_buf_ring (3),
+.BR io_uring_buf_ring_mask (3),
+.BR io_uring_buf_ring_advance (3),
+.BR io_uring_buf_ring_cq_advance (3)
diff --git a/man/io_uring_buf_ring_advance.3 b/man/io_uring_buf_ring_advance.3
new file mode 100644
index 0000000..29a3578
--- /dev/null
+++ b/man/io_uring_buf_ring_advance.3
@@ -0,0 +1,31 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_buf_ring_advance 3 "May 18, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_buf_ring_advance \- advance index of provided buffer in buffer ring
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_buf_ring_advance(struct io_uring_buf_ring *" br ",
+.BI " int " count ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_buf_ring_advance (3)
+commits
+.I count
+previously added buffers to the shared buffer ring
+.IR br ,
+making them visible to the kernel and hence consumable. This passes ownership
+of the buffer to the ring.
+
+.SH RETURN VALUE
+None
+.SH SEE ALSO
+.BR io_uring_register_buf_ring (3),
+.BR io_uring_buf_ring_add (3),
+.BR io_uring_buf_ring_cq_advance (3)
diff --git a/man/io_uring_buf_ring_cq_advance.3 b/man/io_uring_buf_ring_cq_advance.3
new file mode 100644
index 0000000..caf882f
--- /dev/null
+++ b/man/io_uring_buf_ring_cq_advance.3
@@ -0,0 +1,41 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_buf_ring_cq_advance 3 "May 18, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_buf_ring_cq_advance \- advance index of provided buffer and CQ ring
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_buf_ring_cq_advance(struct io_uring *" ring ",
+.BI " struct io_uring_buf_ring *" br ",
+.BI " int " count ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_buf_ring_cq_advance (3)
+commits
+.I count
+previously added buffers to the shared buffer ring
+.IR br ,
+making them visible to the kernel and hence consumable. This passes ownership
+of the buffer to the ring. At the same time, it advances the CQ ring of
+.I ring
+by
+.I count
+amount. This effectively bundles both a
+.BR io_uring_buf_ring_advance (3)
+call and a
+.BR io_uring_cq_avance (3)
+into one operation. Since updating either ring index entails a store memory
+barrier, doing both at once is more efficient.
+
+.SH RETURN VALUE
+None
+.SH SEE ALSO
+.BR io_uring_register_buf_ring (3),
+.BR io_uring_buf_ring_add (3),
+.BR io_uring_buf_ring_advance (3)
diff --git a/man/io_uring_buf_ring_init.3 b/man/io_uring_buf_ring_init.3
new file mode 100644
index 0000000..50cf69a
--- /dev/null
+++ b/man/io_uring_buf_ring_init.3
@@ -0,0 +1,30 @@
+.\" Copyright (C) 2022 Dylan Yudaken <dylany@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_buf_ring_init 3 "June 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_buf_ring_init \- Initialise a buffer ring
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_buf_ring_init(struct io_uring_buf_ring *" br ");"
+.fi
+.SH DESCRIPTION
+.PP
+.BR io_uring_buf_ring_init (3)
+initialises
+.IR br
+so that it is ready to be used. It may be called after
+.BR io_uring_register_buf_ring (3)
+but must be called before the buffer ring is used in any other way.
+
+.SH RETURN VALUE
+None
+
+.SH SEE ALSO
+.BR io_uring_register_buf_ring (3),
+.BR io_uring_buf_ring_add (3)
+.BR io_uring_buf_ring_advance (3),
+.BR io_uring_buf_ring_cq_advance (3)
diff --git a/man/io_uring_buf_ring_mask.3 b/man/io_uring_buf_ring_mask.3
new file mode 100644
index 0000000..9160663
--- /dev/null
+++ b/man/io_uring_buf_ring_mask.3
@@ -0,0 +1,27 @@
+.\" Copyright (C) 2022 Dylan Yudaken <dylany@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_buf_ring_mask 3 "June 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_buf_ring_mask \- Calculate buffer ring mask size
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_buf_ring_mask(__u32 " ring_entries ");"
+.fi
+.SH DESCRIPTION
+.PP
+.BR io_uring_buf_ring_mask (3)
+calculates the appropriate size mask for a buffer ring.
+.IR ring_entries
+is the ring entries as specified in
+.BR io_uring_register_buf_ring (3) .
+
+.SH RETURN VALUE
+Size mask for the buffer ring.
+
+.SH SEE ALSO
+.BR io_uring_register_buf_ring (3),
+.BR io_uring_buf_ring_add (3)
diff --git a/man/io_uring_cq_advance.3 b/man/io_uring_cq_advance.3
new file mode 100644
index 0000000..fae2572
--- /dev/null
+++ b/man/io_uring_cq_advance.3
@@ -0,0 +1,49 @@
+.\" Copyright (C) 2022 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_cq_advance 3 "January 25, 2022" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_cq_advance \- mark one or more io_uring completion events as consumed
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_cq_advance(struct io_uring *" ring ","
+.BI " unsigned " nr ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_cq_advance (3)
+function marks
+.I nr
+IO completions belonging to the
+.I ring
+param as consumed.
+
+After the caller has submitted a request with
+.BR io_uring_submit (3),
+the application can retrieve the completion with
+.BR io_uring_wait_cqe (3),
+.BR io_uring_peek_cqe (3),
+or any of the other CQE retrieval helpers, and mark it as consumed with
+.BR io_uring_cqe_seen (3).
+
+The function
+.BR io_uring_cqe_seen (3)
+calls the function
+.BR io_uring_cq_advance (3).
+
+Completions must be marked as seen, so their slot can get reused. Failure to do
+so will result in the same completion being returned on the next invocation.
+
+.SH RETURN VALUE
+None
+.SH SEE ALSO
+.BR io_uring_submit (3),
+.BR io_uring_wait_cqe (3),
+.BR io_uring_peek_cqe (3),
+.BR io_uring_wait_cqes (3),
+.BR io_uring_wait_cqe_timeout (3),
+.BR io_uring_cqe_seen (3)
diff --git a/man/io_uring_cq_ready.3 b/man/io_uring_cq_ready.3
new file mode 100644
index 0000000..e411a64
--- /dev/null
+++ b/man/io_uring_cq_ready.3
@@ -0,0 +1,26 @@
+.\" Copyright (C) 2022 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_cq_ready "January 25, 2022" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_cq_ready \- returns number of unconsumed ready entries in the CQ ring
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "unsigned io_uring_cq_ready(const struct io_uring *" ring ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_cq_ready (3)
+function retuns the number of unconsumed entries that are ready belonging to the
+.I ring
+param.
+
+.SH RETURN VALUE
+Returns the number of unconsumed ready entries in the CQ ring.
+.SH SEE ALSO
+.BR io_uring_submit (3),
+.BR io_uring_wait_cqe (3)
diff --git a/man/io_uring_cqe_get_data.3 b/man/io_uring_cqe_get_data.3
new file mode 100644
index 0000000..4cbb32c
--- /dev/null
+++ b/man/io_uring_cqe_get_data.3
@@ -0,0 +1,53 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_cqe_get_data 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_cqe_get_data \- get user data for completion event
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void *io_uring_cqe_get_data(struct io_uring_cqe *" cqe ");"
+.BI "
+.BI "__u64 io_uring_cqe_get_data64(struct io_uring_cqe *" cqe ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_cqe_get_data (3)
+function returns the user_data with the completion queue entry
+.IR cqe
+as a data pointer.
+
+The
+.BR io_uring_cqe_get_data64 (3)
+function returns the user_data with the completion queue entry
+.IR cqe
+as a 64-bit data value.
+
+After the caller has received a completion queue entry (CQE) with
+.BR io_uring_wait_cqe (3),
+the application can call
+.BR io_uring_cqe_get_data (3)
+or
+.BR io_uring_cqe_get_data64 (3)
+function to retrieve the
+.I user_data
+value. This requires that
+.I user_data
+has been set earlier with the function
+.BR io_uring_sqe_set_data (3)
+or
+.BR io_uring_sqe_set_data64 (3).
+
+.SH RETURN VALUE
+If the
+.I user_data
+value has been set before submitting the request, it will be returned.
+Otherwise the functions returns NULL.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_sqe_set_data (3),
+.BR io_uring_sqe_submit (3)
diff --git a/man/io_uring_cqe_get_data64.3 b/man/io_uring_cqe_get_data64.3
new file mode 120000
index 0000000..51991c2
--- /dev/null
+++ b/man/io_uring_cqe_get_data64.3
@@ -0,0 +1 @@
+io_uring_cqe_get_data.3 \ No newline at end of file
diff --git a/man/io_uring_cqe_seen.3 b/man/io_uring_cqe_seen.3
new file mode 100644
index 0000000..d2f2984
--- /dev/null
+++ b/man/io_uring_cqe_seen.3
@@ -0,0 +1,42 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_cqe_seen 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_cqe_seen \- mark io_uring completion event as consumed
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_cqe_seen(struct io_uring *" ring ","
+.BI " struct io_uring_cqe *" cqe ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_cqe_seen (3)
+function marks the IO completion
+.I cqe
+belonging to the
+.I ring
+param as consumed.
+
+After the caller has submitted a request with
+.BR io_uring_submit (3),
+the application can retrieve the completion with
+.BR io_uring_wait_cqe (3),
+.BR io_uring_peek_cqe (3),
+or any of the other CQE retrieval helpers, and mark it as consumed with
+.BR io_uring_cqe_seen (3).
+
+Completions must be marked as completed so their slot can get reused.
+.SH RETURN VALUE
+None
+.SH SEE ALSO
+.BR io_uring_submit (3),
+.BR io_uring_wait_cqe (3),
+.BR io_uring_peek_cqe (3),
+.BR io_uring_wait_cqes (3),
+.BR io_uring_wait_cqe_timeout (3),
+.BR io_uring_cqe_seen (3)
diff --git a/man/io_uring_enter.2 b/man/io_uring_enter.2
index 909cc9b..3c04541 100644
--- a/man/io_uring_enter.2
+++ b/man/io_uring_enter.2
@@ -55,6 +55,52 @@ application can no longer get a free SQE entry to submit, without knowing
when it one becomes available as the SQ kernel thread consumes them. If
the system call is used with this flag set, then it will wait until at least
one entry is free in the SQ ring.
+.TP
+.B IORING_ENTER_EXT_ARG
+Since kernel 5.11, the system calls arguments have been modified to look like
+the following:
+
+.nf
+.BI "int io_uring_enter(unsigned int " fd ", unsigned int " to_submit ,
+.BI " unsigned int " min_complete ", unsigned int " flags ,
+.BI " const void *" arg ", size_t " argsz );
+.fi
+
+which is behaves just like the original definition by default. However, if
+.B IORING_ENTER_EXT_ARG
+is set, then instead of a
+.I sigset_t
+being passed in, a pointer to a
+.I struct io_uring_getevents_arg
+is used instead and
+.I argsz
+must be set to the size of this structure. The definition is as follows:
+
+.nf
+.BI "struct io_uring_getevents_args {
+.BI " __u64 sigmask;
+.BI " __u32 sigmask_sz;
+.BI " __u32 pad;
+.BI " __u64 ts;
+.BI "};
+.fi
+
+which allows passing in both a signal mask as well as pointer to a
+.I struct __kernel_timespec
+timeout value. If
+.I ts
+is set to a valid pointer, then this time value indicates the timeout for
+waiting on events. If an application is waiting on events and wishes to
+stop waiting after a specified amount of time, then this can be accomplished
+directly in version 5.11 and newer by using this feature.
+.TP
+.B IORING_ENTER_REGISTERED_RING
+If the ring file descriptor has been registered through use of
+.B IORING_REGISTER_RING_FDS,
+then setting this flag will tell the kernel that the
+.I ring_fd
+passed in is the registered ring offset rather than a normal file descriptor.
+
.PP
.PP
If the io_uring instance was configured for polling, by specifying
@@ -159,22 +205,28 @@ struct io_uring_sqe {
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
+ __u32 rename_flags;
+ __u32 unlink_flags;
+ __u32 hardlink_flags;
};
__u64 user_data; /* data to be passed back at completion time */
union {
- struct {
- /* index into fixed buffers, if used */
+ struct {
+ /* index into fixed buffers, if used */
union {
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
}
- /* personality to use, if used */
- __u16 personality;
+ /* personality to use, if used */
+ __u16 personality;
+ union {
__s32 splice_fd_in;
+ __u32 file_index;
};
- __u64 __pad2[3];
+ };
+ __u64 __pad2[3];
};
};
.EE
@@ -228,11 +280,55 @@ specified in the
.I poll_events
field. Unlike poll or epoll without
.BR EPOLLONESHOT ,
-this interface always works in one shot mode. That is, once the poll
-operation is completed, it will have to be resubmitted. This command works like
+by default this interface always works in one shot mode. That is, once the poll
+operation is completed, it will have to be resubmitted.
+
+If
+.B IORING_POLL_ADD_MULTI
+is set in the SQE
+.I len
+field, then the poll will work in multi shot mode instead. That means it'll
+repatedly trigger when the requested event becomes true, and hence multiple
+CQEs can be generated from this single SQE. The CQE
+.I flags
+field will have
+.B IORING_CQE_F_MORE
+set on completion if the application should expect further CQE entries from
+the original request. If this flag isn't set on completion, then the poll
+request has been terminated and no further events will be generated. This mode
+is available since 5.13.
+
+If
+.B IORING_POLL_UPDATE_EVENTS
+is set in the SQE
+.I len
+field, then the request will update an existing poll request with the mask of
+events passed in with this request. The lookup is based on the
+.I user_data
+field of the original SQE submitted, and this values is passed in the
+.I addr
+field of the SQE. This mode is available since 5.13.
+
+If
+.B IORING_POLL_UPDATE_USER_DATA
+is set in the SQE
+.I len
+field, then the request will update the
+.I user_data
+of an existing poll request based on the value passed in the
+.I off
+field. This mode is available since 5.13.
+
+This command works like
an async
.BR poll(2)
-and the completion event result is the returned mask of events.
+and the completion event result is the returned mask of events. For the
+variants that update
+.I user_data
+or
+.I events
+, the completion result will be similar to
+.B IORING_OP_POLL_REMOVE.
.TP
.B IORING_OP_POLL_REMOVE
@@ -243,7 +339,10 @@ field of the
will contain 0. If not found,
.I res
will contain
-.B -ENOENT.
+.B -ENOENT,
+or
+.B -EALREADY
+if the poll request was in the process of completing already.
.TP
.B IORING_OP_EPOLL_CTL
@@ -342,10 +441,32 @@ clock source. The request will complete with
if the timeout got completed through expiration of the timer, or
.I 0
if the timeout got completed through requests completing on their own. If
-the timeout was cancelled before it expired, the request will complete with
+the timeout was canceled before it expired, the request will complete with
.I -ECANCELED.
Available since 5.4.
+Since 5.15, this command also supports the following modifiers in
+.I timeout_flags:
+
+.PP
+.in +12
+.B IORING_TIMEOUT_BOOTTIME
+If set, then the clocksource used is
+.I CLOCK_BOOTTIME
+instead of
+.I CLOCK_MONOTONIC.
+This clocksource differs in that it includes time elapsed if the system was
+suspend while having a timeout request in-flight.
+
+.B IORING_TIMEOUT_REALTIME
+If set, then the clocksource used is
+.I CLOCK_BOOTTIME
+instead of
+.I CLOCK_MONOTONIC.
+.EE
+.in
+.PP
+
.TP
.B IORING_OP_TIMEOUT_REMOVE
If
@@ -355,7 +476,7 @@ operation.
must contain the
.I user_data
field of the previously issued timeout operation. If the specified timeout
-request is found and cancelled successfully, this request will terminate
+request is found and canceled successfully, this request will terminate
with a result value of
.I 0
If the timeout request was found but expiration was already in progress,
@@ -370,13 +491,14 @@ If
.I timeout_flags
contain
.I IORING_TIMEOUT_UPDATE,
-instead of removing an existing operation it updates it.
+instead of removing an existing operation, it updates it.
.I addr
and return values are same as before.
.I addr2
field must contain a pointer to a struct timespec64 structure.
.I timeout_flags
-may also contain IORING_TIMEOUT_ABS.
+may also contain IORING_TIMEOUT_ABS, in which case the value given is an
+absolute one, not a relative one.
Available since 5.11.
.TP
@@ -389,26 +511,47 @@ must be set to the socket file descriptor,
.I addr
must contain the pointer to the sockaddr structure, and
.I addr2
-must contain a pointer to the socklen_t addrlen field. See also
+must contain a pointer to the socklen_t addrlen field. Flags can be passed using
+the
+.I accept_flags
+field. See also
.BR accept4(2)
for the general description of the related system call. Available since 5.5.
+If the
+.I file_index
+field is set to a positive number, the file won't be installed into the
+normal file table as usual but will be placed into the fixed file table at index
+.I file_index - 1.
+In this case, instead of returning a file descriptor, the result will contain
+either 0 on success or an error. If the index points to a valid empty slot, the
+installation is guaranteed to not fail. If there is already a file in the slot,
+it will be replaced, similar to
+.B IORING_OP_FILES_UPDATE.
+Please note that only io_uring has access to such files and no other syscall
+can use them. See
+.B IOSQE_FIXED_FILE
+and
+.B IORING_REGISTER_FILES.
+
+Available since 5.5.
+
.TP
.B IORING_OP_ASYNC_CANCEL
Attempt to cancel an already issued request.
.I addr
must contain the
.I user_data
-field of the request that should be cancelled. The cancellation request will
+field of the request that should be canceled. The cancelation request will
complete with one of the following results codes. If found, the
.I res
field of the cqe will contain 0. If not found,
.I res
-will contain -ENOENT. If found and attempted cancelled, the
+will contain -ENOENT. If found and attempted canceled, the
.I res
field will contain -EALREADY. In this case, the request may or may not
terminate. In general, requests that are interruptible (like socket IO) will
-get cancelled, while disk IO requests cannot be cancelled if already started.
+get canceled, while disk IO requests cannot be canceled if already started.
Available since 5.5.
.TP
@@ -426,9 +569,9 @@ If used, the timeout specified in the command will cancel the linked command,
unless the linked command completes before the timeout. The timeout will
complete with
.I -ETIME
-if the timer expired and the linked request was attempted cancelled, or
+if the timer expired and the linked request was attempted canceled, or
.I -ECANCELED
-if the timer got cancelled because of completion of the linked request. Like
+if the timer got canceled because of completion of the linked request. Like
.B IORING_OP_TIMEOUT
the clock source used is
.B CLOCK_MONOTONIC
@@ -516,6 +659,24 @@ is access mode of the file. See also
.BR openat(2)
for the general description of the related system call. Available since 5.6.
+If the
+.I file_index
+field is set to a positive number, the file won't be installed into the
+normal file table as usual but will be placed into the fixed file table at index
+.I file_index - 1.
+In this case, instead of returning a file descriptor, the result will contain
+either 0 on success or an error. If the index points to a valid empty slot, the
+installation is guaranteed to not fail. If there is already a file in the slot,
+it will be replaced, similar to
+.B IORING_OP_FILES_UPDATE.
+Please note that only io_uring has access to such files and no other syscall
+can use them. See
+.B IOSQE_FIXED_FILE
+and
+.B IORING_REGISTER_FILES.
+
+Available since 5.15.
+
.TP
.B IORING_OP_OPENAT2
Issue the equivalent of a
@@ -536,6 +697,24 @@ should be set to the address of the open_how structure. See also
.BR openat2(2)
for the general description of the related system call. Available since 5.6.
+If the
+.I file_index
+field is set to a positive number, the file won't be installed into the
+normal file table as usual but will be placed into the fixed file table at index
+.I file_index - 1.
+In this case, instead of returning a file descriptor, the result will contain
+either 0 on success or an error. If the index points to a valid empty slot, the
+installation is guaranteed to not fail. If there is already a file in the slot,
+it will be replaced, similar to
+.B IORING_OP_FILES_UPDATE.
+Please note that only io_uring has access to such files and no other syscall
+can use them. See
+.B IOSQE_FIXED_FILE
+and
+.B IORING_REGISTER_FILES.
+
+Available since 5.15.
+
.TP
.B IORING_OP_CLOSE
Issue the equivalent of a
@@ -545,6 +724,18 @@ system call.
is the file descriptor to be closed. See also
.BR close(2)
for the general description of the related system call. Available since 5.6.
+If the
+.I file_index
+field is set to a positive number, this command can be used to close files
+that were direct opened through
+.B IORING_OP_OPENAT
+,
+.B IORING_OP_OPENAT2
+, or
+.B IORING_OP_ACCEPT
+using the io_uring specific direct descriptors. Note that only one of the
+descriptor fields may be set. The direct close feature is available since
+the 5.15 kernel, where direct descriptors were introduced.
.TP
.B IORING_OP_STATX
@@ -596,7 +787,9 @@ does not refer to a seekable file,
.I off
must be set to zero. If
.I offs
-is set to -1, the offset will use (and advance) the file position, like the
+is set to
+.B -1
+, the offset will use (and advance) the file position, like the
.BR read(2)
and
.BR write(2)
@@ -622,8 +815,9 @@ is an offset to read from,
.I fd
is the file descriptor to write to,
.I off
-is an offset from which to start writing to. A sentinel value of -1 is used
-to pass the equivalent of a NULL for the offsets to
+is an offset from which to start writing to. A sentinel value of
+.B -1
+is used to pass the equivalent of a NULL for the offsets to
.BR splice(2).
.I len
contains the number of bytes to copy.
@@ -724,8 +918,11 @@ Issue the equivalent of a
.BR shutdown(2)
system call.
.I fd
-is the file descriptor to the socket being shutdown, no other fields should
-be set. Available since 5.11.
+is the file descriptor to the socket being shutdown, and
+.I len
+must be set to the
+.I how
+argument. No no other fields should be set. Available since 5.11.
.TP
.B IORING_OP_RENAMEAT
@@ -774,6 +971,90 @@ being passed in to
.BR unlinkat(2).
Available since 5.11.
+.TP
+.B IORING_OP_MKDIRAT
+Issue the equivalent of a
+.BR mkdirat2(2)
+system call.
+.I fd
+should be set to the
+.I dirfd,
+.I addr
+should be set to the
+.I pathname,
+and
+.I len
+should be set to the
+.I mode
+being passed in to
+.BR mkdirat(2).
+Available since 5.15.
+
+.TP
+.B IORING_OP_SYMLINKAT
+Issue the equivalent of a
+.BR symlinkat2(2)
+system call.
+.I fd
+should be set to the
+.I newdirfd,
+.I addr
+should be set to the
+.I target
+and
+.I addr2
+should be set to the
+.I linkpath
+being passed in to
+.BR symlinkat(2).
+Available since 5.15.
+
+.TP
+.B IORING_OP_LINKAT
+Issue the equivalent of a
+.BR linkat2(2)
+system call.
+.I fd
+should be set to the
+.I olddirfd,
+.I addr
+should be set to the
+.I oldpath,
+.I len
+should be set to the
+.I newdirfd,
+.I addr2
+should be set to the
+.I newpath,
+and
+.I hardlink_flags
+should be set to the
+.I flags
+being passed in to
+.BR linkat(2).
+Available since 5.15.
+
+.TP
+.B IORING_OP_MSG_RING
+Send a message to an io_uring.
+.I fd
+must be set to a file descriptor of a ring that the application has access to,
+.I len
+can be set to any 32-bit value that the application wishes to pass on, and
+.I off
+should be set any 64-bit value that the application wishes to send. On the
+target ring, a CQE will be posted with the
+.I res
+field matching the
+.I len
+set, and a
+.I user_data
+field matching the
+.I off
+value being passed in. This request type can be used to either just wake or
+interrupt anyone waiting for completions on the target ring, ot it can be used
+to pass messages via the two fields. Available since 5.18.
+
.PP
The
.I flags
@@ -786,7 +1067,10 @@ is an index into the files array registered with the io_uring instance (see the
.B IORING_REGISTER_FILES
section of the
.BR io_uring_register (2)
-man page). Available since 5.1.
+man page). Note that this isn't always available for all commands. If used on
+a command that doesn't support fixed files, the SQE will error with
+.B -EBADF.
+Available since 5.1.
.TP
.B IOSQE_IO_DRAIN
When this flag is specified, the SQE will not be started before previously
@@ -794,12 +1078,14 @@ submitted SQEs have completed, and new SQEs will not be started before this
one completes. Available since 5.2.
.TP
.B IOSQE_IO_LINK
-When this flag is specified, it forms a link with the next SQE in the
-submission ring. That next SQE will not be started before this one completes.
-This, in effect, forms a chain of SQEs, which can be arbitrarily long. The tail
-of the chain is denoted by the first SQE that does not have this flag set.
-This flag has no effect on previous SQE submissions, nor does it impact SQEs
-that are outside of the chain tail. This means that multiple chains can be
+When this flag is specified, the SQE forms a link with the next SQE in the
+submission ring. That next SQE will not be started before the previous request
+completes. This, in effect, forms a chain of SQEs, which can be arbitrarily
+long. The tail of the chain is denoted by the first SQE that does not have this
+flag set. Chains are not supported across submission boundaries. Even if the
+last SQE in a submission has this flag set, it will still terminate the current
+chain. This flag has no effect on previous SQE submissions, nor does it impact
+SQEs that are outside of the chain tail. This means that multiple chains can be
executing in parallel, or chains and individual SQEs. Only members inside the
chain are serialized. A chain of SQEs will be broken, if any request in that
chain ends in error. io_uring considers any unexpected result an error. This
@@ -829,7 +1115,7 @@ Used in conjunction with the
command, which registers a pool of buffers to be used by commands that read
or receive data. When buffers are registered for this use case, and this
flag is set in the command, io_uring will grab a buffer from this pool when
-the request is ready to receive or read data. If succesful, the resulting CQE
+the request is ready to receive or read data. If successful, the resulting CQE
will have
.B IORING_CQE_F_BUFFER
set in the flags part of the struct, and the upper
@@ -841,6 +1127,37 @@ are available and this flag is set, then the request will fail with
as the error code. Once a buffer has been used, it is no longer available in
the kernel pool. The application must re-register the given buffer again when
it is ready to recycle it (eg has completed using it). Available since 5.7.
+.TP
+.B IOSQE_CQE_SKIP_SUCCESS
+Don't generate a CQE if the request completes successfully. If the request
+fails, an appropriate CQE will be posted as usual and if there is no
+.B IOSQE_IO_HARDLINK,
+CQEs for all linked requests will be omitted. The notion of failure/success is
+opcode specific and is the same as with breaking chains of
+.B IOSQE_IO_LINK.
+One special case is when the request has a linked timeout, then the CQE
+generation for the linked timeout is decided solely by whether it has
+.B IOSQE_CQE_SKIP_SUCCESS
+set, regardless whether it timed out or was canceled. In other words, if a
+linked timeout has the flag set, it's guaranteed to not post a CQE.
+
+The semantics are chosen to accommodate several use cases. First, when all but
+the last request of a normal link without linked timeouts are marked with the
+flag, only one CQE per lin is posted. Additionally, it enables supression of
+CQEs in cases where the side effects of a successfully executed operation is
+enough for userspace to know the state of the system. One such example would
+be writing to a synchronisation file.
+
+This flag is incompatible with
+.B IOSQE_IO_DRAIN.
+Using both of them in a single ring is undefined behavior, even when they are
+not used together in a single request. Currently, after the first request with
+.B IOSQE_CQE_SKIP_SUCCESS,
+all subsequent requests marked with drain will be failed at submission time.
+Note that the error reporting is best effort only, and restrictions may change
+in the future.
+
+Available since 5.17.
.PP
.I ioprio
@@ -933,7 +1250,13 @@ is copied from the field of the same name in the submission queue
entry. The primary use case is to store data that the application
will need to access upon completion of this particular I/O. The
.I flags
-is reserved for future use.
+is used for certain commands, like
+.B IORING_OP_POLL_ADD
+or in conjunction with
+.B IOSQE_BUFFER_SELECT
+or
+.B IORING_OP_MSG_RING,
+, see those entries for details.
.I res
is the operation-specific result, but io_uring-specific errors
(e.g. flags or opcode invalid) are returned through this field.
@@ -941,13 +1264,22 @@ They are described in section
.B CQE ERRORS.
.PP
For read and write opcodes, the
-return values match those documented in the
+return values match
+.I errno
+values documented in the
.BR preadv2 (2)
and
.BR pwritev2 (2)
-man pages.
-Return codes for the io_uring-specific opcodes are documented in the
-description of the opcodes above.
+man pages, with
+.I
+res
+holding the equivalent of
+.I -errno
+for error cases, or the transferred number of bytes in case the operation
+is successful. Hence both error and success return can be found in that
+field in the CQE. For other request types, the return values are documented
+in the matching man page for that type, or in the opcodes section above for
+io_uring-specific opcodes.
.PP
.SH RETURN VALUE
.BR io_uring_enter ()
@@ -967,7 +1299,9 @@ completion queue entry (see section
rather than through the system call itself.
Errors that occur not on behalf of a submission queue entry are returned via the
-system call directly. On such an error, -1 is returned and
+system call directly. On such an error,
+.B -1
+is returned and
.I errno
is set appropriately.
.PP
diff --git a/man/io_uring_free_probe.3 b/man/io_uring_free_probe.3
new file mode 100644
index 0000000..d2308fa
--- /dev/null
+++ b/man/io_uring_free_probe.3
@@ -0,0 +1,27 @@
+.\" Copyright (C) 2022 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_free_probe "January 25, 2022" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_free_probe \- free probe instance
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_free_probe(struct io_uring_probe *" probe ");"
+.fi
+.SH DESCRIPTION
+.PP
+The function
+.BR io_uring_free_probe (3)
+frees the
+.I probe
+instance allocated with the
+.BR io_uring_get_probe (3)
+function.
+
+.SH RETURN VALUE
+None
+.SH SEE ALSO
+.BR io_uring_get_probe (3)
diff --git a/man/io_uring_get_probe.3 b/man/io_uring_get_probe.3
new file mode 100644
index 0000000..94c1b21
--- /dev/null
+++ b/man/io_uring_get_probe.3
@@ -0,0 +1,30 @@
+.\" Copyright (C) 2022 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_get_probe "January 25, 2022" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_get_probe \- get probe instance
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "io_uring_probe *io_uring_get_probe(void);"
+.fi
+.SH DESCRIPTION
+.PP
+The function
+.BR io_uring_get_probe (3)
+returns an allocated io_uring_probe structure to the caller. The caller is
+responsible for freeing the structure with the function
+.BR io_uring_free_probe (3).
+
+.SH NOTES
+Earlier versions of the Linux kernel do not support probe. If the kernel
+doesn't support probe, this function will return NULL.
+
+.SH RETURN VALUE
+On success it returns an allocated io_uring_probe structure, otherwise
+it returns NULL.
+.SH SEE ALSO
+.BR io_uring_free_probe (3)
diff --git a/man/io_uring_get_sqe.3 b/man/io_uring_get_sqe.3
index 24834f3..58c8b96 100644
--- a/man/io_uring_get_sqe.3
+++ b/man/io_uring_get_sqe.3
@@ -5,25 +5,28 @@
.\"
.TH io_uring_get_sqe 3 "July 10, 2020" "liburing-0.7" "liburing Manual"
.SH NAME
-io_uring_get_sqe - get the next vacant event from the submission queue
+io_uring_get_sqe \- get the next available submission queue entry from the
+submission queue
.SH SYNOPSIS
.nf
-.BR "#include <liburing.h>"
+.B #include <liburing.h>
.PP
-.BI "struct io_uring_sqe *io_uring_get_sqe(struct io_uring " *ring );
+.BI "struct io_uring_sqe *io_uring_get_sqe(struct io_uring *" ring ");"
.fi
-.PP
.SH DESCRIPTION
.PP
-The io_uring_get_sqe() function gets the next vacant event from the submission
+The
+.BR io_uring_get_sqe (3)
+function gets the next available submission queue entry from the submission
queue belonging to the
.I ring
param.
-On success io_uring_get_sqe() returns a pointer to the submission queue event.
-On failure NULL is returned.
+On success
+.BR io_uring_get_sqe (3)
+returns a pointer to the submission queue entry. On failure NULL is returned.
-If a submission queue event is returned, it should be filled out via one of the
+If a submission queue entry is returned, it should be filled out via one of the
prep functions such as
.BR io_uring_prep_read (3)
and submitted via
@@ -32,6 +35,7 @@ and submitted via
.SH RETURN VALUE
.BR io_uring_get_sqe (3)
returns a pointer to the next submission queue event on success and NULL on
-failure.
+failure. If NULL is returned, the SQ ring is currently full and entries must
+be submitted for processing before new ones can get allocated.
.SH SEE ALSO
.BR io_uring_submit (3)
diff --git a/man/io_uring_opcode_supported.3 b/man/io_uring_opcode_supported.3
new file mode 100644
index 0000000..b20b504
--- /dev/null
+++ b/man/io_uring_opcode_supported.3
@@ -0,0 +1,30 @@
+.\" Copyright (C) 2022 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_opcode_supported "January 25, 2022" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_opcode_supported \- is op code supported?
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_opcode_supported(struct io_uring_probe *" probe ","
+.BI " int " opcode ");"
+.fi
+.SH DESCRIPTION
+.PP
+The function
+.BR io_uring_opcode_supported (3)
+allows the caller to determine if the passed in
+.I opcode
+belonging to the
+.I probe
+param is supported. An instance of the io_uring_probe instance can be
+obtained by calling the function
+.BR io_uring_get_probe (3).
+
+.SH RETURN VALUE
+On success it returns 1, otherwise it returns 0.
+.SH SEE ALSO
+.BR io_uring_get_probe (3)
diff --git a/man/io_uring_peek_cqe.3 b/man/io_uring_peek_cqe.3
new file mode 100644
index 0000000..a4ac2da
--- /dev/null
+++ b/man/io_uring_peek_cqe.3
@@ -0,0 +1,38 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_peek_cqe 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_peek_cqe \- check if an io_uring completion event is available
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_peek_cqe(struct io_uring *" ring ","
+.BI " struct io_uring_cqe **" cqe_ptr ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_peek_cqe (3)
+function returns an IO completion from the queue belonging to the
+.I ring
+param, if one is readily available. On successful return,
+.I cqe_ptr
+param is filled with a valid CQE entry.
+
+This function does not enter the kernel to wait for an event, an event
+is only returned if it's already available in the CQ ring.
+
+.SH RETURN VALUE
+On success
+.BR io_uring_peek_cqe (3)
+returns
+.B 0
+and the cqe_ptr parameter is filled in. On failure it returns
+.BR -EAGAIN .
+.SH SEE ALSO
+.BR io_uring_submit (3),
+.BR io_uring_wait_cqes (3),
+.BR io_uring_wait_cqe (3)
diff --git a/man/io_uring_prep_accept.3 b/man/io_uring_prep_accept.3
new file mode 100644
index 0000000..3800ccb
--- /dev/null
+++ b/man/io_uring_prep_accept.3
@@ -0,0 +1,159 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_accept 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_accept \- prepare an accept request
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_accept(struct io_uring_sqe *" sqe ","
+.BI " int " sockfd ","
+.BI " struct sockaddr *" addr ","
+.BI " socklen_t *" addrlen ","
+.BI " int " flags ");"
+.PP
+.BI "void io_uring_prep_accept_direct(struct io_uring_sqe *" sqe ","
+.BI " int " sockfd ","
+.BI " struct sockaddr *" addr ","
+.BI " socklen_t *" addrlen ","
+.BI " int " flags ","
+.BI " unsigned int " file_index ");"
+.PP
+.BI "void io_uring_prep_multishot_accept(struct io_uring_sqe *" sqe ","
+.BI " int " sockfd ","
+.BI " struct sockaddr *" addr ","
+.BI " socklen_t *" addrlen ","
+.BI " int " flags ");"
+.PP
+.BI "void io_uring_prep_multishot_accept_direct(struct io_uring_sqe *" sqe ","
+.BI " int " sockfd ","
+.BI " struct sockaddr *" addr ","
+.BI " socklen_t *" addrlen ","
+.BI " int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_accept (3)
+function prepares an accept request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I sockfd
+to start accepting a connection request described by the socket address at
+.I addr
+and of structure length
+.I addrlen
+and using modifier flags in
+.IR flags .
+
+For a direct descriptor accept request, the offset is specified by the
+.I file_index
+argument. Direct descriptors are io_uring private file descriptors. They
+avoid some of the overhead associated with thread shared file tables and
+can be used in any io_uring request that takes a file descriptor. To do so,
+.B IOSQE_FIXED_FILE
+must be set in the SQE
+.I flags
+member, and the SQE
+.I fd
+field should use the direct descriptor value rather than the regular file
+descriptor. Direct descriptors are managed like registered files.
+
+If the direct variant is used, the application must first have registered
+a file table using
+.BR io_uring_register_files (3)
+of the appropriate size. Once registered, a direct accept request may use any
+entry in that table, as long as it is within the size of the registered table.
+If a specified entry already contains a file, the file will first be removed
+from the table and closed. It's consistent with the behavior of updating an
+existing file with
+.BR io_uring_register_files_update (3).
+Note that old kernels don't check the SQE
+.I file_index
+field, which is not a problem for liburing helpers, but users of the raw
+io_uring interface need to zero SQEs to avoid unexpected behavior. This also
+means that applications should check for availability of
+.B IORING_OP_ACCEPT_DIRECT
+before using it, they cannot rely on a
+.B -EINVAL
+CQE
+.I res
+return.
+
+For a direct descriptor accept request, the
+.I file_index
+argument can be set to
+.BR IORING_FILE_INDEX_ALLOC ,
+In this case a free entry in io_uring file table will
+be used automatically and the file index will be returned as CQE
+.IR res .
+.B -ENFILE
+is otherwise returned if there is no free entries in the io_uring file table.
+
+The multishot version accept and accept_direct allow an application to issue
+a single accept request, which will repeatedly trigger a CQE when a connection
+request comes in. Like other multishot type requests, the application should
+look at the CQE
+.I flags
+and see if
+.B IORING_CQE_F_MORE
+is set on completion as an indication of whether or not the accept request
+will generate further CQEs. The multishot variants are available since 5.19.
+
+For multishot with direct descriptors,
+.B IORING_FILE_INDEX_ALLOC
+must be used as the file descriptor. This tells io_uring to allocate a free
+direct descriptor from our table, rather than the application passing one in.
+Failure to do so will result in the accept request being terminated with
+.BR -EINVAL .
+The allocated descriptor will be returned in the CQE
+.I res
+field, like a non-direct accept request.
+
+These functions prepare an async
+.BR accept4 (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. For singleshot accept, the
+non-direct accept returns the installed file descriptor as its value, the
+direct accept returns
+.B 0
+on success. The caller must know which direct descriptor was picked for this
+request. For multishot accept, the non-direct accept returns the installed
+file descriptor as its value, the direct accept returns the file index used on
+success. See the related man page for details on possible values for the
+non-direct accept. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_register (2),
+.BR accept4 (2)
diff --git a/man/io_uring_prep_accept_direct.3 b/man/io_uring_prep_accept_direct.3
new file mode 120000
index 0000000..0404bf5
--- /dev/null
+++ b/man/io_uring_prep_accept_direct.3
@@ -0,0 +1 @@
+io_uring_prep_accept.3 \ No newline at end of file
diff --git a/man/io_uring_prep_cancel.3 b/man/io_uring_prep_cancel.3
new file mode 100644
index 0000000..3c9f2df
--- /dev/null
+++ b/man/io_uring_prep_cancel.3
@@ -0,0 +1,118 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_cancel 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_cancel \- prepare a cancelation request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_cancel64(struct io_uring_sqe *" sqe ","
+.BI " __u64 " user_data ","
+.BI " int " flags ");"
+.PP
+.BI "void io_uring_prep_cancel(struct io_uring_sqe *" sqe ","
+.BI " void *" user_data ","
+.BI " int " flags ");"
+.PP
+.BI "void io_uring_prep_cancel_fd(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_cancel (3)
+function prepares a cancelation request. The submission queue entry
+.I sqe
+is prepared to cancel an existing request identified by
+.IR user_data .
+For the
+.I flags
+argument, see below.
+
+.BR io_uring_prep_cancel64 (3)
+is identical to
+.BR io_uring_prep_cancel (3) ,
+except it takes a 64-bit integer rather than a pointer type.
+
+The cancelation request will attempt to find the previously issued request
+identified by
+.I user_data
+and cancel it. The identifier is what the previously issued request has in
+their
+.I user_data
+field in the SQE.
+
+The
+.BR io_uring_prep_cancel_fd (3)
+function prepares a cancelation request. The submission queue entry
+.I sqe
+is prepared to cancel an existing request that used the file descriptor
+.IR fd .
+For the
+.I flags
+argument, see below.
+
+The cancelation request will attempt to find the previously issued request
+that used
+.I fd
+as the file descriptor and cancel it.
+
+By default, the first request matching the criteria given will be canceled.
+This can be modified with any of the following flags passed in:
+.TP
+.B IORING_ASYNC_CANCEL_ALL
+Cancel all requests that match the given criteria, rather than just canceling
+the first one found. Available since 5.19.
+.TP
+.B IORING_ASYNC_CANCEL_FD
+Match based on the file descriptor used in the original request rather than
+the user_data. This is what
+.BR io_uring_prep_cancel_fd (3)
+sets up. Available since 5.19.
+.TP
+.B IORING_ASYNC_CANCEL_ANY
+Match any request in the ring, regardless of user_data or file descriptor.
+Can be used to cancel any pending request in the ring. Available since 5.19.
+.P
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+These are the errors that are reported in the CQE
+.I res
+field. If no flags are used to cancel multiple requests,
+.B 0
+is returned on success. If flags are used to match multiple requests, then
+a positive value is returned indicating how many requests were found and
+canceled.
+.TP
+.B -ENOENT
+The request identified by
+.I user_data
+could not be located. This could be because it completed before the cancelation
+request was issued, or if an invalid identifier is used.
+.TP
+.B -EINVAL
+One of the fields set in the SQE was invalid.
+.TP
+.B -EALREADY
+The execution state of the request has progressed far enough that cancelation
+is no longer possible. This should normally mean that it will complete shortly,
+either successfully, or interrupted due to the cancelation.
+.SH NOTES
+Although the cancelation request uses async request syntax, the kernel side of
+the cancelation is always run synchronously. It is guaranteed that a CQE is
+always generated by the time the cancel request has been submitted. If the
+cancelation is successful, the completion for the request targeted for
+cancelation will have been posted by the time submission returns. For
+.B -EALREADY
+it may take a bit of time to do so. For this case, the caller must wait for the
+canceled request to post its completion event.
+.SH SEE ALSO
+.BR io_uring_prep_poll_remove (3),
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3)
diff --git a/man/io_uring_prep_cancel64.3 b/man/io_uring_prep_cancel64.3
new file mode 120000
index 0000000..347db09
--- /dev/null
+++ b/man/io_uring_prep_cancel64.3
@@ -0,0 +1 @@
+io_uring_prep_cancel.3 \ No newline at end of file
diff --git a/man/io_uring_prep_close.3 b/man/io_uring_prep_close.3
new file mode 100644
index 0000000..94780f2
--- /dev/null
+++ b/man/io_uring_prep_close.3
@@ -0,0 +1,59 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_close 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_close \- prepare a file descriptor close request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_close(struct io_uring_sqe *" sqe ","
+.BI " int " fd ");"
+.PP
+.BI "void io_uring_prep_close_direct(struct io_uring_sqe *" sqe ","
+.BI " unsigned " file_index ");"
+.PP
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_close (3)
+function prepares a close request. The submission queue entry
+.I sqe
+is setup to close the file descriptor indicated by
+.IR fd .
+
+For a direct descriptor close request, the offset is specified by the
+.I file_index
+argument instead of the
+.IR fd .
+This is identical to unregistering the direct descriptor, and is provided as
+a convenience.
+
+These functions prepare an async
+.BR close (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR close (2)
diff --git a/man/io_uring_prep_close_direct.3 b/man/io_uring_prep_close_direct.3
new file mode 120000
index 0000000..d9ce6a6
--- /dev/null
+++ b/man/io_uring_prep_close_direct.3
@@ -0,0 +1 @@
+io_uring_prep_close.3 \ No newline at end of file
diff --git a/man/io_uring_prep_connect.3 b/man/io_uring_prep_connect.3
new file mode 100644
index 0000000..6a7c64a
--- /dev/null
+++ b/man/io_uring_prep_connect.3
@@ -0,0 +1,66 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_connect 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_connect \- prepare a connect request
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.B #include <sys/socket.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_connect(struct io_uring_sqe *" sqe ","
+.BI " int " sockfd ","
+.BI " const struct sockaddr *" addr ","
+.BI " socklen_t " addrlen ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_connect (3)
+function prepares a connect request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I sockfd
+to start connecting to the destination described by the socket address at
+.I addr
+and of structure length
+.IR addrlen .
+
+This function prepares an async
+.BR connect (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR connect (2)
diff --git a/man/io_uring_prep_fadvise.3 b/man/io_uring_prep_fadvise.3
new file mode 100644
index 0000000..a53ab25
--- /dev/null
+++ b/man/io_uring_prep_fadvise.3
@@ -0,0 +1,59 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_fadvise 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_fadvise \- prepare a fadvise request
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_fadvise(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " __u64 " offset ","
+.BI " off_t " len ","
+.BI " int " advice ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_fadvise (3)
+function prepares an fadvise request. The submission queue entry
+.I sqe
+is setup to use the file descriptor pointed to by
+.I fd
+to start an fadvise operation at
+.I offset
+and of
+.I len
+length in bytes, giving it the advise located in
+.IR advice .
+
+This function prepares an async
+.BR posix_fadvise (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_register (2),
+.BR posix_fadvise (2)
diff --git a/man/io_uring_prep_fallocate.3 b/man/io_uring_prep_fallocate.3
new file mode 100644
index 0000000..86e1d39
--- /dev/null
+++ b/man/io_uring_prep_fallocate.3
@@ -0,0 +1,59 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_fallocate 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_fallocate \- prepare a fallocate request
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_fallocate(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " int " mode ","
+.BI " off_t " offset ","
+.BI " off_t " len ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_fallocate (3)
+function prepares a fallocate request. The submission queue entry
+.I sqe
+is setup to use the file descriptor pointed to by
+.I fd
+to start a fallocate operation described by
+.I mode
+at offset
+.I offset
+and
+.I len
+length in bytes.
+
+This function prepares an async
+.BR fallocate (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR fallocate (2)
diff --git a/man/io_uring_prep_files_update.3 b/man/io_uring_prep_files_update.3
new file mode 100644
index 0000000..bedb85e
--- /dev/null
+++ b/man/io_uring_prep_files_update.3
@@ -0,0 +1,92 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_files_update 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_files_update \- prepare a registered file update request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_files_update(struct io_uring_sqe *" sqe ","
+.BI " int *" fds ","
+.BI " unsigned " nr_fds ","
+.BI " int " offset ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_files_update (3)
+function prepares a request for updating a number of previously registered file
+descriptors. The submission queue entry
+.I sqe
+is setup to use the file descriptor array pointed to by
+.I fds
+and of
+.I nr_fds
+in length to update that amount of previously registered files starting at
+offset
+.IR offset .
+
+Once a previously registered file is updated with a new one, the existing
+entry is updated and then removed from the table. This operation is equivalent to
+first unregistering that entry and then inserting a new one, just bundled into
+one combined operation.
+
+If
+.I offset
+is specified as IORING_FILE_INDEX_ALLOC, io_uring will allocate free direct
+descriptors instead of having the application to pass, and store allocated
+direct descriptors into
+.I fds
+array,
+.I cqe->res
+will return the number of direct descriptors allocated.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+These are the errors that are reported in the CQE
+.I res
+field. On success,
+.I res
+will contain the number of successfully updated file descriptors. On error,
+the following errors can occur.
+.TP
+.B -ENOMEM
+The kernel was unable to allocate memory for the request.
+.TP
+.B -EINVAL
+One of the fields set in the SQE was invalid.
+.TP
+.B -EFAULT
+The kernel was unable to copy in the memory pointed to by
+.IR fds .
+.TP
+.B -EBADF
+On of the descriptors located in
+.I fds
+didn't refer to a valid file descriptor, or one of the file descriptors in
+the array referred to an io_uring instance.
+.TP
+.B -EOVERFLOW
+The product of
+.I offset
+and
+.I nr_fds
+exceed the valid amount or overflowed.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_register (2)
diff --git a/man/io_uring_prep_fsync.3 b/man/io_uring_prep_fsync.3
new file mode 100644
index 0000000..a3259a0
--- /dev/null
+++ b/man/io_uring_prep_fsync.3
@@ -0,0 +1,70 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_fsync 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_fsync \- prepare an fsync request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_fsync(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " unsigned " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_fsync (3)
+function prepares an fsync request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+that should get synced, with the modifier flags indicated by the
+.I flags
+argument.
+
+This function prepares an fsync request. It can act either like an
+.BR fsync (2)
+operation, which is the default behavior. If
+.B IORING_FSYNC_DATASYNC
+is set in the
+.I flags
+argument, then it behaves like
+.BR fdatasync (2).
+If no range is specified, the
+.I fd
+will be synced from 0 to end-of-file.
+
+It's possible to specify a range to sync, if one is desired. If the
+.I off
+field of the SQE is set to non-zero, then that indicates the offset to
+start syncing at. If
+.I len
+is set in the SQE, then that indicates the size in bytes to sync from the
+offset. Note that these fields are not accepted by this helper, so they have
+to be set manually in the SQE after calling this prep helper.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR fsync (2),
+.BR fdatasync (2)
diff --git a/man/io_uring_prep_link.3 b/man/io_uring_prep_link.3
new file mode 120000
index 0000000..6d3059d
--- /dev/null
+++ b/man/io_uring_prep_link.3
@@ -0,0 +1 @@
+io_uring_prep_linkat.3 \ No newline at end of file
diff --git a/man/io_uring_prep_linkat.3 b/man/io_uring_prep_linkat.3
new file mode 100644
index 0000000..0949e3b
--- /dev/null
+++ b/man/io_uring_prep_linkat.3
@@ -0,0 +1,91 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_linkat 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_linkat \- prepare a linkat request
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.B #include <unistd.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_linkat(struct io_uring_sqe *" sqe ","
+.BI " int " olddirfd ","
+.BI " const char *" oldpath ","
+.BI " int " newdirfd ","
+.BI " const char *" newpath ","
+.BI " int " flags ");"
+.PP
+.BI "void io_uring_prep_link(struct io_uring_sqe *" sqe ","
+.BI " const char *" oldpath ","
+.BI " const char *" newpath ","
+.BI " int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_linkat (3)
+function prepares a linkat request. The submission queue entry
+.I sqe
+is setup to use the old directory file descriptor pointed to by
+.I olddirfd
+and old path pointed to by
+.I oldpath
+with the new directory file descriptor pointed to by
+.I newdirfd
+and the new path pointed to by
+.I newpath
+and using the specified flags in
+.IR flags .
+
+The
+.BR io_uring_prep_link (3)
+function prepares a link request. The submission queue entry
+.I sqe
+is setup to use the old path pointed to by
+.I oldpath
+and the new path pointed to by
+.IR newpath ,
+both relative to the current working directory and using the specified flags in
+.IR flags .
+
+These functions prepare an async
+.BR linkat (2)
+or
+.BR link (2)
+request. See those man pages for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR linkat (2),
+.BR link (2)
diff --git a/man/io_uring_prep_madvise.3 b/man/io_uring_prep_madvise.3
new file mode 100644
index 0000000..6c5f16b
--- /dev/null
+++ b/man/io_uring_prep_madvise.3
@@ -0,0 +1,56 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_madvise 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_madvise \- prepare a madvise request
+.SH SYNOPSIS
+.nf
+.B #include <sys/mman.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_madvise(struct io_uring_sqe *" sqe ","
+.BI " void *" addr ","
+.BI " off_t " len ","
+.BI " int " advice ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_madvise (3)
+function prepares an madvise request. The submission queue entry
+.I sqe
+is setup to start an madvise operation at the virtual address of
+.I addr
+and of
+.I len
+length in bytes, giving it the advise located in
+.IR advice .
+
+This function prepares an async
+.BR madvise (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_register (2),
+.BR madvise (2)
diff --git a/man/io_uring_prep_mkdir.3 b/man/io_uring_prep_mkdir.3
new file mode 120000
index 0000000..b3412d1
--- /dev/null
+++ b/man/io_uring_prep_mkdir.3
@@ -0,0 +1 @@
+io_uring_prep_mkdirat.3 \ No newline at end of file
diff --git a/man/io_uring_prep_mkdirat.3 b/man/io_uring_prep_mkdirat.3
new file mode 100644
index 0000000..a98b4e3
--- /dev/null
+++ b/man/io_uring_prep_mkdirat.3
@@ -0,0 +1,83 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_mkdirat 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_mkdirat \- prepare an mkdirat request
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.B #include <sys/stat.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_mkdirat(struct io_uring_sqe *" sqe ","
+.BI " int " dirfd ","
+.BI " const char *" path ","
+.BI " mode_t " mode ");"
+.PP
+.BI "void io_uring_prep_mkdir(struct io_uring_sqe *" sqe ","
+.BI " const char *" path ","
+.BI " mode_t " mode ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_mkdirat (3)
+function prepares a mkdirat request. The submission queue entry
+.I sqe
+is setup to use the directory file descriptor pointed to by
+.I dirfd
+to start a mkdirat operation on the path identified by
+.I path
+with the mode given in
+.IR mode .
+
+The
+.BR io_uring_prep_mkdir (3)
+function prepares a mkdir request. The submission queue entry
+.I sqe
+is setup to use the current working directory to start a mkdir
+operation on the path identified by
+.I path
+with the mode given in
+.IR mode .
+
+These functions prepare an async
+.BR mkdir (2)
+or
+.BR mkdirat (2)
+request. See those man pages for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR mkdirat (2),
+.BR mkdir (2)
diff --git a/man/io_uring_prep_msg_ring.3 b/man/io_uring_prep_msg_ring.3
new file mode 100644
index 0000000..9cf3444
--- /dev/null
+++ b/man/io_uring_prep_msg_ring.3
@@ -0,0 +1,72 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_msg_ring 3 "March 10, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_msg_ring \- send a message to another ring
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_msg_ring(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " unsigned int " len ","
+.BI " __u64 " data ","
+.BI " unsigned int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+.BR io_uring_prep_msg_ring (3)
+prepares a to send a CQE to an io_uring file descriptor. The submission queue
+entry
+.I sqe
+is setup to use the file descriptor
+.IR fd ,
+which must identify a io_uring context, to post a CQE on that ring where the
+target CQE
+.B res
+field will contain the content of
+.I len
+and the
+.B user_data
+of
+.I data
+with the request modifier flags set by
+.IR flags .
+Currently there are no valid flag modifiers, this field must contain
+.BR 0 .
+
+The targeted ring may be any ring that the user has access to, even the ring
+itself. This request can be used for simple message passing to another ring,
+allowing 32+64 bits of data to be transferred through the
+.I len
+and
+.I data
+fields. The use case may be anything from simply waking up someone waiting
+on the targeted ring, or it can be used to pass messages between the two
+rings.
+
+.SH RETURN VALUE
+None
+
+.SH ERRORS
+These are the errors that are reported in the CQE
+.I res
+field.
+.TP
+.B -ENOMEM
+The kernel was unable to allocate memory for the request.
+.TP
+.B -EINVAL
+One of the fields set in the SQE was invalid.
+.TP
+.B -EBADFD
+The descriptor passed in
+.I fd
+does not refer to an io_uring file descriptor.
+.TP
+.B -EOVERFLOW
+The kernel was unable to fill a CQE on the target ring. This can happen if
+the target CQ ring is in an overflow state and the kernel wasn't able to
+allocate memory for a new CQE entry.
diff --git a/man/io_uring_prep_multishot_accept.3 b/man/io_uring_prep_multishot_accept.3
new file mode 120000
index 0000000..0404bf5
--- /dev/null
+++ b/man/io_uring_prep_multishot_accept.3
@@ -0,0 +1 @@
+io_uring_prep_accept.3 \ No newline at end of file
diff --git a/man/io_uring_prep_multishot_accept_direct.3 b/man/io_uring_prep_multishot_accept_direct.3
new file mode 120000
index 0000000..0404bf5
--- /dev/null
+++ b/man/io_uring_prep_multishot_accept_direct.3
@@ -0,0 +1 @@
+io_uring_prep_accept.3 \ No newline at end of file
diff --git a/man/io_uring_prep_openat.3 b/man/io_uring_prep_openat.3
new file mode 100644
index 0000000..e8b4217
--- /dev/null
+++ b/man/io_uring_prep_openat.3
@@ -0,0 +1,117 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_openat 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_openat \- prepare an openat request
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.B #include <sys/stat.h>
+.B #include <fcntl.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_openat(struct io_uring_sqe *" sqe ","
+.BI " int " dfd ","
+.BI " const char *" path ","
+.BI " int " flags ","
+.BI " mode_t " mode ");"
+.PP
+.BI "void io_uring_prep_openat_direct(struct io_uring_sqe *" sqe ","
+.BI " int " dfd ","
+.BI " const char *" path ","
+.BI " int " flags ","
+.BI " mode_t " mode ","
+.BI " unsigned " file_index ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_openat (3)
+function prepares an openat request. The submission queue entry
+.I sqe
+is setup to use the directory file descriptor
+.I dfd
+to start opening a file described by
+.I path
+and using the open flags in
+.I flags
+and using the file mode bits specified in
+.IR mode .
+
+For a direct descriptor open request, the offset is specified by the
+.I file_index
+argument. Direct descriptors are io_uring private file descriptors. They
+avoid some of the overhead associated with thread shared file tables, and
+can be used in any io_uring request that takes a file descriptor. To do so,
+.B IOSQE_FIXED_FILE
+must be set in the SQE
+.I flags
+member, and the SQE
+.I fd
+field should use the direct descriptor value rather than the regular file
+descriptor. Direct descriptors are managed like registered files.
+
+If the direct variant is used, the application must first have registered
+a file table using
+.BR io_uring_register_files (3)
+of the appropriate size. Once registered, a direct accept request may use any
+entry in that table, as long as it is within the size of the registered table.
+If a specified entry already contains a file, the file will first be removed
+from the table and closed. It's consistent with the behavior of updating an
+existing file with
+.BR io_uring_register_files_update (3).
+Note that old kernels don't check the SQE
+.I file_index
+field, which is not a problem for liburing helpers, but users of the raw
+io_uring interface need to zero SQEs to avoid unexpected behavior.
+
+If
+.B IORING_FILE_INDEX_ALLOC
+is used as the
+.I file_index
+for a direct open, then io_uring will allocate a free direct descriptor in
+the existing table. The allocated descriptor is returned in the CQE
+.I res
+field just like it would be for a non-direct open request. If no more entries
+are available in the direct descriptor table,
+.B -ENFILE
+is returned instead.
+
+These functions prepare an async
+.BR openat (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_register (2),
+.BR openat (2)
diff --git a/man/io_uring_prep_openat2.3 b/man/io_uring_prep_openat2.3
new file mode 100644
index 0000000..338cf7e
--- /dev/null
+++ b/man/io_uring_prep_openat2.3
@@ -0,0 +1,117 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_openat2 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_openat2 \- prepare an openat2 request
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.B #include <sys/stat.h>
+.B #include <fcntl.h>
+.B #include <linux/openat2.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_openat2(struct io_uring_sqe *" sqe ","
+.BI " int " dfd ","
+.BI " const char *" path ","
+.BI " int " flags ","
+.BI " struct open_how *" how ");"
+.PP
+.BI "void io_uring_prep_openat2_direct(struct io_uring_sqe *" sqe ","
+.BI " int " dfd ","
+.BI " const char *" path ","
+.BI " int " flags ","
+.BI " struct open_how *" how ","
+.BI " unsigned " file_index ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_openat2 (3)
+function prepares an openat2 request. The submission queue entry
+.I sqe
+is setup to use the directory file descriptor
+.I dfd
+to start opening a file described by
+.I path
+and using the open flags in
+.I flags
+and using the instructions on how to open the file given in
+.IR how .
+
+For a direct descriptor open request, the offset is specified by the
+.I file_index
+argument. Direct descriptors are io_uring private file descriptors. They
+avoid some of the overhead associated with thread shared file tables, and
+can be used in any io_uring request that takes a file descriptor. To do so,
+.B IOSQE_FIXED_FILE
+must be set in the SQE
+.I flags
+member, and the SQE
+.I fd
+field should use the direct descriptor value rather than the regular file
+descriptor. Direct descriptors are managed like registered files.
+
+If the direct variant is used, the application must first have registered
+a file table using
+.BR io_uring_register_files (3)
+of the appropriate size. Once registered, a direct accept request may use any
+entry in that table, as long as it is within the size of the registered table.
+If a specified entry already contains a file, the file will first be removed
+from the table and closed. It's consistent with the behavior of updating an
+existing file with
+.BR io_uring_register_files_update (3).
+Note that old kernels don't check the SQE
+.I file_index
+field, which is not a problem for liburing helpers, but users of the raw
+io_uring interface need to zero SQEs to avoid unexpected behavior.
+If
+.B IORING_FILE_INDEX_ALLOC
+is used as the
+.I file_index
+for a direct open, then io_uring will allocate a free direct descriptor in
+the existing table. The allocated descriptor is returned in the CQE
+.I res
+field just like it would be for a non-direct open request. If no more entries
+are available in the direct descriptor table,
+.B -ENFILE
+is returned instead.
+
+These functions prepare an async
+.BR openat2 (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_register (2),
+.BR openat2 (2)
diff --git a/man/io_uring_prep_openat2_direct.3 b/man/io_uring_prep_openat2_direct.3
new file mode 120000
index 0000000..2c0e6c9
--- /dev/null
+++ b/man/io_uring_prep_openat2_direct.3
@@ -0,0 +1 @@
+io_uring_prep_openat2.3 \ No newline at end of file
diff --git a/man/io_uring_prep_openat_direct.3 b/man/io_uring_prep_openat_direct.3
new file mode 120000
index 0000000..67f501e
--- /dev/null
+++ b/man/io_uring_prep_openat_direct.3
@@ -0,0 +1 @@
+io_uring_prep_openat.3 \ No newline at end of file
diff --git a/man/io_uring_prep_poll_add.3 b/man/io_uring_prep_poll_add.3
new file mode 100644
index 0000000..cb60878
--- /dev/null
+++ b/man/io_uring_prep_poll_add.3
@@ -0,0 +1,72 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_poll_add 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_poll_add \- prepare a poll request
+.SH SYNOPSIS
+.nf
+.B #include <poll.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_poll_add(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " unsigned " poll_mask ");"
+.PP
+.BI "void io_uring_prep_poll_multishot(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " unsigned " poll_mask ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_poll_add (3)
+function prepares a poll request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+that should get polled, with the events desired specified in the
+.I poll_mask
+argument.
+
+The default behavior is a single-shot poll request. When the specified event
+has triggered, a completion CQE is posted and no more events will be generated
+by the poll request.
+.BR io_uring_prep_multishot (3)
+behaves identically in terms of events, but it persist across notifications
+and will repeatedly post notifications for the same registration. A CQE
+posted from a multishot poll request will have
+.B IORING_CQE_F_MORE
+set in the CQE
+.I flags
+member, indicating that the application should expect more completions from
+this request. If the multishot poll request gets terminated or experiences
+an error, this flag will not be set in the CQE. If this happens, the application
+should not expect further CQEs from the original request and must reissue a
+new one if it still wishes to get notifications on this file descriptor.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation, which is a bitmask of the
+events notified. See the
+.BR poll (2)
+man page for details. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR poll (2),
+.BR epoll_ctl (3)
diff --git a/man/io_uring_prep_poll_multishot.3 b/man/io_uring_prep_poll_multishot.3
new file mode 120000
index 0000000..ac8fb8f
--- /dev/null
+++ b/man/io_uring_prep_poll_multishot.3
@@ -0,0 +1 @@
+io_uring_prep_poll_add.3 \ No newline at end of file
diff --git a/man/io_uring_prep_poll_remove.3 b/man/io_uring_prep_poll_remove.3
new file mode 100644
index 0000000..b6f4b26
--- /dev/null
+++ b/man/io_uring_prep_poll_remove.3
@@ -0,0 +1,55 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_poll_remove 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_poll_remove \- prepare a poll deletion request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_poll_remove(struct io_uring_sqe *" sqe ","
+.BI " __u64 " user_data ");"
+.BI "
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_poll_remove (3)
+function prepares a poll removal request. The submission queue entry
+.I sqe
+is setup to remove a poll request identified by
+.I user_data
+
+Works like
+.BR io_uring_prep_cancel (3)
+except only looks for poll requests. Apart from that, behavior is identical.
+See that man page for specific details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+These are the errors that are reported in the CQE
+.I res
+field. On success,
+.B 0
+is returned.
+.TP
+.B -ENOENT
+The request identified by
+.I user_data
+could not be located. This could be because it completed before the cancelation
+request was issued, or if an invalid identifier is used.
+.TP
+.B -EINVAL
+One of the fields set in the SQE was invalid.
+.TP
+.B -EALREADY
+The execution state of the request has progressed far enough that cancelation
+is no longer possible. This should normally mean that it will complete shortly,
+either successfully, or interrupted due to the cancelation.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_prep_cancel (3)
diff --git a/man/io_uring_prep_poll_update.3 b/man/io_uring_prep_poll_update.3
new file mode 100644
index 0000000..11f6346
--- /dev/null
+++ b/man/io_uring_prep_poll_update.3
@@ -0,0 +1,89 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_poll_update 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_poll_update \- update an existing poll request
+.SH SYNOPSIS
+.nf
+.B #include <poll.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_poll_update(struct io_uring_sqe *" sqe ","
+.BI " __u64 " old_user_data ","
+.BI " __u64 " new_user_data ","
+.BI " unsigned " poll_mask ","
+.BI " unsigned " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_poll_update (3)
+function prepares a poll update request. The submission queue entry
+.I sqe
+is setup to update a poll request identified by
+.IR old_user_data ,
+replacing it with the
+.I new_user_data
+information. The
+.I poll_mask
+arguments contains the new mask to use for the poll request, and
+.I flags
+argument contains modifier flags telling io_uring what fields to update.
+
+The
+.I flags
+modifier flags is a bitmask and may contain and OR'ed mask of:
+.TP
+.B IORING_POLL_UPDATE_EVENTS
+If set, the poll update request will replace the existing events being waited
+for with the ones specified in the
+.I poll_mask
+argument to the function.
+.TP
+.B IORING_POLL_UPDATE_USER_DATA
+If set, the poll update request will update the existing user_data of the
+request with the value passed in as the
+.I new_user_data
+argument.
+.TP
+.B IORING_POLL_ADD_MULTI
+If set, this will change the poll request from a singleshot to a multishot
+request. This must be used along with
+.B IORING_POLL_UPDATE_EVENTS
+as the event field must be updated to enable multishot.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+These are the errors that are reported in the CQE
+.I res
+field. On success,
+.B 0
+is returned.
+.TP
+.B -ENOENT
+The request identified by
+.I user_data
+could not be located. This could be because it completed before the cancelation
+request was issued, or if an invalid identifier is used.
+.TP
+.B -EINVAL
+One of the fields set in the SQE was invalid.
+.TP
+.B -EALREADY
+The execution state of the request has progressed far enough that cancelation
+is no longer possible. This should normally mean that it will complete shortly,
+either successfully, or interrupted due to the cancelation.
+.TP
+.B -ECANCELED
+.B IORING_POLL_UPDATE_EVENTS
+was set and an error occurred re-arming the poll request with the new mask.
+The original poll request is terminated if this happens, and that termination
+CQE will contain the reason for the error re-arming.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_prep_poll_add (3),
+.BR io_uring_prep_poll_multishot (3)
diff --git a/man/io_uring_prep_provide_buffers.3 b/man/io_uring_prep_provide_buffers.3
new file mode 100644
index 0000000..f3dded9
--- /dev/null
+++ b/man/io_uring_prep_provide_buffers.3
@@ -0,0 +1,131 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_provide_buffers 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_provide_buffers \- prepare a provide buffers request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_provide_buffers(struct io_uring_sqe *" sqe ","
+.BI " void *" addr ","
+.BI " int " len ","
+.BI " int " nr ","
+.BI " int " bgid ","
+.BI " int " bid ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_provide_buffers (3)
+function prepares a request for providing the kernel with buffers. The
+submission queue entry
+.I sqe
+is setup to consume
+.I len
+number of buffers starting at
+.I addr
+and identified by the buffer group ID of
+.I bgid
+and numbered sequentially starting at
+.IR bid .
+
+This function sets up a request to provide buffers to the io_uring context
+that can be used by read or receive operations. This is done by filling in
+the SQE
+.I buf_group
+field and setting
+.B IOSQE_BUFFER_SELECT
+in the SQE
+.I flags
+member. If buffer selection is used for a request, no buffer should be provided
+in the address field. Instead, the group ID is set to match one that was
+previously provided to the kernel. The kernel will then select a buffer from
+this group for the IO operation. On successful completion of the IO request,
+the CQE
+.I flags
+field will have
+.B IORING_CQE_F_BUFFER
+set and the selected buffer ID will be indicated by the upper 16-bits of the
+.I flags
+field.
+
+Different buffer group IDs can be used by the application to have different
+sizes or types of buffers available. Once a buffer has been consumed for an
+operation, it is no longer known to io_uring. It must be re-provided if so
+desired or freed by the application if no longer needed.
+
+The buffer IDs are internally tracked from
+.I bid
+and sequentially ascending from that value. If
+.B 16
+buffers are provided and start with an initial
+.I bid
+of 0, then the buffer IDs will range from
+.BR 0..15 .
+The application must be aware of this to make sense of the buffer ID passed
+back in the CQE.
+
+Not all requests support buffer selection, as it only really makes sense for
+requests that receive data from the kernel rather than write or provide data.
+Currently, this mode of operation is supported for any file read or socket
+receive request. Attempting to use
+.B IOSQE_BUFFER_SELECT
+with a command that doesn't support it will result in a CQE
+.I res
+error of
+.BR -EINVAL .
+Buffer selection will work with operations that take a
+.B struct iovec
+as its data destination, but only if 1 iovec is provided.
+.
+.SH RETURN VALUE
+None
+.SH ERRORS
+These are the errors that are reported in the CQE
+.I res
+field. On success,
+.I res
+will contain the number of successfully provided buffers. On error,
+the following errors can occur.
+.TP
+.B -ENOMEM
+The kernel was unable to allocate memory for the request.
+.TP
+.B -EINVAL
+One of the fields set in the SQE was invalid.
+.TP
+.B -E2BIG
+The number of buffers provided was too big, or the
+.I bid
+was too big. A max value of
+.B USHRT_MAX
+buffers can be specified.
+.TP
+.B -EFAULT
+Some of the user memory given was invalid for the application.
+.TP
+.B -EBADF
+On of the descriptors located in
+.I fds
+didn't refer to a valid file descriptor, or one of the file descriptors in
+the array referred to an io_uring instance.
+.TP
+.B -EOVERFLOW
+The product of
+.I len
+and
+.I nr
+exceed the valid amount or overflowed, or the sum of
+.I addr
+and the length of buffers overflowed.
+.TP
+.B -EBUSY
+Attempt to update a slot that is already used.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_register (2),
+.BR io_uring_prep_remove_buffers (3)
diff --git a/man/io_uring_prep_read.3 b/man/io_uring_prep_read.3
new file mode 100644
index 0000000..faec35f
--- /dev/null
+++ b/man/io_uring_prep_read.3
@@ -0,0 +1,69 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_read 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_prep_read \- prepare I/O read request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_read(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " void *" buf ","
+.BI " unsigned " nbytes ","
+.BI " __u64 " offset ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_read (3)
+prepares an IO read request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+to start reading
+.I nbytes
+into the buffer
+.I buf
+at the specified
+.IR offset .
+
+On files that support seeking, if the offset is set to
+.BR -1 ,
+the read operation commences at the file offset, and the file offset is
+incremented by the number of bytes read. See
+.BR read (2)
+for more details. Note that for an async API, reading and updating the
+current file offset may result in unpredictable behavior, unless access
+to the file is serialized. It is not encouraged to use this feature, if it's
+possible to provide the desired IO offset from the application or library.
+
+On files that are not capable of seeking, the offset is ignored.
+
+After the read has been prepared it can be submitted with one of the submit
+functions.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_prep_readv (3),
+.BR io_uring_prep_readv2 (3),
+.BR io_uring_submit (3)
diff --git a/man/io_uring_prep_read_fixed.3 b/man/io_uring_prep_read_fixed.3
new file mode 100644
index 0000000..d3726f2
--- /dev/null
+++ b/man/io_uring_prep_read_fixed.3
@@ -0,0 +1,72 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_read 3 "February 13, 2022" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_prep_read_fixed \- prepare I/O read request with registered buffer
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_read_fixed(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " void *" buf ","
+.BI " unsigned " nbytes ","
+.BI " __u64 " offset ","
+.BI " int " buf_index ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_read_fixed (3)
+prepares an IO read request with a previously registered IO buffer. The
+submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+to start reading
+.I nbytes
+into the buffer
+.I buf
+at the specified
+.IR offset ,
+and with the buffer matching the registered index of
+.IR buf_index .
+
+This works just like
+.BR io_uring_prep_read (3)
+except it requires the use of buffers that have been registered with
+.BR io_uring_register_buffers (3).
+The
+.I buf
+and
+.I nbytes
+arguments must fall within a region specificed by
+.I buf_index
+in the previously registered buffer. The buffer need not be aligned with
+the start of the registered buffer.
+
+After the read has been prepared it can be submitted with one of the submit
+functions.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_prep_read (3),
+.BR io_uring_register_buffers (3)
diff --git a/man/io_uring_prep_readv.3 b/man/io_uring_prep_readv.3
new file mode 100644
index 0000000..ea7afd5
--- /dev/null
+++ b/man/io_uring_prep_readv.3
@@ -0,0 +1,85 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_readv 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_prep_readv \- prepare vector I/O read request
+.SH SYNOPSIS
+.nf
+.B #include <sys/uio.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_readv(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " const struct iovec *" iovecs ","
+.BI " unsigned " nr_vecs ","
+.BI " __u64 " offset ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_readv (3)
+prepares a vectored IO read request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+to start reading
+.I nr_vecs
+into the
+.I iovecs
+array at the specified
+.IR offset .
+
+On files that support seeking, if the offset is set to
+.BR -1 ,
+the read operation commences at the file offset, and the file offset is
+incremented by the number of bytes read. See
+.BR read (2)
+for more details. Note that for an async API, reading and updating the
+current file offset may result in unpredictable behavior, unless access
+to the file is serialized. It is not encouraged to use this feature, if it's
+possible to provide the desired IO offset from the application or library.
+
+On files that are not capable of seeking, the offset is ignored.
+
+After the write has been prepared it can be submitted with one of the submit
+functions.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+Unless an application explicitly needs to pass in more than iovec, it is more
+efficient to use
+.BR io_uring_prep_read (3)
+rather than this function, as no state has to be maintained for a
+non-vectored IO request.
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_prep_read (3),
+.BR io_uring_prep_readv2 (3),
+.BR io_uring_submit (3)
diff --git a/man/io_uring_prep_readv2.3 b/man/io_uring_prep_readv2.3
new file mode 100644
index 0000000..171a699
--- /dev/null
+++ b/man/io_uring_prep_readv2.3
@@ -0,0 +1,111 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_readv2 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_prep_readv2 \- prepare vector I/O read request with flags
+.SH SYNOPSIS
+.nf
+.B #include <sys/uio.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_readv2(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " const struct iovec *" iovecs ","
+.BI " unsigned " nr_vecs ","
+.BI " __u64 " offset ","
+.BI " int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_readv2 (3)
+prepares a vectored IO read request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+to start reading
+.I nr_vecs
+into the
+.I iovecs
+array at the specified
+.IR offset .
+The behavior of the function can be controlled with the
+.I flags
+parameter.
+
+Supported values for
+.I flags
+are:
+.TP
+.B RWF_HIPRI
+High priority request, poll if possible
+.TP
+.B RWF_DSYNC
+per-IO O_DSYNC
+.TP
+.B RWF_SYNC
+per-IO O_SYNC
+.TP
+.B RWF_NOWAIT
+per-IO, return
+.B -EAGAIN
+if operation would block
+.TP
+.B RWF_APPEND
+per-IO O_APPEND
+
+.P
+On files that support seeking, if the offset is set to
+.BR -1 ,
+the read operation commences at the file offset, and the file offset is
+incremented by the number of bytes read. See
+.BR read (2)
+for more details. Note that for an async API, reading and updating the
+current file offset may result in unpredictable behavior, unless access
+to the file is serialized. It is not encouraged to use this feature, if it's
+possible to provide the desired IO offset from the application or library.
+
+On files that are not capable of seeking, the offset is ignored.
+
+After the write has been prepared, it can be submitted with one of the submit
+functions.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+Unless an application explicitly needs to pass in more than iovec, it is more
+efficient to use
+.BR io_uring_prep_read (3)
+rather than this function, as no state has to be maintained for a
+non-vectored IO request.
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_prep_read (3),
+.BR io_uring_prep_readv (3),
+.BR io_uring_submit (3)
diff --git a/man/io_uring_prep_recv.3 b/man/io_uring_prep_recv.3
new file mode 100644
index 0000000..993e331
--- /dev/null
+++ b/man/io_uring_prep_recv.3
@@ -0,0 +1,83 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_recv 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_recv \- prepare a recv request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_recv(struct io_uring_sqe *" sqe ","
+.BI " int " sockfd ","
+.BI " void *" buf ","
+.BI " size_t " len ","
+.BI " int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_recv (3)
+function prepares a recv request. The submission
+queue entry
+.I sqe
+is setup to use the file descriptor
+.I sockfd
+to start receiving the data into the buffer destination
+.I buf
+of size
+.I size
+and with modifier flags
+.IR flags .
+
+This function prepares an async
+.BR recv (2)
+request. See that man page for details on the arguments specified to this
+prep helper.
+
+After calling this function, additional io_uring internal modifier flags
+may be set in the SQE
+.I off
+field. The following flags are supported:
+.TP
+.B IORING_RECVSEND_POLL_FIRST
+If set, io_uring will assume the socket is currently empty and attempting to
+receive data will be unsuccessful. For this case, io_uring will arm internal
+poll and trigger a receive of the data when the socket has data to be read.
+This initial receive attempt can be wasteful for the case where the socket
+is expected to be empty, setting this flag will bypass the initial receive
+attempt and go straight to arming poll. If poll does indicate that data is
+ready to be received, the operation will proceed.
+
+Can be used with the CQE
+.B IORING_CQE_F_SOCK_NONEMPTY
+flag, which io_uring will set on CQEs after a
+.BR recv (2)
+or
+.BR recvmsg (2)
+operation. If set, the socket still had data to be read after the operation
+completed. Both these flags are available since 5.19.
+.P
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR recv (2)
diff --git a/man/io_uring_prep_recvmsg.3 b/man/io_uring_prep_recvmsg.3
new file mode 100644
index 0000000..8c49411
--- /dev/null
+++ b/man/io_uring_prep_recvmsg.3
@@ -0,0 +1,94 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_recvmsg 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_recvmsg \- prepare a recvmsg request
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.B #include <sys/socket.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_recvmsg(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " struct msghdr *" msg ","
+.BI " unsigned " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_recvmsg (3)
+function prepares a recvmsg request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+to start receiving the data indicated by
+.I msg
+with the
+.BR recvmsg (2)
+defined flags in the
+.I flags
+argument.
+
+This function prepares an async
+.BR recvmsg (2)
+request. See that man page for details on the arguments specified to this
+prep helper.
+
+After calling this function, additional io_uring internal modifier flags
+may be set in the SQE
+.I off
+field. The following flags are supported:
+.TP
+.B IORING_RECVSEND_POLL_FIRST
+If set, io_uring will assume the socket is currently empty and attempting to
+receive data will be unsuccessful. For this case, io_uring will arm internal
+poll and trigger a receive of the data when the socket has data to be read.
+This initial receive attempt can be wasteful for the case where the socket
+is expected to be empty, setting this flag will bypass the initial receive
+attempt and go straight to arming poll. If poll does indicate that data is
+ready to be received, the operation will proceed.
+
+Can be used with the CQE
+.B IORING_CQE_F_SOCK_NONEMPTY
+flag, which io_uring will set on CQEs after a
+.BR recv (2)
+or
+.BR recvmsg (2)
+operation. If set, the socket still had data to be read after the operation
+completed. Both these flags are available since 5.19.
+.P
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR recvmsg (2)
diff --git a/man/io_uring_prep_remove_buffers.3 b/man/io_uring_prep_remove_buffers.3
new file mode 100644
index 0000000..cf4f226
--- /dev/null
+++ b/man/io_uring_prep_remove_buffers.3
@@ -0,0 +1,52 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_remove_buffers 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_remove_buffers \- prepare a remove buffers request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_remove_buffers(struct io_uring_sqe *" sqe ","
+.BI " int " nr ","
+.BI " int " bgid ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_remove_buffers (3)
+function prepares a request for removing previously supplied buffers. The
+submission queue entry
+.I sqe
+is setup to remove
+.I nr
+number of buffers from the buffer group ID indicated by
+.IR bgid .
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+These are the errors that are reported in the CQE
+.I res
+field. On success,
+.I res
+will contain the number of successfully removed buffers. On error,
+the following errors can occur.
+.TP
+.B -ENOMEM
+The kernel was unable to allocate memory for the request.
+.TP
+.B -EINVAL
+One of the fields set in the SQE was invalid.
+.TP
+.B -ENOENT
+No buffers exist at the specified
+.I bgid
+buffer group ID.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_register (2),
+.BR io_uring_prep_provide_buffers (3)
diff --git a/man/io_uring_prep_rename.3 b/man/io_uring_prep_rename.3
new file mode 120000
index 0000000..785b55e
--- /dev/null
+++ b/man/io_uring_prep_rename.3
@@ -0,0 +1 @@
+io_uring_prep_renameat.3 \ No newline at end of file
diff --git a/man/io_uring_prep_renameat.3 b/man/io_uring_prep_renameat.3
new file mode 100644
index 0000000..1fc9e01
--- /dev/null
+++ b/man/io_uring_prep_renameat.3
@@ -0,0 +1,96 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_renameat 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_renameat \- prepare a renameat request
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.B #include <stdio.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_renameat(struct io_uring_sqe *" sqe ","
+.BI " int " olddirfd ","
+.BI " const char *" oldpath ","
+.BI " int " newdirfd ","
+.BI " const char *" newpath ","
+.BI " int " flags ");"
+.PP
+.BI "void io_uring_prep_rename(struct io_uring_sqe *" sqe ","
+.BI " const char *" oldpath ","
+.BI " const char *" newpath ","
+.BI " int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_renameat (3)
+function prepares a renameat request. The submission queue entry
+.I sqe
+is setup to use the old directory file descriptor pointed to by
+.I olddirfd
+and old path pointed to by
+.I oldpath
+with the new directory file descriptor pointed to by
+.I newdirfd
+and the new path pointed to by
+.I newpath
+and using the specified flags in
+.IR flags .
+
+The
+.BR io_uring_prep_rename (3)
+function prepares a rename request. The submission queue entry
+.I sqe
+is setup to use the old path pointed to by
+.I oldpath
+with the new path pointed to by
+.IR newpath ,
+both relative to the current working directory and using the specified flags in
+.IR flags .
+
+These functions prepare an async
+.BR renameat2 (2)
+or
+.BR rename (2)
+request. If
+.I flags
+is zero, then this call is similar to the
+.BR renameat (2)
+system call. See those man pages for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR renameat (2),
+.BR renameat2 (2),
+.BR rename (2)
diff --git a/man/io_uring_prep_send.3 b/man/io_uring_prep_send.3
new file mode 100644
index 0000000..10c86ba
--- /dev/null
+++ b/man/io_uring_prep_send.3
@@ -0,0 +1,57 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_send 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_send \- prepare a send request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_send(struct io_uring_sqe *" sqe ","
+.BI " int " sockfd ","
+.BI " const void *" buf ","
+.BI " size_t " len ","
+.BI " int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_send (3)
+function prepares a send request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I sockfd
+to start sending the data from
+.I buf
+of size
+.I size
+and with modifier flags
+.IR flags .
+
+This function prepares an async
+.BR send (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR send (2)
diff --git a/man/io_uring_prep_sendmsg.3 b/man/io_uring_prep_sendmsg.3
new file mode 100644
index 0000000..bc81d91
--- /dev/null
+++ b/man/io_uring_prep_sendmsg.3
@@ -0,0 +1,69 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_sendmsg 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_sendmsg \- prepare a sendmsg request
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.B #include <sys/socket.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_sendmsg(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " const struct msghdr *" msg ","
+.BI " unsigned " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_sendmsg (3)
+function prepares a sendmsg request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+to start sending the data indicated by
+.I msg
+with the
+.BR sendmsg (2)
+defined flags in the
+.I flags
+argument.
+
+This function prepares an async
+.BR sendmsg (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR sendmsg (2)
diff --git a/man/io_uring_prep_shutdown.3 b/man/io_uring_prep_shutdown.3
new file mode 100644
index 0000000..9125e95
--- /dev/null
+++ b/man/io_uring_prep_shutdown.3
@@ -0,0 +1,53 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_shutdown 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_shutdown \- prepare a shutdown request
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_shutdown(struct io_uring_sqe *" sqe ","
+.BI " int " sockfd ","
+.BI " int " how ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_shutdown (3)
+function prepares a shutdown request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I sockfd
+that should be shutdown with the
+.I how
+argument.
+
+This function prepares an async
+.BR shutdown (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR shutdown (2)
diff --git a/man/io_uring_prep_socket.3 b/man/io_uring_prep_socket.3
new file mode 100644
index 0000000..473f225
--- /dev/null
+++ b/man/io_uring_prep_socket.3
@@ -0,0 +1,97 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_socket 3 "May 27, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_socket \- prepare a socket creation request
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_socket(struct io_uring_sqe *" sqe ","
+.BI " int " domain ","
+.BI " int " type ","
+.BI " int " protocol ","
+.BI " unsigned int " flags ");"
+.PP
+.BI "void io_uring_prep_socket_direct(struct io_uring_sqe *" sqe ","
+.BI " int " domain ","
+.BI " int " type ","
+.BI " int " protocol ","
+.BI " unsigned int " file_index ","
+.BI " unsigned int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_socket (3)
+function prepares a socket creation request. The submission queue entry
+.I sqe
+is setup to use the communication domain defined by
+.I domain
+and use the communication type defined by
+.I type
+and the protocol set by
+.IR protocol .
+The
+.I flags
+argument are currently unused.
+
+The
+.BR io_uring_prep_socket_direct (3)
+works just like
+.BR io_uring_prep_socket (3),
+except it maps the socket to a direct descriptor rather than return a normal
+file descriptor. The
+.I file_index
+argument should be set to the slot that should be used for this socket, or
+.B IORING_FILE_INDEX_ALLOC
+if io_uring should allocate a free one.
+
+If the direct variant is used, the application must first have registered
+a file table using
+.BR io_uring_register_files (3)
+of the appropriate size. Once registered, a direct socket request may use any
+entry in that table, as long as it is within the size of the registered table.
+If a specified entry already contains a file, the file will first be removed
+from the table and closed. It's consistent with the behavior of updating an
+existing file with
+.BR io_uring_register_files_update (3).
+
+For a direct descriptor socket request, the
+.I file_index
+argument can be set to
+.BR IORING_FILE_INDEX_ALLOC ,
+In this case a free entry in io_uring file table will
+be used automatically and the file index will be returned as CQE
+.IR res .
+.B -ENFILE
+is otherwise returned if there is no free entries in the io_uring file table.
+
+These functions prepare an async
+.BR socket (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR socket (2)
diff --git a/man/io_uring_prep_socket_direct.3 b/man/io_uring_prep_socket_direct.3
new file mode 120000
index 0000000..15d7b7f
--- /dev/null
+++ b/man/io_uring_prep_socket_direct.3
@@ -0,0 +1 @@
+io_uring_prep_socket.3 \ No newline at end of file
diff --git a/man/io_uring_prep_splice.3 b/man/io_uring_prep_splice.3
new file mode 100644
index 0000000..cb82ad0
--- /dev/null
+++ b/man/io_uring_prep_splice.3
@@ -0,0 +1,80 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_splice 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_splice \- prepare an splice request
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_splice(struct io_uring_sqe *" sqe ","
+.BI " int " fd_in ","
+.BI " int64_t " off_in ","
+.BI " int " fd_out ","
+.BI " int64_t " off_out ","
+.BI " unsigned int " nbytes ","
+.BI " unsigned int " splice_flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_splice (3)
+function prepares a splice request. The submission queue entry
+.I sqe
+is setup to use as input the file descriptor
+.I fd_in
+at offset
+.IR off_in ,
+splicing data to the file descriptor at
+.I fd_out
+and at offset
+.IR off_out .
+.I nbytes
+bytes of data should be spliced between the two descriptors.
+.I splice_flags
+are modifier flags for the operation. See
+.BR splice (2)
+for the generic splice flags.
+
+If the
+.I fd_out
+descriptor,
+.B IOSQE_FIXED_FILE
+can be set in the SQE to indicate that. For the input file, the io_uring
+specific
+.B SPLICE_F_FD_IN_FIXED
+can be set in
+.I splice_flags
+and
+.I fd_in
+given as a registered file descriptor offset.
+
+This function prepares an async
+.BR splice (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_register (2),
+.BR splice (2)
diff --git a/man/io_uring_prep_statx.3 b/man/io_uring_prep_statx.3
new file mode 100644
index 0000000..d9d983a
--- /dev/null
+++ b/man/io_uring_prep_statx.3
@@ -0,0 +1,74 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_statx 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_statx \- prepare a statx request
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.B #include <sys/stat.h>
+.B #include <unistd.h>
+.B #include <fcntl.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_statx(struct io_uring_sqe *" sqe ","
+.BI " int " dirfd ","
+.BI " const char *" path ","
+.BI " int " flags ","
+.BI " unsigned " mask ","
+.BI " struct statx *" statxbuf ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_statx (3)
+function prepares a statx request. The submission queue entry
+.I sqe
+is setup to use the directory file descriptor pointed to by
+.I dirfd
+to start a statx operation on the path identified by
+.I path
+and using the flags given in
+.I flags
+for the fields specified by
+.I mask
+and into the buffer located at
+.IR statxbuf .
+
+This function prepares an async
+.BR statx (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR statx (2)
diff --git a/man/io_uring_prep_symlink.3 b/man/io_uring_prep_symlink.3
new file mode 120000
index 0000000..ae6f41a
--- /dev/null
+++ b/man/io_uring_prep_symlink.3
@@ -0,0 +1 @@
+io_uring_prep_symlinkat.3 \ No newline at end of file
diff --git a/man/io_uring_prep_symlinkat.3 b/man/io_uring_prep_symlinkat.3
new file mode 100644
index 0000000..0fa7056
--- /dev/null
+++ b/man/io_uring_prep_symlinkat.3
@@ -0,0 +1,85 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_symlinkat 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_symlinkat \- prepare a symlinkat request
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.B #include <unistd.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_symlinkat(struct io_uring_sqe *" sqe ","
+.BI " const char *" target ","
+.BI " int " newdirfd ","
+.BI " const char *" linkpath ");"
+.PP
+.BI "void io_uring_prep_symlink(struct io_uring_sqe *" sqe ","
+.BI " const char *" target ","
+.BI " const char *" linkpath ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_symlinkat (3)
+function prepares a symlinkat request. The submission queue entry
+.I sqe
+is setup to symlink the target path pointed to by
+.I target
+to the new destination indicated by
+.I newdirfd
+and
+.IR linkpath .
+
+The
+.BR io_uring_prep_symlink (3)
+function prepares a symlink request. The submission queue entry
+.I sqe
+is setup to symlink the target path pointed to by
+.I target
+to the new destination indicated by
+.I linkpath
+relative to the the current working directory. This function prepares an async
+.BR symlink (2)
+request. See that man page for details.
+
+These functions prepare an async
+.BR symlinkat (2)
+or
+.BR symlink (2)
+request. See those man pages for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR symlinkat (2),
+.BR symlink (2)
diff --git a/man/io_uring_prep_sync_file_range.3 b/man/io_uring_prep_sync_file_range.3
new file mode 100644
index 0000000..830e411
--- /dev/null
+++ b/man/io_uring_prep_sync_file_range.3
@@ -0,0 +1,59 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_sync_file_range 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_sync_file_range \- prepare a sync_file_range request
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_sync_file_range(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " unsigned " len ","
+.BI " __u64 " offset ","
+.BI " int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_sync_file_range (3)
+function prepares a sync_file_range request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+that should get
+.I len
+bytes synced started at offset
+.I offset
+and with modifier flags in the
+.I flags
+argument.
+
+This function prepares an async
+.BR sync_file_range (2)
+request. See that man page for details on the arguments.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR sync_file_range (2)
diff --git a/man/io_uring_prep_tee.3 b/man/io_uring_prep_tee.3
new file mode 100644
index 0000000..44aaaf6
--- /dev/null
+++ b/man/io_uring_prep_tee.3
@@ -0,0 +1,74 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_tee 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_tee \- prepare a tee request
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_tee(struct io_uring_sqe *" sqe ","
+.BI " int " fd_in ","
+.BI " int " fd_out ","
+.BI " unsigned int " nbytes ","
+.BI " unsigned int " splice_flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_tee (3)
+function prepares a tee request. The submission queue entry
+.I sqe
+is setup to use as input the file descriptor
+.I fd_in
+and as output the file descriptor
+.I fd_out
+duplicating
+.I nbytes
+bytes worth of data.
+.I splice_flags
+are modifier flags for the operation. See
+.BR tee (2)
+for the generic splice flags.
+
+If the
+.I fd_out
+descriptor,
+.B IOSQE_FIXED_FILE
+can be set in the SQE to indicate that. For the input file, the io_uring
+specific
+.B SPLICE_F_FD_IN_FIXED
+can be set and
+.I fd_in
+given as a registered file descriptor offset.
+
+This function prepares an async
+.BR tee (2)
+request. See that man page for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_register (2),
+.BR splice (2),
+.BR tee (2)
diff --git a/man/io_uring_prep_timeout.3 b/man/io_uring_prep_timeout.3
new file mode 100644
index 0000000..bfb8791
--- /dev/null
+++ b/man/io_uring_prep_timeout.3
@@ -0,0 +1,95 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_poll_timeout 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_timeoute \- prepare a timeout request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_timeout(struct io_uring_sqe *" sqe ","
+.BI " struct __kernel_timespec *" ts ","
+.BI " unsigned " count ","
+.BI " unsigned " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_timeout (3)
+function prepares a timeout request. The submission queue entry
+.I sqe
+is setup to arm a timeout specified by
+.I ts
+and with a timeout count of
+.I count
+completion entries. The
+.I flags
+argument holds modifier flags for the request.
+
+This request type can be used as a timeout waking anyone sleeping
+for events on the CQ ring. The
+.I flags
+argument may contain:
+.TP
+.B IORING_TIMEOUT_ABS
+The value specified in
+.I ts
+is an absolute value rather than a relative one.
+.TP
+.B IORING_TIMEOUT_BOOTTIME
+The boottime clock source should be used.
+.TP
+.B IORING_TIMEOUT_REALTIME
+The realtime clock source should be used.
+.TP
+.B IORING_TIMEOUT_ETIME_SUCCESS
+Consider an expired timeout a success in terms of the posted completion.
+Normally a timeout that triggers would return in a
+.B -ETIME
+CQE
+.I res
+value.
+.PP
+The timeout completion event will trigger if either the specified timeout
+has occurred, or the specified number of events to wait for have been posted
+to the CQ ring.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+These are the errors that are reported in the CQE
+.I res
+field. On success,
+.B 0
+is returned.
+.TP
+.B -ETIME
+The specified timeout occurred and triggered the completion event.
+.TP
+.B -EINVAL
+One of the fields set in the SQE was invalid. For example, two clocksources
+where given, or the specified timeout seconds or nanoseconds where < 0.
+.TP
+.B -EFAULT
+io_uring was unable to access the data specified by
+.IR ts .
+.TP
+.B -ECANCELED
+The timeout was canceled by a removal request.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_prep_timeout_remove (3),
+.BR io_uring_prep_timeout_update (3)
diff --git a/man/io_uring_prep_timeout_remove.3 b/man/io_uring_prep_timeout_remove.3
new file mode 120000
index 0000000..5aebd36
--- /dev/null
+++ b/man/io_uring_prep_timeout_remove.3
@@ -0,0 +1 @@
+io_uring_prep_timeout_update.3 \ No newline at end of file
diff --git a/man/io_uring_prep_timeout_update.3 b/man/io_uring_prep_timeout_update.3
new file mode 100644
index 0000000..cb9ed12
--- /dev/null
+++ b/man/io_uring_prep_timeout_update.3
@@ -0,0 +1,98 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_poll_timeout_update 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_timeoute_update \- prepare a request to update an existing timeout
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_timeout_update(struct io_uring_sqe *" sqe ","
+.BI " struct __kernel_timespec *" ts ","
+.BI " __u64 " user_data ","
+.BI " unsigned " flags ");"
+.PP
+.BI "void io_uring_prep_timeout_remove(struct io_uring_sqe *" sqe ","
+.BI " __u64 " user_data ","
+.BI " unsigned " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+These functions modify or cancel an existing timeout request. The submission
+queue entry
+.I sqe
+is setup to arm a timeout update or removal specified by
+.I user_data
+and with modifier flags given by
+.IR flags .
+Additionally, the update request includes a
+.I ts
+structure, which contains new timeout information.
+
+For an update request, the
+.I flags
+member may contain a bitmask of the following values:
+.TP
+.B IORING_TIMEOUT_ABS
+The value specified in
+.I ts
+is an absolute value rather than a relative one.
+.TP
+.B IORING_TIMEOUT_BOOTTIME
+The boottime clock source should be used.
+.TP
+.B IORING_TIMEOUT_REALTIME
+The realtime clock source should be used.
+.TP
+.B IORING_TIMEOUT_ETIME_SUCCESS
+Consider an expired timeout a success in terms of the posted completion.
+Normally a timeout that triggers would return in a
+.B -ETIME
+CQE
+.I res
+value.
+.PP
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+These are the errors that are reported in the CQE
+.I res
+field. On success,
+.B 0
+is returned.
+.TP
+.B -ENOENT
+The timeout identified by
+.I user_data
+could not be found. It may be invalid, or triggered before the update or
+removal request was processed.
+.TP
+.B -EALREADY
+The timeout identified by
+.I user_data
+is already firing and cannot be canceled.
+.TP
+.B -EINVAL
+One of the fields set in the SQE was invalid. For example, two clocksources
+where given, or the specified timeout seconds or nanoseconds where < 0.
+.TP
+.B -EFAULT
+io_uring was unable to access the data specified by
+.IR ts .
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_prep_timeout (3)
diff --git a/man/io_uring_prep_unlink.3 b/man/io_uring_prep_unlink.3
new file mode 120000
index 0000000..80f86d2
--- /dev/null
+++ b/man/io_uring_prep_unlink.3
@@ -0,0 +1 @@
+io_uring_prep_unlinkat.3 \ No newline at end of file
diff --git a/man/io_uring_prep_unlinkat.3 b/man/io_uring_prep_unlinkat.3
new file mode 100644
index 0000000..ba2633c
--- /dev/null
+++ b/man/io_uring_prep_unlinkat.3
@@ -0,0 +1,82 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_unlinkat 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_prep_unlinkat \- prepare an unlinkat request
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.B #include <unistd.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_unlinkat(struct io_uring_sqe *" sqe ","
+.BI " int " dirfd ","
+.BI " const char *" path ","
+.BI " int " flags ");"
+.PP
+.BI "void io_uring_prep_unlink(struct io_uring_sqe *" sqe ","
+.BI " const char *" path ","
+.BI " int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_unlinkat (3)
+function prepares an unlinkat request. The submission queue entry
+.I sqe
+is setup to use the directory file descriptor pointed to by
+.I dirfd
+to start an unlinkat operation on the path identified by
+.I path
+and using the flags given in
+.IR flags .
+
+The
+.BR io_uring_prep_unlink (3)
+function prepares an unlink request. The submission queue entry
+.I sqe
+is setup to start an unlinkat operation on the path identified by
+.I path
+relative to the current working directory and using the flags given in
+.IR flags .
+
+These functions prepare an async
+.BR unlinkat (2)
+or
+.BR unlink (2)
+request. See those man pages for details.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR unlinkat (2),
+.BR unlink (2)
diff --git a/man/io_uring_prep_write.3 b/man/io_uring_prep_write.3
new file mode 100644
index 0000000..791a5f1
--- /dev/null
+++ b/man/io_uring_prep_write.3
@@ -0,0 +1,67 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_write 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_prep_write \- prepare I/O write request
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_write(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " const void *" buf ","
+.BI " unsigned " nbytes ","
+.BI " __u64 " offset ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_write (3)
+prepares an IO write request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+to start writing
+.I nbytes
+from the buffer
+.I buf
+at the specified
+.IR offset .
+
+On files that support seeking, if the offset is set to
+.BR -1 ,
+the write operation commences at the file offset, and the file offset is
+incremented by the number of bytes written. See
+.BR write (2)
+for more details. Note that for an async API, reading and updating the
+current file offset may result in unpredictable behavior, unless access
+to the file is serialized. It is not encouraged to use this feature if it's
+possible to provide the desired IO offset from the application or library.
+
+On files that are not capable of seeking, the offset is ignored.
+
+After the write has been prepared, it can be submitted with one of the submit
+functions.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3)
diff --git a/man/io_uring_prep_write_fixed.3 b/man/io_uring_prep_write_fixed.3
new file mode 100644
index 0000000..5dab4a6
--- /dev/null
+++ b/man/io_uring_prep_write_fixed.3
@@ -0,0 +1,72 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_write 3 "February 13, 2022" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_prep_write_fixed \- prepare I/O write request with registered buffer
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_write_fixed(struct io_uring_sqe *" sqe ","
+.BI " int " fd ",
+.BI " const void *" buf ","
+.BI " unsigned " nbytes ","
+.BI " __u64 " offset ","
+.BI " int " buf_index ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_write_fixed (3)
+prepares an IO write request with a previously registered IO buffer. The
+submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+to start writing
+.I nbytes
+from the buffer
+.I buf
+at the specified
+.I offset
+and with the buffer matching the registered index of
+.IR buf_index .
+
+This works just like
+.BR io_uring_prep_write (3)
+except it requires the use of buffers that have been registered with
+.BR io_uring_register_buffers (3).
+The
+.I buf
+and
+.I nbytes
+arguments must fall within a region specificed by
+.I buf_index
+in the previously registered buffer. The buffer need not be aligned with
+the start of the registered buffer.
+
+After the read has been prepared it can be submitted with one of the submit
+functions.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH SEE ALSO
+.BR io_uring_prep_write (3),
+.BR io_uring_register_buffers (3)
diff --git a/man/io_uring_prep_writev.3 b/man/io_uring_prep_writev.3
new file mode 100644
index 0000000..9fb83d9
--- /dev/null
+++ b/man/io_uring_prep_writev.3
@@ -0,0 +1,85 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_writev 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_prep_writev \- prepare vector I/O write request
+.SH SYNOPSIS
+.nf
+.B #include <sys/uio.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_writev(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " const struct iovec *" iovecs ","
+.BI " unsigned " nr_vecs ","
+.BI " __u64 " offset ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_writev (3)
+prepares a vectored IO write request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+to start writing
+.I nr_vecs
+from the
+.I iovecs
+array at the specified
+.IR offset .
+
+On files that support seeking, if the offset is set to
+.BR -1 ,
+the write operation commences at the file offset, and the file offset is
+incremented by the number of bytes written. See
+.BR write (2)
+for more details. Note that for an async API, reading and updating the
+current file offset may result in unpredictable behavior, unless access
+to the file is serialized. It is not encouraged to use this feature if it's
+possible to provide the desired IO offset from the application or library.
+
+On files that are not capable of seeking, the offset is ignored.
+
+After the write has been prepared it can be submitted with one of the submit
+functions.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+Unless an application explicitly needs to pass in more than iovec, it is more
+efficient to use
+.BR io_uring_prep_write (3)
+rather than this function, as no state has to be maintained for a
+non-vectored IO request.
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_prep_write (3),
+.BR io_uring_prep_writev2 (3),
+.BR io_uring_submit (3)
diff --git a/man/io_uring_prep_writev2.3 b/man/io_uring_prep_writev2.3
new file mode 100644
index 0000000..5093596
--- /dev/null
+++ b/man/io_uring_prep_writev2.3
@@ -0,0 +1,111 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_prep_writev2 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_prep_writev2 \- prepare vector I/O write request with flags
+.SH SYNOPSIS
+.nf
+.B #include <sys/uio.h>
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_prep_writev2(struct io_uring_sqe *" sqe ","
+.BI " int " fd ","
+.BI " const struct iovec *" iovecs ","
+.BI " unsigned " nr_vecs ","
+.BI " __u64 " offset ","
+.BI " int " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_writev2 (3)
+prepares a vectored IO write request. The submission queue entry
+.I sqe
+is setup to use the file descriptor
+.I fd
+to start writing
+.I nr_vecs
+from the
+.I iovecs
+array at the specified
+.IR offset .
+The behavior of the function can be controlled with the
+.I flags
+parameter.
+
+Supported values for
+.I flags
+are:
+.TP
+.B RWF_HIPRI
+High priority request, poll if possible
+.TP
+.B RWF_DSYNC
+per-IO O_DSYNC
+.TP
+.B RWF_SYNC
+per-IO O_SYNC
+.TP
+.B RWF_NOWAIT
+per-IO, return
+.B -EAGAIN
+if operation would block
+.TP
+.B RWF_APPEND
+per-IO O_APPEND
+
+.P
+On files that support seeking, if the offset is set to
+.BR -1 ,
+the write operation commences at the file offset, and the file offset is
+incremented by the number of bytes written. See
+.BR write (2)
+for more details. Note that for an async API, reading and updating the
+current file offset may result in unpredictable behavior, unless access
+to the file is serialized. It is not encouraged to use this feature if it's
+possible to provide the desired IO offset from the application or library.
+
+On files that are not capable of seeking, the offset is ignored.
+
+After the write has been prepared, it can be submitted with one of the submit
+functions.
+
+.SH RETURN VALUE
+None
+.SH ERRORS
+The CQE
+.I res
+field will contain the result of the operation. See the related man page for
+details on possible values. Note that where synchronous system calls will return
+.B -1
+on failure and set
+.I errno
+to the actual error value, io_uring never uses
+.IR errno .
+Instead it returns the negated
+.I errno
+directly in the CQE
+.I res
+field.
+.SH NOTES
+Unless an application explicitly needs to pass in more than iovec, it is more
+efficient to use
+.BR io_uring_prep_write (3)
+rather than this function, as no state has to be maintained for a
+non-vectored IO request.
+As with any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_prep_write (3),
+.BR io_uring_prep_writev (3),
+.BR io_uring_submit (3)
diff --git a/man/io_uring_queue_exit.3 b/man/io_uring_queue_exit.3
index 294b5f3..00f8ae9 100644
--- a/man/io_uring_queue_exit.3
+++ b/man/io_uring_queue_exit.3
@@ -5,14 +5,13 @@
.\"
.TH io_uring_queue_exit 3 "July 10, 2020" "liburing-0.7" "liburing Manual"
.SH NAME
-io_uring_queue_exit - tear down io_uring submission and completion queues
+io_uring_queue_exit \- tear down io_uring submission and completion queues
.SH SYNOPSIS
.nf
-.BR "#include <liburing.h>"
+.B #include <liburing.h>
.PP
-.BI "void io_uring_queue_exit(struct io_uring * ring );"
+.BI "void io_uring_queue_exit(struct io_uring *" ring ");"
.fi
-.PP
.SH DESCRIPTION
.PP
.BR io_uring_queue_exit (3)
diff --git a/man/io_uring_queue_init.3 b/man/io_uring_queue_init.3
index 1980fa4..086b70f 100644
--- a/man/io_uring_queue_init.3
+++ b/man/io_uring_queue_init.3
@@ -5,40 +5,85 @@
.\"
.TH io_uring_queue_init 3 "July 10, 2020" "liburing-0.7" "liburing Manual"
.SH NAME
-io_uring_queue_init - setup io_uring submission and completion queues
+io_uring_queue_init \- setup io_uring submission and completion queues
.SH SYNOPSIS
.nf
-.BR "#include <liburing.h>"
+.B #include <liburing.h>
.PP
-.BI "int io_uring_queue_init(unsigned " entries ", struct io_uring *" ring ,
-.BI " unsigned " flags );
-.fi
+.BI "int io_uring_queue_init(unsigned " entries ","
+.BI " struct io_uring *" ring ","
+.BI " unsigned " flags ");"
.PP
+.BI "int io_uring_queue_init_params(unsigned " entries ","
+.BI " struct io_uring *" ring ","
+.BI " struct io_uring_params *" params ");"
+.fi
.SH DESCRIPTION
.PP
-The io_uring_queue_init() function executes the io_uring_setup syscall to
-initialize the submission and completion queues in the kernel with at least
+The
+.BR io_uring_queue_init (3)
+function executes the
+.BR io_uring_setup (2)
+system call to initialize the submission and completion queues in the kernel
+with at least
.I entries
-entries and then maps the resulting file descriptor to memory shared between the
-application and the kernel.
+entries in the submission queue and then maps the resulting file descriptor to
+memory shared between the application and the kernel.
-On success io_uring_queue_init() returns 0 and
+By default, the CQ ring will have twice the number of entries as specified by
+.I entries
+for the SQ ring. This is adequate for regular file or storage workloads, but
+may be too small networked workloads. The SQ ring entries do not impose a limit
+on the number of in-flight requests that the ring can support, it merely limits
+the number that can be submitted to the kernel in one go (batch). if the CQ
+ring overflows, e.g. more entries are generated than fits in the ring before the
+application can reap them, then the ring enters a CQ ring overflow state. This
+is indicated by
+.B IORING_SQ_CQ_OVERFLOW
+being set in the SQ ring flags. Unless the kernel runs out of available memory,
+entries are not dropped, but it is a much slower completion path and will slow
+down request processing. For that reason it should be avoided and the CQ
+ring sized appropriately for the workload. Setting
+.I cq_entries
+in
+.I struct io_uring_params
+will tell the kernel to allocate this many entries for the CQ ring, independent
+of the SQ ring size in given in
+.IR entries .
+If the value isn't a power of 2, it will be rounded up to the nearest power of
+2.
+
+On success,
+.BR io_uring_queue_init (3)
+returns 0 and
.I ring
will point to the shared memory containing the io_uring queues. On failure
--errno is returned.
+.BR -errno
+is returned.
.I flags
-will be passed through to the io_uring_setup syscall (see
+will be passed through to the io_uring_setup syscall (see
.BR io_uring_setup (2)).
+If the
+.BR io_uring_queue_init_params (3)
+variant is used, then the parameters indicated by
+.I params
+will be passed straight through to the
+.BR io_uring_setup (2)
+system call.
+
On success, the resources held by
.I ring
should be released via a corresponding call to
.BR io_uring_queue_exit (3).
.SH RETURN VALUE
.BR io_uring_queue_init (3)
-returns 0 on success and -errno on failure.
+returns 0 on success and
+.BR -errno
+on failure.
.SH SEE ALSO
.BR io_uring_setup (2),
+.BR io_uring_register_ring_fd (3),
.BR mmap (2),
.BR io_uring_queue_exit (3)
diff --git a/man/io_uring_queue_init_params.3 b/man/io_uring_queue_init_params.3
new file mode 120000
index 0000000..c91609e
--- /dev/null
+++ b/man/io_uring_queue_init_params.3
@@ -0,0 +1 @@
+io_uring_queue_init.3 \ No newline at end of file
diff --git a/man/io_uring_register.2 b/man/io_uring_register.2
index 5326a87..1e91caf 100644
--- a/man/io_uring_register.2
+++ b/man/io_uring_register.2
@@ -88,14 +88,107 @@ then issuing a new call to
.BR io_uring_register ()
with the new buffers.
-Note that registering buffers will wait for the ring to idle. If the application
-currently has requests in-flight, the registration will wait for those to
-finish before proceeding.
+Note that before 5.13 registering buffers would wait for the ring to idle.
+If the application currently has requests in-flight, the registration will
+wait for those to finish before proceeding.
An application need not unregister buffers explicitly before shutting
down the io_uring instance. Available since 5.1.
.TP
+.B IORING_REGISTER_BUFFERS2
+Register buffers for I/O. Similar to
+.B IORING_REGISTER_BUFFERS
+but aims to have a more extensible ABI.
+
+.I arg
+points to a
+.I struct io_uring_rsrc_register,
+and
+.I nr_args
+should be set to the number of bytes in the structure.
+
+.PP
+.in +8n
+.EX
+struct io_uring_rsrc_register {
+ __u32 nr;
+ __u32 resv;
+ __u64 resv2;
+ __aligned_u64 data;
+ __aligned_u64 tags;
+};
+
+.EE
+.in
+.PP
+
+.in +8n
+
+The
+.I data
+field contains a pointer to a
+.I struct iovec
+array of
+.I nr
+entries.
+The
+.I tags
+field should either be 0, then tagging is disabled, or point to an array
+of
+.I nr
+"tags" (unsigned 64 bit integers). If a tag is zero, then tagging for this
+particular resource (a buffer in this case) is disabled. Otherwise, after the
+resource had been unregistered and it's not used anymore, a CQE will be
+posted with
+.I user_data
+set to the specified tag and all other fields zeroed.
+
+Note that resource updates, e.g.
+.B IORING_REGISTER_BUFFERS_UPDATE,
+don't necessarily deallocate resources by the time it returns, but they might
+be held alive until all requests using it complete.
+
+Available since 5.13.
+
+.TP
+.B IORING_REGISTER_BUFFERS_UPDATE
+Updates registered buffers with new ones, either turning a sparse entry into
+a real one, or replacing an existing entry.
+
+.I arg
+must contain a pointer to a struct io_uring_rsrc_update2, which contains
+an offset on which to start the update, and an array of
+.I struct iovec.
+.I tags
+points to an array of tags.
+.I nr
+must contain the number of descriptors in the passed in arrays.
+See
+.B IORING_REGISTER_BUFFERS2
+for the resource tagging description.
+
+.PP
+.in +8n
+.EX
+
+struct io_uring_rsrc_update2 {
+ __u32 offset;
+ __u32 resv;
+ __aligned_u64 data;
+ __aligned_u64 tags;
+ __u32 nr;
+ __u32 resv2;
+};
+.EE
+.in
+.PP
+
+.in +8n
+
+Available since 5.13.
+
+.TP
.B IORING_UNREGISTER_BUFFERS
This operation takes no argument, and
.I arg
@@ -128,25 +221,60 @@ See
.B IORING_REGISTER_FILES_UPDATE
for how to update files in place.
-Note that registering files will wait for the ring to idle. If the application
-currently has requests in-flight, the registration will wait for those to
-finish before proceeding. See
+Note that before 5.13 registering files would wait for the ring to idle.
+If the application currently has requests in-flight, the registration will
+wait for those to finish before proceeding. See
.B IORING_REGISTER_FILES_UPDATE
for how to update an existing set without that limitation.
Files are automatically unregistered when the io_uring instance is
-torn down. An application need only unregister if it wishes to
+torn down. An application needs only unregister if it wishes to
register a new set of fds. Available since 5.1.
.TP
+.B IORING_REGISTER_FILES2
+Register files for I/O. Similar to
+.B IORING_REGISTER_FILES.
+
+.I arg
+points to a
+.I struct io_uring_rsrc_register,
+and
+.I nr_args
+should be set to the number of bytes in the structure.
+
+The
+.I data
+field contains a pointer to an array of
+.I nr
+file descriptors (signed 32 bit integers).
+.I tags
+field should either be 0 or or point to an array of
+.I nr
+"tags" (unsigned 64 bit integers). See
+.B IORING_REGISTER_BUFFERS2
+for more info on resource tagging.
+
+Note that resource updates, e.g.
+.B IORING_REGISTER_FILES_UPDATE,
+don't necessarily deallocate resources, they might be held until all requests
+using that resource complete.
+
+Available since 5.13.
+
+.TP
.B IORING_REGISTER_FILES_UPDATE
This operation replaces existing files in the registered file set with new
-ones, either turning a sparse entry (one where fd is equal to -1) into a
-real one, removing an existing entry (new one is set to -1), or replacing
-an existing entry with a new existing entry.
+ones, either turning a sparse entry (one where fd is equal to
+.B -1
+) into a real one, removing an existing entry (new one is set to
+.B -1
+), or replacing an existing entry with a new existing entry.
.I arg
-must contain a pointer to a struct io_uring_files_update, which contains
+must contain a pointer to a
+.I struct io_uring_files_update,
+which contains
an offset on which to start the update, and an array of file descriptors to
use for the update.
.I nr_args
@@ -158,6 +286,32 @@ File descriptors can be skipped if they are set to
Skipping an fd will not touch the file associated with the previous
fd at that index. Available since 5.12.
+.TP
+.B IORING_REGISTER_FILES_UPDATE2
+Similar to IORING_REGISTER_FILES_UPDATE, replaces existing files in the
+registered file set with new ones, either turning a sparse entry (one where
+fd is equal to
+.B -1
+) into a real one, removing an existing entry (new one is set to
+.B -1
+), or replacing an existing entry with a new existing entry.
+
+.I arg
+must contain a pointer to a
+.I struct io_uring_rsrc_update2,
+which contains
+an offset on which to start the update, and an array of file descriptors to
+use for the update stored in
+.I data.
+.I tags
+points to an array of tags.
+.I nr
+must contain the number of descriptors in the passed in arrays.
+See
+.B IORING_REGISTER_BUFFERS2
+for the resource tagging description.
+
+Available since 5.13.
.TP
.B IORING_UNREGISTER_FILES
@@ -174,7 +328,13 @@ registered through this operation.
.I arg
must contain a pointer to the eventfd file descriptor, and
.I nr_args
-must be 1. Available since 5.2.
+must be 1. Note that while io_uring generally takes care to avoid spurious
+events, they can occur. Similarly, batched completions of CQEs may only trigger
+a single eventfd notification even if multiple CQEs are posted. The application
+should make no assumptions on number of events being available having a direct
+correlation to eventfd notifications posted. An eventfd notification must thus
+only be treated as a hint to check the CQ ring for completions. Available since
+5.2.
An application can temporarily disable notifications, coming through the
registered eventfd, by setting the
@@ -292,11 +452,140 @@ must be specified in the call to
Available since 5.10.
+.TP
+.B IORING_REGISTER_IOWQ_AFF
+By default, async workers created by io_uring will inherit the CPU mask of its
+parent. This is usually all the CPUs in the system, unless the parent is being
+run with a limited set. If this isn't the desired outcome, the application
+may explicitly tell io_uring what CPUs the async workers may run on.
+.I arg
+must point to a
+.B cpu_set_t
+mask, and
+.I nr_args
+the byte size of that mask.
+
+Available since 5.14.
+
+.TP
+.B IORING_UNREGISTER_IOWQ_AFF
+Undoes a CPU mask previously set with
+.B IORING_REGISTER_IOWQ_AFF.
+Must not have
+.I arg
+or
+.I nr_args
+set.
+
+Available since 5.14.
+
+.TP
+.B IORING_REGISTER_IOWQ_MAX_WORKERS
+By default, io_uring limits the unbounded workers created to the maximum
+processor count set by
+.I RLIMIT_NPROC
+and the bounded workers is a function of the SQ ring size and the number
+of CPUs in the system. Sometimes this can be excessive (or too little, for
+bounded), and this command provides a way to change the count per ring (per NUMA
+node) instead.
+
+.I arg
+must be set to an
+.I unsigned int
+pointer to an array of two values, with the values in the array being set to
+the maximum count of workers per NUMA node. Index 0 holds the bounded worker
+count, and index 1 holds the unbounded worker count. On successful return, the
+passed in array will contain the previous maximum valyes for each type. If the
+count being passed in is 0, then this command returns the current maximum values
+and doesn't modify the current setting.
+.I nr_args
+must be set to 2, as the command takes two values.
+
+Available since 5.15.
+
+.TP
+.B IORING_REGISTER_RING_FDS
+Whenever
+.BR io_uring_enter (2)
+is called to submit request or wait for completions, the kernel must grab a
+reference to the file descriptor. If the application using io_uring is threaded,
+the file table is marked as shared, and the reference grab and put of the file
+descriptor count is more expensive than it is for a non-threaded application.
+
+Similarly to how io_uring allows registration of files, this allow registration
+of the ring file descriptor itself. This reduces the overhead of the
+.BR io_uring_enter (2)
+system call.
+
+.I arg
+must be set to an unsigned int pointer to an array of type
+.I struct io_uring_rsrc_register
+of
+.I nr_args
+number of entries. The
+.B data
+field of this struct must point to an io_uring file descriptor, and the
+.B offset
+field can be either
+.B -1
+or an explicit offset desired for the registered file descriptor value. If
+.B -1
+is used, then upon successful return of this system call, the field will
+contain the value of the registered file descriptor to be used for future
+.BR io_uring_enter (2)
+system calls.
+
+On successful completion of this request, the returned descriptors may be used
+instead of the real file descriptor for
+.BR io_uring_enter (2),
+provided that
+.B IORING_ENTER_REGISTERED_RING
+is set in the
+.I flags
+for the system call. This flag tells the kernel that a registered descriptor
+is used rather than a real file descriptor.
+
+Each thread or process using a ring must register the file descriptor directly
+by issuing this request.o
+
+The maximum number of supported registered ring descriptors is currently
+limited to
+.B 16.
+
+Available since 5.18.
+
+.TP
+.B IORING_UNREGISTER_RING_FDS
+Unregister descriptors previously registered with
+.B IORING_REGISTER_RING_FDS.
+
+.I arg
+must be set to an unsigned int pointer to an array of type
+.I struct io_uring_rsrc_register
+of
+.I nr_args
+number of entries. Only the
+.B offset
+field should be set in the structure, containing the registered file descriptor
+offset previously returned from
+.B IORING_REGISTER_RING_FDS
+that the application wishes to unregister.
+
+Note that this isn't done automatically on ring exit, if the thread or task
+that previously registered a ring file descriptor isn't exiting. It is
+recommended to manually unregister any previously registered ring descriptors
+if the ring is closed and the task persists. This will free up a registration
+slot, making it available for future use.
+
+Available since 5.18.
+
.SH RETURN VALUE
On success,
.BR io_uring_register ()
-returns 0. On error, -1 is returned, and
+returns 0. On error,
+.B -1
+is returned, and
.I errno
is set accordingly.
diff --git a/man/io_uring_register_buf_ring.3 b/man/io_uring_register_buf_ring.3
new file mode 100644
index 0000000..9e520bf
--- /dev/null
+++ b/man/io_uring_register_buf_ring.3
@@ -0,0 +1,139 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_register_buf_ring 3 "May 18, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_register_buf_ring \- register buffer ring for provided buffers
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_register_buf_ring(struct io_uring *" ring ",
+.BI " struct io_uring_buf_reg *" reg ",
+.BI " unsigned int " flags ");"
+.BI "
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_register_buf_ring (3)
+function registers a shared buffer ring to be used with provided buffers. For
+the request types that support it, provided buffers are given to the ring and
+one is selected by a request if it has
+.B IOSQE_BUFFER_SELECT
+set in the SQE
+.IR flags ,
+when the request is ready to receive data. This allows both clear ownership
+of the buffer lifetime, and a way to have more read/receive type of operations
+in flight than buffers available.
+
+The
+.I reg
+argument must be filled in with the appropriate information. It looks as
+follows:
+.PP
+.in +4n
+.EX
+struct io_uring_buf_reg {
+ __u64 ring_addr;
+ __u32 ring_entries;
+ __u16 bgid;
+ __u16 pad;
+ __u64 resv[3];
+};
+.EE
+.in
+.PP
+The
+.I ring_addr
+field must contain the address to the memory allocated to fit this ring.
+The memory must be page aligned and hence allocated appropriately using eg
+.BR posix_memalign (3)
+or similar. The size of the ring is the product of
+.I ring_entries
+and the size of
+.IR "struct io_uring_buf" .
+.I ring_entries
+is the desired size of the ring, and must be a power-of-2 in size.
+.I bgid
+is the buffer group ID associated with this ring. SQEs that select a buffer
+has a buffer group associated with them in their
+.I buf_group
+field, and the associated CQE will have
+.B IORING_CQE_F_BUFFER
+set in their
+.I flags
+member, which will also contain the specific ID of the buffer seleted. The rest
+of the fields are reserved and must be cleared to zero.
+
+The
+.I flags
+argument is currently unused and must be set to zero.
+
+A shared buffer ring looks as follows:
+.PP
+.in +4n
+.EX
+struct io_uring_buf_ring {
+ union {
+ struct {
+ __u64 resv1;
+ __u32 resv2;
+ __u16 resv3;
+ __u16 tail;
+ };
+ struct io_uring_buf bufs[0];
+ };
+};
+.EE
+.in
+.PP
+where
+.I tail
+is the index at which the application can insert new buffers for consumption
+by requests, and
+.I struct io_uring_buf
+is buffer definition:
+.PP
+.in +4n
+.EX
+struct io_uring_buf {
+ __u64 addr;
+ __u32 len;
+ __u16 bid;
+ __u16 resv;
+};
+.EE
+.in
+.PP
+where
+.I addr
+is the address for the buffer,
+.I len
+is the length of the buffer in bytes, and
+.I bid
+is the buffer ID that will be returned in the CQE once consumed.
+
+Reserved fields must not be touched. Applications must use
+.BR io_uring_buf_ring_init (3)
+to initialise the buffer ring. Applications may use
+.BR io_uring_buf_ring_add (3)
+and
+.BR io_uring_buf_ring_advance (3)
+or
+.BR io_uring_buf_ring_advance (3)
+to provide buffers, which will set these fields and update the tail.
+
+Available since 5.19.
+
+.SH RETURN VALUE
+On success
+.BR io_uring_register_buf_ring (3)
+returns 0. On failure it returns
+.BR -errno .
+.SH SEE ALSO
+.BR io_uring_buf_ring_init (3),
+.BR io_uring_buf_ring_add (3),
+.BR io_uring_buf_ring_advance (3),
+.BR io_uring_buf_ring_cq_advance (3)
diff --git a/man/io_uring_register_buffers.3 b/man/io_uring_register_buffers.3
new file mode 100644
index 0000000..656ac42
--- /dev/null
+++ b/man/io_uring_register_buffers.3
@@ -0,0 +1,61 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_register_buffers 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_register_buffers \- register buffers for fixed buffer operations
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_register_buffers(struct io_uring *" ring ",
+.BI " const struct iovec *" iovecs ",
+.BI " unsigned " nr_iovecs ");"
+.PP
+.BI "int io_uring_register_buffers_sparse(struct io_uring *" ring ",
+.BI " unsigned " nr_iovecs ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_register_buffers (3)
+function registers
+.I nr_iovecs
+number of buffers defined by the array
+.I iovecs
+belonging to the
+.IR ring .
+
+The
+.BR io_uring_register_buffers_sparse (3)
+function registers
+.I nr_iovecs
+empty buffers belonging to the
+.IR ring .
+These buffers must be updated before use, using eg
+.BR io_uring_register_buffers_update_tag (3).
+
+After the caller has registered the buffers, they can be used with one of the
+fixed buffers functions.
+
+Registered buffers is an optimization that is useful in conjunction with
+.B O_DIRECT
+reads and writes, where it maps the specified range into the kernel once when
+the buffer is registered rather than doing a map and unmap for each IO
+every time IO is performed to that region. Additionally, it also avoids
+manipulating the page reference counts for each IO.
+
+.SH RETURN VALUE
+On success
+.BR io_uring_register_buffers (3)
+and
+.BR io_uring_register_buffers_sparse (3)
+return 0. On failure they return
+.BR -errno .
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_unregister_buffers (3),
+.BR io_uring_register_buf_ring (3),
+.BR io_uring_prep_read_fixed (3),
+.BR io_uring_prep_write_fixed (3)
diff --git a/man/io_uring_register_eventfd.3 b/man/io_uring_register_eventfd.3
new file mode 100644
index 0000000..5cbe72a
--- /dev/null
+++ b/man/io_uring_register_eventfd.3
@@ -0,0 +1,51 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_register_eventfd 3 "April 16, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_register_eventfd \- register an eventfd with a ring
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_register_eventfd(struct io_uring *" ring ","
+.BI " int " fd ");"
+.PP
+.BI "int io_uring_register_eventfd_async(struct io_uring *" ring ","
+.BI " int " fd ");"
+.PP
+.BI "int io_uring_unregister_eventfd(struct io_uring *" ring ");"
+.fi
+.SH DESCRIPTION
+.PP
+.BR io_uring_register_eventfd (3)
+registers the eventfd file descriptor
+.I fd
+with the ring identified by
+.IR ring .
+
+Whenever completions are posted to the CQ ring, an eventfd notification
+is generated with the registered eventfd descriptor. If
+.BR io_uring_register_eventfd_async (3)
+is used, only events that completed out-of-line will trigger a notification.
+
+It notifications are no longer desired,
+.BR io_uring_unregister_eventfd (3)
+may be called to remove the eventfd registration. No eventfd argument is
+needed, as a ring can only have a single eventfd registered.
+
+.SH NOTES
+While io_uring generally takes care to avoid spurious events, they can occur.
+Similarly, batched completions of CQEs may only trigger a single eventfd
+notification even if multiple CQEs are posted. The application should make no
+assumptions on number of events being available having a direct correlation to
+eventfd notifications posted. An eventfd notification must thus only be treated
+as a hint to check the CQ ring for completions.
+.SH RETURN VALUE
+Returns 0 on success, or
+or
+.BR -errno
+on error.
+.SH SEE ALSO
+.BR eventfd (2)
diff --git a/man/io_uring_register_eventfd_async.3 b/man/io_uring_register_eventfd_async.3
new file mode 120000
index 0000000..6659957
--- /dev/null
+++ b/man/io_uring_register_eventfd_async.3
@@ -0,0 +1 @@
+io_uring_register_eventfd.3 \ No newline at end of file
diff --git a/man/io_uring_register_files.3 b/man/io_uring_register_files.3
new file mode 100644
index 0000000..0a9ccc3
--- /dev/null
+++ b/man/io_uring_register_files.3
@@ -0,0 +1,50 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_register_files 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_register_files \- register file descriptors
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_register_files(struct io_uring *" ring ","
+.BI " const int *" files ","
+.BI " unsigned " nr_files ");"
+.PP
+.BI "int io_uring_register_files_sparse(struct io_uring *" ring ","
+.BI " unsigned " nr_files ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_register_files (3)
+function registers
+.I nr_files
+number of file descriptors defined by the array
+.I files
+belonging to the
+.I ring
+for subsequent operations.
+
+The
+.BR io_uring_register_files_sparse (3)
+function registers an empty file table of
+.I nr_files
+number of file descriptors. The sparse variant is available in kernels 5.19
+and later.
+
+Registering a file table is a prerequisite for using any request that uses
+direct descriptors.
+
+.SH RETURN VALUE
+On success
+.BR io_uring_register_files (3)
+and
+.BR io_uring_register_files_sparse (3)
+return 0. On failure they return
+.BR -errno .
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_unregister_files (3)
diff --git a/man/io_uring_register_iowq_aff.3 b/man/io_uring_register_iowq_aff.3
new file mode 100644
index 0000000..e782914
--- /dev/null
+++ b/man/io_uring_register_iowq_aff.3
@@ -0,0 +1,61 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_register_iowq_aff 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_register_iowq_aff \- register async worker CPU affinities
+.SH SYNOPSIS
+.nf
+.B #include <sched.h>
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_register_iowq_aff(struct io_uring *" ring ","
+.BI " size_t " cpusz ","
+.BI " const cpu_set_t *" mask ");
+.PP
+.BI "void io_uring_unregister_iowq_aff(struct io_uring *" ring ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_prep_register_iowq_aff (3)
+function registers a set of CPU affinities to be used by the io_uring async
+workers. By default, io_uring async workers are allowed to run on any CPU in
+the system. If this function is called with
+.I ring
+set to the ring in question and
+.I mask
+set to a pointer to a
+.B cpu_set_t
+value and
+.I cpusz
+set to the size of the CPU set, then async workers will only be allowed to run
+on the CPUs specified in the mask. Existing workers may need to hit a schedule
+point before they are migrated.
+
+For unregistration,
+.BR io_uring_unregister_iowq_aff (3)
+may be called to restore CPU affinities to the default.
+
+.SH RETURN VALUE
+Returns
+.B 0
+on success, or any of the following values in case of error.
+.TP
+.B -EFAULT
+The kernel was unable to copy the memory pointer to by
+.I mask
+as it was invalid.
+.TP
+.B -ENOMEM
+The kernel was unable to allocate memory for the new CPU mask.
+.TP
+.B -EINVAL
+.I cpusz
+or
+.I mask
+was NULL/0, or any other value specified was invalid.
+.SH SEE ALSO
+.BR io_uring_queue_init (3),
+.BR io_uring_register (2)
diff --git a/man/io_uring_register_iowq_max_workers.3 b/man/io_uring_register_iowq_max_workers.3
new file mode 100644
index 0000000..2557e21
--- /dev/null
+++ b/man/io_uring_register_iowq_max_workers.3
@@ -0,0 +1,71 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_register_iowq_max_workers 3 "March 13, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_register_iowq_max_workers \- modify the maximum allowed async workers
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_register_iowq_max_workers(struct io_uring *" ring ","
+.BI " unsigned int *" values ");"
+.fi
+.SH DESCRIPTION
+.PP
+io_uring async workers are split into two types:
+.TP
+.B Bounded
+These workers have a bounded execution time. Examples of that are filesystem
+reads, which normally complete in a relatively short amount of time. In case
+of disk failures, they are still bounded by a timeout operation that will
+abort them if exceeded.
+.TP
+.B Unbounded
+Work items here may take an indefinite amount of time to complete. Examples
+include doing IO to sockets, pipes, or any other non-regular type of file.
+
+.PP
+By default, the amount of bounded IO workers is limited to how many SQ entries
+the ring was setup with, or 4 times the number of online CPUs in the system,
+whichever is smaller. Unbounded workers are only limited by the process task
+limit, as indicated by the rlimit
+.B RLIMIT_NPROC
+limit.
+
+This can be modified by calling
+.B io_uring_register_iowq_max_workers
+with
+.I ring
+set to the ring in question, and
+.I values
+pointing to an array of two values. The first element should contain the number
+of desired bounded workers, and the second element should contain the number
+of desired unbounded workers. These are both maximum values, io_uring will
+not maintain a high count of idle workers, they are reaped when they are not
+necessary anymore.
+
+If called with both values set to 0, the existing values are returned.
+
+.SH RETURN VALUE
+Returns
+.B 0
+on success, with
+.I values
+containing the previous values for the settings. On error, any of the following
+may be returned.
+.TP
+.B -EFAULT
+The kernel was unable to copy the memory pointer to by
+.I values
+as it was invalid.
+.TP
+.B -EINVAL
+.I values
+was
+.B NULL
+or the new values exceeded the maximum allowed value.
+.SH SEE ALSO
+.BR io_uring_queue_init (3),
+.BR io_uring_register (2)
diff --git a/man/io_uring_register_ring_fd.3 b/man/io_uring_register_ring_fd.3
new file mode 100644
index 0000000..e70c551
--- /dev/null
+++ b/man/io_uring_register_ring_fd.3
@@ -0,0 +1,49 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_register_ring_fd 3 "March 11, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_register_ring_fd \- register a ring file descriptor
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_register_ring_fd(struct io_uring *" ring ");"
+.fi
+.SH DESCRIPTION
+.PP
+.BR io_uring_register_ring_fd (3)
+registers the file descriptor of the ring.
+
+Whenever
+.BR io_uring_enter (2)
+is called to submit request or wait for completions, the kernel must grab a
+reference to the file descriptor. If the application using io_uring is threaded,
+the file table is marked as shared, and the reference grab and put of the file
+descriptor count is more expensive than it is for a non-threaded application.
+
+Similarly to how io_uring allows registration of files, this allow registration
+of the ring file descriptor itself. This reduces the overhead of the
+.BR io_uring_enter (2)
+system call.
+
+If an application using liburing is threaded, then an application should call
+this function to register the ring descriptor when a ring is set up. See NOTES
+for restrictions when a ring is shared.
+
+.SH NOTES
+When the ring descriptor is registered, it is stored internally in the
+.I struct io_uring
+structure. For applications that share a ring between threads, for example
+having one thread do submits and another reap events, then this optimization
+cannot be used as each thread may have a different index for the registered
+ring fd.
+.SH RETURN VALUE
+Returns 1 on success, indicating that one file descriptor was registered,
+or
+.BR -errno
+on error.
+.SH SEE ALSO
+.BR io_uring_unregister_ring_fd (3),
+.BR io_uring_register_files (3)
diff --git a/man/io_uring_setup.2 b/man/io_uring_setup.2
index cb8eba9..75c69ff 100644
--- a/man/io_uring_setup.2
+++ b/man/io_uring_setup.2
@@ -37,7 +37,8 @@ struct io_uring_params {
__u32 sq_thread_cpu;
__u32 sq_thread_idle;
__u32 features;
- __u32 resv[4];
+ __u32 wq_fd;
+ __u32 resv[3];
struct io_sqring_offsets sq_off;
struct io_cqring_offsets cq_off;
};
@@ -170,7 +171,7 @@ then it will be clamped at
.B IORING_MAX_CQ_ENTRIES .
.TP
.B IORING_SETUP_ATTACH_WQ
-This flag should be set in conjunction with
+This flag should be set in conjunction with
.IR "struct io_uring_params.wq_fd"
being set to an existing io_uring ring file descriptor. When set, the
io_uring instance being created will share the asynchronous worker
@@ -183,6 +184,61 @@ In this state, restrictions can be registered, but submissions are not allowed.
See
.BR io_uring_register (2)
for details on how to enable the ring. Available since 5.10.
+.TP
+.B IORING_SETUP_SUBMIT_ALL
+Normally io_uring stops submitting a batch of request, if one of these requests
+results in an error. This can cause submission of less than what is expected,
+if a request ends in error while being submitted. If the ring is created with
+this flag,
+.BR io_uring_enter (2)
+will continue submitting requests even if it encounters an error submitting
+a request. CQEs are still posted for errored request regardless of whether or
+not this flag is set at ring creation time, the only difference is if the
+submit sequence is halted or continued when an error is observed. Available
+since 5.18.
+.TP
+.B IORING_SETUP_COOP_TASKRUN
+By default, io_uring will interrupt a task running in userspace when a
+completion event comes in. This is to ensure that completions run in a timely
+manner. For a lot of use cases, this is overkill and can cause reduced
+performance from both the inter-processor interrupt used to do this, the
+kernel/user transition, the needless interruption of the tasks userspace
+activities, and reduced batching if completions come in at a rapid rate. Most
+applications don't need the forceful interruption, as the events are processed
+at any kernel/user transition. The exception are setups where the application
+uses multiple threads operating on the same ring, where the application
+waiting on completions isn't the one that submitted them. For most other
+use cases, setting this flag will improve performance. Available since 5.19.
+.TP
+.B IORING_SETUP_TASKRUN_FLAG
+Used in conjunction with
+.B IORING_SETUP_COOP_TASKRUN,
+this provides a flag,
+.B IORING_SQ_TASKRUN,
+which is set in the SQ ring
+.I flags
+whenever completions are pending that should be processed. liburing will check
+for this flag even when doing
+.BR io_uring_peek_cqe (3)
+and enter the kernel to process them, and applications can do the same. This
+makes
+.B IORING_SETUP_TASKRUN_FLAG
+safe to use even when applications rely on a peek style operation on the CQ
+ring to see if anything might be pending to reap. Available since 5.19.
+.TP
+.B IORING_SETUP_SQE128
+If set, io_uring will use 128-byte SQEs rather than the normal 64-byte sized
+variant. This is a requirement for using certain request types, as of 5.19
+only the
+.B IORING_OP_URING_CMD
+passthrough command for NVMe passthrough needs this. Available since 5.19.
+.TP
+.B IORING_SETUP_CQE32
+If set, io_uring will use 32-byte CQEs rather than the normal 32-byte sized
+variant. This is a requirement for using certain request types, as of 5.19
+only the
+.B IORING_OP_URING_CMD
+passthrough command for NVMe passthrough needs this. Available since 5.19.
.PP
If no flags are specified, the io_uring instance is setup for
interrupt driven I/O. I/O may be submitted using
@@ -202,27 +258,30 @@ If this flag is set, the two SQ and CQ rings can be mapped with a single
.I mmap(2)
call. The SQEs must still be allocated separately. This brings the necessary
.I mmap(2)
-calls down from three to two.
+calls down from three to two. Available since kernel 5.4.
.TP
.B IORING_FEAT_NODROP
If this flag is set, io_uring supports never dropping completion events.
If a completion event occurs and the CQ ring is full, the kernel stores
the event internally until such a time that the CQ ring has room for more
entries. If this overflow condition is entered, attempting to submit more
-IO with fail with the
+IO will fail with the
.B -EBUSY
error value, if it can't flush the overflown events to the CQ ring. If this
happens, the application must reap events from the CQ ring and attempt the
-submit again.
+submit again. Available since kernel 5.5.
.TP
.B IORING_FEAT_SUBMIT_STABLE
If this flag is set, applications can be certain that any data for
-async offload has been consumed when the kernel has consumed the SQE.
+async offload has been consumed when the kernel has consumed the SQE. Available
+since kernel 5.5.
.TP
.B IORING_FEAT_RW_CUR_POS
If this flag is set, applications can specify
.I offset
-== -1 with
+==
+.B -1
+with
.B IORING_OP_{READV,WRITEV}
,
.B IORING_OP_{READ,WRITE}_FIXED
@@ -234,10 +293,13 @@ and
.I pwritev2(2)
with
.I offset
-== -1. It'll use (and update) the current file position. This obviously comes
+==
+.B -1.
+It'll use (and update) the current file position. This obviously comes
with the caveat that if the application has multiple reads or writes in flight,
then the end result will not be as expected. This is similar to threads sharing
-a file descriptor and doing IO using the current file position.
+a file descriptor and doing IO using the current file position. Available since
+kernel 5.6.
.TP
.B IORING_FEAT_CUR_PERSONALITY
If this flag is set, then io_uring guarantees that both sync and async
@@ -253,7 +315,7 @@ still register different personalities through
io_uring_register(2)
with
.B IORING_REGISTER_PERSONALITY
-and specify the personality to use in the sqe.
+and specify the personality to use in the sqe. Available since kernel 5.6.
.TP
.B IORING_FEAT_FAST_POLL
If this flag is set, then io_uring supports using an internal poll mechanism
@@ -262,20 +324,81 @@ write data to a file no longer need to be punted to an async thread for
handling, instead they will begin operation when the file is ready. This is
similar to doing poll + read/write in userspace, but eliminates the need to do
so. If this flag is set, requests waiting on space/data consume a lot less
-resources doing so as they are not blocking a thread.
+resources doing so as they are not blocking a thread. Available since kernel
+5.7.
.TP
.B IORING_FEAT_POLL_32BITS
If this flag is set, the
.B IORING_OP_POLL_ADD
command accepts the full 32-bit range of epoll based flags. Most notably
.B EPOLLEXCLUSIVE
-which allows exclusive (waking single waiters) behavior.
+which allows exclusive (waking single waiters) behavior. Available since kernel
+5.9.
.TP
.B IORING_FEAT_SQPOLL_NONFIXED
If this flag is set, the
.B IORING_SETUP_SQPOLL
feature no longer requires the use of fixed files. Any normal file descriptor
-can be used for IO commands without needing registration.
+can be used for IO commands without needing registration. Available since
+kernel 5.11.
+.TP
+.B IORING_FEAT_ENTER_EXT_ARG
+If this flag is set, then the
+.BR io_uring_enter (2)
+system call supports passing in an extended argument instead of just the
+.IR "sigset_t"
+of earlier kernels. This.
+extended argument is of type
+.IR "struct io_uring_getevents_arg"
+and allows the caller to pass in both a
+.IR "sigset_t"
+and a timeout argument for waiting on events. The struct layout is as follows:
+.TP
+.in +8n
+.EX
+struct io_uring_getevents_arg {
+ __u64 sigmask;
+ __u32 sigmask_sz;
+ __u32 pad;
+ __u64 ts;
+};
+.EE
+
+and a pointer to this struct must be passed in if
+.B IORING_ENTER_EXT_ARG
+is set in the flags for the enter system call. Available since kernel 5.11.
+.TP
+.B IORING_FEAT_NATIVE_WORKERS
+If this flag is set, io_uring is using native workers for its async helpers.
+Previous kernels used kernel threads that assumed the identity of the
+original io_uring owning task, but later kernels will actively create what
+looks more like regular process threads instead. Available since kernel
+5.12.
+.TP
+.B IORING_FEAT_RSRC_TAGS
+If this flag is set, then io_uring supports a variety of features related
+to fixed files and buffers. In particular, it indicates that registered
+buffers can be updated in-place, whereas before the full set would have to
+be unregistered first. Available since kernel 5.13.
+.TP
+.B IORING_FEAT_CQE_SKIP
+If this flag is set, then io_uring supports setting
+.B IOSQE_CQE_SKIP_SUCCESS
+in the submitted SQE, indicating that no CQE should be generated for this
+SQE if it executes normally. If an error happens processing the SQE, a
+CQE with the appropriate error value will still be generated. Available since
+kernel 5.17.
+.TP
+.B IORING_FEAT_LINKED_FILE
+If this flag is set, then io_uring supports sane assignment of files for SQEs
+that have dependencies. For example, if a chain of SQEs are submitted with
+.B IOSQE_IO_LINK,
+then kernels without this flag will prepare the file for each link upfront.
+If a previous link opens a file with a known index, eg if direct descriptors
+are used with open or accept, then file assignment needs to happen post
+execution of that SQE. If this flag is set, then the kernel will defer
+file assignment until execution of a given request is started. Available since
+kernel 5.17.
.PP
The rest of the fields in the
@@ -425,7 +548,9 @@ or
.BR io_uring_enter (2)
system calls.
-On error, -1 is returned and
+On error,
+.B -1
+is returned and
.I errno
is set appropriately.
.PP
diff --git a/man/io_uring_sq_ready.3 b/man/io_uring_sq_ready.3
new file mode 100644
index 0000000..9927388
--- /dev/null
+++ b/man/io_uring_sq_ready.3
@@ -0,0 +1,31 @@
+.\" Copyright (C) 2022 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_sq_ready "January 25, 2022" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_sq_ready \- number of unconsumed or unsubmitted entries in the SQ ring
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "unsigned io_uring_sq_ready(const struct io_uring *" ring ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_sq_ready (3)
+function retuns the number of unconsumed (if SQPOLL) or unsubmitted entries
+that exist in the SQ ring belonging to the
+.I ring
+param.
+
+Usage of this function only applies if the ring has been setup with
+.B IORING_SETUP_SQPOLL,
+where request submissions, and hence consumption from the SQ ring, happens
+through a polling thread.
+
+.SH RETURN VALUE
+Returns the number of unconsumed or unsubmitted entries in the SQ ring.
+.SH SEE ALSO
+.BR io_uring_cq_ready (3)
diff --git a/man/io_uring_sq_space_left.3 b/man/io_uring_sq_space_left.3
new file mode 100644
index 0000000..b5b2e21
--- /dev/null
+++ b/man/io_uring_sq_space_left.3
@@ -0,0 +1,25 @@
+.\" Copyright (C) 2022 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_sq_space-left "January 25, 2022" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_sq_space_left \- free space in the SQ ring
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "unsigned io_uring_sq_space_left(const struct io_uring *" ring ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_sq_space_left (3)
+function retuns how much space is left in the SQ ring belonging to the
+.I ring
+param.
+
+.SH RETURN VALUE
+Returns the number of availables entries in the SQ ring.
+.SH SEE ALSO
+.BR io_uring_sq_ready (3)
diff --git a/man/io_uring_sqe_set_data.3 b/man/io_uring_sqe_set_data.3
new file mode 100644
index 0000000..274a892
--- /dev/null
+++ b/man/io_uring_sqe_set_data.3
@@ -0,0 +1,48 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_sqe_set_data 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_sqe_set_data \- set user data for submission queue event
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_sqe_set_data(struct io_uring_sqe *" sqe ","
+.BI " void *" user_data ");"
+.BI "
+.BI "void io_uring_sqe_set_data64(struct io_uring_sqe *" sqe ","
+.BI " __u64 " data ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_sqe_set_data (3)
+function stores a
+.I user_data
+pointer with the submission queue entry
+.IR sqe .
+
+The
+.BR io_uring_sqe_set_data64 (3)
+function stores a 64-bit
+.I data
+value with the submission queue entry
+.IR sqe .
+
+After the caller has requested a submission queue entry (SQE) with
+.BR io_uring_get_sqe (3) ,
+they can associate a data pointer or value with the SQE. Once the completion
+arrives, the function
+.BR io_uring_cqe_get_data (3)
+or
+.BR io_uring_cqe_get_data64 (3)
+can be called to retrieve the data pointer or value associated with the
+submitted request.
+
+.SH RETURN VALUE
+None
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_cqe_get_data (3)
diff --git a/man/io_uring_sqe_set_data64.3 b/man/io_uring_sqe_set_data64.3
new file mode 120000
index 0000000..8bbd692
--- /dev/null
+++ b/man/io_uring_sqe_set_data64.3
@@ -0,0 +1 @@
+io_uring_sqe_set_data.3 \ No newline at end of file
diff --git a/man/io_uring_sqe_set_flags.3 b/man/io_uring_sqe_set_flags.3
new file mode 100644
index 0000000..75e836b
--- /dev/null
+++ b/man/io_uring_sqe_set_flags.3
@@ -0,0 +1,86 @@
+.\" Copyright (C) 2022 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_sqe_set_flags "January 25, 2022" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_sqe_set_flags \- set flags for submission queue entry
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "void io_uring_sqe_set_flags(struct io_uring_sqe *" sqe ","
+.BI " unsigned " flags ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_sqe_set_flags (3)
+function allows the caller to change the behavior of the submission queue entry
+by specifying flags. It enables the
+.I flags
+belonging to the
+.I sqe
+submission queue entry param.
+
+.I flags
+is a bit mask of 0 or more of the following values ORed together:
+.TP
+.B IOSQE_FIXED_FILE
+The file descriptor in the SQE refers to the index of a previously registered
+file or direct file descriptor, not a normal file descriptor.
+.TP
+.B IOSQE_ASYNC
+Normal operation for io_uring is to try and issue an sqe as non-blocking first,
+and if that fails, execute it in an async manner. To support more efficient
+overlapped operation of requests that the application knows/assumes will
+always (or most of the time) block, the application can ask for an sqe to be
+issued async from the start. Note that this flag immediately causes the SQE
+to be offloaded to an async helper thread with no initial non-blocking attempt.
+This may be less efficient and should not be used sporadically.
+.TP
+.B IOSQE_IO_LINK
+When this flag is specified, the SQE forms a link with the next SQE in the
+submission ring. That next SQE will not be started before the previous request
+completes. This, in effect, forms a chain of SQEs, which can be arbitrarily
+long. The tail of the chain is denoted by the first SQE that does not have this
+flag set. Chains are not supported across submission boundaries. Even if the
+last SQE in a submission has this flag set, it will still terminate the current
+chain. This flag has no effect on previous SQE submissions, nor does it impact
+SQEs that are outside of the chain tail. This means that multiple chains can be
+executing in parallel, or chains and individual SQEs. Only members inside the
+chain are serialized. A chain of SQEs will be broken if any request in that
+chain ends in error.
+.TP
+.B IOSQE_IO_HARDLINK
+Like
+.B IOSQE_IO_LINK ,
+except the links aren't severed if an error or unexpected result occurs.
+.TP
+.B IOSQE_IO_DRAIN
+When this flag is specified, the SQE will not be started before previously
+submitted SQEs have completed, and new SQEs will not be started before this
+one completes.
+.TP
+.B IOSQE_CQE_SKIP_SUCCESS
+Request that no CQE be generated for this request, if it completes successfully.
+This can be useful in cases where the application doesn't need to know when
+a specific request completed, if it completed succesfully.
+.TP
+.B IOSQE_BUFFER_SELECT
+If set, and if the request types supports it, select an IO buffer from the
+indicated buffer group. This can be used with requests that read or receive
+data from a file or socket, where buffer selection is deferred until the kernel
+is ready to transfer data, instead of when the IO is originally submitted. The
+application must also set the
+.I buf_group
+field in the SQE, indicating which previously registered buffer group to select
+a buffer from.
+
+.SH RETURN VALUE
+None
+.SH SEE ALSO
+.BR io_uring_submit (3),
+.BR io_uring_register (3)
+.BR io_uring_register_buffers (3)
+.BR io_uring_register_buf_ring (3)
diff --git a/man/io_uring_sqring_wait.3 b/man/io_uring_sqring_wait.3
new file mode 100644
index 0000000..d70cf40
--- /dev/null
+++ b/man/io_uring_sqring_wait.3
@@ -0,0 +1,34 @@
+.\" Copyright (C) 2022 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_sqring_wait "January 25, 2022" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_sqring_wait \- wait for free space in the SQ ring
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_sqring_wait(struct io_uring *" ring ");"
+.fi
+.SH DESCRIPTION
+.PP
+The function
+.BR io_uring_sqring_wait (3)
+allows the caller to wait for space to free up in the SQ ring belonging to the
+.I ring
+param, which happens when the kernel side thread
+has consumed one or more entries. If the SQ ring is currently non-full,
+no action is taken.
+
+This feature can only be used when the ring has been setup with
+.B IORING_SETUP_SQPOLL
+and hence is using an offloaded approach to request submissions.
+
+.SH RETURN VALUE
+On success it returns the free space. If the kernel does not support the
+feature, -EINVAL is returned.
+.SH SEE ALSO
+.BR io_uring_submit (3),
+.BR io_uring_wait_cqe (3),
+.BR io_uring_wait_cqes (3)
diff --git a/man/io_uring_submit.3 b/man/io_uring_submit.3
new file mode 100644
index 0000000..f871b89
--- /dev/null
+++ b/man/io_uring_submit.3
@@ -0,0 +1,46 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_submit 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_submit \- submit requests to the submission queue
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_submit(struct io_uring *" ring ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_submit (3)
+function submits the next events to the submission queue belonging to the
+.IR ring .
+
+After the caller retrieves a submission queue entry (SQE) with
+.BR io_uring_get_sqe (3)
+and prepares the SQE using one of the provided helpers, it can be submitted with
+.BR io_uring_submit (3) .
+
+.SH RETURN VALUE
+On success
+.BR io_uring_submit (3)
+returns the number of submitted submission queue entries. On failure it returns
+.BR -errno .
+.SH NOTES
+For any request that passes in data in a struct, that data must remain
+valid until the request has been successfully submitted. It need not remain
+valid until completion. Once a request has been submitted, the in-kernel
+state is stable. Very early kernels (5.4 and earlier) required state to be
+stable until the completion occurred. Applications can test for this
+behavior by inspecting the
+.B IORING_FEAT_SUBMIT_STABLE
+flag passed back from
+.BR io_uring_queue_init_params (3).
+In general, the man pages for the individual prep helpers will have a note
+mentioning this fact as well, if required for the given command.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit_and_wait (3),
+.BR io_uring_submit_and_wait_timeout (3)
diff --git a/man/io_uring_submit_and_wait.3 b/man/io_uring_submit_and_wait.3
new file mode 100644
index 0000000..1c9eb62
--- /dev/null
+++ b/man/io_uring_submit_and_wait.3
@@ -0,0 +1,38 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_submit_and_wait 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_submit_and_wait \- submit requests to the submission queue and wait for completion
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_submit_and_wait(struct io_uring *" ring ","
+.BI " unsigned " wait_nr ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_submit_and_wait (3)
+function submits the next events to the submission queue belonging to the
+.I ring
+and waits for
+.I wait_nr
+completion events.
+
+After the caller retrieves a submission queue entry (SQE) with
+.BR io_uring_get_sqe (3)
+and prepares the SQE, it can be submitted with
+.BR io_uring_submit_and_wait (3) .
+
+.SH RETURN VALUE
+On success
+.BR io_uring_submit_and_wait (3)
+returns the number of submitted submission queue entries. On failure it returns
+.BR -errno .
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_submit_and_wait_timeout (3)
diff --git a/man/io_uring_submit_and_wait_timeout.3 b/man/io_uring_submit_and_wait_timeout.3
new file mode 100644
index 0000000..80fe889
--- /dev/null
+++ b/man/io_uring_submit_and_wait_timeout.3
@@ -0,0 +1,54 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_submit_and_wait_timeout 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_submit_and_wait_timeout \- submit requests to the submission queue and
+wait for the completion with timeout
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_submit_and_wait_timeout(struct io_uring *" ring ","
+.BI " struct io_uring_cqe **" cqe_ptr ","
+.BI " unsigned " wait_nr ","
+.BI " struct __kernel_timespec *" ts ","
+.BI " sigset_t *" sigmask ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_submit_and_wait_timeout (3)
+function submits the next events to the submission queue belonging to the
+.I ring
+and waits for
+.I wait_nr
+completion events or until the timeout
+.I ts
+expires. The completion events are stored in the
+.I cqe_ptr
+array. The
+.I sigmask
+specifies the set of signals to block. The prevailing signal mask is restored
+before returning.
+
+After the caller retrieves a submission queue entry (SQE) with
+.BR io_uring_get_sqe (3)
+and prepares the SQE, it can be submitted with
+.BR io_uring_submit_and_wait_timeout (3) .
+
+.SH RETURN VALUE
+On success
+.BR io_uring_submit_and_wait_timeout (3)
+returns the number of submitted submission queue entries. On failure it returns
+.BR -errno .
+The most common failure case is not receiving a completion within the specified
+timeout,
+.B -ETIME
+is returned in this case.
+.SH SEE ALSO
+.BR io_uring_get_sqe (3),
+.BR io_uring_submit (3),
+.BR io_uring_submit_and_wait (3),
+.BR io_uring_wait_cqe (3)
diff --git a/man/io_uring_unregister_buf_ring.3 b/man/io_uring_unregister_buf_ring.3
new file mode 100644
index 0000000..ee87e86
--- /dev/null
+++ b/man/io_uring_unregister_buf_ring.3
@@ -0,0 +1,30 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_unregister_buf_ring 3 "May 18, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_unregister_buf_ring \- unregister a previously registered buffer ring
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_unregister_buf_ring(struct io_uring *" ring ",
+.BI " int " bgid ");"
+.BI "
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_unregister_buf_ring (3)
+function unregisters a previously registered shared buffer ring indicated by
+.IR bgid .
+
+.SH RETURN VALUE
+On success
+.BR io_uring_unregister_buf_ring (3)
+returns 0. On failure it returns
+.BR -errno .
+.SH SEE ALSO
+.BR io_uring_register_buf_ring (3),
+.BR io_uring_buf_ring_free (3)
diff --git a/man/io_uring_unregister_buffers.3 b/man/io_uring_unregister_buffers.3
new file mode 100644
index 0000000..f066679
--- /dev/null
+++ b/man/io_uring_unregister_buffers.3
@@ -0,0 +1,27 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_unregister_buffers 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_unregister_buffers \- unregister buffers for fixed buffer operations
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_unregister_buffers(struct io_uring *" ring ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_unregister_buffers (3)
+function unregisters the fixed buffers previously registered to the
+.IR ring .
+
+.SH RETURN VALUE
+On success
+.BR io_uring_unregister_buffers (3)
+returns 0. On failure it returns
+.BR -errno .
+.SH SEE ALSO
+.BR io_uring_register_buffers (3)
diff --git a/man/io_uring_unregister_eventfd.3 b/man/io_uring_unregister_eventfd.3
new file mode 120000
index 0000000..6659957
--- /dev/null
+++ b/man/io_uring_unregister_eventfd.3
@@ -0,0 +1 @@
+io_uring_register_eventfd.3 \ No newline at end of file
diff --git a/man/io_uring_unregister_files.3 b/man/io_uring_unregister_files.3
new file mode 100644
index 0000000..c468d08
--- /dev/null
+++ b/man/io_uring_unregister_files.3
@@ -0,0 +1,27 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_unregister_files 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_unregister_files \- unregister file descriptors
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_unregister_files(struct io_uring *" ring ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_unregister_files (3)
+function unregisters the file descriptors previously registered to the
+.IR ring .
+
+.SH RETURN VALUE
+On success
+.BR io_uring_unregister_files (3)
+returns 0. On failure it returns
+.BR -errno .
+.SH SEE ALSO
+.BR io_uring_register_files (3)
diff --git a/man/io_uring_unregister_iowq_aff.3 b/man/io_uring_unregister_iowq_aff.3
new file mode 120000
index 0000000..c29bd44
--- /dev/null
+++ b/man/io_uring_unregister_iowq_aff.3
@@ -0,0 +1 @@
+io_uring_register_iowq_aff.3 \ No newline at end of file
diff --git a/man/io_uring_unregister_ring_fd.3 b/man/io_uring_unregister_ring_fd.3
new file mode 100644
index 0000000..85aca14
--- /dev/null
+++ b/man/io_uring_unregister_ring_fd.3
@@ -0,0 +1,32 @@
+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_unregister_ring_fd 3 "March 11, 2022" "liburing-2.2" "liburing Manual"
+.SH NAME
+io_uring_unregister_ring_fd \- unregister a ring file descriptor
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_unregister_ring_fd(struct io_uring *" ring ");"
+.fi
+.SH DESCRIPTION
+.PP
+.BR io_uring_unregister_ring_fd (3)
+unregisters the file descriptor of the ring.
+
+Unregisters a ring descriptor previously registered with the task. This is
+done automatically when
+.BR io_uring_queue_exit (3)
+is called, but can also be done to free up space for new ring registrations.
+For more information on ring descriptor registration, see
+.BR io_uring_register_ring_fd (3)
+
+.SH RETURN VALUE
+Returns 1 on success, indicating that one file descriptor was unregistered, or
+.BR -errno
+on error.
+.SH SEE ALSO
+.BR io_uring_register_ring_fd (3),
+.BR io_uring_register_files (3)
diff --git a/man/io_uring_wait_cqe.3 b/man/io_uring_wait_cqe.3
new file mode 100644
index 0000000..c115f6f
--- /dev/null
+++ b/man/io_uring_wait_cqe.3
@@ -0,0 +1,40 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_wait_cqe 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_wait_cqe \- wait for one io_uring completion event
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_wait_cqe(struct io_uring *" ring ","
+.BI " struct io_uring_cqe **" cqe_ptr ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_wait_cqe (3)
+function waits for an IO completion from the queue belonging to the
+.I ring
+param, waiting for it if necessary. If an event is already available in
+the ring when invoked, no waiting will occur. The
+.I cqe_ptr
+param is filled in on success.
+
+After the caller has submitted a request with
+.BR io_uring_submit (3),
+the application can retrieve the completion with
+.BR io_uring_wait_cqe (3).
+
+.SH RETURN VALUE
+On success
+.BR io_uring_wait_cqe (3)
+returns 0 and the cqe_ptr parm is filled in. On failure it returns
+.BR -errno .
+The return value indicates the result of waiting for a CQE, and it has no
+relation to the CQE result itself.
+.SH SEE ALSO
+.BR io_uring_submit (3),
+.BR io_uring_wait_cqes (3)
diff --git a/man/io_uring_wait_cqe_nr.3 b/man/io_uring_wait_cqe_nr.3
new file mode 100644
index 0000000..5a4a5d5
--- /dev/null
+++ b/man/io_uring_wait_cqe_nr.3
@@ -0,0 +1,43 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_wait_cqe_nr 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_wait_cqe_nr \- wait for one or more io_uring completion events
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_wait_cqe_nr(struct io_uring *" ring ","
+.BI " struct io_uring_cqe **" cqe_ptr ","
+.BI " unsigned " wait_nr ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_wait_cqe_nr (3)
+function returns
+.I wait_nr
+IO completion events from the queue belonging to the
+.I ring
+param, waiting for it if necessary. If the requested number of events are
+already available in the ring when invoked, no waiting will occur. The
+.I cqe_ptr
+param is filled in on success.
+
+After the caller has submitted a request with
+.BR io_uring_submit (3),
+the application can retrieve the completion with
+.BR io_uring_wait_cqe (3).
+
+.SH RETURN VALUE
+On success
+.BR io_uring_wait_cqe_nr (3)
+returns 0 and the cqe_ptr parm is filled in. On failure it returns
+.BR -errno .
+The return value indicates the result of waiting for a CQE, and it has no
+relation to the CQE result itself.
+.SH SEE ALSO
+.BR io_uring_submit (3),
+.BR io_uring_wait_cqes (3)
diff --git a/man/io_uring_wait_cqe_timeout.3 b/man/io_uring_wait_cqe_timeout.3
new file mode 100644
index 0000000..965fc32
--- /dev/null
+++ b/man/io_uring_wait_cqe_timeout.3
@@ -0,0 +1,53 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_wait_cqe_timeout 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_wait_cqe_timeout \- wait for one io_uring completion event with timeout
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_wait_cqe_timeout(struct io_uring *" ring ","
+.BI " struct io_uring_cqe **" cqe_ptr ","
+.BI " struct __kernel_timespec *" ts ");"
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_wait_cqe_timeout (3)
+function waits for one IO completion to be available from the queue belonging
+to the
+.I ring
+param, waiting for it if necessary or until the timeout
+.I ts
+expires. If an event is already available in the ring when invoked, no waiting
+will occur.
+
+The
+.I cqe_ptr
+param is filled in on success.
+
+If
+.I ts
+is specified and an older kernel without
+.B IORING_FEAT_EXT_ARG
+is used, the application does not need to call
+.BR io_uring_submit (3)
+before calling
+.BR io_uring_wait_cqes (3).
+For newer kernels with that feature flag set, there is no implied submit
+when waiting for a request.
+
+.SH RETURN VALUE
+On success
+.BR io_uring_wait_cqes (3)
+returns 0 and the cqe_ptr parm is filled in. On failure it returns
+.BR -errno .
+The return value indicates the result of waiting for a CQE, and it has no
+relation to the CQE result itself.
+.SH SEE ALSO
+.BR io_uring_submit (3),
+.BR io_uring_wait_cqe_timeout (3),
+.BR io_uring_wait_cqe (3)
diff --git a/man/io_uring_wait_cqes.3 b/man/io_uring_wait_cqes.3
new file mode 100644
index 0000000..b771ebe
--- /dev/null
+++ b/man/io_uring_wait_cqes.3
@@ -0,0 +1,56 @@
+.\" Copyright (C) 2021 Stefan Roesch <shr@fb.com>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_wait_cqes 3 "November 15, 2021" "liburing-2.1" "liburing Manual"
+.SH NAME
+io_uring_wait_cqes \- wait for one or more io_uring completion events
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_wait_cqes(struct io_uring *" ring ","
+.BI " struct io_uring_cqe **" cqe_ptr ","
+.BI " unsigned " wait_nr ","
+.BI " struct __kernel_timespec *" ts ","
+.BI " sigset_t *" sigmask ");
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_wait_cqes (3)
+function returns
+.I wait_nr
+IO completions from the queue belonging to the
+.I ring
+param, waiting for them if necessary or until the timeout
+.I ts
+expires. The
+.I sigmask
+specifies the set of signals to block. The prevailing signal mask is restored
+before returning.
+
+The
+.I cqe_ptr
+param is filled in on success.
+
+If
+.I ts
+is specified and an older kernel without
+.B IORING_FEAT_EXT_ARG
+is used, the application does not need to call
+.BR io_uring_submit (3)
+before calling
+.BR io_uring_wait_cqes (3).
+For newer kernels with that feature flag set, there is no implied submit
+when waiting for a request.
+
+.SH RETURN VALUE
+On success
+.BR io_uring_wait_cqes (3)
+returns 0 and the cqe_ptr parm is filled in. On failure it returns
+.BR -errno .
+.SH SEE ALSO
+.BR io_uring_submit (3),
+.BR io_uring_wait_cqe_timeout (3),
+.BR io_uring_wait_cqe (3)
diff --git a/src/Makefile b/src/Makefile
index dfca826..12cf49f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,22 +1,23 @@
+include ../Makefile.common
+
prefix ?= /usr
includedir ?= $(prefix)/include
libdir ?= $(prefix)/lib
libdevdir ?= $(prefix)/lib
CPPFLAGS ?=
-override CPPFLAGS += -Iinclude/ -include ../config-host.h
-CFLAGS ?= -g -fomit-frame-pointer -O2
-override CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare
+override CPPFLAGS += -D_GNU_SOURCE \
+ -Iinclude/ -include ../config-host.h
+CFLAGS ?= -g -O2 -Wall -Wextra -fno-stack-protector
+override CFLAGS += -Wno-unused-parameter -Wno-sign-compare -DLIBURING_INTERNAL
SO_CFLAGS=-fPIC $(CFLAGS)
L_CFLAGS=$(CFLAGS)
LINK_FLAGS=
LINK_FLAGS+=$(LDFLAGS)
ENABLE_SHARED ?= 1
-soname=liburing.so.2
-minor=0
-micro=0
-libname=$(soname).$(minor).$(micro)
+soname=liburing.so.$(VERSION_MAJOR)
+libname=liburing.so.$(VERSION)
all_targets += liburing.a
ifeq ($(ENABLE_SHARED),1)
@@ -31,19 +32,31 @@ endif
all: $(all_targets)
-liburing_srcs := setup.c queue.c syscall.c register.c
+liburing_srcs := setup.c queue.c register.c
+
+ifeq ($(CONFIG_NOLIBC),y)
+ liburing_srcs += nolibc.c
+ override CFLAGS += -nostdlib -nodefaultlibs -ffreestanding
+ override CPPFLAGS += -nostdlib -nodefaultlibs -ffreestanding
+ override LINK_FLAGS += -nostdlib -nodefaultlibs
+else
+ liburing_srcs += syscall.c
+endif
+override CPPFLAGS += -MT "$@" -MMD -MP -MF "$@.d"
liburing_objs := $(patsubst %.c,%.ol,$(liburing_srcs))
liburing_sobjs := $(patsubst %.c,%.os,$(liburing_srcs))
-$(liburing_objs) $(liburing_sobjs): include/liburing/io_uring.h
-
%.os: %.c
$(QUIET_CC)$(CC) $(CPPFLAGS) $(SO_CFLAGS) -c -o $@ $<
%.ol: %.c
$(QUIET_CC)$(CC) $(CPPFLAGS) $(L_CFLAGS) -c -o $@ $<
+# Include compiler generated dependency files.
+-include $(liburing_objs:%=%.d)
+-include $(liburing_sobjs:%=%.d)
+
AR ?= ar
RANLIB ?= ranlib
liburing.a: $(liburing_objs)
@@ -66,9 +79,11 @@ ifeq ($(ENABLE_SHARED),1)
ln -sf $(relativelibdir)$(libname) $(libdevdir)/liburing.so
endif
-$(liburing_objs): include/liburing.h
-
clean:
@rm -f $(all_targets) $(liburing_objs) $(liburing_sobjs) $(soname).new
- @rm -f *.so* *.a *.o
+ @rm -f *.so* *.a *.o *.d
@rm -f include/liburing/compat.h
+
+ @# When cleaning, we don't include ../config-host.mak,
+ @# so the nolibc objects are always skipped, clean them up!
+ @rm -f nolibc.ol nolibc.os
diff --git a/src/arch/aarch64/syscall.h b/src/arch/aarch64/syscall.h
new file mode 100644
index 0000000..c0ab7e2
--- /dev/null
+++ b/src/arch/aarch64/syscall.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: MIT */
+
+#ifndef __INTERNAL__LIBURING_SYSCALL_H
+ #error "This file should be included from src/syscall.h (liburing)"
+#endif
+
+#ifndef LIBURING_ARCH_AARCH64_SYSCALL_H
+#define LIBURING_ARCH_AARCH64_SYSCALL_H
+
+#if defined(__aarch64__)
+
+#define __do_syscallN(...) ({ \
+ __asm__ volatile ( \
+ "svc 0" \
+ : "=r"(x0) \
+ : __VA_ARGS__ \
+ : "memory", "cc"); \
+ (long) x0; \
+})
+
+#define __do_syscall0(__n) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register long x0 __asm__("x0"); \
+ \
+ __do_syscallN("r" (x8)); \
+})
+
+#define __do_syscall1(__n, __a) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ \
+ __do_syscallN("r" (x8), "0" (x0)); \
+})
+
+#define __do_syscall2(__n, __a, __b) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ register __typeof__(__b) x1 __asm__("x1") = __b; \
+ \
+ __do_syscallN("r" (x8), "0" (x0), "r" (x1)); \
+})
+
+#define __do_syscall3(__n, __a, __b, __c) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ register __typeof__(__b) x1 __asm__("x1") = __b; \
+ register __typeof__(__c) x2 __asm__("x2") = __c; \
+ \
+ __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2)); \
+})
+
+#define __do_syscall4(__n, __a, __b, __c, __d) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ register __typeof__(__b) x1 __asm__("x1") = __b; \
+ register __typeof__(__c) x2 __asm__("x2") = __c; \
+ register __typeof__(__d) x3 __asm__("x3") = __d; \
+ \
+ __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3));\
+})
+
+#define __do_syscall5(__n, __a, __b, __c, __d, __e) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ register __typeof__(__b) x1 __asm__("x1") = __b; \
+ register __typeof__(__c) x2 __asm__("x2") = __c; \
+ register __typeof__(__d) x3 __asm__("x3") = __d; \
+ register __typeof__(__e) x4 __asm__("x4") = __e; \
+ \
+ __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3), \
+ "r"(x4)); \
+})
+
+#define __do_syscall6(__n, __a, __b, __c, __d, __e, __f) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ register __typeof__(__b) x1 __asm__("x1") = __b; \
+ register __typeof__(__c) x2 __asm__("x2") = __c; \
+ register __typeof__(__d) x3 __asm__("x3") = __d; \
+ register __typeof__(__e) x4 __asm__("x4") = __e; \
+ register __typeof__(__f) x5 __asm__("x5") = __f; \
+ \
+ __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3), \
+ "r" (x4), "r"(x5)); \
+})
+
+#include "../syscall-defs.h"
+
+#else /* #if defined(__aarch64__) */
+
+#include "../generic/syscall.h"
+
+#endif /* #if defined(__aarch64__) */
+
+#endif /* #ifndef LIBURING_ARCH_AARCH64_SYSCALL_H */
diff --git a/src/arch/generic/lib.h b/src/arch/generic/lib.h
new file mode 100644
index 0000000..737e795
--- /dev/null
+++ b/src/arch/generic/lib.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: MIT */
+
+#ifndef __INTERNAL__LIBURING_LIB_H
+ #error "This file should be included from src/lib.h (liburing)"
+#endif
+
+#ifndef LIBURING_ARCH_GENERIC_LIB_H
+#define LIBURING_ARCH_GENERIC_LIB_H
+
+static inline long get_page_size(void)
+{
+ long page_size;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ if (page_size < 0)
+ page_size = 4096;
+
+ return page_size;
+}
+
+#endif /* #ifndef LIBURING_ARCH_GENERIC_LIB_H */
diff --git a/src/arch/generic/syscall.h b/src/arch/generic/syscall.h
new file mode 100644
index 0000000..fa93064
--- /dev/null
+++ b/src/arch/generic/syscall.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: MIT */
+
+#ifndef __INTERNAL__LIBURING_SYSCALL_H
+ #error "This file should be included from src/syscall.h (liburing)"
+#endif
+
+#ifndef LIBURING_ARCH_GENERIC_SYSCALL_H
+#define LIBURING_ARCH_GENERIC_SYSCALL_H
+
+static inline int ____sys_io_uring_register(int fd, unsigned opcode,
+ const void *arg, unsigned nr_args)
+{
+ int ret;
+ ret = syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
+ return (ret < 0) ? -errno : ret;
+}
+
+static inline int ____sys_io_uring_setup(unsigned entries,
+ struct io_uring_params *p)
+{
+ int ret;
+ ret = syscall(__NR_io_uring_setup, entries, p);
+ return (ret < 0) ? -errno : ret;
+}
+
+static inline int ____sys_io_uring_enter2(int fd, unsigned to_submit,
+ unsigned min_complete, unsigned flags,
+ sigset_t *sig, int sz)
+{
+ int ret;
+ ret = syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags,
+ sig, sz);
+ return (ret < 0) ? -errno : ret;
+}
+
+static inline int ____sys_io_uring_enter(int fd, unsigned to_submit,
+ unsigned min_complete, unsigned flags,
+ sigset_t *sig)
+{
+ return ____sys_io_uring_enter2(fd, to_submit, min_complete, flags, sig,
+ _NSIG / 8);
+}
+
+static inline void *__sys_mmap(void *addr, size_t length, int prot, int flags,
+ int fd, off_t offset)
+{
+ void *ret;
+ ret = mmap(addr, length, prot, flags, fd, offset);
+ return (ret == MAP_FAILED) ? ERR_PTR(-errno) : ret;
+}
+
+static inline int __sys_munmap(void *addr, size_t length)
+{
+ int ret;
+ ret = munmap(addr, length);
+ return (ret < 0) ? -errno : ret;
+}
+
+static inline int __sys_madvise(void *addr, size_t length, int advice)
+{
+ int ret;
+ ret = madvise(addr, length, advice);
+ return (ret < 0) ? -errno : ret;
+}
+
+static inline int __sys_getrlimit(int resource, struct rlimit *rlim)
+{
+ int ret;
+ ret = getrlimit(resource, rlim);
+ return (ret < 0) ? -errno : ret;
+}
+
+static inline int __sys_setrlimit(int resource, const struct rlimit *rlim)
+{
+ int ret;
+ ret = setrlimit(resource, rlim);
+ return (ret < 0) ? -errno : ret;
+}
+
+static inline int __sys_close(int fd)
+{
+ int ret;
+ ret = close(fd);
+ return (ret < 0) ? -errno : ret;
+}
+
+#endif /* #ifndef LIBURING_ARCH_GENERIC_SYSCALL_H */
diff --git a/src/arch/syscall-defs.h b/src/arch/syscall-defs.h
new file mode 100644
index 0000000..1e8ae1b
--- /dev/null
+++ b/src/arch/syscall-defs.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: MIT */
+
+#ifndef LIBURING_ARCH_SYSCALL_DEFS_H
+#define LIBURING_ARCH_SYSCALL_DEFS_H
+
+static inline void *__sys_mmap(void *addr, size_t length, int prot, int flags,
+ int fd, off_t offset)
+{
+ int nr;
+
+#if defined(__i386__)
+ nr = __NR_mmap2;
+ offset >>= 12;
+#else
+ nr = __NR_mmap;
+#endif
+ return (void *) __do_syscall6(nr, addr, length, prot, flags, fd, offset);
+}
+
+static inline int __sys_munmap(void *addr, size_t length)
+{
+ return (int) __do_syscall2(__NR_munmap, addr, length);
+}
+
+static inline int __sys_madvise(void *addr, size_t length, int advice)
+{
+ return (int) __do_syscall3(__NR_madvise, addr, length, advice);
+}
+
+static inline int __sys_getrlimit(int resource, struct rlimit *rlim)
+{
+ return (int) __do_syscall2(__NR_getrlimit, resource, rlim);
+}
+
+static inline int __sys_setrlimit(int resource, const struct rlimit *rlim)
+{
+ return (int) __do_syscall2(__NR_setrlimit, resource, rlim);
+}
+
+static inline int __sys_close(int fd)
+{
+ return (int) __do_syscall1(__NR_close, fd);
+}
+
+static inline int ____sys_io_uring_register(int fd, unsigned opcode,
+ const void *arg, unsigned nr_args)
+{
+ return (int) __do_syscall4(__NR_io_uring_register, fd, opcode, arg,
+ nr_args);
+}
+
+static inline int ____sys_io_uring_setup(unsigned entries,
+ struct io_uring_params *p)
+{
+ return (int) __do_syscall2(__NR_io_uring_setup, entries, p);
+}
+
+static inline int ____sys_io_uring_enter2(int fd, unsigned to_submit,
+ unsigned min_complete, unsigned flags,
+ sigset_t *sig, int sz)
+{
+ return (int) __do_syscall6(__NR_io_uring_enter, fd, to_submit,
+ min_complete, flags, sig, sz);
+}
+
+static inline int ____sys_io_uring_enter(int fd, unsigned to_submit,
+ unsigned min_complete, unsigned flags,
+ sigset_t *sig)
+{
+ return ____sys_io_uring_enter2(fd, to_submit, min_complete, flags, sig,
+ _NSIG / 8);
+}
+
+#endif
diff --git a/src/arch/x86/lib.h b/src/arch/x86/lib.h
new file mode 100644
index 0000000..e6a74f3
--- /dev/null
+++ b/src/arch/x86/lib.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: MIT */
+
+#ifndef __INTERNAL__LIBURING_LIB_H
+ #error "This file should be included from src/lib.h (liburing)"
+#endif
+
+#ifndef LIBURING_ARCH_X86_LIB_H
+#define LIBURING_ARCH_X86_LIB_H
+
+static inline long get_page_size(void)
+{
+ return 4096;
+}
+
+#endif /* #ifndef LIBURING_ARCH_X86_LIB_H */
diff --git a/src/arch/x86/syscall.h b/src/arch/x86/syscall.h
new file mode 100644
index 0000000..43c576b
--- /dev/null
+++ b/src/arch/x86/syscall.h
@@ -0,0 +1,300 @@
+/* SPDX-License-Identifier: MIT */
+
+#ifndef __INTERNAL__LIBURING_SYSCALL_H
+ #error "This file should be included from src/syscall.h (liburing)"
+#endif
+
+#ifndef LIBURING_ARCH_X86_SYSCALL_H
+#define LIBURING_ARCH_X86_SYSCALL_H
+
+#if defined(__x86_64__)
+/**
+ * Note for syscall registers usage (x86-64):
+ * - %rax is the syscall number.
+ * - %rax is also the return value.
+ * - %rdi is the 1st argument.
+ * - %rsi is the 2nd argument.
+ * - %rdx is the 3rd argument.
+ * - %r10 is the 4th argument (**yes it's %r10, not %rcx!**).
+ * - %r8 is the 5th argument.
+ * - %r9 is the 6th argument.
+ *
+ * `syscall` instruction will clobber %r11 and %rcx.
+ *
+ * After the syscall returns to userspace:
+ * - %r11 will contain %rflags.
+ * - %rcx will contain the return address.
+ *
+ * IOW, after the syscall returns to userspace:
+ * %r11 == %rflags and %rcx == %rip.
+ */
+
+#define __do_syscall0(NUM) ({ \
+ intptr_t rax; \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"(NUM) /* %rax */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall1(NUM, ARG1) ({ \
+ intptr_t rax; \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)) /* %rdi */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall2(NUM, ARG1, ARG2) ({ \
+ intptr_t rax; \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)), /* %rdi */ \
+ "S"((ARG2)) /* %rsi */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall3(NUM, ARG1, ARG2, ARG3) ({ \
+ intptr_t rax; \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)), /* %rdi */ \
+ "S"((ARG2)), /* %rsi */ \
+ "d"((ARG3)) /* %rdx */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall4(NUM, ARG1, ARG2, ARG3, ARG4) ({ \
+ intptr_t rax; \
+ register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)), /* %rdi */ \
+ "S"((ARG2)), /* %rsi */ \
+ "d"((ARG3)), /* %rdx */ \
+ "r"(__r10) /* %r10 */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall5(NUM, ARG1, ARG2, ARG3, ARG4, ARG5) ({ \
+ intptr_t rax; \
+ register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \
+ register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5); \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)), /* %rdi */ \
+ "S"((ARG2)), /* %rsi */ \
+ "d"((ARG3)), /* %rdx */ \
+ "r"(__r10), /* %r10 */ \
+ "r"(__r8) /* %r8 */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall6(NUM, ARG1, ARG2, ARG3, ARG4, ARG5, ARG6) ({ \
+ intptr_t rax; \
+ register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \
+ register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5); \
+ register __typeof__(ARG6) __r9 __asm__("r9") = (ARG6); \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)), /* %rdi */ \
+ "S"((ARG2)), /* %rsi */ \
+ "d"((ARG3)), /* %rdx */ \
+ "r"(__r10), /* %r10 */ \
+ "r"(__r8), /* %r8 */ \
+ "r"(__r9) /* %r9 */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#include "../syscall-defs.h"
+
+#else /* #if defined(__x86_64__) */
+
+#ifdef CONFIG_NOLIBC
+/**
+ * Note for syscall registers usage (x86, 32-bit):
+ * - %eax is the syscall number.
+ * - %eax is also the return value.
+ * - %ebx is the 1st argument.
+ * - %ecx is the 2nd argument.
+ * - %edx is the 3rd argument.
+ * - %esi is the 4th argument.
+ * - %edi is the 5th argument.
+ * - %ebp is the 6th argument.
+ */
+
+#define __do_syscall0(NUM) ({ \
+ intptr_t eax; \
+ \
+ __asm__ volatile( \
+ "int $0x80" \
+ : "=a"(eax) /* %eax */ \
+ : "a"(NUM) /* %eax */ \
+ : "memory" \
+ ); \
+ eax; \
+})
+
+#define __do_syscall1(NUM, ARG1) ({ \
+ intptr_t eax; \
+ \
+ __asm__ volatile( \
+ "int $0x80" \
+ : "=a"(eax) /* %eax */ \
+ : "a"(NUM), /* %eax */ \
+ "b"((ARG1)) /* %ebx */ \
+ : "memory" \
+ ); \
+ eax; \
+})
+
+#define __do_syscall2(NUM, ARG1, ARG2) ({ \
+ intptr_t eax; \
+ \
+ __asm__ volatile( \
+ "int $0x80" \
+ : "=a" (eax) /* %eax */ \
+ : "a"(NUM), /* %eax */ \
+ "b"((ARG1)), /* %ebx */ \
+ "c"((ARG2)) /* %ecx */ \
+ : "memory" \
+ ); \
+ eax; \
+})
+
+#define __do_syscall3(NUM, ARG1, ARG2, ARG3) ({ \
+ intptr_t eax; \
+ \
+ __asm__ volatile( \
+ "int $0x80" \
+ : "=a" (eax) /* %eax */ \
+ : "a"(NUM), /* %eax */ \
+ "b"((ARG1)), /* %ebx */ \
+ "c"((ARG2)), /* %ecx */ \
+ "d"((ARG3)) /* %edx */ \
+ : "memory" \
+ ); \
+ eax; \
+})
+
+#define __do_syscall4(NUM, ARG1, ARG2, ARG3, ARG4) ({ \
+ intptr_t eax; \
+ \
+ __asm__ volatile( \
+ "int $0x80" \
+ : "=a" (eax) /* %eax */ \
+ : "a"(NUM), /* %eax */ \
+ "b"((ARG1)), /* %ebx */ \
+ "c"((ARG2)), /* %ecx */ \
+ "d"((ARG3)), /* %edx */ \
+ "S"((ARG4)) /* %esi */ \
+ : "memory" \
+ ); \
+ eax; \
+})
+
+#define __do_syscall5(NUM, ARG1, ARG2, ARG3, ARG4, ARG5) ({ \
+ intptr_t eax; \
+ \
+ __asm__ volatile( \
+ "int $0x80" \
+ : "=a" (eax) /* %eax */ \
+ : "a"(NUM), /* %eax */ \
+ "b"((ARG1)), /* %ebx */ \
+ "c"((ARG2)), /* %ecx */ \
+ "d"((ARG3)), /* %edx */ \
+ "S"((ARG4)), /* %esi */ \
+ "D"((ARG5)) /* %edi */ \
+ : "memory" \
+ ); \
+ eax; \
+})
+
+
+/*
+ * On i386, the 6th argument of syscall goes in %ebp. However, both Clang
+ * and GCC cannot use %ebp in the clobber list and in the "r" constraint
+ * without using -fomit-frame-pointer. To make it always available for
+ * any kind of compilation, the below workaround is implemented:
+ *
+ * 1) Push the 6-th argument.
+ * 2) Push %ebp.
+ * 3) Load the 6-th argument from 4(%esp) to %ebp.
+ * 4) Do the syscall (int $0x80).
+ * 5) Pop %ebp (restore the old value of %ebp).
+ * 6) Add %esp by 4 (undo the stack pointer).
+ *
+ * WARNING:
+ * Don't use register variables for __do_syscall6(), there is a known
+ * GCC bug that results in an endless loop.
+ *
+ * BugLink: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105032
+ *
+ */
+#define __do_syscall6(NUM, ARG1, ARG2, ARG3, ARG4, ARG5, ARG6) ({ \
+ intptr_t eax = (intptr_t)(NUM); \
+ intptr_t arg6 = (intptr_t)(ARG6); /* Always in memory */ \
+ __asm__ volatile ( \
+ "pushl %[_arg6]\n\t" \
+ "pushl %%ebp\n\t" \
+ "movl 4(%%esp),%%ebp\n\t" \
+ "int $0x80\n\t" \
+ "popl %%ebp\n\t" \
+ "addl $4,%%esp" \
+ : "+a"(eax) /* %eax */ \
+ : "b"(ARG1), /* %ebx */ \
+ "c"(ARG2), /* %ecx */ \
+ "d"(ARG3), /* %edx */ \
+ "S"(ARG4), /* %esi */ \
+ "D"(ARG5), /* %edi */ \
+ [_arg6]"m"(arg6) /* memory */ \
+ : "memory", "cc" \
+ ); \
+ eax; \
+})
+
+#include "../syscall-defs.h"
+
+#else /* #ifdef CONFIG_NOLIBC */
+
+#include "../generic/syscall.h"
+
+#endif /* #ifdef CONFIG_NOLIBC */
+
+#endif /* #if defined(__x86_64__) */
+
+#endif /* #ifndef LIBURING_ARCH_X86_SYSCALL_H */
diff --git a/src/include/liburing.h b/src/include/liburing.h
index 51dc602..1c1b03e 100644
--- a/src/include/liburing.h
+++ b/src/include/liburing.h
@@ -6,25 +6,31 @@
#define _XOPEN_SOURCE 500 /* Required for glibc to expose sigset_t */
#endif
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE /* Required for musl to expose cpu_set_t */
+#endif
+
#include <sys/socket.h>
-#include <sys/uio.h>
#include <sys/stat.h>
+#include <sys/uio.h>
#include <errno.h>
#include <signal.h>
#include <stdbool.h>
#include <inttypes.h>
#include <time.h>
+#include <fcntl.h>
+#include <sched.h>
#include <linux/swab.h>
#include "liburing/compat.h"
#include "liburing/io_uring.h"
#include "liburing/barrier.h"
#ifndef uring_unlikely
-# define uring_unlikely(cond) __builtin_expect(!!(cond), 0)
+#define uring_unlikely(cond) __builtin_expect(!!(cond), 0)
#endif
#ifndef uring_likely
-# define uring_likely(cond) __builtin_expect(!!(cond), 1)
+#define uring_likely(cond) __builtin_expect(!!(cond), 1)
#endif
#ifdef __cplusplus
@@ -75,7 +81,10 @@ struct io_uring {
int ring_fd;
unsigned features;
- unsigned pad[3];
+ int enter_ring_fd;
+ __u8 int_flags;
+ __u8 pad[3];
+ unsigned pad2;
};
/*
@@ -86,74 +95,115 @@ struct io_uring {
* return an allocated io_uring_probe structure, or NULL if probe fails (for
* example, if it is not available). The caller is responsible for freeing it
*/
-extern struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring);
+struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring);
/* same as io_uring_get_probe_ring, but takes care of ring init and teardown */
-extern struct io_uring_probe *io_uring_get_probe(void);
+struct io_uring_probe *io_uring_get_probe(void);
/*
* frees a probe allocated through io_uring_get_probe() or
* io_uring_get_probe_ring()
*/
-extern void io_uring_free_probe(struct io_uring_probe *probe);
+void io_uring_free_probe(struct io_uring_probe *probe);
-static inline int io_uring_opcode_supported(const struct io_uring_probe *p, int op)
+static inline int io_uring_opcode_supported(const struct io_uring_probe *p,
+ int op)
{
if (op > p->last_op)
return 0;
return (p->ops[op].flags & IO_URING_OP_SUPPORTED) != 0;
}
-extern int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
- struct io_uring_params *p);
-extern int io_uring_queue_init(unsigned entries, struct io_uring *ring,
- unsigned flags);
-extern int io_uring_queue_mmap(int fd, struct io_uring_params *p,
- struct io_uring *ring);
-extern int io_uring_ring_dontfork(struct io_uring *ring);
-extern void io_uring_queue_exit(struct io_uring *ring);
+int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
+ struct io_uring_params *p);
+int io_uring_queue_init(unsigned entries, struct io_uring *ring,
+ unsigned flags);
+int io_uring_queue_mmap(int fd, struct io_uring_params *p,
+ struct io_uring *ring);
+int io_uring_ring_dontfork(struct io_uring *ring);
+void io_uring_queue_exit(struct io_uring *ring);
unsigned io_uring_peek_batch_cqe(struct io_uring *ring,
struct io_uring_cqe **cqes, unsigned count);
-extern int io_uring_wait_cqes(struct io_uring *ring,
- struct io_uring_cqe **cqe_ptr, unsigned wait_nr,
- struct __kernel_timespec *ts, sigset_t *sigmask);
-extern int io_uring_wait_cqe_timeout(struct io_uring *ring,
- struct io_uring_cqe **cqe_ptr, struct __kernel_timespec *ts);
-extern int io_uring_submit(struct io_uring *ring);
-extern int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr);
-extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
-
-extern int io_uring_register_buffers(struct io_uring *ring,
- const struct iovec *iovecs,
- unsigned nr_iovecs);
-extern int io_uring_unregister_buffers(struct io_uring *ring);
-extern int io_uring_register_files(struct io_uring *ring, const int *files,
- unsigned nr_files);
-extern int io_uring_unregister_files(struct io_uring *ring);
-extern int io_uring_register_files_update(struct io_uring *ring, unsigned off,
- int *files, unsigned nr_files);
-extern int io_uring_register_eventfd(struct io_uring *ring, int fd);
-extern int io_uring_register_eventfd_async(struct io_uring *ring, int fd);
-extern int io_uring_unregister_eventfd(struct io_uring *ring);
-extern int io_uring_register_probe(struct io_uring *ring,
- struct io_uring_probe *p, unsigned nr);
-extern int io_uring_register_personality(struct io_uring *ring);
-extern int io_uring_unregister_personality(struct io_uring *ring, int id);
-extern int io_uring_register_restrictions(struct io_uring *ring,
- struct io_uring_restriction *res,
- unsigned int nr_res);
-extern int io_uring_enable_rings(struct io_uring *ring);
-extern int __io_uring_sqring_wait(struct io_uring *ring);
+int io_uring_wait_cqes(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
+ unsigned wait_nr, struct __kernel_timespec *ts,
+ sigset_t *sigmask);
+int io_uring_wait_cqe_timeout(struct io_uring *ring,
+ struct io_uring_cqe **cqe_ptr,
+ struct __kernel_timespec *ts);
+int io_uring_submit(struct io_uring *ring);
+int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr);
+int io_uring_submit_and_wait_timeout(struct io_uring *ring,
+ struct io_uring_cqe **cqe_ptr,
+ unsigned wait_nr,
+ struct __kernel_timespec *ts,
+ sigset_t *sigmask);
+
+int io_uring_register_buffers(struct io_uring *ring, const struct iovec *iovecs,
+ unsigned nr_iovecs);
+int io_uring_register_buffers_tags(struct io_uring *ring,
+ const struct iovec *iovecs,
+ const __u64 *tags, unsigned nr);
+int io_uring_register_buffers_sparse(struct io_uring *ring, unsigned nr);
+int io_uring_register_buffers_update_tag(struct io_uring *ring,
+ unsigned off,
+ const struct iovec *iovecs,
+ const __u64 *tags, unsigned nr);
+int io_uring_unregister_buffers(struct io_uring *ring);
+
+int io_uring_register_files(struct io_uring *ring, const int *files,
+ unsigned nr_files);
+int io_uring_register_files_tags(struct io_uring *ring, const int *files,
+ const __u64 *tags, unsigned nr);
+int io_uring_register_files_sparse(struct io_uring *ring, unsigned nr);
+int io_uring_register_files_update_tag(struct io_uring *ring, unsigned off,
+ const int *files, const __u64 *tags,
+ unsigned nr_files);
+
+int io_uring_unregister_files(struct io_uring *ring);
+int io_uring_register_files_update(struct io_uring *ring, unsigned off,
+ int *files, unsigned nr_files);
+int io_uring_register_eventfd(struct io_uring *ring, int fd);
+int io_uring_register_eventfd_async(struct io_uring *ring, int fd);
+int io_uring_unregister_eventfd(struct io_uring *ring);
+int io_uring_register_probe(struct io_uring *ring, struct io_uring_probe *p,
+ unsigned nr);
+int io_uring_register_personality(struct io_uring *ring);
+int io_uring_unregister_personality(struct io_uring *ring, int id);
+int io_uring_register_restrictions(struct io_uring *ring,
+ struct io_uring_restriction *res,
+ unsigned int nr_res);
+int io_uring_enable_rings(struct io_uring *ring);
+int __io_uring_sqring_wait(struct io_uring *ring);
+int io_uring_register_iowq_aff(struct io_uring *ring, size_t cpusz,
+ const cpu_set_t *mask);
+int io_uring_unregister_iowq_aff(struct io_uring *ring);
+int io_uring_register_iowq_max_workers(struct io_uring *ring,
+ unsigned int *values);
+int io_uring_register_ring_fd(struct io_uring *ring);
+int io_uring_unregister_ring_fd(struct io_uring *ring);
+int io_uring_register_buf_ring(struct io_uring *ring,
+ struct io_uring_buf_reg *reg, unsigned int flags);
+int io_uring_unregister_buf_ring(struct io_uring *ring, int bgid);
/*
* Helper for the peek/wait single cqe functions. Exported because of that,
* but probably shouldn't be used directly in an application.
*/
-extern int __io_uring_get_cqe(struct io_uring *ring,
- struct io_uring_cqe **cqe_ptr, unsigned submit,
- unsigned wait_nr, sigset_t *sigmask);
+int __io_uring_get_cqe(struct io_uring *ring,
+ struct io_uring_cqe **cqe_ptr, unsigned submit,
+ unsigned wait_nr, sigset_t *sigmask);
#define LIBURING_UDATA_TIMEOUT ((__u64) -1)
+/*
+ * Calculates the step size for CQE iteration.
+ * For standard CQE's its 1, for big CQE's its two.
+ */
+#define io_uring_cqe_shift(ring) \
+ (!!((ring)->flags & IORING_SETUP_CQE32))
+
+#define io_uring_cqe_index(ring,ptr,mask) \
+ (((ptr) & (mask)) << io_uring_cqe_shift(ring))
+
#define io_uring_for_each_cqe(ring, head, cqe) \
/* \
* io_uring_smp_load_acquire() enforces the order of tail \
@@ -161,7 +211,7 @@ extern int __io_uring_get_cqe(struct io_uring *ring,
*/ \
for (head = *(ring)->cq.khead; \
(cqe = (head != io_uring_smp_load_acquire((ring)->cq.ktail) ? \
- &(ring)->cq.cqes[head & (*(ring)->cq.kring_mask)] : NULL)); \
+ &(ring)->cq.cqes[io_uring_cqe_index(ring, head, *(ring)->cq.kring_mask)] : NULL)); \
head++) \
/*
@@ -195,6 +245,11 @@ static inline void io_uring_cqe_seen(struct io_uring *ring,
/*
* Command prep helpers
*/
+
+/*
+ * Associate pointer @data with the sqe, for later retrieval from the cqe
+ * at command completion time with io_uring_cqe_get_data().
+ */
static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data)
{
sqe->user_data = (unsigned long) data;
@@ -205,17 +260,45 @@ static inline void *io_uring_cqe_get_data(const struct io_uring_cqe *cqe)
return (void *) (uintptr_t) cqe->user_data;
}
+/*
+ * Assign a 64-bit value to this sqe, which can get retrieved at completion
+ * time with io_uring_cqe_get_data64. Just like the non-64 variants, except
+ * these store a 64-bit type rather than a data pointer.
+ */
+static inline void io_uring_sqe_set_data64(struct io_uring_sqe *sqe,
+ __u64 data)
+{
+ sqe->user_data = data;
+}
+
+static inline __u64 io_uring_cqe_get_data64(const struct io_uring_cqe *cqe)
+{
+ return cqe->user_data;
+}
+
+/*
+ * Tell the app the have the 64-bit variants of the get/set userdata
+ */
+#define LIBURING_HAVE_DATA64
+
static inline void io_uring_sqe_set_flags(struct io_uring_sqe *sqe,
unsigned flags)
{
- sqe->flags = flags;
+ sqe->flags = (__u8) flags;
+}
+
+static inline void __io_uring_set_target_fixed_file(struct io_uring_sqe *sqe,
+ unsigned int file_index)
+{
+ /* 0 means no fixed files, indexes should be encoded as "index + 1" */
+ sqe->file_index = file_index + 1;
}
static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
const void *addr, unsigned len,
__u64 offset)
{
- sqe->opcode = op;
+ sqe->opcode = (__u8) op;
sqe->flags = 0;
sqe->ioprio = 0;
sqe->fd = fd;
@@ -223,27 +306,33 @@ static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
sqe->addr = (unsigned long) addr;
sqe->len = len;
sqe->rw_flags = 0;
- sqe->user_data = 0;
- sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
+ sqe->buf_index = 0;
+ sqe->personality = 0;
+ sqe->file_index = 0;
+ sqe->addr3 = 0;
+ sqe->__pad2[0] = 0;
}
/**
* @pre Either fd_in or fd_out must be a pipe.
* @param off_in If fd_in refers to a pipe, off_in must be (int64_t) -1;
- * If fd_in does not refer to a pipe and off_in is (int64_t) -1, then bytes are read
- * from fd_in starting from the file offset and it is adjust appropriately;
- * If fd_in does not refer to a pipe and off_in is not (int64_t) -1, then the
- * starting offset of fd_in will be off_in.
+ * If fd_in does not refer to a pipe and off_in is (int64_t) -1,
+ * then bytes are read from fd_in starting from the file offset
+ * and it is adjust appropriately;
+ * If fd_in does not refer to a pipe and off_in is not
+ * (int64_t) -1, then the starting offset of fd_in will be
+ * off_in.
* @param off_out The description of off_in also applied to off_out.
* @param splice_flags see man splice(2) for description of flags.
*
- * This splice operation can be used to implement sendfile by splicing to an intermediate pipe
- * first, then splice to the final destination.
+ * This splice operation can be used to implement sendfile by splicing to an
+ * intermediate pipe first, then splice to the final destination.
* In fact, the implementation of sendfile in kernel uses splice internally.
*
- * NOTE that even if fd_in or fd_out refers to a pipe, the splice operation can still failed with
- * EINVAL if one of the fd doesn't explicitly support splice operation, e.g. reading from terminal
- * is unsupported from kernel 5.7 to 5.11.
+ * NOTE that even if fd_in or fd_out refers to a pipe, the splice operation
+ * can still failed with EINVAL if one of the fd doesn't explicitly support
+ * splice operation, e.g. reading from terminal is unsupported from kernel 5.7
+ * to 5.11.
* Check issue #291 for more information.
*/
static inline void io_uring_prep_splice(struct io_uring_sqe *sqe,
@@ -252,8 +341,9 @@ static inline void io_uring_prep_splice(struct io_uring_sqe *sqe,
unsigned int nbytes,
unsigned int splice_flags)
{
- io_uring_prep_rw(IORING_OP_SPLICE, sqe, fd_out, NULL, nbytes, off_out);
- sqe->splice_off_in = off_in;
+ io_uring_prep_rw(IORING_OP_SPLICE, sqe, fd_out, NULL, nbytes,
+ (__u64) off_out);
+ sqe->splice_off_in = (__u64) off_in;
sqe->splice_fd_in = fd_in;
sqe->splice_flags = splice_flags;
}
@@ -271,32 +361,50 @@ static inline void io_uring_prep_tee(struct io_uring_sqe *sqe,
static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd,
const struct iovec *iovecs,
- unsigned nr_vecs, off_t offset)
+ unsigned nr_vecs, __u64 offset)
{
io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset);
}
+static inline void io_uring_prep_readv2(struct io_uring_sqe *sqe, int fd,
+ const struct iovec *iovecs,
+ unsigned nr_vecs, __u64 offset,
+ int flags)
+{
+ io_uring_prep_readv(sqe, fd, iovecs, nr_vecs, offset);
+ sqe->rw_flags = flags;
+}
+
static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd,
void *buf, unsigned nbytes,
- off_t offset, int buf_index)
+ __u64 offset, int buf_index)
{
io_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset);
- sqe->buf_index = buf_index;
+ sqe->buf_index = (__u16) buf_index;
}
static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd,
const struct iovec *iovecs,
- unsigned nr_vecs, off_t offset)
+ unsigned nr_vecs, __u64 offset)
{
io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset);
}
+static inline void io_uring_prep_writev2(struct io_uring_sqe *sqe, int fd,
+ const struct iovec *iovecs,
+ unsigned nr_vecs, __u64 offset,
+ int flags)
+{
+ io_uring_prep_writev(sqe, fd, iovecs, nr_vecs, offset);
+ sqe->rw_flags = flags;
+}
+
static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd,
const void *buf, unsigned nbytes,
- off_t offset, int buf_index)
+ __u64 offset, int buf_index)
{
io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset);
- sqe->buf_index = buf_index;
+ sqe->buf_index = (__u16) buf_index;
}
static inline void io_uring_prep_recvmsg(struct io_uring_sqe *sqe, int fd,
@@ -307,39 +415,51 @@ static inline void io_uring_prep_recvmsg(struct io_uring_sqe *sqe, int fd,
}
static inline void io_uring_prep_sendmsg(struct io_uring_sqe *sqe, int fd,
- const struct msghdr *msg, unsigned flags)
+ const struct msghdr *msg,
+ unsigned flags)
{
io_uring_prep_rw(IORING_OP_SENDMSG, sqe, fd, msg, 1, 0);
sqe->msg_flags = flags;
}
-static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd,
- unsigned poll_mask)
+static inline unsigned __io_uring_prep_poll_mask(unsigned poll_mask)
{
- io_uring_prep_rw(IORING_OP_POLL_ADD, sqe, fd, NULL, 0, 0);
#if __BYTE_ORDER == __BIG_ENDIAN
poll_mask = __swahw32(poll_mask);
#endif
- sqe->poll32_events = poll_mask;
+ return poll_mask;
+}
+
+static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd,
+ unsigned poll_mask)
+{
+ io_uring_prep_rw(IORING_OP_POLL_ADD, sqe, fd, NULL, 0, 0);
+ sqe->poll32_events = __io_uring_prep_poll_mask(poll_mask);
+}
+
+static inline void io_uring_prep_poll_multishot(struct io_uring_sqe *sqe,
+ int fd, unsigned poll_mask)
+{
+ io_uring_prep_poll_add(sqe, fd, poll_mask);
+ sqe->len = IORING_POLL_ADD_MULTI;
}
static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe,
- void *user_data)
+ __u64 user_data)
{
- io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, user_data, 0, 0);
+ io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, NULL, 0, 0);
+ sqe->addr = user_data;
}
static inline void io_uring_prep_poll_update(struct io_uring_sqe *sqe,
- void *old_user_data,
- void *new_user_data,
+ __u64 old_user_data,
+ __u64 new_user_data,
unsigned poll_mask, unsigned flags)
{
- io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, old_user_data, flags,
- (__u64)new_user_data);
-#if __BYTE_ORDER == __BIG_ENDIAN
- poll_mask = __swahw32(poll_mask);
-#endif
- sqe->poll32_events = poll_mask;
+ io_uring_prep_rw(IORING_OP_POLL_REMOVE, sqe, -1, NULL, flags,
+ new_user_data);
+ sqe->addr = old_user_data;
+ sqe->poll32_events = __io_uring_prep_poll_mask(poll_mask);
}
static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd,
@@ -365,8 +485,8 @@ static inline void io_uring_prep_timeout(struct io_uring_sqe *sqe,
static inline void io_uring_prep_timeout_remove(struct io_uring_sqe *sqe,
__u64 user_data, unsigned flags)
{
- io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1,
- (void *)(unsigned long)user_data, 0, 0);
+ io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1, NULL, 0, 0);
+ sqe->addr = user_data;
sqe->timeout_flags = flags;
}
@@ -374,9 +494,9 @@ static inline void io_uring_prep_timeout_update(struct io_uring_sqe *sqe,
struct __kernel_timespec *ts,
__u64 user_data, unsigned flags)
{
- io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1,
- (void *)(unsigned long)user_data, 0,
- (uintptr_t)ts);
+ io_uring_prep_rw(IORING_OP_TIMEOUT_REMOVE, sqe, -1, NULL, 0,
+ (uintptr_t) ts);
+ sqe->addr = user_data;
sqe->timeout_flags = flags | IORING_TIMEOUT_UPDATE;
}
@@ -386,14 +506,57 @@ static inline void io_uring_prep_accept(struct io_uring_sqe *sqe, int fd,
{
io_uring_prep_rw(IORING_OP_ACCEPT, sqe, fd, addr, 0,
(__u64) (unsigned long) addrlen);
- sqe->accept_flags = flags;
+ sqe->accept_flags = (__u32) flags;
+}
+
+/* accept directly into the fixed file table */
+static inline void io_uring_prep_accept_direct(struct io_uring_sqe *sqe, int fd,
+ struct sockaddr *addr,
+ socklen_t *addrlen, int flags,
+ unsigned int file_index)
+{
+ io_uring_prep_accept(sqe, fd, addr, addrlen, flags);
+ __io_uring_set_target_fixed_file(sqe, file_index);
+}
+
+static inline void io_uring_prep_multishot_accept(struct io_uring_sqe *sqe,
+ int fd, struct sockaddr *addr,
+ socklen_t *addrlen, int flags)
+{
+ io_uring_prep_accept(sqe, fd, addr, addrlen, flags);
+ sqe->ioprio |= IORING_ACCEPT_MULTISHOT;
+}
+
+/* multishot accept directly into the fixed file table */
+static inline void io_uring_prep_multishot_accept_direct(struct io_uring_sqe *sqe,
+ int fd,
+ struct sockaddr *addr,
+ socklen_t *addrlen,
+ int flags)
+{
+ io_uring_prep_multishot_accept(sqe, fd, addr, addrlen, flags);
+ __io_uring_set_target_fixed_file(sqe, IORING_FILE_INDEX_ALLOC - 1);
+}
+
+static inline void io_uring_prep_cancel64(struct io_uring_sqe *sqe,
+ __u64 user_data, int flags)
+{
+ io_uring_prep_rw(IORING_OP_ASYNC_CANCEL, sqe, -1, NULL, 0, 0);
+ sqe->addr = user_data;
+ sqe->cancel_flags = (__u32) flags;
+}
+
+static inline void io_uring_prep_cancel(struct io_uring_sqe *sqe,
+ void *user_data, int flags)
+{
+ io_uring_prep_cancel64(sqe, (__u64) (uintptr_t) user_data, flags);
}
-static inline void io_uring_prep_cancel(struct io_uring_sqe *sqe, void *user_data,
- int flags)
+static inline void io_uring_prep_cancel_fd(struct io_uring_sqe *sqe, int fd,
+ unsigned int flags)
{
- io_uring_prep_rw(IORING_OP_ASYNC_CANCEL, sqe, -1, user_data, 0, 0);
- sqe->cancel_flags = flags;
+ io_uring_prep_rw(IORING_OP_ASYNC_CANCEL, sqe, fd, NULL, 0, 0);
+ sqe->cancel_flags = (__u32) flags | IORING_ASYNC_CANCEL_FD;
}
static inline void io_uring_prep_link_timeout(struct io_uring_sqe *sqe,
@@ -415,7 +578,8 @@ static inline void io_uring_prep_files_update(struct io_uring_sqe *sqe,
int *fds, unsigned nr_fds,
int offset)
{
- io_uring_prep_rw(IORING_OP_FILES_UPDATE, sqe, -1, fds, nr_fds, offset);
+ io_uring_prep_rw(IORING_OP_FILES_UPDATE, sqe, -1, fds, nr_fds,
+ (__u64) offset);
}
static inline void io_uring_prep_fallocate(struct io_uring_sqe *sqe, int fd,
@@ -423,14 +587,26 @@ static inline void io_uring_prep_fallocate(struct io_uring_sqe *sqe, int fd,
{
io_uring_prep_rw(IORING_OP_FALLOCATE, sqe, fd,
- (const uintptr_t *) (unsigned long) len, mode, offset);
+ (const uintptr_t *) (unsigned long) len,
+ (unsigned int) mode, (__u64) offset);
}
static inline void io_uring_prep_openat(struct io_uring_sqe *sqe, int dfd,
- const char *path, int flags, mode_t mode)
+ const char *path, int flags,
+ mode_t mode)
{
io_uring_prep_rw(IORING_OP_OPENAT, sqe, dfd, path, mode, 0);
- sqe->open_flags = flags;
+ sqe->open_flags = (__u32) flags;
+}
+
+/* open directly into the fixed file table */
+static inline void io_uring_prep_openat_direct(struct io_uring_sqe *sqe,
+ int dfd, const char *path,
+ int flags, mode_t mode,
+ unsigned file_index)
+{
+ io_uring_prep_openat(sqe, dfd, path, flags, mode);
+ __io_uring_set_target_fixed_file(sqe, file_index);
}
static inline void io_uring_prep_close(struct io_uring_sqe *sqe, int fd)
@@ -438,14 +614,22 @@ static inline void io_uring_prep_close(struct io_uring_sqe *sqe, int fd)
io_uring_prep_rw(IORING_OP_CLOSE, sqe, fd, NULL, 0, 0);
}
+static inline void io_uring_prep_close_direct(struct io_uring_sqe *sqe,
+ unsigned file_index)
+{
+ io_uring_prep_close(sqe, 0);
+ __io_uring_set_target_fixed_file(sqe, file_index);
+}
+
static inline void io_uring_prep_read(struct io_uring_sqe *sqe, int fd,
- void *buf, unsigned nbytes, off_t offset)
+ void *buf, unsigned nbytes, __u64 offset)
{
io_uring_prep_rw(IORING_OP_READ, sqe, fd, buf, nbytes, offset);
}
static inline void io_uring_prep_write(struct io_uring_sqe *sqe, int fd,
- const void *buf, unsigned nbytes, off_t offset)
+ const void *buf, unsigned nbytes,
+ __u64 offset)
{
io_uring_prep_rw(IORING_OP_WRITE, sqe, fd, buf, nbytes, offset);
}
@@ -457,35 +641,35 @@ static inline void io_uring_prep_statx(struct io_uring_sqe *sqe, int dfd,
{
io_uring_prep_rw(IORING_OP_STATX, sqe, dfd, path, mask,
(__u64) (unsigned long) statxbuf);
- sqe->statx_flags = flags;
+ sqe->statx_flags = (__u32) flags;
}
static inline void io_uring_prep_fadvise(struct io_uring_sqe *sqe, int fd,
- off_t offset, off_t len, int advice)
+ __u64 offset, off_t len, int advice)
{
- io_uring_prep_rw(IORING_OP_FADVISE, sqe, fd, NULL, len, offset);
- sqe->fadvise_advice = advice;
+ io_uring_prep_rw(IORING_OP_FADVISE, sqe, fd, NULL, (__u32) len, offset);
+ sqe->fadvise_advice = (__u32) advice;
}
static inline void io_uring_prep_madvise(struct io_uring_sqe *sqe, void *addr,
off_t length, int advice)
{
- io_uring_prep_rw(IORING_OP_MADVISE, sqe, -1, addr, length, 0);
- sqe->fadvise_advice = advice;
+ io_uring_prep_rw(IORING_OP_MADVISE, sqe, -1, addr, (__u32) length, 0);
+ sqe->fadvise_advice = (__u32) advice;
}
static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
const void *buf, size_t len, int flags)
{
- io_uring_prep_rw(IORING_OP_SEND, sqe, sockfd, buf, len, 0);
- sqe->msg_flags = flags;
+ io_uring_prep_rw(IORING_OP_SEND, sqe, sockfd, buf, (__u32) len, 0);
+ sqe->msg_flags = (__u32) flags;
}
static inline void io_uring_prep_recv(struct io_uring_sqe *sqe, int sockfd,
void *buf, size_t len, int flags)
{
- io_uring_prep_rw(IORING_OP_RECV, sqe, sockfd, buf, len, 0);
- sqe->msg_flags = flags;
+ io_uring_prep_rw(IORING_OP_RECV, sqe, sockfd, buf, (__u32) len, 0);
+ sqe->msg_flags = (__u32) flags;
}
static inline void io_uring_prep_openat2(struct io_uring_sqe *sqe, int dfd,
@@ -495,57 +679,82 @@ static inline void io_uring_prep_openat2(struct io_uring_sqe *sqe, int dfd,
(uint64_t) (uintptr_t) how);
}
+/* open directly into the fixed file table */
+static inline void io_uring_prep_openat2_direct(struct io_uring_sqe *sqe,
+ int dfd, const char *path,
+ struct open_how *how,
+ unsigned file_index)
+{
+ io_uring_prep_openat2(sqe, dfd, path, how);
+ __io_uring_set_target_fixed_file(sqe, file_index);
+}
+
struct epoll_event;
static inline void io_uring_prep_epoll_ctl(struct io_uring_sqe *sqe, int epfd,
int fd, int op,
struct epoll_event *ev)
{
- io_uring_prep_rw(IORING_OP_EPOLL_CTL, sqe, epfd, ev, op, fd);
+ io_uring_prep_rw(IORING_OP_EPOLL_CTL, sqe, epfd, ev,
+ (__u32) op, (__u32) fd);
}
static inline void io_uring_prep_provide_buffers(struct io_uring_sqe *sqe,
void *addr, int len, int nr,
int bgid, int bid)
{
- io_uring_prep_rw(IORING_OP_PROVIDE_BUFFERS, sqe, nr, addr, len, bid);
- sqe->buf_group = bgid;
+ io_uring_prep_rw(IORING_OP_PROVIDE_BUFFERS, sqe, nr, addr, (__u32) len,
+ (__u64) bid);
+ sqe->buf_group = (__u16) bgid;
}
static inline void io_uring_prep_remove_buffers(struct io_uring_sqe *sqe,
int nr, int bgid)
{
io_uring_prep_rw(IORING_OP_REMOVE_BUFFERS, sqe, nr, NULL, 0, 0);
- sqe->buf_group = bgid;
+ sqe->buf_group = (__u16) bgid;
}
static inline void io_uring_prep_shutdown(struct io_uring_sqe *sqe, int fd,
int how)
{
- io_uring_prep_rw(IORING_OP_SHUTDOWN, sqe, fd, NULL, how, 0);
+ io_uring_prep_rw(IORING_OP_SHUTDOWN, sqe, fd, NULL, (__u32) how, 0);
}
static inline void io_uring_prep_unlinkat(struct io_uring_sqe *sqe, int dfd,
const char *path, int flags)
{
io_uring_prep_rw(IORING_OP_UNLINKAT, sqe, dfd, path, 0, 0);
- sqe->unlink_flags = flags;
+ sqe->unlink_flags = (__u32) flags;
+}
+
+static inline void io_uring_prep_unlink(struct io_uring_sqe *sqe,
+ const char *path, int flags)
+{
+ io_uring_prep_unlinkat(sqe, AT_FDCWD, path, flags);
}
static inline void io_uring_prep_renameat(struct io_uring_sqe *sqe, int olddfd,
const char *oldpath, int newdfd,
const char *newpath, int flags)
{
- io_uring_prep_rw(IORING_OP_RENAMEAT, sqe, olddfd, oldpath, newdfd,
+ io_uring_prep_rw(IORING_OP_RENAMEAT, sqe, olddfd, oldpath,
+ (__u32) newdfd,
(uint64_t) (uintptr_t) newpath);
- sqe->rename_flags = flags;
+ sqe->rename_flags = (__u32) flags;
+}
+
+static inline void io_uring_prep_rename(struct io_uring_sqe *sqe,
+ const char *oldpath, const char *newpath)
+{
+ io_uring_prep_renameat(sqe, AT_FDCWD, oldpath, AT_FDCWD, newpath, 0);
}
static inline void io_uring_prep_sync_file_range(struct io_uring_sqe *sqe,
int fd, unsigned len,
- off_t offset, int flags)
+ __u64 offset, int flags)
{
io_uring_prep_rw(IORING_OP_SYNC_FILE_RANGE, sqe, fd, NULL, len, offset);
- sqe->sync_range_flags = flags;
+ sqe->sync_range_flags = (__u32) flags;
}
static inline void io_uring_prep_mkdirat(struct io_uring_sqe *sqe, int dfd,
@@ -554,20 +763,123 @@ static inline void io_uring_prep_mkdirat(struct io_uring_sqe *sqe, int dfd,
io_uring_prep_rw(IORING_OP_MKDIRAT, sqe, dfd, path, mode, 0);
}
+static inline void io_uring_prep_mkdir(struct io_uring_sqe *sqe,
+ const char *path, mode_t mode)
+{
+ io_uring_prep_mkdirat(sqe, AT_FDCWD, path, mode);
+}
+
static inline void io_uring_prep_symlinkat(struct io_uring_sqe *sqe,
- const char *target, int newdirfd, const char *linkpath)
+ const char *target, int newdirfd,
+ const char *linkpath)
{
io_uring_prep_rw(IORING_OP_SYMLINKAT, sqe, newdirfd, target, 0,
(uint64_t) (uintptr_t) linkpath);
}
+static inline void io_uring_prep_symlink(struct io_uring_sqe *sqe,
+ const char *target, const char *linkpath)
+{
+ io_uring_prep_symlinkat(sqe, target, AT_FDCWD, linkpath);
+}
+
static inline void io_uring_prep_linkat(struct io_uring_sqe *sqe, int olddfd,
const char *oldpath, int newdfd,
const char *newpath, int flags)
{
- io_uring_prep_rw(IORING_OP_LINKAT, sqe, olddfd, oldpath, newdfd,
+ io_uring_prep_rw(IORING_OP_LINKAT, sqe, olddfd, oldpath, (__u32) newdfd,
(uint64_t) (uintptr_t) newpath);
- sqe->hardlink_flags = flags;
+ sqe->hardlink_flags = (__u32) flags;
+}
+
+static inline void io_uring_prep_link(struct io_uring_sqe *sqe,
+ const char *oldpath, const char *newpath, int flags)
+{
+ io_uring_prep_linkat(sqe, AT_FDCWD, oldpath, AT_FDCWD, newpath, flags);
+}
+
+static inline void io_uring_prep_msg_ring(struct io_uring_sqe *sqe, int fd,
+ unsigned int len, __u64 data,
+ unsigned int flags)
+{
+ io_uring_prep_rw(IORING_OP_MSG_RING, sqe, fd, NULL, len, data);
+ sqe->rw_flags = flags;
+}
+
+static inline void io_uring_prep_getxattr(struct io_uring_sqe *sqe,
+ const char *name,
+ const char *value,
+ const char *path,
+ size_t len)
+{
+ io_uring_prep_rw(IORING_OP_GETXATTR, sqe, 0, name, len,
+ (__u64) (uintptr_t) value);
+ sqe->addr3 = (__u64) (uintptr_t) path;
+ sqe->xattr_flags = 0;
+}
+
+static inline void io_uring_prep_setxattr(struct io_uring_sqe *sqe,
+ const char *name,
+ const char *value,
+ const char *path,
+ int flags,
+ size_t len)
+{
+ io_uring_prep_rw(IORING_OP_SETXATTR, sqe, 0, name, len,
+ (__u64) (uintptr_t) value);
+ sqe->addr3 = (__u64) (uintptr_t) path;
+ sqe->xattr_flags = flags;
+}
+
+static inline void io_uring_prep_fgetxattr(struct io_uring_sqe *sqe,
+ int fd,
+ const char *name,
+ const char *value,
+ size_t len)
+{
+ io_uring_prep_rw(IORING_OP_FGETXATTR, sqe, fd, name, len,
+ (__u64) (uintptr_t) value);
+ sqe->xattr_flags = 0;
+}
+
+static inline void io_uring_prep_fsetxattr(struct io_uring_sqe *sqe,
+ int fd,
+ const char *name,
+ const char *value,
+ int flags,
+ size_t len)
+{
+ io_uring_prep_rw(IORING_OP_FSETXATTR, sqe, fd, name, len,
+ (__u64) (uintptr_t) value);
+ sqe->xattr_flags = flags;
+}
+
+static inline void io_uring_prep_socket(struct io_uring_sqe *sqe, int domain,
+ int type, int protocol,
+ unsigned int flags)
+{
+ io_uring_prep_rw(IORING_OP_SOCKET, sqe, domain, NULL, protocol, type);
+ sqe->rw_flags = flags;
+}
+
+static inline void io_uring_prep_socket_direct(struct io_uring_sqe *sqe,
+ int domain, int type,
+ int protocol,
+ unsigned file_index,
+ unsigned int flags)
+{
+ io_uring_prep_rw(IORING_OP_SOCKET, sqe, domain, NULL, protocol, type);
+ sqe->rw_flags = flags;
+ __io_uring_set_target_fixed_file(sqe, file_index);
+}
+
+static inline void io_uring_prep_socket_direct_alloc(struct io_uring_sqe *sqe,
+ int domain, int type, int protocol,
+ unsigned int flags)
+{
+ io_uring_prep_rw(IORING_OP_SOCKET, sqe, domain, NULL, protocol, type);
+ sqe->rw_flags = flags;
+ __io_uring_set_target_fixed_file(sqe, IORING_FILE_INDEX_ALLOC - 1);
}
/*
@@ -576,15 +888,18 @@ static inline void io_uring_prep_linkat(struct io_uring_sqe *sqe, int olddfd,
*/
static inline unsigned io_uring_sq_ready(const struct io_uring *ring)
{
+ unsigned khead = *ring->sq.khead;
+
/*
- * Without a barrier, we could miss an update and think the SQ wasn't ready.
- * We don't need the load acquire for non-SQPOLL since then we drive updates.
+ * Without a barrier, we could miss an update and think the SQ wasn't
+ * ready. We don't need the load acquire for non-SQPOLL since then we
+ * drive updates.
*/
if (ring->flags & IORING_SETUP_SQPOLL)
- return ring->sq.sqe_tail - io_uring_smp_load_acquire(ring->sq.khead);
+ khead = io_uring_smp_load_acquire(ring->sq.khead);
/* always use real head, to avoid losing sync for short submit */
- return ring->sq.sqe_tail - *ring->sq.khead;
+ return ring->sq.sqe_tail - khead;
}
/*
@@ -671,12 +986,62 @@ static inline int io_uring_wait_cqe_nr(struct io_uring *ring,
}
/*
+ * Internal helper, don't use directly in applications. Use one of the
+ * "official" versions of this, io_uring_peek_cqe(), io_uring_wait_cqe(),
+ * or io_uring_wait_cqes*().
+ */
+static inline int __io_uring_peek_cqe(struct io_uring *ring,
+ struct io_uring_cqe **cqe_ptr,
+ unsigned *nr_available)
+{
+ struct io_uring_cqe *cqe;
+ int err = 0;
+ unsigned available;
+ unsigned mask = *ring->cq.kring_mask;
+ int shift = 0;
+
+ if (ring->flags & IORING_SETUP_CQE32)
+ shift = 1;
+
+ do {
+ unsigned tail = io_uring_smp_load_acquire(ring->cq.ktail);
+ unsigned head = *ring->cq.khead;
+
+ cqe = NULL;
+ available = tail - head;
+ if (!available)
+ break;
+
+ cqe = &ring->cq.cqes[(head & mask) << shift];
+ if (!(ring->features & IORING_FEAT_EXT_ARG) &&
+ cqe->user_data == LIBURING_UDATA_TIMEOUT) {
+ if (cqe->res < 0)
+ err = cqe->res;
+ io_uring_cq_advance(ring, 1);
+ if (!err)
+ continue;
+ cqe = NULL;
+ }
+
+ break;
+ } while (1);
+
+ *cqe_ptr = cqe;
+ if (nr_available)
+ *nr_available = available;
+ return err;
+}
+
+/*
* Return an IO completion, if one is readily available. Returns 0 with
* cqe_ptr filled in on success, -errno on failure.
*/
static inline int io_uring_peek_cqe(struct io_uring *ring,
struct io_uring_cqe **cqe_ptr)
{
+ if (!__io_uring_peek_cqe(ring, cqe_ptr, NULL) && *cqe_ptr)
+ return 0;
+
return io_uring_wait_cqe_nr(ring, cqe_ptr, 0);
}
@@ -687,9 +1052,105 @@ static inline int io_uring_peek_cqe(struct io_uring *ring,
static inline int io_uring_wait_cqe(struct io_uring *ring,
struct io_uring_cqe **cqe_ptr)
{
+ if (!__io_uring_peek_cqe(ring, cqe_ptr, NULL) && *cqe_ptr)
+ return 0;
+
return io_uring_wait_cqe_nr(ring, cqe_ptr, 1);
}
+/*
+ * Return an sqe to fill. Application must later call io_uring_submit()
+ * when it's ready to tell the kernel about it. The caller may call this
+ * function multiple times before calling io_uring_submit().
+ *
+ * Returns a vacant sqe, or NULL if we're full.
+ */
+static inline struct io_uring_sqe *_io_uring_get_sqe(struct io_uring *ring)
+{
+ struct io_uring_sq *sq = &ring->sq;
+ unsigned int head = io_uring_smp_load_acquire(sq->khead);
+ unsigned int next = sq->sqe_tail + 1;
+ int shift = 0;
+
+ if (ring->flags & IORING_SETUP_SQE128)
+ shift = 1;
+
+ if (next - head <= *sq->kring_entries) {
+ struct io_uring_sqe *sqe;
+
+ sqe = &sq->sqes[(sq->sqe_tail & *sq->kring_mask) << shift];
+ sq->sqe_tail = next;
+ return sqe;
+ }
+
+ return NULL;
+}
+
+/*
+ * Return the appropriate mask for a buffer ring of size 'ring_entries'
+ */
+static inline int io_uring_buf_ring_mask(__u32 ring_entries)
+{
+ return ring_entries - 1;
+}
+
+static inline void io_uring_buf_ring_init(struct io_uring_buf_ring *br)
+{
+ br->tail = 0;
+}
+
+/*
+ * Assign 'buf' with the addr/len/buffer ID supplied
+ */
+static inline void io_uring_buf_ring_add(struct io_uring_buf_ring *br,
+ void *addr, unsigned int len,
+ unsigned short bid, int mask,
+ int buf_offset)
+{
+ struct io_uring_buf *buf = &br->bufs[(br->tail + buf_offset) & mask];
+
+ buf->addr = (unsigned long) (uintptr_t) addr;
+ buf->len = len;
+ buf->bid = bid;
+}
+
+/*
+ * Make 'count' new buffers visible to the kernel. Called after
+ * io_uring_buf_ring_add() has been called 'count' times to fill in new
+ * buffers.
+ */
+static inline void io_uring_buf_ring_advance(struct io_uring_buf_ring *br,
+ int count)
+{
+ unsigned short new_tail = br->tail + count;
+
+ io_uring_smp_store_release(&br->tail, new_tail);
+}
+
+/*
+ * Make 'count' new buffers visible to the kernel while at the same time
+ * advancing the CQ ring seen entries. This can be used when the application
+ * is using ring provided buffers and returns buffers while processing CQEs,
+ * avoiding an extra atomic when needing to increment both the CQ ring and
+ * the ring buffer index at the same time.
+ */
+static inline void io_uring_buf_ring_cq_advance(struct io_uring *ring,
+ struct io_uring_buf_ring *br,
+ int count)
+{
+ br->tail += count;
+ io_uring_cq_advance(ring, count);
+}
+
+#ifndef LIBURING_INTERNAL
+static inline struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
+{
+ return _io_uring_get_sqe(ring);
+}
+#else
+struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
+#endif
+
ssize_t io_uring_mlock_size(unsigned entries, unsigned flags);
ssize_t io_uring_mlock_size_params(unsigned entries, struct io_uring_params *p);
diff --git a/src/include/liburing/barrier.h b/src/include/liburing/barrier.h
index 89ac682..aedeb47 100644
--- a/src/include/liburing/barrier.h
+++ b/src/include/liburing/barrier.h
@@ -52,6 +52,11 @@ static inline T io_uring_smp_load_acquire(const T *p)
reinterpret_cast<const std::atomic<T> *>(p),
std::memory_order_acquire);
}
+
+static inline void io_uring_smp_mb()
+{
+ std::atomic_thread_fence(std::memory_order_seq_cst);
+}
#else
#include <stdatomic.h>
@@ -68,6 +73,9 @@ static inline T io_uring_smp_load_acquire(const T *p)
#define io_uring_smp_load_acquire(p) \
atomic_load_explicit((_Atomic __typeof__(*(p)) *)(p), \
memory_order_acquire)
+
+#define io_uring_smp_mb() \
+ atomic_thread_fence(memory_order_seq_cst)
#endif
#endif /* defined(LIBURING_BARRIER_H) */
diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
index e4a4fc4..2f391c9 100644
--- a/src/include/liburing/io_uring.h
+++ b/src/include/liburing/io_uring.h
@@ -49,25 +49,35 @@ struct io_uring_sqe {
__u32 rename_flags;
__u32 unlink_flags;
__u32 hardlink_flags;
+ __u32 xattr_flags;
};
__u64 user_data; /* data to be passed back at completion time */
+ /* pack this to avoid bogus arm OABI complaints */
union {
- struct {
- /* pack this to avoid bogus arm OABI complaints */
- union {
- /* index into fixed buffers, if used */
- __u16 buf_index;
- /* for grouped buffer selection */
- __u16 buf_group;
- } __attribute__((packed));
- /* personality to use, if used */
- __u16 personality;
- __s32 splice_fd_in;
- };
- __u64 __pad2[3];
+ /* index into fixed buffers, if used */
+ __u16 buf_index;
+ /* for grouped buffer selection */
+ __u16 buf_group;
+ } __attribute__((packed));
+ /* personality to use, if used */
+ __u16 personality;
+ union {
+ __s32 splice_fd_in;
+ __u32 file_index;
};
+ __u64 addr3;
+ __u64 __pad2[1];
};
+/*
+ * If sqe->file_index is set to this for opcodes that instantiate a new
+ * direct descriptor (like openat/openat2/accept), then io_uring will allocate
+ * an available direct descriptor instead of having the application pass one
+ * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE
+ * if the space is full.
+ */
+#define IORING_FILE_INDEX_ALLOC (~0U)
+
enum {
IOSQE_FIXED_FILE_BIT,
IOSQE_IO_DRAIN_BIT,
@@ -75,6 +85,7 @@ enum {
IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT,
IOSQE_BUFFER_SELECT_BIT,
+ IOSQE_CQE_SKIP_SUCCESS_BIT,
};
/*
@@ -92,6 +103,8 @@ enum {
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/* select buffer from sqe->buf_group */
#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
+/* don't post CQE if request succeeded */
+#define IOSQE_CQE_SKIP_SUCCESS (1U << IOSQE_CQE_SKIP_SUCCESS_BIT)
/*
* io_uring_setup() flags
@@ -103,8 +116,26 @@ enum {
#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
+#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */
+/*
+ * Cooperative task running. When requests complete, they often require
+ * forcing the submitter to transition to the kernel to complete. If this
+ * flag is set, work will be done when the task transitions anyway, rather
+ * than force an inter-processor interrupt reschedule. This avoids interrupting
+ * a task running in userspace, and saves an IPI.
+ */
+#define IORING_SETUP_COOP_TASKRUN (1U << 8)
+/*
+ * If COOP_TASKRUN is set, get notified if task work is available for
+ * running and a kernel transition would be needed to run it. This sets
+ * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
+ */
+#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
-enum {
+#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */
+#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */
+
+enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
IORING_OP_WRITEV,
@@ -145,6 +176,13 @@ enum {
IORING_OP_MKDIRAT,
IORING_OP_SYMLINKAT,
IORING_OP_LINKAT,
+ IORING_OP_MSG_RING,
+ IORING_OP_FSETXATTR,
+ IORING_OP_SETXATTR,
+ IORING_OP_FGETXATTR,
+ IORING_OP_GETXATTR,
+ IORING_OP_SOCKET,
+ IORING_OP_URING_CMD,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -158,9 +196,14 @@ enum {
/*
* sqe->timeout_flags
*/
-#define IORING_TIMEOUT_ABS (1U << 0)
-#define IORING_TIMEOUT_UPDATE (1U << 1)
-
+#define IORING_TIMEOUT_ABS (1U << 0)
+#define IORING_TIMEOUT_UPDATE (1U << 1)
+#define IORING_TIMEOUT_BOOTTIME (1U << 2)
+#define IORING_TIMEOUT_REALTIME (1U << 3)
+#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
+#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
+#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
+#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
* sqe->splice_flags
* extends splice(2) flags
@@ -183,12 +226,45 @@ enum {
#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
/*
+ * ASYNC_CANCEL flags.
+ *
+ * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key
+ * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the
+ * request 'user_data'
+ * IORING_ASYNC_CANCEL_ANY Match any request
+ */
+#define IORING_ASYNC_CANCEL_ALL (1U << 0)
+#define IORING_ASYNC_CANCEL_FD (1U << 1)
+#define IORING_ASYNC_CANCEL_ANY (1U << 2)
+
+/*
+ * send/sendmsg and recv/recvmsg flags (sqe->addr2)
+ *
+ * IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send
+ * or receive and arm poll if that yields an
+ * -EAGAIN result, arm poll upfront and skip
+ * the initial transfer attempt.
+ */
+#define IORING_RECVSEND_POLL_FIRST (1U << 0)
+
+/*
+ * accept flags stored in sqe->ioprio
+ */
+#define IORING_ACCEPT_MULTISHOT (1U << 0)
+
+/*
* IO completion data structure (Completion Queue Entry)
*/
struct io_uring_cqe {
__u64 user_data; /* sqe->data submission passed back */
__s32 res; /* result code for this event */
__u32 flags;
+
+ /*
+ * If the ring is initialized with IORING_SETUP_CQE32, then this field
+ * contains 16-bytes of padding, doubling the size of the CQE.
+ */
+ __u64 big_cqe[];
};
/*
@@ -196,9 +272,11 @@ struct io_uring_cqe {
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
* IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
+ * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
*/
#define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1)
+#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
enum {
IORING_CQE_BUFFER_SHIFT = 16,
@@ -231,6 +309,7 @@ struct io_sqring_offsets {
*/
#define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */
#define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */
+#define IORING_SQ_TASKRUN (1U << 2) /* task should enter the kernel */
struct io_cqring_offsets {
__u32 head;
@@ -254,10 +333,11 @@ struct io_cqring_offsets {
/*
* io_uring_enter(2) flags
*/
-#define IORING_ENTER_GETEVENTS (1U << 0)
-#define IORING_ENTER_SQ_WAKEUP (1U << 1)
-#define IORING_ENTER_SQ_WAIT (1U << 2)
-#define IORING_ENTER_EXT_ARG (1U << 3)
+#define IORING_ENTER_GETEVENTS (1U << 0)
+#define IORING_ENTER_SQ_WAKEUP (1U << 1)
+#define IORING_ENTER_SQ_WAIT (1U << 2)
+#define IORING_ENTER_EXT_ARG (1U << 3)
+#define IORING_ENTER_REGISTERED_RING (1U << 4)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -289,6 +369,8 @@ struct io_uring_params {
#define IORING_FEAT_EXT_ARG (1U << 8)
#define IORING_FEAT_NATIVE_WORKERS (1U << 9)
#define IORING_FEAT_RSRC_TAGS (1U << 10)
+#define IORING_FEAT_CQE_SKIP (1U << 11)
+#define IORING_FEAT_LINKED_FILE (1U << 12)
/*
* io_uring_register(2) opcodes and arguments
@@ -314,10 +396,31 @@ enum {
IORING_REGISTER_BUFFERS2 = 15,
IORING_REGISTER_BUFFERS_UPDATE = 16,
+ /* set/clear io-wq thread affinities */
+ IORING_REGISTER_IOWQ_AFF = 17,
+ IORING_UNREGISTER_IOWQ_AFF = 18,
+
+ /* set/get max number of io-wq workers */
+ IORING_REGISTER_IOWQ_MAX_WORKERS = 19,
+
+ /* register/unregister io_uring fd with the ring */
+ IORING_REGISTER_RING_FDS = 20,
+ IORING_UNREGISTER_RING_FDS = 21,
+
+ /* register ring based provide buffer group */
+ IORING_REGISTER_PBUF_RING = 22,
+ IORING_UNREGISTER_PBUF_RING = 23,
+
/* this goes last */
IORING_REGISTER_LAST
};
+/* io-wq worker categories */
+enum {
+ IO_WQ_BOUND,
+ IO_WQ_UNBOUND,
+};
+
/* deprecated, see struct io_uring_rsrc_update */
struct io_uring_files_update {
__u32 offset;
@@ -325,9 +428,15 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds;
};
+/*
+ * Register a fully sparse file space, rather than pass in an array of all
+ * -1 file descriptors.
+ */
+#define IORING_RSRC_REGISTER_SPARSE (1U << 0)
+
struct io_uring_rsrc_register {
__u32 nr;
- __u32 resv;
+ __u32 flags;
__u64 resv2;
__aligned_u64 data;
__aligned_u64 tags;
@@ -365,7 +474,7 @@ struct io_uring_probe {
__u8 ops_len; /* length of ops[] array below */
__u16 resv;
__u32 resv2[3];
- struct io_uring_probe_op ops[];
+ struct io_uring_probe_op ops[0];
};
struct io_uring_restriction {
@@ -379,6 +488,38 @@ struct io_uring_restriction {
__u32 resv2[3];
};
+struct io_uring_buf {
+ __u64 addr;
+ __u32 len;
+ __u16 bid;
+ __u16 resv;
+};
+
+struct io_uring_buf_ring {
+ union {
+ /*
+ * To avoid spilling into more pages than we need to, the
+ * ring tail is overlaid with the io_uring_buf->resv field.
+ */
+ struct {
+ __u64 resv1;
+ __u32 resv2;
+ __u16 resv3;
+ __u16 tail;
+ };
+ struct io_uring_buf bufs[0];
+ };
+};
+
+/* argument for IORING_(UN)REGISTER_PBUF_RING */
+struct io_uring_buf_reg {
+ __u64 ring_addr;
+ __u32 ring_entries;
+ __u16 bgid;
+ __u16 pad;
+ __u64 resv[3];
+};
+
/*
* io_uring_restriction->opcode values
*/
@@ -405,6 +546,11 @@ struct io_uring_getevents_arg {
__u64 ts;
};
+/*
+ * accept flags stored in sqe->ioprio
+ */
+#define IORING_ACCEPT_MULTISHOT (1U << 0)
+
#ifdef __cplusplus
}
#endif
diff --git a/src/int_flags.h b/src/int_flags.h
new file mode 100644
index 0000000..90505ec
--- /dev/null
+++ b/src/int_flags.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: MIT */
+#ifndef LIBURING_INT_FLAGS
+#define LIBURING_INT_FLAGS
+
+enum {
+ INT_FLAG_REG_RING = 1,
+};
+
+#endif
diff --git a/src/lib.h b/src/lib.h
new file mode 100644
index 0000000..6672cc5
--- /dev/null
+++ b/src/lib.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: MIT */
+#ifndef LIBURING_LIB_H
+#define LIBURING_LIB_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define __INTERNAL__LIBURING_LIB_H
+#if defined(__x86_64__) || defined(__i386__)
+ #include "arch/x86/lib.h"
+#else
+ /*
+ * We don't have nolibc support for this arch. Must use libc!
+ */
+ #ifdef CONFIG_NOLIBC
+ #error "This arch doesn't support building liburing without libc"
+ #endif
+ /* libc wrappers. */
+ #include "arch/generic/lib.h"
+#endif
+#undef __INTERNAL__LIBURING_LIB_H
+
+
+#ifndef offsetof
+ #define offsetof(TYPE, FIELD) ((size_t) &((TYPE *)0)->FIELD)
+#endif
+
+#ifndef container_of
+ #define container_of(PTR, TYPE, FIELD) ({ \
+ __typeof__(((TYPE *)0)->FIELD) *__FIELD_PTR = (PTR); \
+ (TYPE *)((char *) __FIELD_PTR - offsetof(TYPE, FIELD)); \
+ })
+#endif
+
+void *__uring_malloc(size_t len);
+void __uring_free(void *p);
+
+static inline void *uring_malloc(size_t len)
+{
+#ifdef CONFIG_NOLIBC
+ return __uring_malloc(len);
+#else
+ return malloc(len);
+#endif
+}
+
+static inline void uring_free(void *ptr)
+{
+#ifdef CONFIG_NOLIBC
+ __uring_free(ptr);
+#else
+ free(ptr);
+#endif
+}
+
+#endif /* #ifndef LIBURING_LIB_H */
diff --git a/src/liburing.map b/src/liburing.map
index 012ac4e..879f791 100644
--- a/src/liburing.map
+++ b/src/liburing.map
@@ -36,4 +36,22 @@ LIBURING_2.1 {
global:
io_uring_mlock_size_params;
io_uring_mlock_size;
+ io_uring_register_buffers_tags;
+ io_uring_register_buffers_update_tag;
+ io_uring_register_files_tags;
+ io_uring_register_files_update_tag;
+ io_uring_register_iowq_aff;
+ io_uring_unregister_iowq_aff;
+ io_uring_register_iowq_max_workers;
} LIBURING_2.0;
+
+LIBURING_2.2 {
+ global:
+ io_uring_submit_and_wait_timeout;
+ io_uring_register_ring_fd;
+ io_uring_unregister_ring_fd;
+ io_uring_register_files_sparse;
+ io_uring_register_buffers_sparse;
+ io_uring_register_buf_ring;
+ io_uring_unregister_buf_ring;
+} LIBURING_2.1;
diff --git a/src/nolibc.c b/src/nolibc.c
new file mode 100644
index 0000000..9a04ead
--- /dev/null
+++ b/src/nolibc.c
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: MIT */
+
+#ifndef CONFIG_NOLIBC
+#error "This file should only be compiled for no libc build"
+#endif
+
+#include "lib.h"
+#include "syscall.h"
+
+void *memset(void *s, int c, size_t n)
+{
+ size_t i;
+ unsigned char *p = s;
+
+ for (i = 0; i < n; i++)
+ p[i] = (unsigned char) c;
+
+ return s;
+}
+
+struct uring_heap {
+ size_t len;
+ char user_p[] __attribute__((__aligned__));
+};
+
+void *__uring_malloc(size_t len)
+{
+ struct uring_heap *heap;
+
+ heap = __sys_mmap(NULL, sizeof(*heap) + len, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (IS_ERR(heap))
+ return NULL;
+
+ heap->len = sizeof(*heap) + len;
+ return heap->user_p;
+}
+
+void __uring_free(void *p)
+{
+ struct uring_heap *heap;
+
+ if (uring_unlikely(!p))
+ return;
+
+ heap = container_of(p, struct uring_heap, user_p);
+ __sys_munmap(heap, heap->len);
+}
diff --git a/src/queue.c b/src/queue.c
index 2f0f19b..ce0ecf6 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -1,20 +1,12 @@
/* SPDX-License-Identifier: MIT */
#define _POSIX_C_SOURCE 200112L
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <stdbool.h>
-
+#include "lib.h"
+#include "syscall.h"
+#include "liburing.h"
+#include "int_flags.h"
#include "liburing/compat.h"
#include "liburing/io_uring.h"
-#include "liburing.h"
-#include "liburing/barrier.h"
-
-#include "syscall.h"
/*
* Returns true if we're not using SQ thread (thus nobody submits but us)
@@ -26,6 +18,12 @@ static inline bool sq_ring_needs_enter(struct io_uring *ring, unsigned *flags)
if (!(ring->flags & IORING_SETUP_SQPOLL))
return true;
+ /*
+ * Ensure the kernel can see the store to the SQ tail before we read
+ * the flags.
+ */
+ io_uring_smp_mb();
+
if (uring_unlikely(IO_URING_READ_ONCE(*ring->sq.kflags) &
IORING_SQ_NEED_WAKEUP)) {
*flags |= IORING_ENTER_SQ_WAKEUP;
@@ -37,44 +35,13 @@ static inline bool sq_ring_needs_enter(struct io_uring *ring, unsigned *flags)
static inline bool cq_ring_needs_flush(struct io_uring *ring)
{
- return IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_CQ_OVERFLOW;
+ return IO_URING_READ_ONCE(*ring->sq.kflags) &
+ (IORING_SQ_CQ_OVERFLOW | IORING_SQ_TASKRUN);
}
-static int __io_uring_peek_cqe(struct io_uring *ring,
- struct io_uring_cqe **cqe_ptr,
- unsigned *nr_available)
+static inline bool cq_ring_needs_enter(struct io_uring *ring)
{
- struct io_uring_cqe *cqe;
- int err = 0;
- unsigned available;
- unsigned mask = *ring->cq.kring_mask;
-
- do {
- unsigned tail = io_uring_smp_load_acquire(ring->cq.ktail);
- unsigned head = *ring->cq.khead;
-
- cqe = NULL;
- available = tail - head;
- if (!available)
- break;
-
- cqe = &ring->cq.cqes[head & mask];
- if (!(ring->features & IORING_FEAT_EXT_ARG) &&
- cqe->user_data == LIBURING_UDATA_TIMEOUT) {
- if (cqe->res < 0)
- err = cqe->res;
- io_uring_cq_advance(ring, 1);
- if (!err)
- continue;
- cqe = NULL;
- }
-
- break;
- } while (1);
-
- *cqe_ptr = cqe;
- *nr_available = available;
- return err;
+ return (ring->flags & IORING_SETUP_IOPOLL) || cq_ring_needs_flush(ring);
}
struct get_data {
@@ -85,15 +52,16 @@ struct get_data {
void *arg;
};
-static int _io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
+static int _io_uring_get_cqe(struct io_uring *ring,
+ struct io_uring_cqe **cqe_ptr,
struct get_data *data)
{
struct io_uring_cqe *cqe = NULL;
+ bool looped = false;
int err;
do {
bool need_enter = false;
- bool cq_overflow_flush = false;
unsigned flags = 0;
unsigned nr_available;
int ret;
@@ -102,34 +70,40 @@ static int _io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_pt
if (err)
break;
if (!cqe && !data->wait_nr && !data->submit) {
- if (!cq_ring_needs_flush(ring)) {
+ /*
+ * If we already looped once, we already entererd
+ * the kernel. Since there's nothing to submit or
+ * wait for, don't keep retrying.
+ */
+ if (looped || !cq_ring_needs_enter(ring)) {
err = -EAGAIN;
break;
}
- cq_overflow_flush = true;
+ need_enter = true;
}
- if (data->wait_nr > nr_available || cq_overflow_flush) {
+ if (data->wait_nr > nr_available || need_enter) {
flags = IORING_ENTER_GETEVENTS | data->get_flags;
need_enter = true;
}
- if (data->submit) {
- sq_ring_needs_enter(ring, &flags);
+ if (data->submit && sq_ring_needs_enter(ring, &flags))
need_enter = true;
- }
if (!need_enter)
break;
- ret = __sys_io_uring_enter2(ring->ring_fd, data->submit,
- data->wait_nr, flags, data->arg,
- data->sz);
+ if (ring->int_flags & INT_FLAG_REG_RING)
+ flags |= IORING_ENTER_REGISTERED_RING;
+ ret = ____sys_io_uring_enter2(ring->enter_ring_fd, data->submit,
+ data->wait_nr, flags, data->arg,
+ data->sz);
if (ret < 0) {
- err = -errno;
+ err = ret;
break;
}
data->submit -= ret;
if (cqe)
break;
+ looped = true;
} while (1);
*cqe_ptr = cqe;
@@ -159,6 +133,10 @@ unsigned io_uring_peek_batch_cqe(struct io_uring *ring,
{
unsigned ready;
bool overflow_checked = false;
+ int shift = 0;
+
+ if (ring->flags & IORING_SETUP_CQE32)
+ shift = 1;
again:
ready = io_uring_cq_ready(ring);
@@ -171,7 +149,7 @@ again:
count = count > ready ? ready : count;
last = head + count;
for (;head != last; head++, i++)
- cqes[i] = &ring->cq.cqes[head & mask];
+ cqes[i] = &ring->cq.cqes[(head & mask) << shift];
return count;
}
@@ -180,8 +158,11 @@ again:
goto done;
if (cq_ring_needs_flush(ring)) {
- __sys_io_uring_enter(ring->ring_fd, 0, 0,
- IORING_ENTER_GETEVENTS, NULL);
+ int flags = IORING_ENTER_GETEVENTS;
+
+ if (ring->int_flags & INT_FLAG_REG_RING)
+ flags |= IORING_ENTER_REGISTERED_RING;
+ ____sys_io_uring_enter(ring->enter_ring_fd, 0, 0, flags, NULL);
overflow_checked = true;
goto again;
}
@@ -239,7 +220,8 @@ out:
*/
static int io_uring_wait_cqes_new(struct io_uring *ring,
struct io_uring_cqe **cqe_ptr,
- unsigned wait_nr, struct __kernel_timespec *ts,
+ unsigned wait_nr,
+ struct __kernel_timespec *ts,
sigset_t *sigmask)
{
struct io_uring_getevents_arg arg = {
@@ -248,7 +230,6 @@ static int io_uring_wait_cqes_new(struct io_uring *ring,
.ts = (unsigned long) ts
};
struct get_data data = {
- .submit = __io_uring_flush_sq(ring),
.wait_nr = wait_nr,
.get_flags = IORING_ENTER_EXT_ARG,
.sz = sizeof(arg),
@@ -275,36 +256,77 @@ static int io_uring_wait_cqes_new(struct io_uring *ring,
* hence this function is safe to use for applications that split SQ and CQ
* handling between two threads.
*/
+static int __io_uring_submit_timeout(struct io_uring *ring, unsigned wait_nr,
+ struct __kernel_timespec *ts)
+{
+ struct io_uring_sqe *sqe;
+ int ret;
+
+ /*
+ * If the SQ ring is full, we may need to submit IO first
+ */
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ ret = io_uring_submit(ring);
+ if (ret < 0)
+ return ret;
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe)
+ return -EAGAIN;
+ }
+ io_uring_prep_timeout(sqe, ts, wait_nr, 0);
+ sqe->user_data = LIBURING_UDATA_TIMEOUT;
+ return __io_uring_flush_sq(ring);
+}
+
int io_uring_wait_cqes(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
unsigned wait_nr, struct __kernel_timespec *ts,
sigset_t *sigmask)
{
- unsigned to_submit = 0;
+ int to_submit = 0;
if (ts) {
- struct io_uring_sqe *sqe;
- int ret;
-
if (ring->features & IORING_FEAT_EXT_ARG)
return io_uring_wait_cqes_new(ring, cqe_ptr, wait_nr,
ts, sigmask);
+ to_submit = __io_uring_submit_timeout(ring, wait_nr, ts);
+ if (to_submit < 0)
+ return to_submit;
+ }
- /*
- * If the SQ ring is full, we may need to submit IO first
- */
- sqe = io_uring_get_sqe(ring);
- if (!sqe) {
- ret = io_uring_submit(ring);
- if (ret < 0)
- return ret;
- sqe = io_uring_get_sqe(ring);
- if (!sqe)
- return -EAGAIN;
+ return __io_uring_get_cqe(ring, cqe_ptr, to_submit, wait_nr, sigmask);
+}
+
+int io_uring_submit_and_wait_timeout(struct io_uring *ring,
+ struct io_uring_cqe **cqe_ptr,
+ unsigned wait_nr,
+ struct __kernel_timespec *ts,
+ sigset_t *sigmask)
+{
+ int to_submit;
+
+ if (ts) {
+ if (ring->features & IORING_FEAT_EXT_ARG) {
+ struct io_uring_getevents_arg arg = {
+ .sigmask = (unsigned long) sigmask,
+ .sigmask_sz = _NSIG / 8,
+ .ts = (unsigned long) ts
+ };
+ struct get_data data = {
+ .submit = __io_uring_flush_sq(ring),
+ .wait_nr = wait_nr,
+ .get_flags = IORING_ENTER_EXT_ARG,
+ .sz = sizeof(arg),
+ .arg = &arg
+ };
+
+ return _io_uring_get_cqe(ring, cqe_ptr, &data);
}
- io_uring_prep_timeout(sqe, ts, wait_nr, 0);
- sqe->user_data = LIBURING_UDATA_TIMEOUT;
+ to_submit = __io_uring_submit_timeout(ring, wait_nr, ts);
+ if (to_submit < 0)
+ return to_submit;
+ } else
to_submit = __io_uring_flush_sq(ring);
- }
return __io_uring_get_cqe(ring, cqe_ptr, to_submit, wait_nr, sigmask);
}
@@ -335,11 +357,11 @@ static int __io_uring_submit(struct io_uring *ring, unsigned submitted,
if (sq_ring_needs_enter(ring, &flags) || wait_nr) {
if (wait_nr || (ring->flags & IORING_SETUP_IOPOLL))
flags |= IORING_ENTER_GETEVENTS;
+ if (ring->int_flags & INT_FLAG_REG_RING)
+ flags |= IORING_ENTER_REGISTERED_RING;
- ret = __sys_io_uring_enter(ring->ring_fd, submitted, wait_nr,
- flags, NULL);
- if (ret < 0)
- return -errno;
+ ret = ____sys_io_uring_enter(ring->enter_ring_fd, submitted,
+ wait_nr, flags, NULL);
} else
ret = submitted;
@@ -371,34 +393,19 @@ int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr)
return __io_uring_submit_and_wait(ring, wait_nr);
}
-/*
- * Return an sqe to fill. Application must later call io_uring_submit()
- * when it's ready to tell the kernel about it. The caller may call this
- * function multiple times before calling io_uring_submit().
- *
- * Returns a vacant sqe, or NULL if we're full.
- */
+#ifdef LIBURING_INTERNAL
struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
{
- struct io_uring_sq *sq = &ring->sq;
- unsigned int head = io_uring_smp_load_acquire(sq->khead);
- unsigned int next = sq->sqe_tail + 1;
- struct io_uring_sqe *sqe = NULL;
-
- if (next - head <= *sq->kring_entries) {
- sqe = &sq->sqes[sq->sqe_tail & *sq->kring_mask];
- sq->sqe_tail = next;
- }
- return sqe;
+ return _io_uring_get_sqe(ring);
}
+#endif
int __io_uring_sqring_wait(struct io_uring *ring)
{
- int ret;
+ int flags = IORING_ENTER_SQ_WAIT;
- ret = __sys_io_uring_enter(ring->ring_fd, 0, 0, IORING_ENTER_SQ_WAIT,
- NULL);
- if (ret < 0)
- ret = -errno;
- return ret;
+ if (ring->int_flags & INT_FLAG_REG_RING)
+ flags |= IORING_ENTER_REGISTERED_RING;
+
+ return ____sys_io_uring_enter(ring->enter_ring_fd, 0, 0, flags, NULL);
}
diff --git a/src/register.c b/src/register.c
index 994aaff..993c450 100644
--- a/src/register.c
+++ b/src/register.c
@@ -1,42 +1,91 @@
/* SPDX-License-Identifier: MIT */
#define _POSIX_C_SOURCE 200112L
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-
+#include "lib.h"
+#include "syscall.h"
+#include "liburing.h"
+#include "int_flags.h"
#include "liburing/compat.h"
#include "liburing/io_uring.h"
-#include "liburing.h"
-#include "syscall.h"
+int io_uring_register_buffers_update_tag(struct io_uring *ring, unsigned off,
+ const struct iovec *iovecs,
+ const __u64 *tags,
+ unsigned nr)
+{
+ struct io_uring_rsrc_update2 up = {
+ .offset = off,
+ .data = (unsigned long)iovecs,
+ .tags = (unsigned long)tags,
+ .nr = nr,
+ };
+
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_BUFFERS_UPDATE, &up,
+ sizeof(up));
+}
+
+int io_uring_register_buffers_tags(struct io_uring *ring,
+ const struct iovec *iovecs,
+ const __u64 *tags,
+ unsigned nr)
+{
+ struct io_uring_rsrc_register reg = {
+ .nr = nr,
+ .data = (unsigned long)iovecs,
+ .tags = (unsigned long)tags,
+ };
+
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_BUFFERS2, &reg,
+ sizeof(reg));
+}
+
+int io_uring_register_buffers_sparse(struct io_uring *ring, unsigned nr)
+{
+ struct io_uring_rsrc_register reg = {
+ .flags = IORING_RSRC_REGISTER_SPARSE,
+ .nr = nr,
+ };
+
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_BUFFERS2, &reg,
+ sizeof(reg));
+}
int io_uring_register_buffers(struct io_uring *ring, const struct iovec *iovecs,
unsigned nr_iovecs)
{
int ret;
- ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_BUFFERS,
+ ret = ____sys_io_uring_register(ring->ring_fd, IORING_REGISTER_BUFFERS,
iovecs, nr_iovecs);
- if (ret < 0)
- return -errno;
-
- return 0;
+ return (ret < 0) ? ret : 0;
}
int io_uring_unregister_buffers(struct io_uring *ring)
{
int ret;
- ret = __sys_io_uring_register(ring->ring_fd, IORING_UNREGISTER_BUFFERS,
- NULL, 0);
- if (ret < 0)
- return -errno;
+ ret = ____sys_io_uring_register(ring->ring_fd,
+ IORING_UNREGISTER_BUFFERS, NULL, 0);
+ return (ret < 0) ? ret : 0;
+}
- return 0;
+int io_uring_register_files_update_tag(struct io_uring *ring, unsigned off,
+ const int *files, const __u64 *tags,
+ unsigned nr_files)
+{
+ struct io_uring_rsrc_update2 up = {
+ .offset = off,
+ .data = (unsigned long)files,
+ .tags = (unsigned long)tags,
+ .nr = nr_files,
+ };
+
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_FILES_UPDATE2, &up,
+ sizeof(up));
}
/*
@@ -53,76 +102,138 @@ int io_uring_register_files_update(struct io_uring *ring, unsigned off,
.offset = off,
.fds = (unsigned long) files,
};
+
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_FILES_UPDATE, &up,
+ nr_files);
+}
+
+static int increase_rlimit_nofile(unsigned nr)
+{
int ret;
+ struct rlimit rlim;
- ret = __sys_io_uring_register(ring->ring_fd,
- IORING_REGISTER_FILES_UPDATE, &up,
- nr_files);
+ ret = __sys_getrlimit(RLIMIT_NOFILE, &rlim);
if (ret < 0)
- return -errno;
+ return ret;
+
+ if (rlim.rlim_cur < nr) {
+ rlim.rlim_cur += nr;
+ __sys_setrlimit(RLIMIT_NOFILE, &rlim);
+ }
+
+ return 0;
+}
+
+int io_uring_register_files_sparse(struct io_uring *ring, unsigned nr)
+{
+ struct io_uring_rsrc_register reg = {
+ .flags = IORING_RSRC_REGISTER_SPARSE,
+ .nr = nr,
+ };
+ int ret, did_increase = 0;
+
+ do {
+ ret = ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_FILES2, &reg,
+ sizeof(reg));
+ if (ret >= 0)
+ break;
+ if (ret == -EMFILE && !did_increase) {
+ did_increase = 1;
+ increase_rlimit_nofile(nr);
+ continue;
+ }
+ break;
+ } while (1);
return ret;
}
-int io_uring_register_files(struct io_uring *ring, const int *files,
- unsigned nr_files)
+int io_uring_register_files_tags(struct io_uring *ring, const int *files,
+ const __u64 *tags, unsigned nr)
{
- int ret;
+ struct io_uring_rsrc_register reg = {
+ .nr = nr,
+ .data = (unsigned long)files,
+ .tags = (unsigned long)tags,
+ };
+ int ret, did_increase = 0;
+
+ do {
+ ret = ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_FILES2, &reg,
+ sizeof(reg));
+ if (ret >= 0)
+ break;
+ if (ret == -EMFILE && !did_increase) {
+ did_increase = 1;
+ increase_rlimit_nofile(nr);
+ continue;
+ }
+ break;
+ } while (1);
- ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_FILES,
- files, nr_files);
- if (ret < 0)
- return -errno;
+ return ret;
+}
- return 0;
+int io_uring_register_files(struct io_uring *ring, const int *files,
+ unsigned nr_files)
+{
+ int ret, did_increase = 0;
+
+ do {
+ ret = ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_FILES, files,
+ nr_files);
+ if (ret >= 0)
+ break;
+ if (ret == -EMFILE && !did_increase) {
+ did_increase = 1;
+ increase_rlimit_nofile(nr_files);
+ continue;
+ }
+ break;
+ } while (1);
+
+ return ret;
}
int io_uring_unregister_files(struct io_uring *ring)
{
int ret;
- ret = __sys_io_uring_register(ring->ring_fd, IORING_UNREGISTER_FILES,
+ ret = ____sys_io_uring_register(ring->ring_fd, IORING_UNREGISTER_FILES,
NULL, 0);
- if (ret < 0)
- return -errno;
-
- return 0;
+ return (ret < 0) ? ret : 0;
}
int io_uring_register_eventfd(struct io_uring *ring, int event_fd)
{
int ret;
- ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_EVENTFD,
+ ret = ____sys_io_uring_register(ring->ring_fd, IORING_REGISTER_EVENTFD,
&event_fd, 1);
- if (ret < 0)
- return -errno;
-
- return 0;
+ return (ret < 0) ? ret : 0;
}
int io_uring_unregister_eventfd(struct io_uring *ring)
{
int ret;
- ret = __sys_io_uring_register(ring->ring_fd, IORING_UNREGISTER_EVENTFD,
- NULL, 0);
- if (ret < 0)
- return -errno;
-
- return 0;
+ ret = ____sys_io_uring_register(ring->ring_fd,
+ IORING_UNREGISTER_EVENTFD, NULL, 0);
+ return (ret < 0) ? ret : 0;
}
int io_uring_register_eventfd_async(struct io_uring *ring, int event_fd)
{
int ret;
- ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_EVENTFD_ASYNC,
- &event_fd, 1);
- if (ret < 0)
- return -errno;
-
- return 0;
+ ret = ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_EVENTFD_ASYNC,
+ &event_fd, 1);
+ return (ret < 0) ? ret : 0;
}
int io_uring_register_probe(struct io_uring *ring, struct io_uring_probe *p,
@@ -130,36 +241,22 @@ int io_uring_register_probe(struct io_uring *ring, struct io_uring_probe *p,
{
int ret;
- ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_PROBE,
- p, nr_ops);
- if (ret < 0)
- return -errno;
-
- return 0;
+ ret = ____sys_io_uring_register(ring->ring_fd, IORING_REGISTER_PROBE, p,
+ nr_ops);
+ return (ret < 0) ? ret : 0;
}
int io_uring_register_personality(struct io_uring *ring)
{
- int ret;
-
- ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_PERSONALITY,
- NULL, 0);
- if (ret < 0)
- return -errno;
-
- return ret;
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_PERSONALITY, NULL, 0);
}
int io_uring_unregister_personality(struct io_uring *ring, int id)
{
- int ret;
-
- ret = __sys_io_uring_register(ring->ring_fd, IORING_UNREGISTER_PERSONALITY,
- NULL, id);
- if (ret < 0)
- return -errno;
-
- return ret;
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_UNREGISTER_PERSONALITY, NULL,
+ id);
}
int io_uring_register_restrictions(struct io_uring *ring,
@@ -168,22 +265,83 @@ int io_uring_register_restrictions(struct io_uring *ring,
{
int ret;
- ret = __sys_io_uring_register(ring->ring_fd, IORING_REGISTER_RESTRICTIONS,
- res, nr_res);
- if (ret < 0)
- return -errno;
-
- return 0;
+ ret = ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_RESTRICTIONS, res,
+ nr_res);
+ return (ret < 0) ? ret : 0;
}
int io_uring_enable_rings(struct io_uring *ring)
{
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_ENABLE_RINGS, NULL, 0);
+}
+
+int io_uring_register_iowq_aff(struct io_uring *ring, size_t cpusz,
+ const cpu_set_t *mask)
+{
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_IOWQ_AFF, mask, cpusz);
+}
+
+int io_uring_unregister_iowq_aff(struct io_uring *ring)
+{
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_UNREGISTER_IOWQ_AFF, NULL, 0);
+}
+
+int io_uring_register_iowq_max_workers(struct io_uring *ring, unsigned int *val)
+{
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_IOWQ_MAX_WORKERS, val,
+ 2);
+}
+
+int io_uring_register_ring_fd(struct io_uring *ring)
+{
+ struct io_uring_rsrc_update up = {
+ .data = ring->ring_fd,
+ .offset = -1U,
+ };
int ret;
- ret = __sys_io_uring_register(ring->ring_fd,
- IORING_REGISTER_ENABLE_RINGS, NULL, 0);
- if (ret < 0)
- return -errno;
+ ret = ____sys_io_uring_register(ring->ring_fd, IORING_REGISTER_RING_FDS,
+ &up, 1);
+ if (ret == 1) {
+ ring->enter_ring_fd = up.offset;
+ ring->int_flags |= INT_FLAG_REG_RING;
+ }
+ return ret;
+}
+
+
+int io_uring_unregister_ring_fd(struct io_uring *ring)
+{
+ struct io_uring_rsrc_update up = {
+ .offset = ring->enter_ring_fd,
+ };
+ int ret;
+ ret = ____sys_io_uring_register(ring->ring_fd,
+ IORING_UNREGISTER_RING_FDS, &up, 1);
+ if (ret == 1) {
+ ring->enter_ring_fd = ring->ring_fd;
+ ring->int_flags &= ~INT_FLAG_REG_RING;
+ }
return ret;
}
+
+int io_uring_register_buf_ring(struct io_uring *ring,
+ struct io_uring_buf_reg *reg, unsigned int flags)
+{
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_REGISTER_PBUF_RING, reg, 1);
+}
+
+int io_uring_unregister_buf_ring(struct io_uring *ring, int bgid)
+{
+ struct io_uring_buf_reg reg = { .bgid = bgid };
+
+ return ____sys_io_uring_register(ring->ring_fd,
+ IORING_UNREGISTER_PBUF_RING, &reg, 1);
+}
diff --git a/src/setup.c b/src/setup.c
index 54225e8..d2adc7f 100644
--- a/src/setup.c
+++ b/src/setup.c
@@ -1,26 +1,18 @@
/* SPDX-License-Identifier: MIT */
#define _DEFAULT_SOURCE
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <stdlib.h>
-#include <signal.h>
-
+#include "lib.h"
+#include "syscall.h"
+#include "liburing.h"
+#include "int_flags.h"
#include "liburing/compat.h"
#include "liburing/io_uring.h"
-#include "liburing.h"
-
-#include "syscall.h"
static void io_uring_unmap_rings(struct io_uring_sq *sq, struct io_uring_cq *cq)
{
- munmap(sq->ring_ptr, sq->ring_sz);
+ __sys_munmap(sq->ring_ptr, sq->ring_sz);
if (cq->ring_ptr && cq->ring_ptr != sq->ring_ptr)
- munmap(cq->ring_ptr, cq->ring_sz);
+ __sys_munmap(cq->ring_ptr, cq->ring_sz);
}
static int io_uring_mmap(int fd, struct io_uring_params *p,
@@ -29,27 +21,33 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
size_t size;
int ret;
+ size = sizeof(struct io_uring_cqe);
+ if (p->flags & IORING_SETUP_CQE32)
+ size += sizeof(struct io_uring_cqe);
+
sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
- cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
+ cq->ring_sz = p->cq_off.cqes + p->cq_entries * size;
if (p->features & IORING_FEAT_SINGLE_MMAP) {
if (cq->ring_sz > sq->ring_sz)
sq->ring_sz = cq->ring_sz;
cq->ring_sz = sq->ring_sz;
}
- sq->ring_ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
- if (sq->ring_ptr == MAP_FAILED)
- return -errno;
+ sq->ring_ptr = __sys_mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd,
+ IORING_OFF_SQ_RING);
+ if (IS_ERR(sq->ring_ptr))
+ return PTR_ERR(sq->ring_ptr);
if (p->features & IORING_FEAT_SINGLE_MMAP) {
cq->ring_ptr = sq->ring_ptr;
} else {
- cq->ring_ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
- if (cq->ring_ptr == MAP_FAILED) {
+ cq->ring_ptr = __sys_mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd,
+ IORING_OFF_CQ_RING);
+ if (IS_ERR(cq->ring_ptr)) {
+ ret = PTR_ERR(cq->ring_ptr);
cq->ring_ptr = NULL;
- ret = -errno;
goto err;
}
}
@@ -62,12 +60,13 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
sq->kdropped = sq->ring_ptr + p->sq_off.dropped;
sq->array = sq->ring_ptr + p->sq_off.array;
- size = p->sq_entries * sizeof(struct io_uring_sqe);
- sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE, fd,
- IORING_OFF_SQES);
- if (sq->sqes == MAP_FAILED) {
- ret = -errno;
+ size = sizeof(struct io_uring_sqe);
+ if (p->flags & IORING_SETUP_SQE128)
+ size += 64;
+ sq->sqes = __sys_mmap(0, size * p->sq_entries, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
+ if (IS_ERR(sq->sqes)) {
+ ret = PTR_ERR(sq->sqes);
err:
io_uring_unmap_rings(sq, cq);
return ret;
@@ -98,7 +97,8 @@ int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring
ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq);
if (!ret) {
ring->flags = p->flags;
- ring->ring_fd = fd;
+ ring->ring_fd = ring->enter_ring_fd = fd;
+ ring->int_flags = 0;
}
return ret;
}
@@ -115,21 +115,24 @@ int io_uring_ring_dontfork(struct io_uring *ring)
if (!ring->sq.ring_ptr || !ring->sq.sqes || !ring->cq.ring_ptr)
return -EINVAL;
- len = *ring->sq.kring_entries * sizeof(struct io_uring_sqe);
- ret = madvise(ring->sq.sqes, len, MADV_DONTFORK);
- if (ret == -1)
- return -errno;
+ len = sizeof(struct io_uring_sqe);
+ if (ring->flags & IORING_SETUP_SQE128)
+ len += 64;
+ len *= *ring->sq.kring_entries;
+ ret = __sys_madvise(ring->sq.sqes, len, MADV_DONTFORK);
+ if (ret < 0)
+ return ret;
len = ring->sq.ring_sz;
- ret = madvise(ring->sq.ring_ptr, len, MADV_DONTFORK);
- if (ret == -1)
- return -errno;
+ ret = __sys_madvise(ring->sq.ring_ptr, len, MADV_DONTFORK);
+ if (ret < 0)
+ return ret;
if (ring->cq.ring_ptr != ring->sq.ring_ptr) {
len = ring->cq.ring_sz;
- ret = madvise(ring->cq.ring_ptr, len, MADV_DONTFORK);
- if (ret == -1)
- return -errno;
+ ret = __sys_madvise(ring->cq.ring_ptr, len, MADV_DONTFORK);
+ if (ret < 0)
+ return ret;
}
return 0;
@@ -140,13 +143,13 @@ int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
{
int fd, ret;
- fd = __sys_io_uring_setup(entries, p);
+ fd = ____sys_io_uring_setup(entries, p);
if (fd < 0)
- return -errno;
+ return fd;
ret = io_uring_queue_mmap(fd, p, ring);
if (ret) {
- close(fd);
+ __sys_close(fd);
return ret;
}
@@ -172,10 +175,20 @@ void io_uring_queue_exit(struct io_uring *ring)
{
struct io_uring_sq *sq = &ring->sq;
struct io_uring_cq *cq = &ring->cq;
+ size_t sqe_size;
- munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
+ sqe_size = sizeof(struct io_uring_sqe);
+ if (ring->flags & IORING_SETUP_SQE128)
+ sqe_size += 64;
+ __sys_munmap(sq->sqes, sqe_size * *sq->kring_entries);
io_uring_unmap_rings(sq, cq);
- close(ring->ring_fd);
+ /*
+ * Not strictly required, but frees up the slot we used now rather
+ * than at process exit time.
+ */
+ if (ring->int_flags & INT_FLAG_REG_RING)
+ io_uring_unregister_ring_fd(ring);
+ __sys_close(ring->ring_fd);
}
struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring)
@@ -185,7 +198,7 @@ struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring)
int r;
len = sizeof(*probe) + 256 * sizeof(struct io_uring_probe_op);
- probe = malloc(len);
+ probe = uring_malloc(len);
if (!probe)
return NULL;
memset(probe, 0, len);
@@ -194,7 +207,7 @@ struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring)
if (r >= 0)
return probe;
- free(probe);
+ uring_free(probe);
return NULL;
}
@@ -215,36 +228,14 @@ struct io_uring_probe *io_uring_get_probe(void)
void io_uring_free_probe(struct io_uring_probe *probe)
{
- free(probe);
+ uring_free(probe);
}
-static int __fls(int x)
+static inline int __fls(int x)
{
- int r = 32;
-
if (!x)
return 0;
- if (!(x & 0xffff0000u)) {
- x <<= 16;
- r -= 16;
- }
- if (!(x & 0xff000000u)) {
- x <<= 8;
- r -= 8;
- }
- if (!(x & 0xf0000000u)) {
- x <<= 4;
- r -= 4;
- }
- if (!(x & 0xc0000000u)) {
- x <<= 2;
- r -= 2;
- }
- if (!(x & 0x80000000u)) {
- x <<= 1;
- r -= 1;
- }
- return r;
+ return 8 * sizeof(x) - __builtin_clz(x);
}
static unsigned roundup_pow2(unsigned depth)
@@ -261,16 +252,23 @@ static size_t npages(size_t size, unsigned page_size)
#define KRING_SIZE 320
-static size_t rings_size(unsigned entries, unsigned cq_entries, unsigned page_size)
+static size_t rings_size(struct io_uring_params *p, unsigned entries,
+ unsigned cq_entries, unsigned page_size)
{
size_t pages, sq_size, cq_size;
- cq_size = KRING_SIZE;
- cq_size += cq_entries * sizeof(struct io_uring_cqe);
+ cq_size = sizeof(struct io_uring_cqe);
+ if (p->flags & IORING_SETUP_CQE32)
+ cq_size += sizeof(struct io_uring_cqe);
+ cq_size *= cq_entries;
+ cq_size += KRING_SIZE;
cq_size = (cq_size + 63) & ~63UL;
pages = (size_t) 1 << npages(cq_size, page_size);
- sq_size = sizeof(struct io_uring_sqe) * entries;
+ sq_size = sizeof(struct io_uring_sqe);
+ if (p->flags & IORING_SETUP_SQE128)
+ sq_size += 64;
+ sq_size *= entries;
pages += (size_t) 1 << npages(sq_size, page_size);
return pages * page_size;
}
@@ -337,11 +335,8 @@ ssize_t io_uring_mlock_size_params(unsigned entries, struct io_uring_params *p)
cq_entries = 2 * entries;
}
- page_size = sysconf(_SC_PAGESIZE);
- if (page_size < 0)
- page_size = 4096;
-
- return rings_size(entries, cq_entries, page_size);
+ page_size = get_page_size();
+ return rings_size(p, entries, cq_entries, page_size);
}
/*
diff --git a/src/syscall.c b/src/syscall.c
index 2fd3dd4..362f1f5 100644
--- a/src/syscall.c
+++ b/src/syscall.c
@@ -2,6 +2,16 @@
#define _DEFAULT_SOURCE
/*
+ * Functions in this file require libc, only build them when we use libc.
+ *
+ * Note:
+ * liburing's tests still need these functions.
+ */
+#if defined(CONFIG_NOLIBC) && !defined(LIBURING_BUILD_TEST)
+#error "This file should only be compiled for libc build, or for liburing tests"
+#endif
+
+/*
* Will go away once libc support is there
*/
#include <unistd.h>
@@ -11,32 +21,6 @@
#include "liburing/io_uring.h"
#include "syscall.h"
-#ifdef __alpha__
-/*
- * alpha is the only exception, all other architectures
- * have common numbers for new system calls.
- */
-# ifndef __NR_io_uring_setup
-# define __NR_io_uring_setup 535
-# endif
-# ifndef __NR_io_uring_enter
-# define __NR_io_uring_enter 536
-# endif
-# ifndef __NR_io_uring_register
-# define __NR_io_uring_register 537
-# endif
-#else /* !__alpha__ */
-# ifndef __NR_io_uring_setup
-# define __NR_io_uring_setup 425
-# endif
-# ifndef __NR_io_uring_enter
-# define __NR_io_uring_enter 426
-# endif
-# ifndef __NR_io_uring_register
-# define __NR_io_uring_register 427
-# endif
-#endif
-
int __sys_io_uring_register(int fd, unsigned opcode, const void *arg,
unsigned nr_args)
{
@@ -49,15 +33,15 @@ int __sys_io_uring_setup(unsigned entries, struct io_uring_params *p)
}
int __sys_io_uring_enter2(int fd, unsigned to_submit, unsigned min_complete,
- unsigned flags, sigset_t *sig, int sz)
+ unsigned flags, sigset_t *sig, int sz)
{
- return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
- flags, sig, sz);
+ return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags,
+ sig, sz);
}
int __sys_io_uring_enter(int fd, unsigned to_submit, unsigned min_complete,
unsigned flags, sigset_t *sig)
{
return __sys_io_uring_enter2(fd, to_submit, min_complete, flags, sig,
- _NSIG / 8);
+ _NSIG / 8);
}
diff --git a/src/syscall.h b/src/syscall.h
index 3b94efc..214789d 100644
--- a/src/syscall.h
+++ b/src/syscall.h
@@ -2,19 +2,102 @@
#ifndef LIBURING_SYSCALL_H
#define LIBURING_SYSCALL_H
+#include <errno.h>
#include <signal.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/resource.h>
+#include <liburing.h>
+
+#ifdef __alpha__
+/*
+ * alpha and mips are exception, other architectures have
+ * common numbers for new system calls.
+ */
+#ifndef __NR_io_uring_setup
+#define __NR_io_uring_setup 535
+#endif
+#ifndef __NR_io_uring_enter
+#define __NR_io_uring_enter 536
+#endif
+#ifndef __NR_io_uring_register
+#define __NR_io_uring_register 537
+#endif
+#elif defined __mips__
+#ifndef __NR_io_uring_setup
+#define __NR_io_uring_setup (__NR_Linux + 425)
+#endif
+#ifndef __NR_io_uring_enter
+#define __NR_io_uring_enter (__NR_Linux + 426)
+#endif
+#ifndef __NR_io_uring_register
+#define __NR_io_uring_register (__NR_Linux + 427)
+#endif
+#else /* !__alpha__ and !__mips__ */
+#ifndef __NR_io_uring_setup
+#define __NR_io_uring_setup 425
+#endif
+#ifndef __NR_io_uring_enter
+#define __NR_io_uring_enter 426
+#endif
+#ifndef __NR_io_uring_register
+#define __NR_io_uring_register 427
+#endif
+#endif
+
+/*
+ * Don't put this below the #include "arch/$arch/syscall.h", that
+ * file may need it.
+ */
struct io_uring_params;
+static inline void *ERR_PTR(intptr_t n)
+{
+ return (void *) n;
+}
+
+static inline intptr_t PTR_ERR(const void *ptr)
+{
+ return (intptr_t) ptr;
+}
+
+static inline bool IS_ERR(const void *ptr)
+{
+ return uring_unlikely((uintptr_t) ptr >= (uintptr_t) -4095UL);
+}
+
+#define __INTERNAL__LIBURING_SYSCALL_H
+#if defined(__x86_64__) || defined(__i386__)
+#include "arch/x86/syscall.h"
+#elif defined(__aarch64__)
+#include "arch/aarch64/syscall.h"
+#else
+/*
+ * We don't have native syscall wrappers
+ * for this arch. Must use libc!
+ */
+#ifdef CONFIG_NOLIBC
+ #error "This arch doesn't support building liburing without libc"
+#endif
+/* libc syscall wrappers. */
+#include "arch/generic/syscall.h"
+#endif
+#undef __INTERNAL__LIBURING_SYSCALL_H
+
/*
- * System calls
+ * For backward compatibility.
+ * (these __sys* functions always use libc, see syscall.c)
*/
-extern int __sys_io_uring_setup(unsigned entries, struct io_uring_params *p);
-extern int __sys_io_uring_enter(int fd, unsigned to_submit,
- unsigned min_complete, unsigned flags, sigset_t *sig);
-extern int __sys_io_uring_enter2(int fd, unsigned to_submit,
- unsigned min_complete, unsigned flags, sigset_t *sig, int sz);
-extern int __sys_io_uring_register(int fd, unsigned int opcode, const void *arg,
- unsigned int nr_args);
+int __sys_io_uring_setup(unsigned entries, struct io_uring_params *p);
+int __sys_io_uring_enter(int fd, unsigned to_submit, unsigned min_complete,
+ unsigned flags, sigset_t *sig);
+int __sys_io_uring_enter2(int fd, unsigned to_submit, unsigned min_complete,
+ unsigned flags, sigset_t *sig, int sz);
+int __sys_io_uring_register(int fd, unsigned int opcode, const void *arg,
+ unsigned int nr_args);
#endif
diff --git a/test/232c93d07b74-test.c b/test/232c93d07b74.c
index cd194cb..8a7810b 100644
--- a/test/232c93d07b74-test.c
+++ b/test/232c93d07b74.c
@@ -19,13 +19,14 @@
#include <sys/un.h>
#include <netinet/tcp.h>
#include <netinet/in.h>
+#include <arpa/inet.h>
#include "liburing.h"
#define RECV_BUFF_SIZE 2
#define SEND_BUFF_SIZE 3
-#define PORT 0x1235
+#define PORT 0x1234
struct params {
int tcp;
@@ -75,8 +76,8 @@ static void *rcv(void *arg)
struct sockaddr_in addr;
addr.sin_family = AF_INET;
- addr.sin_port = PORT;
- addr.sin_addr.s_addr = 0x0100007fU;
+ addr.sin_port = htons(PORT);
+ addr.sin_addr.s_addr = inet_addr("127.0.0.1");
res = bind(s0, (struct sockaddr *) &addr, sizeof(addr));
assert(res != -1);
} else {
@@ -190,8 +191,8 @@ static void *snd(void *arg)
struct sockaddr_in addr;
addr.sin_family = AF_INET;
- addr.sin_port = PORT;
- addr.sin_addr.s_addr = 0x0100007fU;
+ addr.sin_port = htons(PORT);
+ addr.sin_addr.s_addr = inet_addr("127.0.0.1");
ret = connect(s0, (struct sockaddr*) &addr, sizeof(addr));
assert(ret != -1);
} else {
diff --git a/test/35fa71a030ca-test.c b/test/35fa71a030ca.c
index f5fcc4d..9a6ddb6 100644
--- a/test/35fa71a030ca-test.c
+++ b/test/35fa71a030ca.c
@@ -238,8 +238,7 @@ static void execute_one(void);
static void loop(void)
{
- int iter;
- for (iter = 0;; iter++) {
+ for (;;) {
int pid = fork();
if (pid < 0)
exit(1);
@@ -321,6 +320,9 @@ int main(int argc, char *argv[])
return 0;
signal(SIGINT, sig_int);
mmap((void *) 0x20000000, 0x1000000, 3, 0x32, -1, 0);
+ signal(SIGALRM, sig_int);
+ alarm(5);
+
loop();
return 0;
}
diff --git a/test/500f9fbadef8-test.c b/test/500f9fbadef8.c
index dbd5751..dbd5751 100644
--- a/test/500f9fbadef8-test.c
+++ b/test/500f9fbadef8.c
diff --git a/test/7ad0e4b2f83c-test.c b/test/7ad0e4b2f83c.c
index 4d760e1..4d760e1 100644
--- a/test/7ad0e4b2f83c-test.c
+++ b/test/7ad0e4b2f83c.c
diff --git a/test/8a9973408177-test.c b/test/8a9973408177.c
index 94bf781..94bf781 100644
--- a/test/8a9973408177-test.c
+++ b/test/8a9973408177.c
diff --git a/test/917257daa0fe-test.c b/test/917257daa0fe.c
index 1d00ef1..1d00ef1 100644
--- a/test/917257daa0fe-test.c
+++ b/test/917257daa0fe.c
diff --git a/test/Makefile b/test/Makefile
index 2f0a694..e3204a7 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -3,294 +3,241 @@ datadir ?= $(prefix)/share
INSTALL=install
-CPPFLAGS ?=
-override CPPFLAGS += -D_GNU_SOURCE -D__SANE_USERSPACE_TYPES__ \
- -I../src/include/ -include ../config-host.h
-CFLAGS ?= -g -O2
-XCFLAGS =
-override CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare \
- -L../src/
-CXXFLAGS ?=
-override CXXFLAGS += $(CFLAGS) -std=c++11
-
-test_targets += \
- 232c93d07b74-test \
- 35fa71a030ca-test \
- 500f9fbadef8-test \
- 7ad0e4b2f83c-test \
- 8a9973408177-test \
- 917257daa0fe-test \
- a0908ae19763-test \
- a4c0b3decb33-test \
- accept \
- accept-link \
- accept-reuse \
- accept-test \
- across-fork splice \
- b19062a56726-test \
- b5837bd5311d-test \
- ce593a6c480a-test \
- close-opath \
- connect \
- cq-full \
- cq-overflow \
- cq-peek-batch \
- cq-ready \
- cq-size \
- d4ae271dfaae-test \
- d77a67ed5f27-test \
- defer \
- double-poll-crash \
- eeed8b54e0df-test \
- eventfd \
- eventfd-disable \
- eventfd-ring \
- fadvise \
- fallocate \
- fc2a85cb02ef-test \
- file-register \
- file-update \
- files-exit-hang-poll \
- files-exit-hang-timeout \
- fixed-link \
- fsync \
- hardlink \
- io-cancel \
- io_uring_enter \
- io_uring_register \
- io_uring_setup \
- iopoll \
- lfs-openat \
- lfs-openat-write \
- link \
- link-timeout \
- link_drain \
- madvise \
- mkdir \
- multicqes_drain \
- nop \
- nop-all-sizes \
- open-close \
- openat2 \
- personality \
- pipe-eof \
- pipe-reuse \
- poll \
- poll-cancel \
- poll-cancel-ton \
- poll-link \
- poll-many \
- poll-mshot-update \
- poll-ring \
- poll-v-poll \
- probe \
- read-write \
- register-restrictions \
- rename \
- ring-leak \
- ring-leak2 \
- rw_merge_test \
- self \
- send_recv \
- send_recvmsg \
- shared-wq \
- short-read \
- shutdown \
- sigfd-deadlock \
- socket-rw \
- socket-rw-eagain \
- sq-full \
- sq-poll-dup \
- sq-poll-kthread \
- sq-poll-share \
- sqpoll-disable-exit \
- sqpoll-exit-hang \
- sqpoll-sleep \
- sq-space_left \
- stdout \
- submit-reuse \
- symlink \
- teardowns \
- thread-exit \
- timeout \
- timeout-new \
- timeout-overflow \
- unlink \
- wakeup-hang \
- sendmsg_fs_cve \
- rsrc_tags \
- # EOL
-
-all_targets += $(test_targets)
-
-include ../Makefile.quiet
-
ifneq ($(MAKECMDGOALS),clean)
include ../config-host.mak
endif
-ifdef CONFIG_HAVE_STATX
-test_targets += statx
-endif
-all_targets += statx
+CPPFLAGS ?=
-ifdef CONFIG_HAVE_CXX
-test_targets += sq-full-cpp
-endif
-all_targets += sq-full-cpp
+override CPPFLAGS += \
+ -D_GNU_SOURCE \
+ -D__SANE_USERSPACE_TYPES__ \
+ -I../src/include/ \
+ -include ../config-host.h
-helpers = helpers.o
+CFLAGS ?= -g -O2 -Wall -Wextra
+XCFLAGS = -Wno-unused-parameter -Wno-sign-compare
-all: ${helpers} $(test_targets)
+ifdef CONFIG_HAVE_STRINGOP_OVERFLOW
+ XCFLAGS += -Wstringop-overflow=0
+endif
-helpers.o: helpers.c helpers.c
- $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ -c $< -luring
+ifdef CONFIG_HAVE_ARRAY_BOUNDS
+ XCFLAGS += -Warray-bounds=0
+endif
-%: %.c ${helpers} helpers.h
- $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ $< ${helpers} -luring $(XCFLAGS)
+CXXFLAGS ?= $(CFLAGS)
+override CFLAGS += $(XCFLAGS) -DLIBURING_BUILD_TEST
+override CXXFLAGS += $(XCFLAGS) -std=c++11 -DLIBURING_BUILD_TEST
-%: %.cc ${helpers} helpers.h
- $(QUIET_CXX)$(CXX) $(CPPFLAGS) $(CXXFLAGS) -o $@ $< ${helpers} -luring $(XCFLAGS)
+LDFLAGS ?=
+override LDFLAGS += -L../src/ -luring -lpthread
+# Please keep this list sorted alphabetically.
test_srcs := \
- helpers.c \
- 232c93d07b74-test.c \
- 35fa71a030ca-test.c \
- 500f9fbadef8-test.c \
- 7ad0e4b2f83c-test.c \
- 8a9973408177-test.c \
- 917257daa0fe-test.c \
- a0908ae19763-test.c \
- a4c0b3decb33-test.c \
+ 232c93d07b74.c \
+ 35fa71a030ca.c \
+ 500f9fbadef8.c \
+ 7ad0e4b2f83c.c \
+ 8a9973408177.c \
+ 917257daa0fe.c \
+ a0908ae19763.c \
+ a4c0b3decb33.c \
+ accept.c \
accept-link.c \
accept-reuse.c \
accept-test.c \
- accept.c \
across-fork.c \
- b19062a56726-test.c \
- b5837bd5311d-test.c \
- ce593a6c480a-test.c \
+ b19062a56726.c \
+ b5837bd5311d.c \
+ buf-ring.c \
+ ce593a6c480a.c \
close-opath.c \
connect.c \
cq-full.c \
cq-overflow.c \
cq-peek-batch.c \
- cq-ready.c\
+ cq-ready.c \
cq-size.c \
- d4ae271dfaae-test.c \
- d77a67ed5f27-test.c \
+ d4ae271dfaae.c \
+ d77a67ed5f27.c \
defer.c \
double-poll-crash.c \
- eeed8b54e0df-test.c \
+ drop-submit.c \
+ eeed8b54e0df.c \
+ empty-eownerdead.c \
+ eventfd.c \
eventfd-disable.c \
+ eventfd-reg.c \
eventfd-ring.c \
- eventfd.c \
+ exec-target.c \
+ exit-no-cleanup.c \
fadvise.c \
fallocate.c \
- fc2a85cb02ef-test.c \
+ fc2a85cb02ef.c \
file-register.c \
- file-update.c \
files-exit-hang-poll.c \
files-exit-hang-timeout.c \
+ file-update.c \
+ file-verify.c \
+ fixed-buf-iter.c \
fixed-link.c \
+ fixed-reuse.c \
+ fpos.c \
fsync.c \
hardlink.c \
io-cancel.c \
+ iopoll.c \
io_uring_enter.c \
io_uring_register.c \
io_uring_setup.c \
- iopoll.c \
- lfs-openat-write.c \
lfs-openat.c \
- link-timeout.c \
+ lfs-openat-write.c \
link.c \
link_drain.c \
+ link-timeout.c \
madvise.c \
mkdir.c \
+ msg-ring.c \
multicqes_drain.c \
nop-all-sizes.c \
nop.c \
- open-close.c \
openat2.c \
+ open-close.c \
+ open-direct-link.c \
+ open-direct-pick.c \
personality.c \
pipe-eof.c \
pipe-reuse.c \
- poll-cancel-ton.c \
+ poll.c \
poll-cancel.c \
+ poll-cancel-all.c \
+ poll-cancel-ton.c \
poll-link.c \
poll-many.c \
poll-mshot-update.c \
poll-ring.c \
poll-v-poll.c \
- poll.c \
+ pollfree.c \
probe.c \
+ read-before-exit.c \
read-write.c \
+ recv-msgall.c \
+ recv-msgall-stream.c \
register-restrictions.c \
rename.c \
- ring-leak.c \
+ ringbuf-read.c \
ring-leak2.c \
+ ring-leak.c \
+ rsrc_tags.c \
rw_merge_test.c \
self.c \
+ sendmsg_fs_cve.c \
+ send_recv.c \
send_recvmsg.c \
shared-wq.c \
short-read.c \
shutdown.c \
sigfd-deadlock.c \
+ skip-cqe.c \
+ socket.c \
socket-rw.c \
socket-rw-eagain.c \
+ socket-rw-offset.c \
splice.c \
- sq-full-cpp.cc \
sq-full.c \
+ sq-full-cpp.cc \
+ sqpoll-cancel-hang.c \
+ sqpoll-disable-exit.c \
sq-poll-dup.c \
+ sqpoll-exit-hang.c \
sq-poll-kthread.c \
sq-poll-share.c \
- sqpoll-disable-exit.c \
- sqpoll-exit-hang.c \
sqpoll-sleep.c \
sq-space_left.c \
- statx.c \
stdout.c \
+ submit-link-fail.c \
submit-reuse.c \
symlink.c \
teardowns.c \
thread-exit.c \
+ timeout.c \
timeout-new.c \
timeout-overflow.c \
- timeout.c \
+ tty-write-dpoll.c \
unlink.c \
wakeup-hang.c \
- sendmsg_fs_cve.c \
- rsrc_tags.c \
+ xattr.c \
+ skip-cqe.c \
# EOL
-test_objs := $(patsubst %.c,%.ol,$(patsubst %.cc,%.ol,$(test_srcs)))
+all_targets :=
+include ../Makefile.quiet
+
+ifdef CONFIG_HAVE_STATX
+ test_srcs += statx.c
+else ifdef CONFIG_HAVE_GLIBC_STATX
+ test_srcs += statx.c
+endif
+all_targets += statx.t
+
+ifdef CONFIG_HAVE_CXX
+ test_srcs += sq-full-cpp.cc
+endif
+all_targets += sq-full-cpp.t
+
+
+test_targets := $(patsubst %.c,%,$(test_srcs))
+test_targets := $(patsubst %.cc,%,$(test_targets))
+run_test_targets := $(patsubst %,%.run_test,$(test_targets))
+test_targets := $(patsubst %,%.t,$(test_targets))
+all_targets += $(test_targets)
+
+#
+# Build ../src/syscall.c manually from test's Makefile to support
+# liburing nolibc.
+#
+# Functions in ../src/syscall.c require libc to work with, if we
+# build liburing without libc, we don't have those functions
+# in liburing.a. So build it manually here.
+#
+helpers = helpers.o ../src/syscall.o
+
+all: $(test_targets)
+
+../src/syscall.o: ../src/syscall.c
+ $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ -c $<
+
+helpers.o: helpers.c
+ $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ -c $<
+
+%.t: %.c $(helpers) helpers.h ../src/liburing.a
+ $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ $< $(helpers) $(LDFLAGS)
+
+%.t: %.cc $(helpers) helpers.h ../src/liburing.a
+ $(QUIET_CXX)$(CXX) $(CPPFLAGS) $(CXXFLAGS) -o $@ $< $(helpers) $(LDFLAGS)
-35fa71a030ca-test: XCFLAGS = -lpthread
-232c93d07b74-test: XCFLAGS = -lpthread
-send_recv: XCFLAGS = -lpthread
-send_recvmsg: XCFLAGS = -lpthread
-poll-link: XCFLAGS = -lpthread
-accept-link: XCFLAGS = -lpthread
-submit-reuse: XCFLAGS = -lpthread
-poll-v-poll: XCFLAGS = -lpthread
-across-fork: XCFLAGS = -lpthread
-ce593a6c480a-test: XCFLAGS = -lpthread
-wakeup-hang: XCFLAGS = -lpthread
-pipe-eof: XCFLAGS = -lpthread
-timeout-new: XCFLAGS = -lpthread
-thread-exit: XCFLAGS = -lpthread
-ring-leak2: XCFLAGS = -lpthread
-poll-mshot-update: XCFLAGS = -lpthread
install: $(test_targets) runtests.sh runtests-loop.sh
$(INSTALL) -D -d -m 755 $(datadir)/liburing-test/
$(INSTALL) -D -m 755 $(test_targets) $(datadir)/liburing-test/
$(INSTALL) -D -m 755 runtests.sh $(datadir)/liburing-test/
$(INSTALL) -D -m 755 runtests-loop.sh $(datadir)/liburing-test/
+
clean:
- @rm -f $(all_targets) $(test_objs) helpers.o
+ @rm -f $(all_targets) helpers.o output/*
+ @rm -rf output/
runtests: all
@./runtests.sh $(test_targets)
+
runtests-loop: all
@./runtests-loop.sh $(test_targets)
+
+%.run_test: %.t
+ @./runtests-quiet.sh $<
+
+runtests-parallel: $(run_test_targets)
+ @echo "All tests passed"
+
+.PHONY: all install clean runtests runtests-loop runtests-parallel
diff --git a/test/a0908ae19763-test.c b/test/a0908ae19763.c
index 00cb559..00cb559 100644
--- a/test/a0908ae19763-test.c
+++ b/test/a0908ae19763.c
diff --git a/test/a4c0b3decb33-test.c b/test/a4c0b3decb33.c
index 34b0af2..34b0af2 100644
--- a/test/a4c0b3decb33-test.c
+++ b/test/a4c0b3decb33.c
diff --git a/test/accept-link.c b/test/accept-link.c
index 605e0ec..f111275 100644
--- a/test/accept-link.c
+++ b/test/accept-link.c
@@ -11,6 +11,7 @@
#include <netinet/tcp.h>
#include <netinet/in.h>
#include <poll.h>
+#include <arpa/inet.h>
#include "liburing.h"
@@ -42,7 +43,8 @@ struct data {
unsigned expected[2];
unsigned just_positive[2];
unsigned long timeout;
- int port;
+ unsigned short port;
+ unsigned int addr;
int stop;
};
@@ -63,7 +65,7 @@ static void *send_thread(void *arg)
addr.sin_family = AF_INET;
addr.sin_port = data->port;
- addr.sin_addr.s_addr = 0x0100007fU;
+ addr.sin_addr.s_addr = data->addr;
ret = connect(s0, (struct sockaddr*)&addr, sizeof(addr));
assert(ret != -1);
@@ -95,11 +97,12 @@ void *recv_thread(void *arg)
struct sockaddr_in addr;
addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = 0x0100007fU;
+ data->addr = inet_addr("127.0.0.1");
+ addr.sin_addr.s_addr = data->addr;
i = 0;
do {
- data->port = 1025 + (rand() % 64510);
+ data->port = htons(1025 + (rand() % 64510));
addr.sin_port = data->port;
if (bind(s0, (struct sockaddr*)&addr, sizeof(addr)) != -1)
diff --git a/test/accept-test.c b/test/accept-test.c
index 71d9d80..4a904e4 100644
--- a/test/accept-test.c
+++ b/test/accept-test.c
@@ -35,7 +35,7 @@ int main(int argc, char *argv[])
memset(&addr, 0, sizeof(addr));
addr.sun_family = AF_UNIX;
- memcpy(addr.sun_path, "\0sock", 6);
+ memcpy(addr.sun_path, "\0sock2", 7);
ret = bind(fd, (struct sockaddr *)&addr, addrlen);
assert(ret != -1);
diff --git a/test/accept.c b/test/accept.c
index f096f8a..8078ccb 100644
--- a/test/accept.c
+++ b/test/accept.c
@@ -7,6 +7,7 @@
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
+#include <limits.h>
#include <errno.h>
#include <fcntl.h>
@@ -17,17 +18,44 @@
#include <sys/un.h>
#include <netinet/tcp.h>
#include <netinet/in.h>
+#include <arpa/inet.h>
#include "helpers.h"
#include "liburing.h"
+#define MAX_FDS 32
static int no_accept;
+static int no_accept_multi;
struct data {
char buf[128];
struct iovec iov;
};
+struct accept_test_args {
+ int accept_should_error;
+ bool fixed;
+ bool nonblock;
+ bool queue_accept_before_connect;
+ bool multishot;
+ int extra_loops;
+};
+
+static void close_fds(int fds[], int nr)
+{
+ int i;
+
+ for (i = 0; i < nr; i++)
+ close(fds[i]);
+}
+
+static void close_sock_fds(int s_fd[], int c_fd[], int nr, bool fixed)
+{
+ if (!fixed)
+ close_fds(s_fd, nr);
+ close_fds(c_fd, nr);
+}
+
static void queue_send(struct io_uring *ring, int fd)
{
struct io_uring_sqe *sqe;
@@ -39,9 +67,10 @@ static void queue_send(struct io_uring *ring, int fd)
sqe = io_uring_get_sqe(ring);
io_uring_prep_writev(sqe, fd, &d->iov, 1, 0);
+ sqe->user_data = 1;
}
-static void queue_recv(struct io_uring *ring, int fd)
+static void queue_recv(struct io_uring *ring, int fd, bool fixed)
{
struct io_uring_sqe *sqe;
struct data *d;
@@ -52,32 +81,73 @@ static void queue_recv(struct io_uring *ring, int fd)
sqe = io_uring_get_sqe(ring);
io_uring_prep_readv(sqe, fd, &d->iov, 1, 0);
+ sqe->user_data = 2;
+ if (fixed)
+ sqe->flags |= IOSQE_FIXED_FILE;
}
-static int accept_conn(struct io_uring *ring, int fd)
+static void queue_accept_conn(struct io_uring *ring, int fd,
+ struct accept_test_args args)
{
struct io_uring_sqe *sqe;
- struct io_uring_cqe *cqe;
int ret;
+ int fixed_idx = args.fixed ? 0 : -1;
+ int count = 1 + args.extra_loops;
+ bool multishot = args.multishot;
+
+ while (count--) {
+ sqe = io_uring_get_sqe(ring);
+ if (fixed_idx < 0) {
+ if (!multishot)
+ io_uring_prep_accept(sqe, fd, NULL, NULL, 0);
+ else
+ io_uring_prep_multishot_accept(sqe, fd, NULL,
+ NULL, 0);
+ } else {
+ if (!multishot)
+ io_uring_prep_accept_direct(sqe, fd, NULL, NULL,
+ 0, fixed_idx);
+ else
+ io_uring_prep_multishot_accept_direct(sqe, fd,
+ NULL, NULL,
+ 0);
+ }
- sqe = io_uring_get_sqe(ring);
- io_uring_prep_accept(sqe, fd, NULL, NULL, 0);
+ ret = io_uring_submit(ring);
+ assert(ret != -1);
+ }
+}
- ret = io_uring_submit(ring);
- assert(ret != -1);
+static int accept_conn(struct io_uring *ring, int fixed_idx, bool multishot)
+{
+ struct io_uring_cqe *cqe;
+ int ret;
ret = io_uring_wait_cqe(ring, &cqe);
assert(!ret);
ret = cqe->res;
io_uring_cqe_seen(ring, cqe);
+
+ if (fixed_idx >= 0) {
+ if (ret > 0) {
+ if (!multishot) {
+ close(ret);
+ return -EINVAL;
+ }
+ } else if (!ret) {
+ ret = fixed_idx;
+ }
+ }
return ret;
}
-static int start_accept_listen(struct sockaddr_in *addr, int port_off)
+static int start_accept_listen(struct sockaddr_in *addr, int port_off,
+ int extra_flags)
{
int fd, ret;
- fd = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+ fd = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC | extra_flags,
+ IPPROTO_TCP);
int32_t val = 1;
ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val));
@@ -91,8 +161,8 @@ static int start_accept_listen(struct sockaddr_in *addr, int port_off)
addr = &laddr;
addr->sin_family = AF_INET;
- addr->sin_port = 0x1235 + port_off;
- addr->sin_addr.s_addr = 0x0100007fU;
+ addr->sin_port = htons(0x1235 + port_off);
+ addr->sin_addr.s_addr = inet_addr("127.0.0.1");
ret = bind(fd, (struct sockaddr*)addr, sizeof(*addr));
assert(ret != -1);
@@ -102,58 +172,107 @@ static int start_accept_listen(struct sockaddr_in *addr, int port_off)
return fd;
}
-static int test(struct io_uring *ring, int accept_should_error)
+static int set_client_fd(struct sockaddr_in *addr)
{
- struct io_uring_cqe *cqe;
- struct sockaddr_in addr;
- uint32_t head;
- uint32_t count = 0;
- int done = 0;
- int p_fd[2];
- int ret;
-
- int32_t val, recv_s0 = start_accept_listen(&addr, 0);
+ int32_t val;
+ int fd, ret;
- p_fd[1] = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+ fd = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
val = 1;
- ret = setsockopt(p_fd[1], IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val));
+ ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val));
assert(ret != -1);
- int32_t flags = fcntl(p_fd[1], F_GETFL, 0);
+ int32_t flags = fcntl(fd, F_GETFL, 0);
assert(flags != -1);
flags |= O_NONBLOCK;
- ret = fcntl(p_fd[1], F_SETFL, flags);
+ ret = fcntl(fd, F_SETFL, flags);
assert(ret != -1);
- ret = connect(p_fd[1], (struct sockaddr*)&addr, sizeof(addr));
+ ret = connect(fd, (struct sockaddr *)addr, sizeof(*addr));
assert(ret == -1);
- flags = fcntl(p_fd[1], F_GETFL, 0);
+ flags = fcntl(fd, F_GETFL, 0);
assert(flags != -1);
flags &= ~O_NONBLOCK;
- ret = fcntl(p_fd[1], F_SETFL, flags);
+ ret = fcntl(fd, F_SETFL, flags);
assert(ret != -1);
- p_fd[0] = accept_conn(ring, recv_s0);
- if (p_fd[0] == -EINVAL) {
- if (accept_should_error)
+ return fd;
+}
+
+static int test_loop(struct io_uring *ring,
+ struct accept_test_args args,
+ int recv_s0,
+ struct sockaddr_in *addr)
+{
+ struct io_uring_cqe *cqe;
+ uint32_t head, count = 0;
+ int i, ret, s_fd[MAX_FDS], c_fd[MAX_FDS], done = 0;
+ bool fixed = args.fixed;
+ bool multishot = args.multishot;
+ uint32_t multishot_mask = 0;
+ int nr_fds = multishot ? MAX_FDS : 1;
+
+ for (i = 0; i < nr_fds; i++)
+ c_fd[i] = set_client_fd(addr);
+
+ if (!args.queue_accept_before_connect)
+ queue_accept_conn(ring, recv_s0, args);
+
+ for (i = 0; i < nr_fds; i++) {
+ s_fd[i] = accept_conn(ring, fixed ? 0 : -1, multishot);
+ if (s_fd[i] == -EINVAL) {
+ if (args.accept_should_error)
+ goto out;
+ fprintf(stdout,
+ "%s %s Accept not supported, skipping\n",
+ fixed ? "Fixed" : "",
+ multishot ? "Multishot" : "");
+ if (multishot)
+ no_accept_multi = 1;
+ else
+ no_accept = 1;
goto out;
- fprintf(stdout, "Accept not supported, skipping\n");
- no_accept = 1;
+ } else if (s_fd[i] < 0) {
+ if (args.accept_should_error &&
+ (s_fd[i] == -EBADF || s_fd[i] == -EINVAL))
+ goto out;
+ fprintf(stderr, "%s %s Accept[%d] got %d\n",
+ fixed ? "Fixed" : "",
+ multishot ? "Multishot" : "",
+ i, s_fd[i]);
+ goto err;
+ }
+
+ if (multishot && fixed) {
+ if (s_fd[i] >= MAX_FDS) {
+ fprintf(stderr,
+ "Fixed Multishot Accept[%d] got outbound index: %d\n",
+ i, s_fd[i]);
+ goto err;
+ }
+ /*
+ * for fixed multishot accept test, the file slots
+ * allocated are [0, 32), this means we finally end up
+ * with each bit of a u32 being 1.
+ */
+ multishot_mask |= (1U << s_fd[i]);
+ }
+ }
+
+ if (multishot) {
+ if (fixed && (~multishot_mask != 0U)) {
+ fprintf(stderr, "Fixed Multishot Accept misses events\n");
+ goto err;
+ }
goto out;
- } else if (p_fd[0] < 0) {
- if (accept_should_error &&
- (p_fd[0] == -EBADF || p_fd[0] == -EINVAL))
- goto out;
- fprintf(stderr, "Accept got %d\n", p_fd[0]);
- goto err;
}
- queue_send(ring, p_fd[1]);
- queue_recv(ring, p_fd[0]);
+ queue_send(ring, c_fd[0]);
+ queue_recv(ring, s_fd[0], fixed);
ret = io_uring_submit_and_wait(ring, 2);
assert(ret != -1);
@@ -161,7 +280,8 @@ static int test(struct io_uring *ring, int accept_should_error)
while (count < 2) {
io_uring_for_each_cqe(ring, head, cqe) {
if (cqe->res < 0) {
- fprintf(stderr, "Got cqe res %d\n", cqe->res);
+ fprintf(stderr, "Got cqe res %d, user_data %i\n",
+ cqe->res, (int)cqe->user_data);
done = 1;
break;
}
@@ -176,17 +296,32 @@ static int test(struct io_uring *ring, int accept_should_error)
}
out:
- close(p_fd[0]);
- close(p_fd[1]);
- close(recv_s0);
+ close_sock_fds(s_fd, c_fd, nr_fds, fixed);
return 0;
err:
- close(p_fd[0]);
- close(p_fd[1]);
- close(recv_s0);
+ close_sock_fds(s_fd, c_fd, nr_fds, fixed);
return 1;
}
+static int test(struct io_uring *ring, struct accept_test_args args)
+{
+ struct sockaddr_in addr;
+ int ret = 0;
+ int loop;
+ int32_t recv_s0 = start_accept_listen(&addr, 0,
+ args.nonblock ? O_NONBLOCK : 0);
+ if (args.queue_accept_before_connect)
+ queue_accept_conn(ring, recv_s0, args);
+ for (loop = 0; loop < 1 + args.extra_loops; loop++) {
+ ret = test_loop(ring, args, recv_s0, &addr);
+ if (ret)
+ break;
+ }
+
+ close(recv_s0);
+ return ret;
+}
+
static void sig_alrm(int sig)
{
exit(0);
@@ -202,7 +337,7 @@ static int test_accept_pending_on_exit(void)
ret = io_uring_queue_init(32, &m_io_uring, 0);
assert(ret >= 0);
- fd = start_accept_listen(NULL, 0);
+ fd = start_accept_listen(NULL, 0, 0);
sqe = io_uring_get_sqe(&m_io_uring);
io_uring_prep_accept(sqe, fd, NULL, NULL, 0);
@@ -219,10 +354,17 @@ static int test_accept_pending_on_exit(void)
return 0;
}
+struct test_accept_many_args {
+ unsigned int usecs;
+ bool nonblock;
+ bool single_sock;
+ bool close_fds;
+};
+
/*
* Test issue many accepts and see if we handle cancellation on exit
*/
-static int test_accept_many(unsigned nr, unsigned usecs)
+static int test_accept_many(struct test_accept_many_args args)
{
struct io_uring m_io_uring;
struct io_uring_cqe *cqe;
@@ -230,6 +372,8 @@ static int test_accept_many(unsigned nr, unsigned usecs)
unsigned long cur_lim;
struct rlimit rlim;
int *fds, i, ret;
+ unsigned int nr = 128;
+ int nr_socks = args.single_sock ? 1 : nr;
if (getrlimit(RLIMIT_NPROC, &rlim) < 0) {
perror("getrlimit");
@@ -247,31 +391,39 @@ static int test_accept_many(unsigned nr, unsigned usecs)
ret = io_uring_queue_init(2 * nr, &m_io_uring, 0);
assert(ret >= 0);
- fds = t_calloc(nr, sizeof(int));
+ fds = t_calloc(nr_socks, sizeof(int));
- for (i = 0; i < nr; i++)
- fds[i] = start_accept_listen(NULL, i);
+ for (i = 0; i < nr_socks; i++)
+ fds[i] = start_accept_listen(NULL, i,
+ args.nonblock ? O_NONBLOCK : 0);
for (i = 0; i < nr; i++) {
+ int sock_idx = args.single_sock ? 0 : i;
sqe = io_uring_get_sqe(&m_io_uring);
- io_uring_prep_accept(sqe, fds[i], NULL, NULL, 0);
+ io_uring_prep_accept(sqe, fds[sock_idx], NULL, NULL, 0);
sqe->user_data = 1 + i;
ret = io_uring_submit(&m_io_uring);
assert(ret == 1);
}
- if (usecs)
- usleep(usecs);
+ if (args.usecs)
+ usleep(args.usecs);
+
+ if (args.close_fds)
+ for (i = 0; i < nr_socks; i++)
+ close(fds[i]);
for (i = 0; i < nr; i++) {
if (io_uring_peek_cqe(&m_io_uring, &cqe))
break;
if (cqe->res != -ECANCELED) {
- fprintf(stderr, "Expected cqe to be cancelled\n");
- goto err;
+ fprintf(stderr, "Expected cqe to be cancelled %d\n", cqe->res);
+ ret = 1;
+ goto out;
}
io_uring_cqe_seen(&m_io_uring, cqe);
}
+ ret = 0;
out:
rlim.rlim_cur = cur_lim;
if (setrlimit(RLIMIT_NPROC, &rlim) < 0) {
@@ -281,40 +433,46 @@ out:
free(fds);
io_uring_queue_exit(&m_io_uring);
- return 0;
-err:
- ret = 1;
- goto out;
+ return ret;
}
-static int test_accept_cancel(unsigned usecs)
+static int test_accept_cancel(unsigned usecs, unsigned int nr, bool multishot)
{
struct io_uring m_io_uring;
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
int fd, i, ret;
+ if (multishot && no_accept_multi)
+ return 0;
+
ret = io_uring_queue_init(32, &m_io_uring, 0);
assert(ret >= 0);
- fd = start_accept_listen(NULL, 0);
+ fd = start_accept_listen(NULL, 0, 0);
- sqe = io_uring_get_sqe(&m_io_uring);
- io_uring_prep_accept(sqe, fd, NULL, NULL, 0);
- sqe->user_data = 1;
- ret = io_uring_submit(&m_io_uring);
- assert(ret == 1);
+ for (i = 1; i <= nr; i++) {
+ sqe = io_uring_get_sqe(&m_io_uring);
+ if (!multishot)
+ io_uring_prep_accept(sqe, fd, NULL, NULL, 0);
+ else
+ io_uring_prep_multishot_accept(sqe, fd, NULL, NULL, 0);
+ sqe->user_data = i;
+ ret = io_uring_submit(&m_io_uring);
+ assert(ret == 1);
+ }
if (usecs)
usleep(usecs);
- sqe = io_uring_get_sqe(&m_io_uring);
- io_uring_prep_cancel(sqe, (void *) 1, 0);
- sqe->user_data = 2;
- ret = io_uring_submit(&m_io_uring);
- assert(ret == 1);
-
- for (i = 0; i < 2; i++) {
+ for (i = 1; i <= nr; i++) {
+ sqe = io_uring_get_sqe(&m_io_uring);
+ io_uring_prep_cancel64(sqe, i, 0);
+ sqe->user_data = nr + i;
+ ret = io_uring_submit(&m_io_uring);
+ assert(ret == 1);
+ }
+ for (i = 0; i < nr * 2; i++) {
ret = io_uring_wait_cqe(&m_io_uring, &cqe);
assert(!ret);
/*
@@ -327,12 +485,15 @@ static int test_accept_cancel(unsigned usecs)
* should get '-EALREADY' for the cancel request and
* '-EINTR' for the accept request.
*/
- if (cqe->user_data == 1) {
+ if (cqe->user_data == 0) {
+ fprintf(stderr, "unexpected 0 user data\n");
+ goto err;
+ } else if (cqe->user_data <= nr) {
if (cqe->res != -EINTR && cqe->res != -ECANCELED) {
fprintf(stderr, "Cancelled accept got %d\n", cqe->res);
goto err;
}
- } else if (cqe->user_data == 2) {
+ } else if (cqe->user_data <= nr * 2) {
if (cqe->res != -EALREADY && cqe->res != 0) {
fprintf(stderr, "Cancel got %d\n", cqe->res);
goto err;
@@ -342,20 +503,139 @@ static int test_accept_cancel(unsigned usecs)
}
io_uring_queue_exit(&m_io_uring);
+ close(fd);
+ return 0;
+err:
+ io_uring_queue_exit(&m_io_uring);
+ close(fd);
+ return 1;
+}
+
+static int test_accept(int count, bool before)
+{
+ struct io_uring m_io_uring;
+ int ret;
+ struct accept_test_args args = {
+ .queue_accept_before_connect = before,
+ .extra_loops = count - 1
+ };
+
+ ret = io_uring_queue_init(32, &m_io_uring, 0);
+ assert(ret >= 0);
+ ret = test(&m_io_uring, args);
+ io_uring_queue_exit(&m_io_uring);
+ return ret;
+}
+
+static int test_multishot_accept(int count, bool before)
+{
+ struct io_uring m_io_uring;
+ int ret;
+ struct accept_test_args args = {
+ .queue_accept_before_connect = before,
+ .multishot = true,
+ .extra_loops = count - 1
+ };
+
+ if (no_accept_multi)
+ return 0;
+
+ ret = io_uring_queue_init(MAX_FDS + 10, &m_io_uring, 0);
+ assert(ret >= 0);
+ ret = test(&m_io_uring, args);
+ io_uring_queue_exit(&m_io_uring);
+ return ret;
+}
+
+static int test_accept_multishot_wrong_arg()
+{
+ struct io_uring m_io_uring;
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int fd, ret;
+
+ ret = io_uring_queue_init(4, &m_io_uring, 0);
+ assert(ret >= 0);
+
+ fd = start_accept_listen(NULL, 0, 0);
+
+ sqe = io_uring_get_sqe(&m_io_uring);
+ io_uring_prep_multishot_accept_direct(sqe, fd, NULL, NULL, 0);
+ sqe->file_index = 1;
+ ret = io_uring_submit(&m_io_uring);
+ assert(ret == 1);
+
+ ret = io_uring_wait_cqe(&m_io_uring, &cqe);
+ assert(!ret);
+ if (cqe->res != -EINVAL) {
+ fprintf(stderr, "file index should be IORING_FILE_INDEX_ALLOC \
+ if its accept in multishot direct mode\n");
+ goto err;
+ }
+ io_uring_cqe_seen(&m_io_uring, cqe);
+
+ io_uring_queue_exit(&m_io_uring);
+ close(fd);
return 0;
err:
io_uring_queue_exit(&m_io_uring);
+ close(fd);
return 1;
}
-static int test_accept(void)
+
+static int test_accept_nonblock(bool queue_before_connect, int count)
{
struct io_uring m_io_uring;
int ret;
+ struct accept_test_args args = {
+ .nonblock = true,
+ .queue_accept_before_connect = queue_before_connect,
+ .extra_loops = count - 1
+ };
+
+ ret = io_uring_queue_init(32, &m_io_uring, 0);
+ assert(ret >= 0);
+ ret = test(&m_io_uring, args);
+ io_uring_queue_exit(&m_io_uring);
+ return ret;
+}
+
+static int test_accept_fixed(void)
+{
+ struct io_uring m_io_uring;
+ int ret, fd = -1;
+ struct accept_test_args args = {
+ .fixed = true
+ };
ret = io_uring_queue_init(32, &m_io_uring, 0);
assert(ret >= 0);
- ret = test(&m_io_uring, 0);
+ ret = io_uring_register_files(&m_io_uring, &fd, 1);
+ assert(ret == 0);
+ ret = test(&m_io_uring, args);
+ io_uring_queue_exit(&m_io_uring);
+ return ret;
+}
+
+static int test_multishot_fixed_accept(void)
+{
+ struct io_uring m_io_uring;
+ int ret, fd[MAX_FDS];
+ struct accept_test_args args = {
+ .fixed = true,
+ .multishot = true
+ };
+
+ if (no_accept_multi)
+ return 0;
+
+ memset(fd, -1, sizeof(fd));
+ ret = io_uring_queue_init(MAX_FDS + 10, &m_io_uring, 0);
+ assert(ret >= 0);
+ ret = io_uring_register_files(&m_io_uring, fd, MAX_FDS);
+ assert(ret == 0);
+ ret = test(&m_io_uring, args);
io_uring_queue_exit(&m_io_uring);
return ret;
}
@@ -364,7 +644,8 @@ static int test_accept_sqpoll(void)
{
struct io_uring m_io_uring;
struct io_uring_params p = { };
- int ret, should_fail;
+ int ret;
+ struct accept_test_args args = { };
p.flags = IORING_SETUP_SQPOLL;
ret = t_create_ring_params(32, &m_io_uring, &p);
@@ -373,11 +654,11 @@ static int test_accept_sqpoll(void)
else if (ret < 0)
return ret;
- should_fail = 1;
+ args.accept_should_error = 1;
if (p.features & IORING_FEAT_SQPOLL_NONFIXED)
- should_fail = 0;
+ args.accept_should_error = 0;
- ret = test(&m_io_uring, should_fail);
+ ret = test(&m_io_uring, args);
io_uring_queue_exit(&m_io_uring);
return ret;
}
@@ -388,8 +669,7 @@ int main(int argc, char *argv[])
if (argc > 1)
return 0;
-
- ret = test_accept();
+ ret = test_accept(1, false);
if (ret) {
fprintf(stderr, "test_accept failed\n");
return ret;
@@ -397,41 +677,153 @@ int main(int argc, char *argv[])
if (no_accept)
return 0;
+ ret = test_accept(2, false);
+ if (ret) {
+ fprintf(stderr, "test_accept(2) failed\n");
+ return ret;
+ }
+
+ ret = test_accept(2, true);
+ if (ret) {
+ fprintf(stderr, "test_accept(2, true) failed\n");
+ return ret;
+ }
+
+ ret = test_accept_nonblock(false, 1);
+ if (ret) {
+ fprintf(stderr, "test_accept_nonblock failed\n");
+ return ret;
+ }
+
+ ret = test_accept_nonblock(true, 1);
+ if (ret) {
+ fprintf(stderr, "test_accept_nonblock(before, 1) failed\n");
+ return ret;
+ }
+
+ ret = test_accept_nonblock(true, 3);
+ if (ret) {
+ fprintf(stderr, "test_accept_nonblock(before,3) failed\n");
+ return ret;
+ }
+
+ ret = test_accept_fixed();
+ if (ret) {
+ fprintf(stderr, "test_accept_fixed failed\n");
+ return ret;
+ }
+
+ ret = test_multishot_fixed_accept();
+ if (ret) {
+ fprintf(stderr, "test_multishot_fixed_accept failed\n");
+ return ret;
+ }
+
+ ret = test_accept_multishot_wrong_arg();
+ if (ret) {
+ fprintf(stderr, "test_accept_multishot_wrong_arg failed\n");
+ return ret;
+ }
+
ret = test_accept_sqpoll();
if (ret) {
fprintf(stderr, "test_accept_sqpoll failed\n");
return ret;
}
- ret = test_accept_cancel(0);
+ ret = test_accept_cancel(0, 1, false);
if (ret) {
fprintf(stderr, "test_accept_cancel nodelay failed\n");
return ret;
}
- ret = test_accept_cancel(10000);
+ ret = test_accept_cancel(10000, 1, false);
if (ret) {
fprintf(stderr, "test_accept_cancel delay failed\n");
return ret;
}
- ret = test_accept_many(128, 0);
+ ret = test_accept_cancel(0, 4, false);
if (ret) {
- fprintf(stderr, "test_accept_many failed\n");
+ fprintf(stderr, "test_accept_cancel nodelay failed\n");
+ return ret;
+ }
+
+ ret = test_accept_cancel(10000, 4, false);
+ if (ret) {
+ fprintf(stderr, "test_accept_cancel delay failed\n");
+ return ret;
+ }
+
+ ret = test_accept_cancel(0, 1, true);
+ if (ret) {
+ fprintf(stderr, "test_accept_cancel multishot nodelay failed\n");
+ return ret;
+ }
+
+ ret = test_accept_cancel(10000, 1, true);
+ if (ret) {
+ fprintf(stderr, "test_accept_cancel multishot delay failed\n");
return ret;
}
- ret = test_accept_many(128, 100000);
+ ret = test_accept_cancel(0, 4, true);
+ if (ret) {
+ fprintf(stderr, "test_accept_cancel multishot nodelay failed\n");
+ return ret;
+ }
+
+ ret = test_accept_cancel(10000, 4, true);
+ if (ret) {
+ fprintf(stderr, "test_accept_cancel multishot delay failed\n");
+ return ret;
+ }
+
+ ret = test_multishot_accept(1, false);
+ if (ret) {
+ fprintf(stderr, "test_multishot_accept(1, false) failed\n");
+ return ret;
+ }
+
+ ret = test_multishot_accept(1, true);
+ if (ret) {
+ fprintf(stderr, "test_multishot_accept(1, true) failed\n");
+ return ret;
+ }
+
+ ret = test_accept_many((struct test_accept_many_args) {});
if (ret) {
fprintf(stderr, "test_accept_many failed\n");
return ret;
}
+ ret = test_accept_many((struct test_accept_many_args) {
+ .usecs = 100000 });
+ if (ret) {
+ fprintf(stderr, "test_accept_many(sleep) failed\n");
+ return ret;
+ }
+
+ ret = test_accept_many((struct test_accept_many_args) {
+ .nonblock = true });
+ if (ret) {
+ fprintf(stderr, "test_accept_many(nonblock) failed\n");
+ return ret;
+ }
+
+ ret = test_accept_many((struct test_accept_many_args) {
+ .nonblock = true,
+ .single_sock = true,
+ .close_fds = true });
+ if (ret) {
+ fprintf(stderr, "test_accept_many(nonblock,close) failed\n");
+ return ret;
+ }
+
ret = test_accept_pending_on_exit();
if (ret) {
fprintf(stderr, "test_accept_pending_on_exit failed\n");
return ret;
}
-
return 0;
}
diff --git a/test/b19062a56726-test.c b/test/b19062a56726.c
index 6a0f686..6a0f686 100644
--- a/test/b19062a56726-test.c
+++ b/test/b19062a56726.c
diff --git a/test/b5837bd5311d-test.c b/test/b5837bd5311d.c
index 57a2b58..57a2b58 100644
--- a/test/b5837bd5311d-test.c
+++ b/test/b5837bd5311d.c
diff --git a/test/buf-ring.c b/test/buf-ring.c
new file mode 100644
index 0000000..3d12ef6
--- /dev/null
+++ b/test/buf-ring.c
@@ -0,0 +1,390 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: run various shared buffer ring sanity checks
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+static int no_buf_ring;
+
+/* test trying to register classic group when ring group exists */
+static int test_mixed_reg2(int bgid)
+{
+ struct io_uring_buf_reg reg = { };
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ struct io_uring ring;
+ void *ptr, *bufs;
+ int ret;
+
+ ret = t_create_ring(1, &ring, 0);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ else if (ret != T_SETUP_OK)
+ return 1;
+
+ if (posix_memalign(&ptr, 4096, 4096))
+ return 1;
+
+ reg.ring_addr = (unsigned long) ptr;
+ reg.ring_entries = 32;
+ reg.bgid = bgid;
+
+ ret = io_uring_register_buf_ring(&ring, &reg, 0);
+ if (ret) {
+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
+ return 1;
+ }
+
+ /* provide classic buffers, group 1 */
+ bufs = malloc(8 * 1024);
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_provide_buffers(sqe, bufs, 1024, 8, bgid, 0);
+ io_uring_submit(&ring);
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait_cqe %d\n", ret);
+ return 1;
+ }
+ if (cqe->res != -EEXIST && cqe->res != -EINVAL) {
+ fprintf(stderr, "cqe res %d\n", cqe->res);
+ return 1;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+/* test trying to register ring group when classic group exists */
+static int test_mixed_reg(int bgid)
+{
+ struct io_uring_buf_reg reg = { };
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ struct io_uring ring;
+ void *ptr, *bufs;
+ int ret;
+
+ ret = t_create_ring(1, &ring, 0);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ else if (ret != T_SETUP_OK)
+ return 1;
+
+ /* provide classic buffers, group 1 */
+ bufs = malloc(8 * 1024);
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_provide_buffers(sqe, bufs, 1024, 8, bgid, 0);
+ io_uring_submit(&ring);
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait_cqe %d\n", ret);
+ return 1;
+ }
+ if (cqe->res) {
+ fprintf(stderr, "cqe res %d\n", cqe->res);
+ return 1;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+
+ if (posix_memalign(&ptr, 4096, 4096))
+ return 1;
+
+ reg.ring_addr = (unsigned long) ptr;
+ reg.ring_entries = 32;
+ reg.bgid = bgid;
+
+ ret = io_uring_register_buf_ring(&ring, &reg, 0);
+ if (ret != -EEXIST) {
+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
+ return 1;
+ }
+
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+static int test_double_reg_unreg(int bgid)
+{
+ struct io_uring_buf_reg reg = { };
+ struct io_uring ring;
+ void *ptr;
+ int ret;
+
+ ret = t_create_ring(1, &ring, 0);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ else if (ret != T_SETUP_OK)
+ return 1;
+
+ if (posix_memalign(&ptr, 4096, 4096))
+ return 1;
+
+ reg.ring_addr = (unsigned long) ptr;
+ reg.ring_entries = 32;
+ reg.bgid = bgid;
+
+ ret = io_uring_register_buf_ring(&ring, &reg, 0);
+ if (ret) {
+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
+ return 1;
+ }
+
+ /* check that 2nd register with same bgid fails */
+ reg.ring_addr = (unsigned long) ptr;
+ reg.ring_entries = 32;
+ reg.bgid = bgid;
+
+ ret = io_uring_register_buf_ring(&ring, &reg, 0);
+ if (ret != -EEXIST) {
+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
+ return 1;
+ }
+
+ ret = io_uring_unregister_buf_ring(&ring, bgid);
+ if (ret) {
+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
+ return 1;
+ }
+
+ ret = io_uring_unregister_buf_ring(&ring, bgid);
+ if (ret != -EINVAL && ret != -ENOENT) {
+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
+ return 1;
+ }
+
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+static int test_reg_unreg(int bgid)
+{
+ struct io_uring_buf_reg reg = { };
+ struct io_uring ring;
+ void *ptr;
+ int ret;
+
+ ret = t_create_ring(1, &ring, 0);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ else if (ret != T_SETUP_OK)
+ return 1;
+
+ if (posix_memalign(&ptr, 4096, 4096))
+ return 1;
+
+ reg.ring_addr = (unsigned long) ptr;
+ reg.ring_entries = 32;
+ reg.bgid = bgid;
+
+ ret = io_uring_register_buf_ring(&ring, &reg, 0);
+ if (ret) {
+ if (ret == -EINVAL) {
+ no_buf_ring = 1;
+ return 0;
+ }
+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
+ return 1;
+ }
+
+ ret = io_uring_unregister_buf_ring(&ring, bgid);
+ if (ret) {
+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
+ return 1;
+ }
+
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+static int test_one_read(int fd, int bgid, struct io_uring *ring)
+{
+ int ret;
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ return -1;
+ }
+
+ io_uring_prep_read(sqe, fd, NULL, 1, 0);
+ sqe->flags |= IOSQE_BUFFER_SELECT;
+ sqe->buf_group = bgid;
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ return -1;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ return -1;
+ }
+ ret = cqe->res;
+ io_uring_cqe_seen(ring, cqe);
+
+ if (ret == -ENOBUFS)
+ return ret;
+
+ if (ret != 1) {
+ fprintf(stderr, "read result %d\n", ret);
+ return -1;
+ }
+
+ return cqe->flags >> 16;
+}
+
+static int test_running(int bgid, int entries, int loops)
+{
+ struct io_uring_buf_reg reg = { };
+ struct io_uring ring;
+ void *ptr;
+ char buffer[8];
+ int ret;
+ int ring_size = (entries * sizeof(struct io_uring_buf) + 4095) & (~4095);
+ int ring_mask = io_uring_buf_ring_mask(entries);
+
+ int loop, idx;
+ bool *buffers;
+ struct io_uring_buf_ring *br;
+ int read_fd;
+
+ ret = t_create_ring(1, &ring, 0);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ else if (ret != T_SETUP_OK)
+ return 1;
+
+ if (posix_memalign(&ptr, 4096, ring_size))
+ return 1;
+
+ br = (struct io_uring_buf_ring *)ptr;
+ io_uring_buf_ring_init(br);
+
+ buffers = malloc(sizeof(bool) * entries);
+ if (!buffers)
+ return 1;
+
+ read_fd = open("/dev/zero", O_RDONLY);
+ if (read_fd < 0)
+ return 1;
+
+ reg.ring_addr = (unsigned long) ptr;
+ reg.ring_entries = entries;
+ reg.bgid = bgid;
+
+ ret = io_uring_register_buf_ring(&ring, &reg, 0);
+ if (ret) {
+ /* by now should have checked if this is supported or not */
+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
+ return 1;
+ }
+
+ for (loop = 0; loop < loops; loop++) {
+ memset(buffers, 0, sizeof(bool) * entries);
+ for (idx = 0; idx < entries; idx++)
+ io_uring_buf_ring_add(br, buffer, sizeof(buffer), idx, ring_mask, idx);
+ io_uring_buf_ring_advance(br, entries);
+
+ for (idx = 0; idx < entries; idx++) {
+ memset(buffer, 1, sizeof(buffer));
+ ret = test_one_read(read_fd, bgid, &ring);
+ if (ret < 0) {
+ fprintf(stderr, "bad run %d/%d = %d\n", loop, idx, ret);
+ return ret;
+ }
+ if (buffers[ret]) {
+ fprintf(stderr, "reused buffer %d/%d = %d!\n", loop, idx, ret);
+ return 1;
+ }
+ if (buffer[0] != 0) {
+ fprintf(stderr, "unexpected read %d %d/%d = %d!\n",
+ (int)buffer[0], loop, idx, ret);
+ return 1;
+ }
+ if (buffer[1] != 1) {
+ fprintf(stderr, "unexpected spilled read %d %d/%d = %d!\n",
+ (int)buffer[1], loop, idx, ret);
+ return 1;
+ }
+ buffers[ret] = true;
+ }
+ ret = test_one_read(read_fd, bgid, &ring);
+ if (ret != -ENOBUFS) {
+ fprintf(stderr, "expected enobufs run %d = %d\n", loop, ret);
+ return 1;
+ }
+
+ }
+
+ ret = io_uring_unregister_buf_ring(&ring, bgid);
+ if (ret) {
+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
+ return 1;
+ }
+
+ close(read_fd);
+ io_uring_queue_exit(&ring);
+ free(buffers);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int bgids[] = { 1, 127, -1 };
+ int entries[] = {1, 32768, 4096, -1 };
+ int ret, i;
+
+ if (argc > 1)
+ return 0;
+
+ for (i = 0; bgids[i] != -1; i++) {
+ ret = test_reg_unreg(bgids[i]);
+ if (ret) {
+ fprintf(stderr, "test_reg_unreg failed\n");
+ return 1;
+ }
+ if (no_buf_ring)
+ break;
+
+ ret = test_double_reg_unreg(bgids[i]);
+ if (ret) {
+ fprintf(stderr, "test_double_reg_unreg failed\n");
+ return 1;
+ }
+
+ ret = test_mixed_reg(bgids[i]);
+ if (ret) {
+ fprintf(stderr, "test_mixed_reg failed\n");
+ return 1;
+ }
+
+ ret = test_mixed_reg2(bgids[i]);
+ if (ret) {
+ fprintf(stderr, "test_mixed_reg2 failed\n");
+ return 1;
+ }
+ }
+
+ for (i = 0; !no_buf_ring && entries[i] != -1; i++) {
+ ret = test_running(2, entries[i], 3);
+ if (ret) {
+ fprintf(stderr, "test_running(%d) failed\n", entries[i]);
+ return 1;
+ }
+ }
+
+ return 0;
+}
diff --git a/test/ce593a6c480a-test.c b/test/ce593a6c480a.c
index c6949f0..47de128 100644
--- a/test/ce593a6c480a-test.c
+++ b/test/ce593a6c480a.c
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: MIT */
/*
* Test 5.7 regression with task_work not being run while a task is
* waiting on another event in the kernel.
diff --git a/test/connect.c b/test/connect.c
index 0ba3ee6..3ae10de 100644
--- a/test/connect.c
+++ b/test/connect.c
@@ -19,7 +19,8 @@
#include "liburing.h"
static int no_connect;
-static int use_port;
+static unsigned short use_port;
+static unsigned int use_addr;
static int create_socket(void)
{
@@ -90,7 +91,7 @@ static int listen_on_socket(int fd)
memset(&addr, 0, sizeof(addr));
addr.sin_family = AF_INET;
addr.sin_port = use_port;
- addr.sin_addr.s_addr = 0x0100007fU;
+ addr.sin_addr.s_addr = use_addr;
ret = bind(fd, (struct sockaddr*)&addr, sizeof(addr));
if (ret == -1) {
@@ -369,6 +370,8 @@ int main(int argc, char *argv[])
srand(getpid());
use_port = (rand() % 61440) + 4096;
+ use_port = htons(use_port);
+ use_addr = inet_addr("127.0.0.1");
ret = test_connect_with_no_peer(&ring);
if (ret == -1) {
diff --git a/test/cq-overflow.c b/test/cq-overflow.c
index 945dc93..057570e 100644
--- a/test/cq-overflow.c
+++ b/test/cq-overflow.c
@@ -243,6 +243,7 @@ err:
int main(int argc, char *argv[])
{
+ const char *fname = ".cq-overflow";
unsigned iters, drops;
unsigned long usecs;
int ret;
@@ -256,7 +257,7 @@ int main(int argc, char *argv[])
return ret;
}
- t_create_file(".basic-rw", FILE_SIZE);
+ t_create_file(fname, FILE_SIZE);
vecs = t_create_buffers(BUFFERS, BS);
@@ -265,7 +266,7 @@ int main(int argc, char *argv[])
do {
drops = 0;
- if (test_io(".basic-rw", usecs, &drops, 0)) {
+ if (test_io(fname, usecs, &drops, 0)) {
fprintf(stderr, "test_io nofault failed\n");
goto err;
}
@@ -275,19 +276,19 @@ int main(int argc, char *argv[])
iters++;
} while (iters < 40);
- if (test_io(".basic-rw", usecs, &drops, 0)) {
+ if (test_io(fname, usecs, &drops, 0)) {
fprintf(stderr, "test_io nofault failed\n");
goto err;
}
- if (test_io(".basic-rw", usecs, &drops, 1)) {
+ if (test_io(fname, usecs, &drops, 1)) {
fprintf(stderr, "test_io fault failed\n");
goto err;
}
- unlink(".basic-rw");
+ unlink(fname);
return 0;
err:
- unlink(".basic-rw");
+ unlink(fname);
return 1;
}
diff --git a/test/cq-size.c b/test/cq-size.c
index b7dd5b4..4e6e3d1 100644
--- a/test/cq-size.c
+++ b/test/cq-size.c
@@ -45,14 +45,20 @@ int main(int argc, char *argv[])
p.cq_entries = 0;
ret = io_uring_queue_init_params(4, &ring, &p);
- if (ret >= 0 || errno != EINVAL) {
+ if (ret >= 0) {
printf("zero sized cq ring succeeded\n");
+ io_uring_queue_exit(&ring);
+ goto err;
+ }
+
+ if (ret != -EINVAL) {
+ printf("io_uring_queue_init_params failed, but not with -EINVAL"
+ ", returned error %d (%s)\n", ret, strerror(-ret));
goto err;
}
done:
return 0;
err:
- io_uring_queue_exit(&ring);
return 1;
}
diff --git a/test/d4ae271dfaae-test.c b/test/d4ae271dfaae.c
index 80d3f71..397b94b 100644
--- a/test/d4ae271dfaae-test.c
+++ b/test/d4ae271dfaae.c
@@ -27,11 +27,6 @@ int main(int argc, char *argv[])
char *fname;
void *buf;
- if (geteuid()) {
- fprintf(stdout, "Test requires root, skipping\n");
- return 0;
- }
-
memset(&p, 0, sizeof(p));
p.flags = IORING_SETUP_SQPOLL;
ret = t_create_ring_params(4, &ring, &p);
@@ -48,6 +43,8 @@ int main(int argc, char *argv[])
}
fd = open(fname, O_RDONLY | O_DIRECT);
+ if (fname != argv[1])
+ unlink(fname);
if (fd < 0) {
perror("open");
goto out;
@@ -94,8 +91,6 @@ int main(int argc, char *argv[])
close(fd);
out:
- if (fname != argv[1])
- unlink(fname);
io_uring_queue_exit(&ring);
return ret;
}
diff --git a/test/d77a67ed5f27-test.c b/test/d77a67ed5f27.c
index e56fdcd..e56fdcd 100644
--- a/test/d77a67ed5f27-test.c
+++ b/test/d77a67ed5f27.c
diff --git a/test/defer.c b/test/defer.c
index 885cf5c..68ee4b4 100644
--- a/test/defer.c
+++ b/test/defer.c
@@ -11,6 +11,12 @@
#include "helpers.h"
#include "liburing.h"
+#define RING_SIZE 128
+enum {
+ OP_NOP,
+ OP_REMOVE_BUFFERS
+};
+
struct test_context {
struct io_uring *ring;
struct io_uring_sqe **sqes;
@@ -25,7 +31,8 @@ static void free_context(struct test_context *ctx)
memset(ctx, 0, sizeof(*ctx));
}
-static int init_context(struct test_context *ctx, struct io_uring *ring, int nr)
+static int init_context(struct test_context *ctx, struct io_uring *ring, int nr,
+ int op)
{
struct io_uring_sqe *sqe;
int i;
@@ -43,7 +50,14 @@ static int init_context(struct test_context *ctx, struct io_uring *ring, int nr)
sqe = io_uring_get_sqe(ring);
if (!sqe)
goto err;
- io_uring_prep_nop(sqe);
+ switch (op) {
+ case OP_NOP:
+ io_uring_prep_nop(sqe);
+ break;
+ case OP_REMOVE_BUFFERS:
+ io_uring_prep_remove_buffers(sqe, 10, 1);
+ break;
+ };
sqe->user_data = i;
ctx->sqes[i] = sqe;
}
@@ -79,7 +93,7 @@ static int test_cancelled_userdata(struct io_uring *ring)
struct test_context ctx;
int ret, i, nr = 100;
- if (init_context(&ctx, ring, nr))
+ if (init_context(&ctx, ring, nr, OP_NOP))
return 1;
for (i = 0; i < nr; i++)
@@ -113,7 +127,7 @@ static int test_thread_link_cancel(struct io_uring *ring)
struct test_context ctx;
int ret, i, nr = 100;
- if (init_context(&ctx, ring, nr))
+ if (init_context(&ctx, ring, nr, OP_REMOVE_BUFFERS))
return 1;
for (i = 0; i < nr; i++)
@@ -132,12 +146,12 @@ static int test_thread_link_cancel(struct io_uring *ring)
bool fail = false;
if (i == 0)
- fail = (ctx.cqes[i].res != -EINVAL);
+ fail = (ctx.cqes[i].res != -ENOENT);
else
fail = (ctx.cqes[i].res != -ECANCELED);
if (fail) {
- printf("invalid status\n");
+ printf("invalid status %d\n", ctx.cqes[i].res);
goto err;
}
}
@@ -156,7 +170,7 @@ static int test_drain_with_linked_timeout(struct io_uring *ring)
struct test_context ctx;
int ret, i;
- if (init_context(&ctx, ring, nr * 2))
+ if (init_context(&ctx, ring, nr * 2, OP_NOP))
return 1;
for (i = 0; i < nr; i++) {
@@ -186,7 +200,7 @@ static int run_drained(struct io_uring *ring, int nr)
struct test_context ctx;
int ret, i;
- if (init_context(&ctx, ring, nr))
+ if (init_context(&ctx, ring, nr, OP_NOP))
return 1;
for (i = 0; i < nr; i++)
@@ -243,30 +257,24 @@ int main(int argc, char *argv[])
{
struct io_uring ring, poll_ring, sqthread_ring;
struct io_uring_params p;
- int ret, no_sqthread = 0;
+ int ret;
if (argc > 1)
return 0;
memset(&p, 0, sizeof(p));
- ret = io_uring_queue_init_params(1000, &ring, &p);
+ ret = io_uring_queue_init_params(RING_SIZE, &ring, &p);
if (ret) {
- printf("ring setup failed\n");
+ printf("ring setup failed %i\n", ret);
return 1;
}
- ret = io_uring_queue_init(1000, &poll_ring, IORING_SETUP_IOPOLL);
+ ret = io_uring_queue_init(RING_SIZE, &poll_ring, IORING_SETUP_IOPOLL);
if (ret) {
printf("poll_ring setup failed\n");
return 1;
}
- ret = t_create_ring(1000, &sqthread_ring,
- IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL);
- if (ret == T_SETUP_SKIP)
- return 0;
- else if (ret < 0)
- return 1;
ret = test_cancelled_userdata(&poll_ring);
if (ret) {
@@ -274,16 +282,6 @@ int main(int argc, char *argv[])
return ret;
}
- if (no_sqthread) {
- printf("test_thread_link_cancel: skipped, not root\n");
- } else {
- ret = test_thread_link_cancel(&sqthread_ring);
- if (ret) {
- printf("test_thread_link_cancel failed\n");
- return ret;
- }
- }
-
if (!(p.features & IORING_FEAT_NODROP)) {
ret = test_overflow_hung(&ring);
if (ret) {
@@ -304,5 +302,18 @@ int main(int argc, char *argv[])
return ret;
}
+ ret = t_create_ring(RING_SIZE, &sqthread_ring,
+ IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ else if (ret < 0)
+ return 1;
+
+ ret = test_thread_link_cancel(&sqthread_ring);
+ if (ret) {
+ printf("test_thread_link_cancel failed\n");
+ return ret;
+ }
+
return 0;
}
diff --git a/test/double-poll-crash.c b/test/double-poll-crash.c
index 2a012e5..231c7da 100644
--- a/test/double-poll-crash.c
+++ b/test/double-poll-crash.c
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: MIT */
// https://syzkaller.appspot.com/bug?id=5c9918d20f771265ad0ffae3c8f3859d24850692
// autogenerated by syzkaller (https://github.com/google/syzkaller)
@@ -51,10 +52,14 @@ static long syz_io_uring_setup(volatile long a0, volatile long a1,
*ring_ptr_out = mmap(vma1, ring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd_io_uring,
IORING_OFF_SQ_RING);
+ if (*ring_ptr_out == MAP_FAILED)
+ exit(0);
uint32_t sqes_sz = setup_params->sq_entries * SIZEOF_IO_URING_SQE;
*sqes_ptr_out =
mmap(vma2, sqes_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd_io_uring, IORING_OFF_SQES);
+ if (*sqes_ptr_out == MAP_FAILED)
+ exit(0);
return fd_io_uring;
}
@@ -103,21 +108,24 @@ static long syz_open_dev(volatile long a0, volatile long a1, volatile long a2)
}
}
-#ifndef __NR_io_uring_enter
-#define __NR_io_uring_enter 426
-#endif
-
uint64_t r[4] = {0xffffffffffffffff, 0x0, 0x0, 0xffffffffffffffff};
int main(int argc, char *argv[])
{
+ void *mmap_ret;
+#if !defined(__i386) && !defined(__x86_64__)
+ return 0;
+#endif
if (argc > 1)
return 0;
- mmap((void *)0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
- mmap((void *)0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul);
- mmap((void *)0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
+ mmap_ret = mmap((void *)0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul);
+ if (mmap_ret == MAP_FAILED)
+ return 0;
+ mmap_ret = mmap((void *)0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
+ if (mmap_ret == MAP_FAILED)
+ return 0;
intptr_t res = 0;
*(uint32_t*)0x20000484 = 0;
*(uint32_t*)0x20000488 = 0;
diff --git a/test/drop-submit.c b/test/drop-submit.c
new file mode 100644
index 0000000..7b15f26
--- /dev/null
+++ b/test/drop-submit.c
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: test IORING_SETUP_SUBMIT_ALL
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "liburing.h"
+
+static int test(struct io_uring *ring, int expect_drops)
+{
+ struct io_uring_sqe *sqe;
+ char buf[32];
+ int ret, i;
+
+ for (i = 0; i < 4; i++) {
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ io_uring_prep_nop(sqe);
+ }
+
+ /* prep two invalid reads, these will fail */
+ for (i = 0; i < 2; i++) {
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ io_uring_prep_read(sqe, 128, buf, sizeof(buf), 0);
+ sqe->ioprio = (short) -1;
+ }
+
+
+ ret = io_uring_submit(ring);
+ if (expect_drops) {
+ if (ret != 5) {
+ fprintf(stderr, "drops submit failed: %d\n", ret);
+ goto err;
+ }
+ } else {
+ if (ret != 6) {
+ fprintf(stderr, "no drops submit failed: %d\n", ret);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ ret = io_uring_queue_init(8, &ring, IORING_SETUP_SUBMIT_ALL);
+ if (ret)
+ return 0;
+
+ ret = test(&ring, 0);
+ if (ret) {
+ fprintf(stderr, "test no drops failed\n");
+ return ret;
+ }
+
+ io_uring_queue_exit(&ring);
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed\n");
+ return 0;
+ }
+
+ ret = test(&ring, 1);
+ if (ret) {
+ fprintf(stderr, "test drops failed\n");
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/test/eeed8b54e0df-test.c b/test/eeed8b54e0df.c
index b6e27cc..62f6f45 100644
--- a/test/eeed8b54e0df-test.c
+++ b/test/eeed8b54e0df.c
@@ -26,6 +26,7 @@ static int get_file_fd(void)
int fd;
fd = open("testfile", O_RDWR | O_CREAT, 0644);
+ unlink("testfile");
if (fd < 0) {
perror("open file");
return -1;
@@ -54,12 +55,6 @@ err:
return fd;
}
-static void put_file_fd(int fd)
-{
- close(fd);
- unlink("testfile");
-}
-
int main(int argc, char *argv[])
{
struct io_uring ring;
@@ -111,9 +106,9 @@ int main(int argc, char *argv[])
goto err;
}
- put_file_fd(fd);
+ close(fd);
return 0;
err:
- put_file_fd(fd);
+ close(fd);
return 1;
}
diff --git a/test/empty-eownerdead.c b/test/empty-eownerdead.c
new file mode 100644
index 0000000..40f854f
--- /dev/null
+++ b/test/empty-eownerdead.c
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Test if entering with nothing to submit/wait for SQPOLL returns an error.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include "liburing.h"
+#include "helpers.h"
+#include "../src/syscall.h"
+
+int main(int argc, char *argv[])
+{
+ struct io_uring_params p = {};
+ struct io_uring ring;
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ p.flags = IORING_SETUP_SQPOLL;
+ p.sq_thread_idle = 100;
+
+ ret = t_create_ring_params(1, &ring, &p);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ else if (ret < 0)
+ goto err;
+
+ ret = __sys_io_uring_enter(ring.ring_fd, 0, 0, 0, NULL);
+ if (ret < 0) {
+ int __e = errno;
+
+ if (__e == EOWNERDEAD)
+ fprintf(stderr, "sqe submit unexpected failure due old kernel bug: %s\n", strerror(__e));
+ else
+ fprintf(stderr, "sqe submit unexpected failure: %s\n", strerror(__e));
+ goto err;
+ }
+
+ return 0;
+err:
+ return 1;
+}
diff --git a/test/eventfd-disable.c b/test/eventfd-disable.c
index f172fd7..6567be0 100644
--- a/test/eventfd-disable.c
+++ b/test/eventfd-disable.c
@@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/eventfd.h>
#include "liburing.h"
diff --git a/test/eventfd-reg.c b/test/eventfd-reg.c
new file mode 100644
index 0000000..b447455
--- /dev/null
+++ b/test/eventfd-reg.c
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: test eventfd registration+unregistration
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/eventfd.h>
+
+#include "liburing.h"
+
+int main(int argc, char *argv[])
+{
+ struct io_uring_params p = {};
+ struct io_uring ring;
+ int ret, evfd[2], i;
+
+ if (argc > 1)
+ return 0;
+
+ ret = io_uring_queue_init_params(8, &ring, &p);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return 1;
+ }
+
+ evfd[0] = eventfd(0, EFD_CLOEXEC);
+ evfd[1] = eventfd(0, EFD_CLOEXEC);
+ if (evfd[0] < 0 || evfd[1] < 0) {
+ perror("eventfd");
+ return 1;
+ }
+
+ ret = io_uring_register_eventfd(&ring, evfd[0]);
+ if (ret) {
+ fprintf(stderr, "failed to register evfd: %d\n", ret);
+ return 1;
+ }
+
+ /* Check that registrering again will get -EBUSY */
+ ret = io_uring_register_eventfd(&ring, evfd[1]);
+ if (ret != -EBUSY) {
+ fprintf(stderr, "unexpected 2nd register: %d\n", ret);
+ return 1;
+ }
+ close(evfd[1]);
+
+ ret = io_uring_unregister_eventfd(&ring);
+ if (ret) {
+ fprintf(stderr, "unexpected unregister: %d\n", ret);
+ return 1;
+ }
+
+ /* loop 100 registers/unregister */
+ for (i = 0; i < 100; i++) {
+ ret = io_uring_register_eventfd(&ring, evfd[0]);
+ if (ret) {
+ fprintf(stderr, "failed to register evfd: %d\n", ret);
+ return 1;
+ }
+
+ ret = io_uring_unregister_eventfd(&ring);
+ if (ret) {
+ fprintf(stderr, "unexpected unregister: %d\n", ret);
+ return 1;
+ }
+ }
+
+ close(evfd[0]);
+ return 0;
+}
diff --git a/test/eventfd-ring.c b/test/eventfd-ring.c
index 67e102c..d217312 100644
--- a/test/eventfd-ring.c
+++ b/test/eventfd-ring.c
@@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/eventfd.h>
#include "liburing.h"
diff --git a/test/eventfd.c b/test/eventfd.c
index 1a7e3f3..5c49ece 100644
--- a/test/eventfd.c
+++ b/test/eventfd.c
@@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/eventfd.h>
#include "liburing.h"
diff --git a/test/exec-target.c b/test/exec-target.c
new file mode 100644
index 0000000..2399c2d
--- /dev/null
+++ b/test/exec-target.c
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: MIT */
+
+int main(int argc, char *argv[])
+{
+ return 0;
+}
diff --git a/test/exit-no-cleanup.c b/test/exit-no-cleanup.c
new file mode 100644
index 0000000..b3fd5a4
--- /dev/null
+++ b/test/exit-no-cleanup.c
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Test case testing exit without cleanup and io-wq work pending or queued.
+ *
+ * From Florian Fischer <florian.fl.fischer@fau.de>
+ * Link: https://lore.kernel.org/io-uring/20211202165606.mqryio4yzubl7ms5@pasture/
+ *
+ */
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/sysinfo.h>
+#include <unistd.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+#define IORING_ENTRIES 8
+
+static pthread_t *threads;
+static pthread_barrier_t init_barrier;
+static int sleep_fd, notify_fd;
+static sem_t sem;
+
+void *thread_func(void *arg)
+{
+ struct io_uring ring;
+ int res;
+
+ res = io_uring_queue_init(IORING_ENTRIES, &ring, 0);
+ if (res)
+ err(EXIT_FAILURE, "io_uring_queue_init failed");
+
+ pthread_barrier_wait(&init_barrier);
+
+ for(;;) {
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ uint64_t buf;
+ int res;
+
+ sqe = io_uring_get_sqe(&ring);
+ assert(sqe);
+
+ io_uring_prep_read(sqe, sleep_fd, &buf, sizeof(buf), 0);
+
+ res = io_uring_submit_and_wait(&ring, 1);
+ if (res < 0)
+ err(EXIT_FAILURE, "io_uring_submit_and_wait failed");
+
+ res = io_uring_peek_cqe(&ring, &cqe);
+ assert(!res);
+ if (cqe->res < 0) {
+ errno = -cqe->res;
+ err(EXIT_FAILURE, "read failed");
+ }
+ assert(cqe->res == sizeof(buf));
+
+ sem_post(&sem);
+
+ io_uring_cqe_seen(&ring, cqe);
+ }
+
+ return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+ int res, fds[2], i, cpus;
+ const uint64_t n = 0x42;
+
+ if (argc > 1)
+ return 0;
+
+ cpus = get_nprocs();
+ res = pthread_barrier_init(&init_barrier, NULL, cpus);
+ if (res)
+ err(EXIT_FAILURE, "pthread_barrier_init failed");
+
+ res = sem_init(&sem, 0, 0);
+ if (res)
+ err(EXIT_FAILURE, "sem_init failed");
+
+ threads = t_malloc(sizeof(pthread_t) * cpus);
+
+ res = pipe(fds);
+ if (res)
+ err(EXIT_FAILURE, "pipe failed");
+
+ sleep_fd = fds[0];
+ notify_fd = fds[1];
+
+ for (i = 0; i < cpus; i++) {
+ errno = pthread_create(&threads[i], NULL, thread_func, NULL);
+ if (errno)
+ err(EXIT_FAILURE, "pthread_create failed");
+ }
+
+ // Write #cpus notifications
+ for (i = 0; i < cpus; i++) {
+ res = write(notify_fd, &n, sizeof(n));
+ if (res < 0)
+ err(EXIT_FAILURE, "write failed");
+ assert(res == sizeof(n));
+ }
+
+ // Await that all notifications were received
+ for (i = 0; i < cpus; i++)
+ sem_wait(&sem);
+
+ // Exit without resource cleanup
+ exit(EXIT_SUCCESS);
+}
diff --git a/test/fadvise.c b/test/fadvise.c
index b6d4462..278a045 100644
--- a/test/fadvise.c
+++ b/test/fadvise.c
@@ -184,7 +184,9 @@ int main(int argc, char *argv[])
if (i >= MIN_LOOPS && !bad)
break;
}
- if (bad > good) {
+
+ /* too hard to reliably test, just ignore */
+ if (0 && bad > good) {
fprintf(stderr, "Suspicious timings\n");
goto err;
}
diff --git a/test/fallocate.c b/test/fallocate.c
index da90be8..ddb53a6 100644
--- a/test/fallocate.c
+++ b/test/fallocate.c
@@ -42,6 +42,7 @@ static int test_fallocate_rlimit(struct io_uring *ring)
perror("open");
return 1;
}
+ unlink(buf);
sqe = io_uring_get_sqe(ring);
if (!sqe) {
@@ -72,10 +73,8 @@ static int test_fallocate_rlimit(struct io_uring *ring)
}
io_uring_cqe_seen(ring, cqe);
out:
- unlink(buf);
return 0;
err:
- unlink(buf);
return 1;
}
@@ -93,6 +92,7 @@ static int test_fallocate(struct io_uring *ring)
perror("open");
return 1;
}
+ unlink(buf);
sqe = io_uring_get_sqe(ring);
if (!sqe) {
@@ -136,10 +136,8 @@ static int test_fallocate(struct io_uring *ring)
}
out:
- unlink(buf);
return 0;
err:
- unlink(buf);
return 1;
}
@@ -160,6 +158,7 @@ static int test_fallocate_fsync(struct io_uring *ring)
perror("open");
return 1;
}
+ unlink(buf);
sqe = io_uring_get_sqe(ring);
if (!sqe) {
@@ -209,10 +208,8 @@ static int test_fallocate_fsync(struct io_uring *ring)
goto err;
}
- unlink(buf);
return 0;
err:
- unlink(buf);
return 1;
}
diff --git a/test/fc2a85cb02ef-test.c b/test/fc2a85cb02ef.c
index 35addf5..bdc3c48 100644
--- a/test/fc2a85cb02ef-test.c
+++ b/test/fc2a85cb02ef.c
@@ -78,13 +78,6 @@ static int setup_fault()
return 0;
}
-#ifndef __NR_io_uring_register
-#define __NR_io_uring_register 427
-#endif
-#ifndef __NR_io_uring_setup
-#define __NR_io_uring_setup 425
-#endif
-
uint64_t r[2] = {0xffffffffffffffff, 0xffffffffffffffff};
int main(int argc, char *argv[])
diff --git a/test/file-register.c b/test/file-register.c
index c5c5507..6889dbf 100644
--- a/test/file-register.c
+++ b/test/file-register.c
@@ -9,6 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
+#include <sys/resource.h>
#include "helpers.h"
#include "liburing.h"
@@ -351,8 +352,9 @@ static int test_basic(struct io_uring *ring, int fail)
{
int *files;
int ret;
+ int nr_files = fail ? 10 : 100;
- files = open_files(fail ? 10 : 100, 0, 0);
+ files = open_files(nr_files, 0, 0);
ret = io_uring_register_files(ring, files, 100);
if (ret) {
if (fail) {
@@ -371,10 +373,10 @@ static int test_basic(struct io_uring *ring, int fail)
fprintf(stderr, "%s: unregister %d\n", __FUNCTION__, ret);
goto err;
}
- close_files(files, 100, 0);
+ close_files(files, nr_files, 0);
return 0;
err:
- close_files(files, 100, 0);
+ close_files(files, nr_files, 0);
return 1;
}
@@ -493,6 +495,18 @@ static int test_fixed_read_write(struct io_uring *ring, int index)
return 0;
}
+static void adjust_nfiles(int want_files)
+{
+ struct rlimit rlim;
+
+ if (getrlimit(RLIMIT_NOFILE, &rlim) < 0)
+ return;
+ if (rlim.rlim_cur >= want_files)
+ return;
+ rlim.rlim_cur = want_files;
+ setrlimit(RLIMIT_NOFILE, &rlim);
+}
+
/*
* Register 8K of sparse files, update one at a random spot, then do some
* file IO to verify it works.
@@ -502,6 +516,8 @@ static int test_huge(struct io_uring *ring)
int *files;
int ret;
+ adjust_nfiles(16384);
+
files = open_files(0, 8192, 0);
ret = io_uring_register_files(ring, files, 8192);
if (ret) {
@@ -729,7 +745,90 @@ static int test_fixed_removal_ordering(void)
return 0;
}
+/* mix files requiring SCM-accounting and not in a single register */
+static int test_mixed_af_unix(void)
+{
+ struct io_uring ring;
+ int i, ret, fds[2];
+ int reg_fds[32];
+ int sp[2];
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret < 0) {
+ fprintf(stderr, "failed to init io_uring: %s\n", strerror(-ret));
+ return ret;
+ }
+ if (pipe(fds)) {
+ perror("pipe");
+ return -1;
+ }
+ if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sp) != 0) {
+ perror("Failed to create Unix-domain socket pair\n");
+ return 1;
+ }
+
+ for (i = 0; i < 16; i++) {
+ reg_fds[i * 2] = fds[0];
+ reg_fds[i * 2 + 1] = sp[0];
+ }
+
+ ret = io_uring_register_files(&ring, reg_fds, 32);
+ if (ret) {
+ fprintf(stderr, "file_register: %d\n", ret);
+ return ret;
+ }
+
+ close(fds[0]);
+ close(fds[1]);
+ close(sp[0]);
+ close(sp[1]);
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+static int test_partial_register_fail(void)
+{
+ char buffer[128];
+ struct io_uring ring;
+ int ret, fds[2];
+ int reg_fds[5];
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret < 0) {
+ fprintf(stderr, "failed to init io_uring: %s\n", strerror(-ret));
+ return ret;
+ }
+ if (pipe(fds)) {
+ perror("pipe");
+ return -1;
+ }
+
+ /*
+ * Expect register to fail as it doesn't support io_uring fds, shouldn't
+ * leave any fds referenced afterwards.
+ */
+ reg_fds[0] = fds[0];
+ reg_fds[1] = fds[1];
+ reg_fds[2] = -1;
+ reg_fds[3] = ring.ring_fd;
+ reg_fds[4] = -1;
+ ret = io_uring_register_files(&ring, reg_fds, 5);
+ if (!ret) {
+ fprintf(stderr, "file_register unexpectedly succeeded\n");
+ return 1;
+ }
+
+ /* ring should have fds referenced, can close them */
+ close(fds[1]);
+
+ /* confirm that fds[1] is actually close and to ref'ed by io_uring */
+ ret = read(fds[0], buffer, 10);
+ if (ret < 0)
+ perror("read");
+ close(fds[0]);
+ io_uring_queue_exit(&ring);
+ return 0;
+}
int main(int argc, char *argv[])
{
@@ -838,5 +937,17 @@ int main(int argc, char *argv[])
return 1;
}
+ ret = test_mixed_af_unix();
+ if (ret) {
+ printf("test_mixed_af_unix failed\n");
+ return 1;
+ }
+
+ ret = test_partial_register_fail();
+ if (ret) {
+ printf("test_partial_register_fail failed\n");
+ return ret;
+ }
+
return 0;
}
diff --git a/test/file-update.c b/test/file-update.c
index 38059d4..578017e 100644
--- a/test/file-update.c
+++ b/test/file-update.c
@@ -128,6 +128,7 @@ static int test_sqe_update(struct io_uring *ring)
ret = cqe->res;
io_uring_cqe_seen(ring, cqe);
+ free(fds);
if (ret == -EINVAL) {
fprintf(stdout, "IORING_OP_FILES_UPDATE not supported, skipping\n");
return 0;
diff --git a/test/file-verify.c b/test/file-verify.c
new file mode 100644
index 0000000..327cb1d
--- /dev/null
+++ b/test/file-verify.c
@@ -0,0 +1,629 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: run various reads tests, verifying data
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <linux/fs.h>
+
+#include "helpers.h"
+#include "liburing.h"
+
+#define FSIZE 128*1024*1024
+#define CHUNK_SIZE 131072
+#define PUNCH_SIZE 32768
+
+/*
+ * 8 because it fits within the on-stack iov, 16 because it's larger than 8
+ */
+#define MIN_VECS 8
+#define MAX_VECS 16
+
+/*
+ * Can be anything, let's just do something for a bit of parallellism
+ */
+#define READ_BATCH 16
+
+/*
+ * Each offset in the file has the offset / sizeof(int) stored for every
+ * sizeof(int) address.
+ */
+static int verify_buf(void *buf, size_t size, off_t off)
+{
+ int i, u_in_buf = size / sizeof(unsigned int);
+ unsigned int *ptr;
+
+ off /= sizeof(unsigned int);
+ ptr = buf;
+ for (i = 0; i < u_in_buf; i++) {
+ if (off != *ptr) {
+ fprintf(stderr, "Found %u, wanted %lu\n", *ptr, off);
+ return 1;
+ }
+ ptr++;
+ off++;
+ }
+
+ return 0;
+}
+
+static int test_truncate(struct io_uring *ring, const char *fname, int buffered,
+ int vectored, int provide_buf)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ struct iovec vec;
+ struct stat sb;
+ off_t punch_off, off, file_size;
+ void *buf = NULL;
+ int u_in_buf, i, ret, fd, first_pass = 1;
+ unsigned int *ptr;
+
+ if (buffered)
+ fd = open(fname, O_RDWR);
+ else
+ fd = open(fname, O_DIRECT | O_RDWR);
+ if (fd < 0) {
+ perror("open");
+ return 1;
+ }
+
+ if (fstat(fd, &sb) < 0) {
+ perror("stat");
+ close(fd);
+ return 1;
+ }
+
+ if (S_ISREG(sb.st_mode)) {
+ file_size = sb.st_size;
+ } else if (S_ISBLK(sb.st_mode)) {
+ unsigned long long bytes;
+
+ if (ioctl(fd, BLKGETSIZE64, &bytes) < 0) {
+ perror("ioctl");
+ close(fd);
+ return 1;
+ }
+ file_size = bytes;
+ } else {
+ goto out;
+ }
+
+ if (file_size < CHUNK_SIZE)
+ goto out;
+
+ t_posix_memalign(&buf, 4096, CHUNK_SIZE);
+
+ off = file_size - (CHUNK_SIZE / 2);
+ punch_off = off + CHUNK_SIZE / 4;
+
+ u_in_buf = CHUNK_SIZE / sizeof(unsigned int);
+ ptr = buf;
+ for (i = 0; i < u_in_buf; i++) {
+ *ptr = i;
+ ptr++;
+ }
+ ret = pwrite(fd, buf, CHUNK_SIZE / 2, off);
+ if (ret < 0) {
+ perror("pwrite");
+ goto err;
+ } else if (ret != CHUNK_SIZE / 2)
+ goto out;
+
+again:
+ /*
+ * Read in last bit of file so it's known cached, then remove half of that
+ * last bit so we get a short read that needs retry
+ */
+ ret = pread(fd, buf, CHUNK_SIZE / 2, off);
+ if (ret < 0) {
+ perror("pread");
+ goto err;
+ } else if (ret != CHUNK_SIZE / 2)
+ goto out;
+
+ if (posix_fadvise(fd, punch_off, CHUNK_SIZE / 4, POSIX_FADV_DONTNEED) < 0) {
+ perror("posix_fadivse");
+ goto err;
+ }
+
+ if (provide_buf) {
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_provide_buffers(sqe, buf, CHUNK_SIZE, 1, 0, 0);
+ ret = io_uring_submit(ring);
+ if (ret != 1) {
+ fprintf(stderr, "submit failed %d\n", ret);
+ goto err;
+ }
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ ret = cqe->res;
+ io_uring_cqe_seen(ring, cqe);
+ if (ret) {
+ fprintf(stderr, "Provide buffer failed %d\n", ret);
+ goto err;
+ }
+ }
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ if (vectored) {
+ assert(!provide_buf);
+ vec.iov_base = buf;
+ vec.iov_len = CHUNK_SIZE;
+ io_uring_prep_readv(sqe, fd, &vec, 1, off);
+ } else {
+ if (provide_buf) {
+ io_uring_prep_read(sqe, fd, NULL, CHUNK_SIZE, off);
+ sqe->flags |= IOSQE_BUFFER_SELECT;
+ } else {
+ io_uring_prep_read(sqe, fd, buf, CHUNK_SIZE, off);
+ }
+ }
+ memset(buf, 0, CHUNK_SIZE);
+
+ ret = io_uring_submit(ring);
+ if (ret != 1) {
+ fprintf(stderr, "Submit failed %d\n", ret);
+ goto err;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+
+ ret = cqe->res;
+ io_uring_cqe_seen(ring, cqe);
+ if (ret != CHUNK_SIZE / 2) {
+ fprintf(stderr, "Unexpected truncated read %d\n", ret);
+ goto err;
+ }
+
+ if (verify_buf(buf, CHUNK_SIZE / 2, 0))
+ goto err;
+
+ /*
+ * Repeat, but punch first part instead of last
+ */
+ if (first_pass) {
+ punch_off = file_size - CHUNK_SIZE / 4;
+ first_pass = 0;
+ goto again;
+ }
+
+out:
+ free(buf);
+ close(fd);
+ return 0;
+err:
+ free(buf);
+ close(fd);
+ return 1;
+}
+
+enum {
+ PUNCH_NONE,
+ PUNCH_FRONT,
+ PUNCH_MIDDLE,
+ PUNCH_END,
+};
+
+/*
+ * For each chunk in file, DONTNEED a start, end, or middle segment of it.
+ * We enter here with the file fully cached every time, either freshly
+ * written or after other reads. This forces (at least) the buffered reads
+ * to be handled incrementally, exercising that path.
+ */
+static int do_punch(int fd)
+{
+ off_t offset = 0;
+ int punch_type;
+
+ while (offset + CHUNK_SIZE <= FSIZE) {
+ off_t punch_off;
+
+ punch_type = rand() % (PUNCH_END + 1);
+ switch (punch_type) {
+ default:
+ case PUNCH_NONE:
+ punch_off = -1; /* gcc... */
+ break;
+ case PUNCH_FRONT:
+ punch_off = offset;
+ break;
+ case PUNCH_MIDDLE:
+ punch_off = offset + PUNCH_SIZE;
+ break;
+ case PUNCH_END:
+ punch_off = offset + CHUNK_SIZE - PUNCH_SIZE;
+ break;
+ }
+
+ offset += CHUNK_SIZE;
+ if (punch_type == PUNCH_NONE)
+ continue;
+ if (posix_fadvise(fd, punch_off, PUNCH_SIZE, POSIX_FADV_DONTNEED) < 0) {
+ perror("posix_fadivse");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int provide_buffers(struct io_uring *ring, void **buf)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int i, ret;
+
+ /* real use case would have one buffer chopped up, but... */
+ for (i = 0; i < READ_BATCH; i++) {
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_provide_buffers(sqe, buf[i], CHUNK_SIZE, 1, 0, i);
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret != READ_BATCH) {
+ fprintf(stderr, "Submit failed %d\n", ret);
+ return 1;
+ }
+
+ for (i = 0; i < READ_BATCH; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait cqe %d\n", ret);
+ return 1;
+ }
+ if (cqe->res < 0) {
+ fprintf(stderr, "cqe res provide %d\n", cqe->res);
+ return 1;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ return 0;
+}
+
+static int test(struct io_uring *ring, const char *fname, int buffered,
+ int vectored, int small_vecs, int registered, int provide)
+{
+ struct iovec vecs[READ_BATCH][MAX_VECS];
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ void *buf[READ_BATCH];
+ int ret, fd, flags;
+ int i, j, nr_vecs;
+ off_t off, voff;
+ size_t left;
+
+ if (registered) {
+ assert(!provide);
+ assert(!vectored && !small_vecs);
+ }
+ if (provide) {
+ assert(!registered);
+ assert(!vectored && !small_vecs);
+ }
+
+ flags = O_RDONLY;
+ if (!buffered)
+ flags |= O_DIRECT;
+ fd = open(fname, flags);
+ if (fd < 0) {
+ perror("open");
+ return 1;
+ }
+
+ if (do_punch(fd))
+ return 1;
+
+ if (vectored) {
+ if (small_vecs)
+ nr_vecs = MIN_VECS;
+ else
+ nr_vecs = MAX_VECS;
+
+ for (j = 0; j < READ_BATCH; j++) {
+ for (i = 0; i < nr_vecs; i++) {
+ void *ptr;
+
+ t_posix_memalign(&ptr, 4096, CHUNK_SIZE / nr_vecs);
+ vecs[j][i].iov_base = ptr;
+ vecs[j][i].iov_len = CHUNK_SIZE / nr_vecs;
+ }
+ }
+ } else {
+ for (j = 0; j < READ_BATCH; j++)
+ t_posix_memalign(&buf[j], 4096, CHUNK_SIZE);
+ nr_vecs = 0;
+ }
+
+ if (registered) {
+ struct iovec v[READ_BATCH];
+
+ for (i = 0; i < READ_BATCH; i++) {
+ v[i].iov_base = buf[i];
+ v[i].iov_len = CHUNK_SIZE;
+ }
+ ret = io_uring_register_buffers(ring, v, READ_BATCH);
+ if (ret) {
+ fprintf(stderr, "Error buffer reg %d\n", ret);
+ goto err;
+ }
+ }
+
+ i = 0;
+ left = FSIZE;
+ off = 0;
+ while (left) {
+ int pending = 0;
+
+ if (provide && provide_buffers(ring, buf))
+ goto err;
+
+ for (i = 0; i < READ_BATCH; i++) {
+ size_t this = left;
+
+ if (this > CHUNK_SIZE)
+ this = CHUNK_SIZE;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ if (vectored) {
+ io_uring_prep_readv(sqe, fd, vecs[i], nr_vecs, off);
+ } else {
+ if (registered) {
+ io_uring_prep_read_fixed(sqe, fd, buf[i], this, off, i);
+ } else if (provide) {
+ io_uring_prep_read(sqe, fd, NULL, this, off);
+ sqe->flags |= IOSQE_BUFFER_SELECT;
+ } else {
+ io_uring_prep_read(sqe, fd, buf[i], this, off);
+ }
+ }
+ sqe->user_data = ((uint64_t)off << 32) | i;
+ off += this;
+ left -= this;
+ pending++;
+ if (!left)
+ break;
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret != pending) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ for (i = 0; i < pending; i++) {
+ int index;
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ if (cqe->res < 0) {
+ fprintf(stderr, "bad read %d, read %d\n", cqe->res, i);
+ goto err;
+ }
+ if (cqe->flags & IORING_CQE_F_BUFFER)
+ index = cqe->flags >> 16;
+ else
+ index = cqe->user_data & 0xffffffff;
+ voff = cqe->user_data >> 32;
+ io_uring_cqe_seen(ring, cqe);
+ if (vectored) {
+ for (j = 0; j < nr_vecs; j++) {
+ void *buf = vecs[index][j].iov_base;
+ size_t len = vecs[index][j].iov_len;
+
+ if (verify_buf(buf, len, voff))
+ goto err;
+ voff += len;
+ }
+ } else {
+ if (verify_buf(buf[index], CHUNK_SIZE, voff))
+ goto err;
+ }
+ }
+ }
+
+ ret = 0;
+done:
+ if (registered)
+ io_uring_unregister_buffers(ring);
+ if (vectored) {
+ for (j = 0; j < READ_BATCH; j++)
+ for (i = 0; i < nr_vecs; i++)
+ free(vecs[j][i].iov_base);
+ } else {
+ for (j = 0; j < READ_BATCH; j++)
+ free(buf[j]);
+ }
+ close(fd);
+ return ret;
+err:
+ ret = 1;
+ goto done;
+}
+
+static int fill_pattern(const char *fname)
+{
+ size_t left = FSIZE;
+ unsigned int val, *ptr;
+ void *buf;
+ int fd, i;
+
+ fd = open(fname, O_WRONLY);
+ if (fd < 0) {
+ perror("open");
+ return 1;
+ }
+
+ val = 0;
+ buf = t_malloc(4096);
+ while (left) {
+ int u_in_buf = 4096 / sizeof(val);
+ size_t this = left;
+
+ if (this > 4096)
+ this = 4096;
+ ptr = buf;
+ for (i = 0; i < u_in_buf; i++) {
+ *ptr = val;
+ val++;
+ ptr++;
+ }
+ if (write(fd, buf, 4096) != 4096)
+ return 1;
+ left -= 4096;
+ }
+
+ fsync(fd);
+ close(fd);
+ free(buf);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ const char *fname;
+ char buf[32];
+ int ret;
+
+ srand(getpid());
+
+ if (argc > 1) {
+ fname = argv[1];
+ } else {
+ sprintf(buf, ".file-verify.%d", getpid());
+ fname = buf;
+ t_create_file(fname, FSIZE);
+ }
+
+ ret = io_uring_queue_init(READ_BATCH, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ goto err;
+ }
+
+ if (fill_pattern(fname))
+ goto err;
+
+ ret = test(&ring, fname, 1, 0, 0, 0, 0);
+ if (ret) {
+ fprintf(stderr, "Buffered novec test failed\n");
+ goto err;
+ }
+ ret = test(&ring, fname, 1, 0, 0, 1, 0);
+ if (ret) {
+ fprintf(stderr, "Buffered novec reg test failed\n");
+ goto err;
+ }
+ ret = test(&ring, fname, 1, 0, 0, 0, 1);
+ if (ret) {
+ fprintf(stderr, "Buffered novec provide test failed\n");
+ goto err;
+ }
+ ret = test(&ring, fname, 1, 1, 0, 0, 0);
+ if (ret) {
+ fprintf(stderr, "Buffered vec test failed\n");
+ goto err;
+ }
+ ret = test(&ring, fname, 1, 1, 1, 0, 0);
+ if (ret) {
+ fprintf(stderr, "Buffered small vec test failed\n");
+ goto err;
+ }
+
+ ret = test(&ring, fname, 0, 0, 0, 0, 0);
+ if (ret) {
+ fprintf(stderr, "O_DIRECT novec test failed\n");
+ goto err;
+ }
+ ret = test(&ring, fname, 0, 0, 0, 1, 0);
+ if (ret) {
+ fprintf(stderr, "O_DIRECT novec reg test failed\n");
+ goto err;
+ }
+ ret = test(&ring, fname, 0, 0, 0, 0, 1);
+ if (ret) {
+ fprintf(stderr, "O_DIRECT novec provide test failed\n");
+ goto err;
+ }
+ ret = test(&ring, fname, 0, 1, 0, 0, 0);
+ if (ret) {
+ fprintf(stderr, "O_DIRECT vec test failed\n");
+ goto err;
+ }
+ ret = test(&ring, fname, 0, 1, 1, 0, 0);
+ if (ret) {
+ fprintf(stderr, "O_DIRECT small vec test failed\n");
+ goto err;
+ }
+
+ ret = test_truncate(&ring, fname, 1, 0, 0);
+ if (ret) {
+ fprintf(stderr, "Buffered end truncate read failed\n");
+ goto err;
+ }
+ ret = test_truncate(&ring, fname, 1, 1, 0);
+ if (ret) {
+ fprintf(stderr, "Buffered end truncate vec read failed\n");
+ goto err;
+ }
+ ret = test_truncate(&ring, fname, 1, 0, 1);
+ if (ret) {
+ fprintf(stderr, "Buffered end truncate pbuf read failed\n");
+ goto err;
+ }
+
+ ret = test_truncate(&ring, fname, 0, 0, 0);
+ if (ret) {
+ fprintf(stderr, "O_DIRECT end truncate read failed\n");
+ goto err;
+ }
+ ret = test_truncate(&ring, fname, 0, 1, 0);
+ if (ret) {
+ fprintf(stderr, "O_DIRECT end truncate vec read failed\n");
+ goto err;
+ }
+ ret = test_truncate(&ring, fname, 0, 0, 1);
+ if (ret) {
+ fprintf(stderr, "O_DIRECT end truncate pbuf read failed\n");
+ goto err;
+ }
+
+ if (buf == fname)
+ unlink(fname);
+ return 0;
+err:
+ if (buf == fname)
+ unlink(fname);
+ return 1;
+}
diff --git a/test/fixed-buf-iter.c b/test/fixed-buf-iter.c
new file mode 100644
index 0000000..9576993
--- /dev/null
+++ b/test/fixed-buf-iter.c
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Test fixed buffers with non-iterators.
+ *
+ * Taken from: https://github.com/axboe/liburing/issues/549
+ */
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <stdlib.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+#define BUF_SIZE 4096
+#define BUFFERS 1
+#define IN_FD "/dev/urandom"
+#define OUT_FD "/dev/zero"
+
+static int test(struct io_uring *ring)
+{
+ struct iovec iov[BUFFERS];
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ int ret, fd_in, fd_out, i;
+
+ fd_in = open(IN_FD, O_RDONLY, 0644);
+ if (fd_in < 0) {
+ perror("open in");
+ return 1;
+ }
+
+ fd_out = open(OUT_FD, O_RDWR, 0644);
+ if (fd_out < 0) {
+ perror("open out");
+ return 1;
+ }
+
+ for (i = 0; i < BUFFERS; i++) {
+ iov[i].iov_base = malloc(BUF_SIZE);
+ iov[i].iov_len = BUF_SIZE;
+ memset(iov[i].iov_base, 0, BUF_SIZE);
+ }
+
+ ret = io_uring_register_buffers(ring, iov, BUFFERS);
+ if (ret) {
+ fprintf(stderr, "Error registering buffers: %s", strerror(-ret));
+ return 1;
+ }
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "Could not get SQE.\n");
+ return 1;
+ }
+
+ io_uring_prep_read_fixed(sqe, fd_in, iov[0].iov_base, BUF_SIZE, 0, 0);
+ io_uring_submit(ring);
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "Error waiting for completion: %s\n", strerror(-ret));
+ return 1;
+ }
+
+ if (cqe->res < 0) {
+ fprintf(stderr, "Error in async operation: %s\n", strerror(-cqe->res));
+ return 1;
+ }
+ io_uring_cqe_seen(ring, cqe);
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "Could not get SQE.\n");
+ return 1;
+ }
+ io_uring_prep_write_fixed(sqe, fd_out, iov[0].iov_base, BUF_SIZE, 0, 0);
+ io_uring_submit(ring);
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "Error waiting for completion: %s\n", strerror(-ret));
+ return 1;
+ }
+ if (cqe->res < 0) {
+ fprintf(stderr, "Error in async operation: %s\n", strerror(-cqe->res));
+ return 1;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ ret = t_create_ring(8, &ring, 0);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ else if (ret < 0)
+ return 1;
+
+ ret = test(&ring);
+ if (ret) {
+ fprintf(stderr, "Test failed\n");
+ return 1;
+ }
+
+ io_uring_queue_exit(&ring);
+ return 0;
+}
diff --git a/test/fixed-reuse.c b/test/fixed-reuse.c
new file mode 100644
index 0000000..4cd8e37
--- /dev/null
+++ b/test/fixed-reuse.c
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: link <open file><read from file><close file> with an existing
+ * file present in the opened slot, verifying that we get the new file
+ * rather than the old one.
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+#define MAX_FILES 8
+#define FNAME1 ".slot.reuse.1"
+#define FNAME2 ".slot.reuse.2"
+#define PAT1 0xaa
+#define PAT2 0x55
+#define BSIZE 4096
+
+static int test(struct io_uring *ring)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ char buf[BSIZE];
+ int ret, i;
+
+ /* open FNAME1 in slot 0 */
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_openat_direct(sqe, AT_FDCWD, FNAME1, O_RDONLY, 0, 0);
+ sqe->user_data = 1;
+
+ ret = io_uring_submit(ring);
+ if (ret != 1) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ if (cqe->res != 0) {
+ fprintf(stderr, "open res %d\n", ret);
+ goto err;
+ }
+ io_uring_cqe_seen(ring, cqe);
+
+ /*
+ * Now open FNAME2 in that same slot, verifying we get data from
+ * FNAME2 and not FNAME1.
+ */
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_openat_direct(sqe, AT_FDCWD, FNAME2, O_RDONLY, 0, 0);
+ sqe->flags |= IOSQE_IO_LINK;
+ sqe->user_data = 2;
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_read(sqe, 0, buf, sizeof(buf), 0);
+ sqe->flags |= IOSQE_FIXED_FILE;
+ sqe->flags |= IOSQE_IO_LINK;
+ sqe->user_data = 3;
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_close_direct(sqe, 0);
+ sqe->user_data = 4;
+
+ ret = io_uring_submit(ring);
+ if (ret != 3) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ for (i = 0; i < 3; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ switch (cqe->user_data) {
+ case 2:
+ if (cqe->res) {
+ fprintf(stderr, "bad open %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ case 3:
+ if (cqe->res != sizeof(buf)) {
+ fprintf(stderr, "bad read %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ case 4:
+ if (cqe->res) {
+ fprintf(stderr, "bad close %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ for (i = 0; i < sizeof(buf); i++) {
+ if (buf[i] == PAT2)
+ continue;
+ fprintf(stderr, "Bad pattern %x at %d\n", buf[i], i);
+ goto err;
+ }
+
+ return 0;
+err:
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ struct io_uring_params p = { };
+ int ret, files[MAX_FILES];
+
+ if (argc > 1)
+ return 0;
+
+ ret = io_uring_queue_init_params(8, &ring, &p);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return 1;
+ }
+ if (!(p.features & IORING_FEAT_CQE_SKIP))
+ return 0;
+
+ memset(files, -1, sizeof(files));
+ ret = io_uring_register_files(&ring, files, ARRAY_SIZE(files));
+ if (ret) {
+ fprintf(stderr, "Failed registering files\n");
+ return 1;
+ }
+
+ t_create_file_pattern(FNAME1, 4096, PAT1);
+ t_create_file_pattern(FNAME2, 4096, PAT2);
+
+ ret = test(&ring);
+ if (ret) {
+ fprintf(stderr, "test failed\n");
+ goto err;
+ }
+
+ unlink(FNAME1);
+ unlink(FNAME2);
+ return 0;
+err:
+ unlink(FNAME1);
+ unlink(FNAME2);
+ return 1;
+}
diff --git a/test/fpos.c b/test/fpos.c
new file mode 100644
index 0000000..78a6152
--- /dev/null
+++ b/test/fpos.c
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: test io_uring fpos handling
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <assert.h>
+
+#include "helpers.h"
+#include "liburing.h"
+
+#define FILE_SIZE 5000
+#define QUEUE_SIZE 2048
+
+static void create_file(const char *file, size_t size)
+{
+ ssize_t ret;
+ char *buf;
+ size_t idx;
+ int fd;
+
+ buf = t_malloc(size);
+ for (idx = 0; idx < size; ++idx) {
+ /* write 0 or 1 */
+ buf[idx] = (unsigned char)(idx & 0x01);
+ }
+
+ fd = open(file, O_WRONLY | O_CREAT, 0644);
+ assert(fd >= 0);
+
+ ret = write(fd, buf, size);
+ fsync(fd);
+ close(fd);
+ free(buf);
+ assert(ret == size);
+}
+
+static int test_read(struct io_uring *ring, bool async, int blocksize)
+{
+ int ret, fd, i;
+ bool done = false;
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ loff_t current, expected = 0;
+ int count_ok;
+ int count_0 = 0, count_1 = 0;
+ unsigned char buff[QUEUE_SIZE * blocksize];
+ unsigned char reordered[QUEUE_SIZE * blocksize];
+
+ create_file(".test_fpos_read", FILE_SIZE);
+ fd = open(".test_fpos_read", O_RDONLY);
+ unlink(".test_fpos_read");
+ assert(fd >= 0);
+
+ while (!done) {
+ for (i = 0; i < QUEUE_SIZE; ++i) {
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "no sqe\n");
+ return -1;
+ }
+ io_uring_prep_read(sqe, fd,
+ buff + i * blocksize,
+ blocksize, -1);
+ sqe->user_data = i;
+ if (async)
+ sqe->flags |= IOSQE_ASYNC;
+ if (i != QUEUE_SIZE - 1)
+ sqe->flags |= IOSQE_IO_LINK;
+ }
+ ret = io_uring_submit_and_wait(ring, QUEUE_SIZE);
+ if (ret != QUEUE_SIZE) {
+ fprintf(stderr, "submit failed: %d\n", ret);
+ return 1;
+ }
+ count_ok = 0;
+ for (i = 0; i < QUEUE_SIZE; ++i) {
+ int res;
+
+ ret = io_uring_peek_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "peek failed: %d\n", ret);
+ return ret;
+ }
+ assert(cqe->user_data < QUEUE_SIZE);
+ memcpy(reordered + count_ok,
+ buff + cqe->user_data * blocksize, blocksize);
+ res = cqe->res;
+ io_uring_cqe_seen(ring, cqe);
+ if (res == 0) {
+ done = true;
+ } else if (res == -ECANCELED) {
+ /* cancelled, probably ok */
+ } else if (res < 0 || res > blocksize) {
+ fprintf(stderr, "bad read: %d\n", res);
+ return -1;
+ } else {
+ expected += res;
+ count_ok += res;
+ }
+ }
+ ret = 0;
+ for (i = 0; i < count_ok; i++) {
+ if (reordered[i] == 1) {
+ count_1++;
+ } else if (reordered[i] == 0) {
+ count_0++;
+ } else {
+ fprintf(stderr, "odd read %d\n",
+ (int)reordered[i]);
+ ret = -1;
+ break;
+ }
+ }
+ if (labs(count_1 - count_0) > 1) {
+ fprintf(stderr, "inconsistent reads, got 0s:%d 1s:%d\n",
+ count_0, count_1);
+ ret = -1;
+ }
+ current = lseek(fd, 0, SEEK_CUR);
+ if (current != expected) {
+ fprintf(stderr, "f_pos incorrect, expected %ld have %ld\n",
+ (long) expected, (long) current);
+ ret = -1;
+ }
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+
+static int test_write(struct io_uring *ring, bool async, int blocksize)
+{
+ int ret, fd, i;
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ bool fail = false;
+ loff_t current;
+ char data[blocksize+1];
+ char readbuff[QUEUE_SIZE*blocksize+1];
+
+ fd = open(".test_fpos_write", O_RDWR | O_CREAT, 0644);
+ unlink(".test_fpos_write");
+ assert(fd >= 0);
+
+ for (i = 0; i < blocksize; i++)
+ data[i] = 'A' + i;
+
+ data[blocksize] = '\0';
+
+ for (i = 0; i < QUEUE_SIZE; ++i) {
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "no sqe\n");
+ return -1;
+ }
+ io_uring_prep_write(sqe, fd, data + (i % blocksize), 1, -1);
+ sqe->user_data = 1;
+ if (async)
+ sqe->flags |= IOSQE_ASYNC;
+ if (i != QUEUE_SIZE - 1)
+ sqe->flags |= IOSQE_IO_LINK;
+ }
+ ret = io_uring_submit_and_wait(ring, QUEUE_SIZE);
+ if (ret != QUEUE_SIZE) {
+ fprintf(stderr, "submit failed: %d\n", ret);
+ return 1;
+ }
+ for (i = 0; i < QUEUE_SIZE; ++i) {
+ int res;
+
+ ret = io_uring_peek_cqe(ring, &cqe);
+ res = cqe->res;
+ if (ret) {
+ fprintf(stderr, "peek failed: %d\n", ret);
+ return ret;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ if (!fail && res != 1) {
+ fprintf(stderr, "bad result %d\n", res);
+ fail = true;
+ }
+ }
+ current = lseek(fd, 0, SEEK_CUR);
+ if (current != QUEUE_SIZE) {
+ fprintf(stderr, "f_pos incorrect, expected %ld have %d\n",
+ (long) current, QUEUE_SIZE);
+ fail = true;
+ }
+ current = lseek(fd, 0, SEEK_SET);
+ if (current != 0) {
+ perror("seek to start");
+ return -1;
+ }
+ ret = read(fd, readbuff, QUEUE_SIZE);
+ if (ret != QUEUE_SIZE) {
+ fprintf(stderr, "did not write enough: %d\n", ret);
+ return -1;
+ }
+ i = 0;
+ while (i < QUEUE_SIZE - blocksize) {
+ if (strncmp(readbuff + i, data, blocksize)) {
+ char bad[QUEUE_SIZE+1];
+
+ memcpy(bad, readbuff + i, blocksize);
+ bad[blocksize] = '\0';
+ fprintf(stderr, "unexpected data %s\n", bad);
+ fail = true;
+ }
+ i += blocksize;
+ }
+
+ return fail ? -1 : 0;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ ret = io_uring_queue_init(QUEUE_SIZE, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed\n");
+ return 1;
+ }
+
+ for (int test = 0; test < 8; test++) {
+ int async = test & 0x01;
+ int write = test & 0x02;
+ int blocksize = test & 0x04 ? 1 : 7;
+
+ ret = write
+ ? test_write(&ring, !!async, blocksize)
+ : test_read(&ring, !!async, blocksize);
+ if (ret) {
+ fprintf(stderr, "failed %s async=%d blocksize=%d\n",
+ write ? "write" : "read",
+ async, blocksize);
+ return -1;
+ }
+ }
+ return 0;
+}
diff --git a/test/fsync.c b/test/fsync.c
index 7e93ecc..5ae8441 100644
--- a/test/fsync.c
+++ b/test/fsync.c
@@ -63,13 +63,14 @@ static int test_barrier_fsync(struct io_uring *ring)
int i, fd, ret;
off_t off;
- fd = open("testfile", O_WRONLY | O_CREAT, 0644);
+ fd = open("fsync-testfile", O_WRONLY | O_CREAT, 0644);
if (fd < 0) {
perror("open");
return 1;
}
+ unlink("fsync-testfile");
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < ARRAY_SIZE(iovecs); i++) {
iovecs[i].iov_base = t_malloc(4096);
iovecs[i].iov_len = 4096;
}
@@ -129,11 +130,15 @@ static int test_barrier_fsync(struct io_uring *ring)
io_uring_cqe_seen(ring, cqe);
}
- unlink("testfile");
- return 0;
+
+ ret = 0;
+ goto out;
err:
- unlink("testfile");
- return 1;
+ ret = 1;
+out:
+ for (i = 0; i < ARRAY_SIZE(iovecs); i++)
+ free(iovecs[i].iov_base);
+ return ret;
}
#define FILE_SIZE 1024
diff --git a/test/hardlink.c b/test/hardlink.c
index 1c73424..634b8ed 100644
--- a/test/hardlink.c
+++ b/test/hardlink.c
@@ -73,6 +73,9 @@ int main(int argc, char *argv[])
int ret;
struct io_uring ring;
+ if (argc > 1)
+ return 0;
+
ret = io_uring_queue_init(8, &ring, 0);
if (ret) {
fprintf(stderr, "queue init failed: %d\n", ret);
diff --git a/test/helpers.c b/test/helpers.c
index 930d82a..491822e 100644
--- a/test/helpers.c
+++ b/test/helpers.c
@@ -49,14 +49,14 @@ void *t_calloc(size_t nmemb, size_t size)
/*
* Helper for creating file and write @size byte buf with 0xaa value in the file.
*/
-void t_create_file(const char *file, size_t size)
+static void __t_create_file(const char *file, size_t size, char pattern)
{
ssize_t ret;
char *buf;
int fd;
buf = t_malloc(size);
- memset(buf, 0xaa, size);
+ memset(buf, pattern, size);
fd = open(file, O_WRONLY | O_CREAT, 0644);
assert(fd >= 0);
@@ -68,6 +68,16 @@ void t_create_file(const char *file, size_t size)
assert(ret == size);
}
+void t_create_file(const char *file, size_t size)
+{
+ __t_create_file(file, size, 0xaa);
+}
+
+void t_create_file_pattern(const char *file, size_t size, char pattern)
+{
+ __t_create_file(file, size, pattern);
+}
+
/*
* Helper for creating @buf_num number of iovec
* with @buf_size bytes buffer of each iovec.
@@ -114,3 +124,22 @@ enum t_setup_ret t_create_ring(int depth, struct io_uring *ring,
p.flags = flags;
return t_create_ring_params(depth, ring, &p);
}
+
+enum t_setup_ret t_register_buffers(struct io_uring *ring,
+ const struct iovec *iovecs,
+ unsigned nr_iovecs)
+{
+ int ret;
+
+ ret = io_uring_register_buffers(ring, iovecs, nr_iovecs);
+ if (!ret)
+ return T_SETUP_OK;
+
+ if ((ret == -EPERM || ret == -ENOMEM) && geteuid()) {
+ fprintf(stdout, "too large non-root buffer registration, skip\n");
+ return T_SETUP_SKIP;
+ }
+
+ fprintf(stderr, "buffer register failed: %s\n", strerror(-ret));
+ return ret;
+}
diff --git a/test/helpers.h b/test/helpers.h
index 74fe162..d0beb93 100644
--- a/test/helpers.h
+++ b/test/helpers.h
@@ -41,6 +41,12 @@ void *t_calloc(size_t nmemb, size_t size);
void t_create_file(const char *file, size_t size);
/*
+ * Helper for creating file and write @size byte buf with @pattern value in
+ * the file.
+ */
+void t_create_file_pattern(const char *file, size_t size, char pattern);
+
+/*
* Helper for creating @buf_num number of iovec
* with @buf_size bytes buffer of each iovec.
*/
@@ -54,6 +60,12 @@ enum t_setup_ret t_create_ring_params(int depth, struct io_uring *ring,
enum t_setup_ret t_create_ring(int depth, struct io_uring *ring,
unsigned int flags);
+enum t_setup_ret t_register_buffers(struct io_uring *ring,
+ const struct iovec *iovecs,
+ unsigned nr_iovecs);
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
#ifdef __cplusplus
}
#endif
diff --git a/test/io-cancel.c b/test/io-cancel.c
index 9a36dd9..d5e3ae9 100644
--- a/test/io-cancel.c
+++ b/test/io-cancel.c
@@ -11,7 +11,7 @@
#include <sys/types.h>
#include <sys/time.h>
#include <sys/wait.h>
-#include <sys/poll.h>
+#include <poll.h>
#include "helpers.h"
#include "liburing.h"
@@ -115,7 +115,7 @@ static int do_io(struct io_uring *ring, int fd, int do_write)
return 0;
}
-static int start_cancel(struct io_uring *ring, int do_partial)
+static int start_cancel(struct io_uring *ring, int do_partial, int async_cancel)
{
struct io_uring_sqe *sqe;
int i, ret, submitted = 0;
@@ -128,7 +128,9 @@ static int start_cancel(struct io_uring *ring, int do_partial)
fprintf(stderr, "sqe get failed\n");
goto err;
}
- io_uring_prep_cancel(sqe, (void *) (unsigned long) i + 1, 0);
+ io_uring_prep_cancel64(sqe, i + 1, 0);
+ if (async_cancel)
+ sqe->flags |= IOSQE_ASYNC;
sqe->user_data = 0;
submitted++;
}
@@ -148,7 +150,8 @@ err:
* the submitted IO. This is done to verify that cancelling one piece of IO doesn't
* impact others.
*/
-static int test_io_cancel(const char *file, int do_write, int do_partial)
+static int test_io_cancel(const char *file, int do_write, int do_partial,
+ int async_cancel)
{
struct io_uring ring;
struct timeval start_tv;
@@ -179,7 +182,7 @@ static int test_io_cancel(const char *file, int do_write, int do_partial)
goto err;
/* sleep for 1/3 of the total time, to allow some to start/complete */
usleep(usecs / 3);
- if (start_cancel(&ring, do_partial))
+ if (start_cancel(&ring, do_partial, async_cancel))
goto err;
to_wait = BUFFERS;
if (do_partial)
@@ -243,7 +246,7 @@ static int test_dont_cancel_another_ring(void)
fprintf(stderr, "%s: failed to get sqe\n", __FUNCTION__);
return 1;
}
- io_uring_prep_cancel(sqe, (void *) (unsigned long)1, 0);
+ io_uring_prep_cancel64(sqe, 1, 0);
sqe->user_data = 2;
ret = io_uring_submit(&ring2);
@@ -323,7 +326,7 @@ static int test_cancel_req_across_fork(void)
fprintf(stderr, "%s: failed to get sqe\n", __FUNCTION__);
return 1;
}
- io_uring_prep_cancel(sqe, (void *) (unsigned long)1, 0);
+ io_uring_prep_cancel64(sqe, 1, 0);
sqe->user_data = 2;
ret = io_uring_submit(&ring);
@@ -338,8 +341,21 @@ static int test_cancel_req_across_fork(void)
fprintf(stderr, "wait_cqe=%d\n", ret);
return 1;
}
- if ((cqe->user_data == 1 && cqe->res != -EINTR) ||
- (cqe->user_data == 2 && cqe->res != -EALREADY)) {
+ switch (cqe->user_data) {
+ case 1:
+ if (cqe->res != -EINTR &&
+ cqe->res != -ECANCELED) {
+ fprintf(stderr, "%i %i\n", (int)cqe->user_data, cqe->res);
+ exit(1);
+ }
+ break;
+ case 2:
+ if (cqe->res != -EALREADY && cqe->res) {
+ fprintf(stderr, "%i %i\n", (int)cqe->user_data, cqe->res);
+ exit(1);
+ }
+ break;
+ default:
fprintf(stderr, "%i %i\n", (int)cqe->user_data, cqe->res);
exit(1);
}
@@ -483,6 +499,7 @@ static int test_sqpoll_cancel_iowq_requests(void)
int main(int argc, char *argv[])
{
+ const char *fname = ".io-cancel-test";
int i, ret;
if (argc > 1)
@@ -508,24 +525,26 @@ int main(int argc, char *argv[])
return 1;
}
- t_create_file(".basic-rw", FILE_SIZE);
+ t_create_file(fname, FILE_SIZE);
vecs = t_create_buffers(BUFFERS, BS);
- for (i = 0; i < 4; i++) {
- int v1 = (i & 1) != 0;
- int v2 = (i & 2) != 0;
+ for (i = 0; i < 8; i++) {
+ int write = (i & 1) != 0;
+ int partial = (i & 2) != 0;
+ int async = (i & 4) != 0;
- ret = test_io_cancel(".basic-rw", v1, v2);
+ ret = test_io_cancel(fname, write, partial, async);
if (ret) {
- fprintf(stderr, "test_io_cancel %d %d failed\n", v1, v2);
+ fprintf(stderr, "test_io_cancel %d %d %d failed\n",
+ write, partial, async);
goto err;
}
}
- unlink(".basic-rw");
+ unlink(fname);
return 0;
err:
- unlink(".basic-rw");
+ unlink(fname);
return 1;
}
diff --git a/test/io_uring_enter.c b/test/io_uring_enter.c
index a6bb8f5..ef00bf6 100644
--- a/test/io_uring_enter.c
+++ b/test/io_uring_enter.c
@@ -30,62 +30,41 @@
#include "../src/syscall.h"
#define IORING_MAX_ENTRIES 4096
+#define IORING_MAX_ENTRIES_FALLBACK 128
-int
-expect_failed_submit(struct io_uring *ring, int error)
-{
- int ret;
-
- ret = io_uring_submit(ring);
- if (ret == 1) {
- printf("expected failure, but io_uring_submit succeeded.\n");
- return 1;
- }
-
- if (errno != error) {
- printf("expected %d, got %d\n", error, errno);
- return 1;
- }
-
- return 0;
-}
-
-int
-expect_fail(int fd, unsigned int to_submit, unsigned int min_complete,
- unsigned int flags, sigset_t *sig, int error)
+static int expect_fail(int fd, unsigned int to_submit,
+ unsigned int min_complete, unsigned int flags,
+ sigset_t *sig, int error)
{
int ret;
ret = __sys_io_uring_enter(fd, to_submit, min_complete, flags, sig);
if (ret != -1) {
- printf("expected %s, but call succeeded\n", strerror(error));
+ fprintf(stderr, "expected %s, but call succeeded\n", strerror(error));
return 1;
}
if (errno != error) {
- printf("expected %d, got %d\n", error, errno);
+ fprintf(stderr, "expected %d, got %d\n", error, errno);
return 1;
}
return 0;
}
-int
-try_io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete,
- unsigned int flags, sigset_t *sig, int expect, int error)
+static int try_io_uring_enter(int fd, unsigned int to_submit,
+ unsigned int min_complete, unsigned int flags,
+ sigset_t *sig, int expect, int error)
{
int ret;
- printf("io_uring_enter(%d, %u, %u, %u, %p)\n", fd, to_submit,
- min_complete, flags, sig);
-
if (expect == -1)
return expect_fail(fd, to_submit, min_complete,
flags, sig, error);
ret = __sys_io_uring_enter(fd, to_submit, min_complete, flags, sig);
if (ret != expect) {
- printf("Expected %d, got %d\n", expect, errno);
+ fprintf(stderr, "Expected %d, got %d\n", expect, errno);
return 1;
}
@@ -95,8 +74,7 @@ try_io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete,
/*
* prep a read I/O. index is treated like a block number.
*/
-int
-setup_file(char *template, off_t len)
+static int setup_file(char *template, off_t len)
{
int fd, ret;
char buf[4096];
@@ -114,15 +92,15 @@ setup_file(char *template, off_t len)
ret = read(fd, buf, 4096);
if (ret != 4096) {
- printf("read returned %d, expected 4096\n", ret);
+ fprintf(stderr, "read returned %d, expected 4096\n", ret);
exit(1);
}
return fd;
}
-void
-io_prep_read(struct io_uring_sqe *sqe, int fd, off_t offset, size_t len)
+static void io_prep_read(struct io_uring_sqe *sqe, int fd, off_t offset,
+ size_t len)
{
struct iovec *iov;
@@ -137,8 +115,7 @@ io_prep_read(struct io_uring_sqe *sqe, int fd, off_t offset, size_t len)
io_uring_sqe_set_data(sqe, iov); // free on completion
}
-void
-reap_events(struct io_uring *ring, unsigned nr)
+static void reap_events(struct io_uring *ring, unsigned nr)
{
int ret;
unsigned left = nr;
@@ -146,17 +123,15 @@ reap_events(struct io_uring *ring, unsigned nr)
struct iovec *iov;
struct timeval start, now, elapsed;
- printf("Reaping %u I/Os\n", nr);
gettimeofday(&start, NULL);
while (left) {
ret = io_uring_wait_cqe(ring, &cqe);
if (ret < 0) {
- printf("io_uring_wait_cqe returned %d\n", ret);
- printf("expected success\n");
+ fprintf(stderr, "io_uring_wait_cqe returned %d\n", ret);
exit(1);
}
if (cqe->res != 4096)
- printf("cqe->res: %d, expected 4096\n", cqe->res);
+ fprintf(stderr, "cqe->res: %d, expected 4096\n", cqe->res);
iov = io_uring_cqe_get_data(cqe);
free(iov->iov_base);
free(iov);
@@ -166,15 +141,14 @@ reap_events(struct io_uring *ring, unsigned nr)
gettimeofday(&now, NULL);
timersub(&now, &start, &elapsed);
if (elapsed.tv_sec > 10) {
- printf("Timed out waiting for I/Os to complete.\n");
- printf("%u expected, %u completed\n", nr, left);
+ fprintf(stderr, "Timed out waiting for I/Os to complete.\n");
+ fprintf(stderr, "%u expected, %u completed\n", nr, left);
break;
}
}
}
-void
-submit_io(struct io_uring *ring, unsigned nr)
+static void submit_io(struct io_uring *ring, unsigned nr)
{
int fd, ret;
off_t file_len;
@@ -182,7 +156,6 @@ submit_io(struct io_uring *ring, unsigned nr)
static char template[32] = "/tmp/io_uring_enter-test.XXXXXX";
struct io_uring_sqe *sqe;
- printf("Allocating %u sqes\n", nr);
file_len = nr * 4096;
fd = setup_file(template, file_len);
for (i = 0; i < nr; i++) {
@@ -193,18 +166,15 @@ submit_io(struct io_uring *ring, unsigned nr)
}
/* submit the I/Os */
- printf("Submitting %u I/Os\n", nr);
ret = io_uring_submit(ring);
unlink(template);
if (ret < 0) {
perror("io_uring_enter");
exit(1);
}
- printf("Done\n");
}
-int
-main(int argc, char **argv)
+int main(int argc, char **argv)
{
int ret;
unsigned int status = 0;
@@ -218,6 +188,8 @@ main(int argc, char **argv)
return 0;
ret = io_uring_queue_init(IORING_MAX_ENTRIES, &ring, 0);
+ if (ret == -ENOMEM)
+ ret = io_uring_queue_init(IORING_MAX_ENTRIES_FALLBACK, &ring, 0);
if (ret < 0) {
perror("io_uring_queue_init");
exit(1);
@@ -234,12 +206,11 @@ main(int argc, char **argv)
status |= try_io_uring_enter(0, 0, 0, 0, NULL, -1, EOPNOTSUPP);
/* to_submit: 0, flags: 0; should get back 0. */
- status |= try_io_uring_enter(ring.ring_fd, 1, 0, 0, NULL, 0, 0);
+ status |= try_io_uring_enter(ring.ring_fd, 0, 0, 0, NULL, 0, 0);
/* fill the sq ring */
sq_entries = *ring.sq.kring_entries;
submit_io(&ring, sq_entries);
- printf("Waiting for %u events\n", sq_entries);
ret = __sys_io_uring_enter(ring.ring_fd, 0, sq_entries,
IORING_ENTER_GETEVENTS, NULL);
if (ret < 0) {
@@ -253,7 +224,7 @@ main(int argc, char **argv)
*/
completed = *ring.cq.ktail - *ring.cq.khead;
if (completed != sq_entries) {
- printf("Submitted %u I/Os, but only got %u completions\n",
+ fprintf(stderr, "Submitted %u I/Os, but only got %u completions\n",
sq_entries, completed);
status = 1;
}
@@ -264,7 +235,6 @@ main(int argc, char **argv)
* Add an invalid index to the submission queue. This should
* result in the dropped counter increasing.
*/
- printf("Submitting invalid sqe index.\n");
index = *sq->kring_entries + 1; // invalid index
dropped = *sq->kdropped;
ktail = *sq->ktail;
@@ -279,15 +249,13 @@ main(int argc, char **argv)
ret = __sys_io_uring_enter(ring.ring_fd, 1, 0, 0, NULL);
/* now check to see if our sqe was dropped */
if (*sq->kdropped == dropped) {
- printf("dropped counter did not increase\n");
+ fprintf(stderr, "dropped counter did not increase\n");
status = 1;
}
- if (!status) {
- printf("PASS\n");
+ if (!status)
return 0;
- }
- printf("FAIL\n");
+ fprintf(stderr, "FAIL\n");
return -1;
}
diff --git a/test/io_uring_register.c b/test/io_uring_register.c
index 53e3987..e639f05 100644
--- a/test/io_uring_register.c
+++ b/test/io_uring_register.c
@@ -31,19 +31,27 @@ static int pagesize;
static rlim_t mlock_limit;
static int devnull;
-int
-expect_fail(int fd, unsigned int opcode, void *arg,
+#if !defined(CONFIG_HAVE_MEMFD_CREATE)
+#include <sys/syscall.h>
+#include <linux/memfd.h>
+
+static int memfd_create(const char *name, unsigned int flags)
+{
+ return (int)syscall(SYS_memfd_create, name, flags);
+}
+#endif
+
+
+static int expect_fail(int fd, unsigned int opcode, void *arg,
unsigned int nr_args, int error)
{
int ret;
- printf("io_uring_register(%d, %u, %p, %u)\n",
- fd, opcode, arg, nr_args);
ret = __sys_io_uring_register(fd, opcode, arg, nr_args);
if (ret != -1) {
int ret2 = 0;
- printf("expected %s, but call succeeded\n", strerror(error));
+ fprintf(stderr, "expected %s, but call succeeded\n", strerror(error));
if (opcode == IORING_REGISTER_BUFFERS) {
ret2 = __sys_io_uring_register(fd,
IORING_UNREGISTER_BUFFERS, 0, 0);
@@ -52,21 +60,20 @@ expect_fail(int fd, unsigned int opcode, void *arg,
IORING_UNREGISTER_FILES, 0, 0);
}
if (ret2) {
- printf("internal error: failed to unregister\n");
+ fprintf(stderr, "internal error: failed to unregister\n");
exit(1);
}
return 1;
}
if (errno != error) {
- printf("expected %d, got %d\n", error, errno);
+ fprintf(stderr, "expected %d, got %d\n", error, errno);
return 1;
}
return 0;
}
-int
-new_io_uring(int entries, struct io_uring_params *p)
+static int new_io_uring(int entries, struct io_uring_params *p)
{
int fd;
@@ -80,8 +87,7 @@ new_io_uring(int entries, struct io_uring_params *p)
#define MAXFDS (UINT_MAX * sizeof(int))
-void *
-map_filebacked(size_t size)
+static void *map_filebacked(size_t size)
{
int fd, ret;
void *addr;
@@ -116,8 +122,7 @@ map_filebacked(size_t size)
* NOTE: this is now limited by SCM_MAX_FD (253). Keep the code for now,
* but probably should augment it to test 253 and 254, specifically.
*/
-int
-test_max_fds(int uring_fd)
+static int test_max_fds(int uring_fd)
{
int status = 1;
int ret;
@@ -137,14 +142,11 @@ test_max_fds(int uring_fd)
fd_as = mmap(NULL, UINT_MAX * sizeof(int), PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (fd_as == MAP_FAILED) {
- if (errno == ENOMEM) {
- printf("Not enough memory for this test, skipping\n");
+ if (errno == ENOMEM)
return 0;
- }
perror("mmap fd_as");
exit(1);
}
- printf("allocated %zu bytes of address space\n", UINT_MAX * sizeof(int));
fdtable_fd = mkstemp(template);
if (fdtable_fd < 0) {
@@ -182,7 +184,7 @@ test_max_fds(int uring_fd)
fds = mmap(fds, 128*1024*1024, PROT_READ|PROT_WRITE,
MAP_SHARED|MAP_FIXED, fdtable_fd, 0);
if (fds == MAP_FAILED) {
- printf("mmap failed at offset %lu\n",
+ fprintf(stderr, "mmap failed at offset %lu\n",
(unsigned long)((char *)fd_as - (char *)fds));
exit(1);
}
@@ -201,21 +203,15 @@ test_max_fds(int uring_fd)
nr_fds /= 2;
continue;
}
- printf("io_uring_register(%d, IORING_REGISTER_FILES, %p, %llu)"
- "...succeeded\n", uring_fd, fd_as, nr_fds);
status = 0;
- printf("io_uring_register(%d, IORING_UNREGISTER_FILES, 0, 0)...",
- uring_fd);
ret = __sys_io_uring_register(uring_fd, IORING_UNREGISTER_FILES,
0, 0);
if (ret < 0) {
ret = errno;
- printf("failed\n");
errno = ret;
perror("io_uring_register UNREGISTER_FILES");
exit(1);
}
- printf("succeeded\n");
break;
}
@@ -223,15 +219,14 @@ test_max_fds(int uring_fd)
close(fdtable_fd);
ret = munmap(fd_as, UINT_MAX * sizeof(int));
if (ret != 0) {
- printf("munmap(%zu) failed\n", UINT_MAX * sizeof(int));
+ fprintf(stderr, "munmap(%zu) failed\n", UINT_MAX * sizeof(int));
exit(1);
}
return status;
}
-int
-test_memlock_exceeded(int fd)
+static int test_memlock_exceeded(int fd)
{
int ret;
void *buf;
@@ -249,21 +244,21 @@ test_memlock_exceeded(int fd)
ret = __sys_io_uring_register(fd, IORING_REGISTER_BUFFERS, &iov, 1);
if (ret < 0) {
if (errno == ENOMEM) {
- printf("io_uring_register of %zu bytes failed "
- "with ENOMEM (expected).\n", iov.iov_len);
iov.iov_len /= 2;
continue;
}
- printf("expected success or EFAULT, got %d\n", errno);
+ if (errno == EFAULT) {
+ free(buf);
+ return 0;
+ }
+ fprintf(stderr, "expected success or EFAULT, got %d\n", errno);
free(buf);
return 1;
}
- printf("successfully registered %zu bytes (%d).\n",
- iov.iov_len, ret);
ret = __sys_io_uring_register(fd, IORING_UNREGISTER_BUFFERS,
NULL, 0);
if (ret != 0) {
- printf("error: unregister failed with %d\n", errno);
+ fprintf(stderr, "error: unregister failed with %d\n", errno);
free(buf);
return 1;
}
@@ -276,8 +271,7 @@ test_memlock_exceeded(int fd)
return 0;
}
-int
-test_iovec_nr(int fd)
+static int test_iovec_nr(int fd)
{
int i, ret, status = 0;
unsigned int nr = 1000000;
@@ -300,15 +294,15 @@ test_iovec_nr(int fd)
/* reduce to UIO_MAXIOV */
nr = UIO_MAXIOV;
- printf("io_uring_register(%d, %u, %p, %u)\n",
- fd, IORING_REGISTER_BUFFERS, iovs, nr);
ret = __sys_io_uring_register(fd, IORING_REGISTER_BUFFERS, iovs, nr);
- if (ret != 0) {
- printf("expected success, got %d\n", errno);
+ if (ret && (errno == ENOMEM || errno == EPERM) && geteuid()) {
+ fprintf(stderr, "can't register large iovec for regular users, skip\n");
+ } else if (ret != 0) {
+ fprintf(stderr, "expected success, got %d\n", errno);
status = 1;
- } else
+ } else {
__sys_io_uring_register(fd, IORING_UNREGISTER_BUFFERS, 0, 0);
-
+ }
free(buf);
free(iovs);
return status;
@@ -317,8 +311,7 @@ test_iovec_nr(int fd)
/*
* io_uring limit is 1G. iov_len limit is ~OUL, I think
*/
-int
-test_iovec_size(int fd)
+static int test_iovec_size(int fd)
{
unsigned int status = 0;
int ret;
@@ -370,11 +363,10 @@ test_iovec_size(int fd)
"RLIMIT_MEMLOCK resource limit by at "
"least 2MB.");
else {
- printf("expected success, got %d\n", errno);
+ fprintf(stderr, "expected success, got %d\n", errno);
status = 1;
}
} else {
- printf("Success!\n");
ret = __sys_io_uring_register(fd,
IORING_UNREGISTER_BUFFERS, 0, 0);
if (ret < 0) {
@@ -392,7 +384,6 @@ test_iovec_size(int fd)
status = 1;
iov.iov_base = buf;
iov.iov_len = 2*1024*1024;
- printf("reserve file-backed buffers\n");
status |= expect_fail(fd, IORING_REGISTER_BUFFERS, &iov, 1, EOPNOTSUPP);
munmap(buf, 2*1024*1024);
@@ -405,18 +396,7 @@ test_iovec_size(int fd)
return status;
}
-void
-dump_sqe(struct io_uring_sqe *sqe)
-{
- printf("\topcode: %d\n", sqe->opcode);
- printf("\tflags: 0x%.8x\n", sqe->flags);
- printf("\tfd: %d\n", sqe->fd);
- if (sqe->opcode == IORING_OP_POLL_ADD)
- printf("\tpoll_events: 0x%.8x\n", sqe->poll_events);
-}
-
-int
-ioring_poll(struct io_uring *ring, int fd, int fixed)
+static int ioring_poll(struct io_uring *ring, int fd, int fixed)
{
int ret;
struct io_uring_sqe *sqe;
@@ -430,22 +410,20 @@ ioring_poll(struct io_uring *ring, int fd, int fixed)
sqe->fd = fd;
sqe->poll_events = POLLIN|POLLOUT;
- printf("io_uring_submit:\n");
- dump_sqe(sqe);
ret = io_uring_submit(ring);
if (ret != 1) {
- printf("failed to submit poll sqe: %d.\n", errno);
+ fprintf(stderr, "failed to submit poll sqe: %d.\n", errno);
return 1;
}
ret = io_uring_wait_cqe(ring, &cqe);
if (ret < 0) {
- printf("io_uring_wait_cqe failed with %d\n", ret);
+ fprintf(stderr, "io_uring_wait_cqe failed with %d\n", ret);
return 1;
}
ret = 0;
if (cqe->res != POLLOUT) {
- printf("io_uring_wait_cqe: expected 0x%.8x, got 0x%.8x\n",
+ fprintf(stderr, "io_uring_wait_cqe: expected 0x%.8x, got 0x%.8x\n",
POLLOUT, cqe->res);
ret = 1;
}
@@ -454,8 +432,7 @@ ioring_poll(struct io_uring *ring, int fd, int fixed)
return ret;
}
-int
-test_poll_ringfd(void)
+static int test_poll_ringfd(void)
{
int status = 0;
int ret;
@@ -592,8 +569,7 @@ out:
return 0;
}
-int
-main(int argc, char **argv)
+int main(int argc, char **argv)
{
int fd, ret;
unsigned int status = 0;
@@ -611,7 +587,6 @@ main(int argc, char **argv)
return 1;
}
mlock_limit = rlim.rlim_cur;
- printf("RELIMIT_MEMLOCK: %lu (%lu)\n", rlim.rlim_cur, rlim.rlim_max);
devnull = open("/dev/null", O_RDWR);
if (devnull < 0) {
perror("open /dev/null");
@@ -643,10 +618,8 @@ main(int argc, char **argv)
/* uring poll on the uring fd */
status |= test_poll_ringfd();
- if (!status)
- printf("PASS\n");
- else
- printf("FAIL\n");
+ if (status)
+ fprintf(stderr, "FAIL\n");
ret = test_shmem();
if (ret) {
diff --git a/test/io_uring_setup.c b/test/io_uring_setup.c
index a0709a7..7752c97 100644
--- a/test/io_uring_setup.c
+++ b/test/io_uring_setup.c
@@ -99,27 +99,23 @@ dump_resv(struct io_uring_params *p)
int
try_io_uring_setup(unsigned entries, struct io_uring_params *p, int expect, int error)
{
- int ret, __errno;
-
- printf("io_uring_setup(%u, %p), flags: %s, feat: %s, resv: %s, sq_thread_cpu: %u\n",
- entries, p, flags_string(p), features_string(p), dump_resv(p),
- p ? p->sq_thread_cpu : 0);
+ int ret, err;
ret = __sys_io_uring_setup(entries, p);
if (ret != expect) {
- printf("expected %d, got %d\n", expect, ret);
+ fprintf(stderr, "expected %d, got %d\n", expect, ret);
/* if we got a valid uring, close it */
if (ret > 0)
close(ret);
return 1;
}
- __errno = errno;
- if (expect == -1 && error != __errno) {
- if (__errno == EPERM && geteuid() != 0) {
+ err = errno;
+ if (expect == -1 && error != err) {
+ if (err == EPERM && geteuid() != 0) {
printf("Needs root, not flagging as an error\n");
return 0;
}
- printf("expected errno %d, got %d\n", error, __errno);
+ fprintf(stderr, "expected errno %d, got %d\n", error, err);
return 1;
}
@@ -169,7 +165,7 @@ main(int argc, char **argv)
memset(&p, 0, sizeof(p));
fd = __sys_io_uring_setup(1, &p);
if (fd < 0) {
- printf("io_uring_setup failed with %d, expected success\n",
+ fprintf(stderr, "io_uring_setup failed with %d, expected success\n",
errno);
status = 1;
} else {
@@ -177,16 +173,14 @@ main(int argc, char **argv)
int ret;
ret = read(fd, buf, 4096);
if (ret >= 0) {
- printf("read from io_uring fd succeeded. expected fail\n");
+ fprintf(stderr, "read from io_uring fd succeeded. expected fail\n");
status = 1;
}
}
- if (!status) {
- printf("PASS\n");
+ if (!status)
return 0;
- }
- printf("FAIL\n");
+ fprintf(stderr, "FAIL\n");
return -1;
}
diff --git a/test/iopoll.c b/test/iopoll.c
index 3d94dfe..f3c22d6 100644
--- a/test/iopoll.c
+++ b/test/iopoll.c
@@ -9,7 +9,7 @@
#include <string.h>
#include <fcntl.h>
#include <sys/types.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/eventfd.h>
#include <sys/resource.h>
#include "helpers.h"
@@ -60,14 +60,13 @@ static int __test_io(const char *file, struct io_uring *ring, int write, int sqt
struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe;
int open_flags;
- int i, fd, ret;
+ int i, fd = -1, ret;
off_t offset;
- if (buf_select && write)
+ if (buf_select) {
write = 0;
- if (buf_select && fixed)
fixed = 0;
-
+ }
if (buf_select && provide_buffers(ring))
return 1;
@@ -77,19 +76,20 @@ static int __test_io(const char *file, struct io_uring *ring, int write, int sqt
open_flags = O_RDONLY;
open_flags |= O_DIRECT;
- fd = open(file, open_flags);
- if (fd < 0) {
- perror("file open");
- goto err;
- }
-
if (fixed) {
- ret = io_uring_register_buffers(ring, vecs, BUFFERS);
- if (ret) {
+ ret = t_register_buffers(ring, vecs, BUFFERS);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ if (ret != T_SETUP_OK) {
fprintf(stderr, "buffer reg failed: %d\n", ret);
goto err;
}
}
+ fd = open(file, open_flags);
+ if (fd < 0) {
+ perror("file open");
+ goto err;
+ }
if (sqthread) {
ret = io_uring_register_files(ring, &fd, 1);
if (ret) {
@@ -151,6 +151,12 @@ static int __test_io(const char *file, struct io_uring *ring, int write, int sqt
ret = io_uring_submit(ring);
if (ret != BUFFERS) {
+ ret = io_uring_peek_cqe(ring, &cqe);
+ if (!ret && cqe->res == -EOPNOTSUPP) {
+ no_iopoll = 1;
+ io_uring_cqe_seen(ring, cqe);
+ goto out;
+ }
fprintf(stderr, "submit got %d, wanted %d\n", ret, BUFFERS);
goto err;
}
@@ -271,31 +277,19 @@ static int test_io(const char *file, int write, int sqthread, int fixed,
int buf_select)
{
struct io_uring ring;
- int ret, ring_flags;
+ int ret, ring_flags = IORING_SETUP_IOPOLL;
if (no_iopoll)
return 0;
- ring_flags = IORING_SETUP_IOPOLL;
- if (sqthread) {
- static int warned;
-
- if (geteuid()) {
- if (!warned)
- fprintf(stdout, "SQPOLL requires root, skipping\n");
- warned = 1;
- return 0;
- }
- }
-
- ret = io_uring_queue_init(64, &ring, ring_flags);
- if (ret) {
+ ret = t_create_ring(64, &ring, ring_flags);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ if (ret != T_SETUP_OK) {
fprintf(stderr, "ring create failed: %d\n", ret);
return 1;
}
-
ret = __test_io(file, &ring, write, sqthread, fixed, buf_select);
-
io_uring_queue_exit(&ring);
return ret;
}
@@ -318,13 +312,14 @@ static int probe_buf_select(void)
fprintf(stdout, "Buffer select not supported, skipping\n");
return 0;
}
- free(p);
+ io_uring_free_probe(p);
return 0;
}
int main(int argc, char *argv[])
{
int i, ret, nr;
+ char buf[256];
char *fname;
if (probe_buf_select())
@@ -333,7 +328,10 @@ int main(int argc, char *argv[])
if (argc > 1) {
fname = argv[1];
} else {
- fname = ".iopoll-rw";
+ srand((unsigned)time(NULL));
+ snprintf(buf, sizeof(buf), ".basic-rw-%u-%u",
+ (unsigned)rand(), (unsigned)getpid());
+ fname = buf;
t_create_file(fname, FILE_SIZE);
}
@@ -343,15 +341,15 @@ int main(int argc, char *argv[])
if (no_buf_select)
nr = 8;
for (i = 0; i < nr; i++) {
- int v1, v2, v3, v4;
+ int write = (i & 1) != 0;
+ int sqthread = (i & 2) != 0;
+ int fixed = (i & 4) != 0;
+ int buf_select = (i & 8) != 0;
- v1 = (i & 1) != 0;
- v2 = (i & 2) != 0;
- v3 = (i & 4) != 0;
- v4 = (i & 8) != 0;
- ret = test_io(fname, v1, v2, v3, v4);
+ ret = test_io(fname, write, sqthread, fixed, buf_select);
if (ret) {
- fprintf(stderr, "test_io failed %d/%d/%d/%d\n", v1, v2, v3, v4);
+ fprintf(stderr, "test_io failed %d/%d/%d/%d\n",
+ write, sqthread, fixed, buf_select);
goto err;
}
if (no_iopoll)
diff --git a/test/lfs-openat-write.c b/test/lfs-openat-write.c
index ac35e1b..6bbf78d 100644
--- a/test/lfs-openat-write.c
+++ b/test/lfs-openat-write.c
@@ -1,3 +1,5 @@
+/* SPDX-License-Identifier: MIT */
+
#define _LARGEFILE_SOURCE
#define _FILE_OFFSET_BITS 64
diff --git a/test/lfs-openat.c b/test/lfs-openat.c
index b14238a..4823cc4 100644
--- a/test/lfs-openat.c
+++ b/test/lfs-openat.c
@@ -1,3 +1,5 @@
+/* SPDX-License-Identifier: MIT */
+
#define _LARGEFILE_SOURCE
#define _FILE_OFFSET_BITS 64
diff --git a/test/link-timeout.c b/test/link-timeout.c
index 5d8417f..ad638e9 100644
--- a/test/link-timeout.c
+++ b/test/link-timeout.c
@@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
-#include <sys/poll.h>
+#include <poll.h>
#include "liburing.h"
@@ -63,7 +63,7 @@ static int test_fail_two_link_timeouts(struct io_uring *ring)
struct __kernel_timespec ts;
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
- int ret, i;
+ int ret, i, nr_wait;
ts.tv_sec = 1;
ts.tv_nsec = 0;
@@ -114,12 +114,13 @@ static int test_fail_two_link_timeouts(struct io_uring *ring)
sqe->user_data = 4;
ret = io_uring_submit(ring);
- if (ret != 4) {
+ if (ret < 3) {
printf("sqe submit failed: %d\n", ret);
goto err;
}
+ nr_wait = ret;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < nr_wait; i++) {
ret = io_uring_wait_cqe(ring, &cqe);
if (ret < 0) {
printf("wait completion %d\n", ret);
@@ -619,6 +620,8 @@ static int test_timeout_link_chain1(struct io_uring *ring)
io_uring_cqe_seen(ring, cqe);
}
+ close(fds[0]);
+ close(fds[1]);
return 0;
err:
return 1;
@@ -713,6 +716,8 @@ static int test_timeout_link_chain2(struct io_uring *ring)
io_uring_cqe_seen(ring, cqe);
}
+ close(fds[0]);
+ close(fds[1]);
return 0;
err:
return 1;
@@ -833,6 +838,8 @@ static int test_timeout_link_chain3(struct io_uring *ring)
io_uring_cqe_seen(ring, cqe);
}
+ close(fds[0]);
+ close(fds[1]);
return 0;
err:
return 1;
@@ -917,6 +924,8 @@ static int test_timeout_link_chain4(struct io_uring *ring)
io_uring_cqe_seen(ring, cqe);
}
+ close(fds[0]);
+ close(fds[1]);
return 0;
err:
return 1;
@@ -973,14 +982,16 @@ static int test_timeout_link_chain5(struct io_uring *ring)
}
switch (cqe->user_data) {
case 1:
- if (cqe->res) {
- fprintf(stderr, "Timeout got %d, wanted -EINVAL\n",
+ case 2:
+ if (cqe->res && cqe->res != -ECANCELED) {
+ fprintf(stderr, "Request got %d, wanted -EINVAL "
+ "or -ECANCELED\n",
cqe->res);
goto err;
}
break;
- case 2:
- if (cqe->res != -ECANCELED) {
+ case 3:
+ if (cqe->res != -ECANCELED && cqe->res != -EINVAL) {
fprintf(stderr, "Link timeout got %d, wanted -ECANCELED\n", cqe->res);
goto err;
}
diff --git a/test/link.c b/test/link.c
index c89d6b2..41d3899 100644
--- a/test/link.c
+++ b/test/link.c
@@ -178,7 +178,7 @@ static int test_single_link_fail(struct io_uring *ring)
goto err;
}
- io_uring_prep_nop(sqe);
+ io_uring_prep_remove_buffers(sqe, 10, 1);
sqe->flags |= IOSQE_IO_LINK;
sqe = io_uring_get_sqe(ring);
@@ -205,8 +205,8 @@ static int test_single_link_fail(struct io_uring *ring)
printf("failed to get cqe\n");
goto err;
}
- if (i == 0 && cqe->res != -EINVAL) {
- printf("sqe0 failed with %d, wanted -EINVAL\n", cqe->res);
+ if (i == 0 && cqe->res != -ENOENT) {
+ printf("sqe0 failed with %d, wanted -ENOENT\n", cqe->res);
goto err;
}
if (i == 1 && cqe->res != -ECANCELED) {
diff --git a/test/link_drain.c b/test/link_drain.c
index a50fe88..b95168d 100644
--- a/test/link_drain.c
+++ b/test/link_drain.c
@@ -111,6 +111,7 @@ int test_link_drain_multi(struct io_uring *ring)
perror("open");
return 1;
}
+ unlink("testfile");
iovecs.iov_base = t_malloc(4096);
iovecs.iov_len = 4096;
@@ -189,12 +190,10 @@ int test_link_drain_multi(struct io_uring *ring)
free(iovecs.iov_base);
close(fd);
- unlink("testfile");
return 0;
err:
free(iovecs.iov_base);
close(fd);
- unlink("testfile");
return 1;
}
diff --git a/test/madvise.c b/test/madvise.c
index 89057af..b85aba8 100644
--- a/test/madvise.c
+++ b/test/madvise.c
@@ -181,7 +181,8 @@ int main(int argc, char *argv[])
break;
}
- if (bad > good)
+ /* too hard to reliably test, just ignore */
+ if (0 && bad > good)
fprintf(stderr, "Suspicious timings (%u > %u)\n", bad, good);
if (fname != argv[1])
unlink(fname);
diff --git a/test/mkdir.c b/test/mkdir.c
index c044652..363fe1e 100644
--- a/test/mkdir.c
+++ b/test/mkdir.c
@@ -58,6 +58,9 @@ int main(int argc, char *argv[])
int ret;
struct io_uring ring;
+ if (argc > 1)
+ return 0;
+
ret = io_uring_queue_init(8, &ring, 0);
if (ret) {
fprintf(stderr, "queue init failed: %d\n", ret);
diff --git a/test/msg-ring.c b/test/msg-ring.c
new file mode 100644
index 0000000..48c4a64
--- /dev/null
+++ b/test/msg-ring.c
@@ -0,0 +1,236 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: test ring messaging command
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include "liburing.h"
+
+static int no_msg;
+
+static int test_own(struct io_uring *ring)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret, i;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ io_uring_prep_msg_ring(sqe, ring->ring_fd, 0x10, 0x1234, 0);
+ sqe->user_data = 1;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ for (i = 0; i < 2; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ switch (cqe->user_data) {
+ case 1:
+ if (cqe->res == -EINVAL || cqe->res == -EOPNOTSUPP) {
+ no_msg = 1;
+ return 0;
+ }
+ if (cqe->res != 0) {
+ fprintf(stderr, "cqe res %d\n", cqe->res);
+ return -1;
+ }
+ break;
+ case 0x1234:
+ if (cqe->res != 0x10) {
+ fprintf(stderr, "invalid len %x\n", cqe->res);
+ return -1;
+ }
+ break;
+ default:
+ fprintf(stderr, "Invalid user_data\n");
+ return -1;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ return 0;
+err:
+ return 1;
+}
+
+static void *wait_cqe_fn(void *data)
+{
+ struct io_uring *ring = data;
+ struct io_uring_cqe *cqe;
+ int ret;
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait cqe %d\n", ret);
+ goto err;
+ }
+
+ if (cqe->user_data != 0x5aa5) {
+ fprintf(stderr, "user_data %llx\n", (long long) cqe->user_data);
+ goto err;
+ }
+ if (cqe->res != 0x20) {
+ fprintf(stderr, "len %x\n", cqe->res);
+ goto err;
+ }
+
+ return NULL;
+err:
+ return (void *) (unsigned long) 1;
+}
+
+static int test_remote(struct io_uring *ring, struct io_uring *target)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ io_uring_prep_msg_ring(sqe, target->ring_fd, 0x20, 0x5aa5, 0);
+ sqe->user_data = 1;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ if (cqe->res != 0) {
+ fprintf(stderr, "cqe res %d\n", cqe->res);
+ return -1;
+ }
+ if (cqe->user_data != 1) {
+ fprintf(stderr, "user_data %llx\n", (long long) cqe->user_data);
+ return -1;
+ }
+
+ io_uring_cqe_seen(ring, cqe);
+ return 0;
+err:
+ return 1;
+}
+
+static int test_invalid(struct io_uring *ring)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ io_uring_prep_msg_ring(sqe, 1, 0, 0x8989, 0);
+ sqe->user_data = 1;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ if (cqe->res != -EBADFD) {
+ fprintf(stderr, "cqe res %d\n", cqe->res);
+ return -1;
+ }
+
+ io_uring_cqe_seen(ring, cqe);
+ return 0;
+err:
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring, ring2, pring;
+ pthread_t thread;
+ void *tret;
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return 1;
+ }
+ ret = io_uring_queue_init(8, &ring2, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return 1;
+ }
+ ret = io_uring_queue_init(8, &pring, IORING_SETUP_IOPOLL);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return 1;
+ }
+
+ ret = test_own(&ring);
+ if (ret) {
+ fprintf(stderr, "test_own failed\n");
+ return ret;
+ }
+ if (no_msg) {
+ fprintf(stdout, "Skipped\n");
+ return 0;
+ }
+ ret = test_own(&pring);
+ if (ret) {
+ fprintf(stderr, "test_own iopoll failed\n");
+ return ret;
+ }
+
+ ret = test_invalid(&ring);
+ if (ret) {
+ fprintf(stderr, "test_invalid failed\n");
+ return ret;
+ }
+
+ pthread_create(&thread, NULL, wait_cqe_fn, &ring2);
+
+ ret = test_remote(&ring, &ring2);
+ if (ret) {
+ fprintf(stderr, "test_remote failed\n");
+ return ret;
+ }
+
+ pthread_join(thread, &tret);
+
+ return 0;
+}
diff --git a/test/multicqes_drain.c b/test/multicqes_drain.c
index 609d583..b16dc52 100644
--- a/test/multicqes_drain.c
+++ b/test/multicqes_drain.c
@@ -14,7 +14,7 @@
#include <stdlib.h>
#include <string.h>
#include <time.h>
-#include <sys/poll.h>
+#include <poll.h>
#include "liburing.h"
@@ -91,7 +91,7 @@ void io_uring_sqe_prep(int op, struct io_uring_sqe *sqe, unsigned sqe_flags, int
io_uring_prep_nop(sqe);
break;
case cancel:
- io_uring_prep_poll_remove(sqe, (void *)(long)arg);
+ io_uring_prep_poll_remove(sqe, arg);
break;
}
sqe->flags = sqe_flags;
@@ -157,7 +157,7 @@ int generate_opcode(int i, int pre_flags)
return sqe_op;
}
-inline void add_multishot_sqe(int index)
+static inline void add_multishot_sqe(int index)
{
multi_sqes[cnt++] = index;
}
@@ -224,7 +224,7 @@ static int test_generic_drain(struct io_uring *ring)
goto err;
}
- sleep(4);
+ sleep(1);
// TODO: randomize event triggerring order
for (i = 0; i < max_entry; i++) {
if (si[i].op != multi && si[i].op != single)
@@ -233,7 +233,7 @@ static int test_generic_drain(struct io_uring *ring)
if (trigger_event(pipes[i]))
goto err;
}
- sleep(5);
+ sleep(1);
i = 0;
while (!io_uring_peek_cqe(ring, &cqe)) {
cqe_data[i] = cqe->user_data;
@@ -288,9 +288,9 @@ static int test_simple_drain(struct io_uring *ring)
}
}
- io_uring_prep_poll_add(sqe[0], pipe1[0], POLLIN);
- sqe[0]->len |= IORING_POLL_ADD_MULTI;
+ io_uring_prep_poll_multishot(sqe[0], pipe1[0], POLLIN);
sqe[0]->user_data = 0;
+
io_uring_prep_poll_add(sqe[1], pipe2[0], POLLIN);
sqe[1]->user_data = 1;
@@ -320,6 +320,7 @@ static int test_simple_drain(struct io_uring *ring)
io_uring_prep_poll_remove(sqe[0], 0);
sqe[0]->user_data = 2;
+
io_uring_prep_nop(sqe[1]);
sqe[1]->flags |= IOSQE_IO_DRAIN;
sqe[1]->user_data = 3;
@@ -333,18 +334,21 @@ static int test_simple_drain(struct io_uring *ring)
goto err;
}
-
for (i = 0; i < 6; i++) {
ret = io_uring_wait_cqe(ring, &cqe);
if (ret < 0) {
printf("wait completion %d\n", ret);
goto err;
}
- io_uring_cqe_seen(ring, cqe);
if ((i == 5) && (cqe->user_data != 3))
goto err;
+ io_uring_cqe_seen(ring, cqe);
}
+ close(pipe1[0]);
+ close(pipe1[1]);
+ close(pipe2[0]);
+ close(pipe2[1]);
return 0;
err:
return 1;
diff --git a/test/nop-all-sizes.c b/test/nop-all-sizes.c
index 49b8642..e8e4b9d 100644
--- a/test/nop-all-sizes.c
+++ b/test/nop-all-sizes.c
@@ -37,10 +37,6 @@ static int test_nops(struct io_uring *ring)
int ret, nr, total = 0, i;
nr = fill_nops(ring);
- if (nr < 0) {
- fprintf(stderr, "Fill: %d\n", nr);
- goto err;
- }
ret = io_uring_submit(ring);
if (ret != nr) {
@@ -50,10 +46,6 @@ static int test_nops(struct io_uring *ring)
total += ret;
nr = fill_nops(ring);
- if (nr < 0) {
- fprintf(stderr, "Fill: %d\n", nr);
- goto err;
- }
ret = io_uring_submit(ring);
if (ret != nr) {
diff --git a/test/nop.c b/test/nop.c
index 82201bd..1aa88fc 100644
--- a/test/nop.c
+++ b/test/nop.c
@@ -11,12 +11,16 @@
#include <fcntl.h>
#include "liburing.h"
+#include "test.h"
-static int test_single_nop(struct io_uring *ring)
+static int seq;
+
+static int test_single_nop(struct io_uring *ring, unsigned req_flags)
{
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
int ret;
+ bool cqe32 = (ring->flags & IORING_SETUP_CQE32);
sqe = io_uring_get_sqe(ring);
if (!sqe) {
@@ -25,6 +29,8 @@ static int test_single_nop(struct io_uring *ring)
}
io_uring_prep_nop(sqe);
+ sqe->user_data = ++seq;
+ sqe->flags |= req_flags;
ret = io_uring_submit(ring);
if (ret <= 0) {
@@ -37,18 +43,33 @@ static int test_single_nop(struct io_uring *ring)
fprintf(stderr, "wait completion %d\n", ret);
goto err;
}
+ if (!cqe->user_data) {
+ fprintf(stderr, "Unexpected 0 user_data\n");
+ goto err;
+ }
+ if (cqe32) {
+ if (cqe->big_cqe[0] != 0) {
+ fprintf(stderr, "Unexpected extra1\n");
+ goto err;
+ }
+ if (cqe->big_cqe[1] != 0) {
+ fprintf(stderr, "Unexpected extra2\n");
+ goto err;
+ }
+ }
io_uring_cqe_seen(ring, cqe);
return 0;
err:
return 1;
}
-static int test_barrier_nop(struct io_uring *ring)
+static int test_barrier_nop(struct io_uring *ring, unsigned req_flags)
{
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
int ret, i;
+ bool cqe32 = (ring->flags & IORING_SETUP_CQE32);
for (i = 0; i < 8; i++) {
sqe = io_uring_get_sqe(ring);
@@ -60,6 +81,8 @@ static int test_barrier_nop(struct io_uring *ring)
io_uring_prep_nop(sqe);
if (i == 4)
sqe->flags = IOSQE_IO_DRAIN;
+ sqe->user_data = ++seq;
+ sqe->flags |= req_flags;
}
ret = io_uring_submit(ring);
@@ -77,6 +100,20 @@ static int test_barrier_nop(struct io_uring *ring)
fprintf(stderr, "wait completion %d\n", ret);
goto err;
}
+ if (!cqe->user_data) {
+ fprintf(stderr, "Unexpected 0 user_data\n");
+ goto err;
+ }
+ if (cqe32) {
+ if (cqe->big_cqe[0] != 0) {
+ fprintf(stderr, "Unexpected extra1\n");
+ goto err;
+ }
+ if (cqe->big_cqe[1] != 0) {
+ fprintf(stderr, "Unexpected extra2\n");
+ goto err;
+ }
+ }
io_uring_cqe_seen(ring, cqe);
}
@@ -85,30 +122,55 @@ err:
return 1;
}
-int main(int argc, char *argv[])
+static int test_ring(unsigned flags)
{
struct io_uring ring;
- int ret;
-
- if (argc > 1)
- return 0;
+ struct io_uring_params p = { };
+ int ret, i;
- ret = io_uring_queue_init(8, &ring, 0);
+ p.flags = flags;
+ ret = io_uring_queue_init_params(8, &ring, &p);
if (ret) {
+ if (ret == -EINVAL)
+ return 0;
fprintf(stderr, "ring setup failed: %d\n", ret);
return 1;
}
- ret = test_single_nop(&ring);
- if (ret) {
- fprintf(stderr, "test_single_nop failed\n");
- return ret;
+ for (i = 0; i < 1000; i++) {
+ unsigned req_flags = (i & 1) ? IOSQE_ASYNC : 0;
+
+ ret = test_single_nop(&ring, req_flags);
+ if (ret) {
+ fprintf(stderr, "test_single_nop failed\n");
+ goto err;
+ }
+
+ ret = test_barrier_nop(&ring, req_flags);
+ if (ret) {
+ fprintf(stderr, "test_barrier_nop failed\n");
+ goto err;
+ }
}
+err:
+ io_uring_queue_exit(&ring);
+ return ret;
+}
- ret = test_barrier_nop(&ring);
- if (ret) {
- fprintf(stderr, "test_barrier_nop failed\n");
- return ret;
+int main(int argc, char *argv[])
+{
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ FOR_ALL_TEST_CONFIGS {
+ ret = test_ring(IORING_GET_TEST_CONFIG_FLAGS());
+ if (ret) {
+ fprintf(stderr, "Normal ring test failed: %s\n",
+ IORING_GET_TEST_CONFIG_DESCRIPTION());
+ return ret;
+ }
}
return 0;
diff --git a/test/open-close.c b/test/open-close.c
index 648737c..d5c116b 100644
--- a/test/open-close.c
+++ b/test/open-close.c
@@ -9,10 +9,119 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
+#include <assert.h>
#include "helpers.h"
#include "liburing.h"
+static int submit_wait(struct io_uring *ring)
+{
+ struct io_uring_cqe *cqe;
+ int ret;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ return 1;
+ }
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ return 1;
+ }
+
+ ret = cqe->res;
+ io_uring_cqe_seen(ring, cqe);
+ return ret;
+}
+
+static inline int try_close(struct io_uring *ring, int fd, int slot)
+{
+ struct io_uring_sqe *sqe;
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_close(sqe, fd);
+ __io_uring_set_target_fixed_file(sqe, slot);
+ return submit_wait(ring);
+}
+
+static int test_close_fixed(void)
+{
+ struct io_uring ring;
+ struct io_uring_sqe *sqe;
+ int ret, fds[2];
+ char buf[1];
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed\n");
+ return -1;
+ }
+ if (pipe(fds)) {
+ perror("pipe");
+ return -1;
+ }
+
+ ret = try_close(&ring, 0, 0);
+ if (ret == -EINVAL) {
+ fprintf(stderr, "close for fixed files is not supported\n");
+ return 0;
+ } else if (ret != -ENXIO) {
+ fprintf(stderr, "no table failed %i\n", ret);
+ return -1;
+ }
+
+ ret = try_close(&ring, 1, 0);
+ if (ret != -EINVAL) {
+ fprintf(stderr, "set fd failed %i\n", ret);
+ return -1;
+ }
+
+ ret = io_uring_register_files(&ring, fds, 2);
+ if (ret) {
+ fprintf(stderr, "file_register: %d\n", ret);
+ return ret;
+ }
+
+ ret = try_close(&ring, 0, 2);
+ if (ret != -EINVAL) {
+ fprintf(stderr, "out of table failed %i\n", ret);
+ return -1;
+ }
+
+ ret = try_close(&ring, 0, 0);
+ if (ret != 0) {
+ fprintf(stderr, "close failed %i\n", ret);
+ return -1;
+ }
+
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_read(sqe, 0, buf, sizeof(buf), 0);
+ sqe->flags |= IOSQE_FIXED_FILE;
+ ret = submit_wait(&ring);
+ if (ret != -EBADF) {
+ fprintf(stderr, "read failed %i\n", ret);
+ return -1;
+ }
+
+ ret = try_close(&ring, 0, 1);
+ if (ret != 0) {
+ fprintf(stderr, "close 2 failed %i\n", ret);
+ return -1;
+ }
+
+ ret = try_close(&ring, 0, 0);
+ if (ret != -EBADF) {
+ fprintf(stderr, "empty slot failed %i\n", ret);
+ return -1;
+ }
+
+ close(fds[0]);
+ close(fds[1]);
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
static int test_close(struct io_uring *ring, int fd, int is_ring_fd)
{
struct io_uring_cqe *cqe;
@@ -133,6 +242,12 @@ int main(int argc, char *argv[])
goto err;
}
+ ret = test_close_fixed();
+ if (ret) {
+ fprintf(stderr, "test_close_fixed failed\n");
+ goto err;
+ }
+
done:
unlink(path);
if (do_unlink)
diff --git a/test/open-direct-link.c b/test/open-direct-link.c
new file mode 100644
index 0000000..33f88f4
--- /dev/null
+++ b/test/open-direct-link.c
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: link <open file><read from file><close file>
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+#define MAX_FILES 8
+#define FNAME ".link.direct"
+
+static int test(struct io_uring *ring, int skip_success, int drain, int async)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ char buf[4096];
+ int ret, i;
+
+ /* drain and cqe skip are mutually exclusive */
+ if (skip_success && drain)
+ return 1;
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_openat_direct(sqe, AT_FDCWD, FNAME, O_RDONLY, 0, 0);
+ if (!drain)
+ sqe->flags |= IOSQE_IO_LINK;
+ if (skip_success)
+ sqe->flags |= IOSQE_CQE_SKIP_SUCCESS;
+ if (async)
+ sqe->flags |= IOSQE_ASYNC;
+ sqe->user_data = 1;
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_read(sqe, 0, buf, sizeof(buf), 0);
+ sqe->flags |= IOSQE_FIXED_FILE;
+ if (drain)
+ sqe->flags |= IOSQE_IO_DRAIN;
+ else
+ sqe->flags |= IOSQE_IO_LINK;
+ if (async)
+ sqe->flags |= IOSQE_ASYNC;
+ sqe->user_data = 2;
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_close_direct(sqe, 0);
+ sqe->user_data = 3;
+ if (skip_success)
+ sqe->flags |= IOSQE_CQE_SKIP_SUCCESS;
+ if (drain)
+ sqe->flags |= IOSQE_IO_DRAIN;
+ if (async)
+ sqe->flags |= IOSQE_ASYNC;
+
+ ret = io_uring_submit(ring);
+ if (ret != 3) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ if (skip_success) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ if (cqe->user_data != 2) {
+ fprintf(stderr, "Unexpected cqe %lu/%d\n",
+ (unsigned long) cqe->user_data,
+ cqe->res);
+ goto err;
+ }
+ if (cqe->res != sizeof(buf)) {
+ fprintf(stderr, "bad read %d\n", cqe->res);
+ goto err;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ return 0;
+ }
+
+ for (i = 0; i < 3; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ switch (cqe->user_data) {
+ case 1:
+ if (cqe->res) {
+ fprintf(stderr, "bad open %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ case 2:
+ if (cqe->res != sizeof(buf)) {
+ fprintf(stderr, "bad read %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ case 3:
+ if (cqe->res) {
+ fprintf(stderr, "bad close %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ return 0;
+err:
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ struct io_uring_params p = { };
+ int ret, files[MAX_FILES];
+
+ if (argc > 1)
+ return 0;
+
+ ret = io_uring_queue_init_params(8, &ring, &p);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return 1;
+ }
+ if (!(p.features & IORING_FEAT_CQE_SKIP))
+ return 0;
+
+ memset(files, -1, sizeof(files));
+ ret = io_uring_register_files(&ring, files, ARRAY_SIZE(files));
+ if (ret) {
+ fprintf(stderr, "Failed registering files\n");
+ return 1;
+ }
+
+ t_create_file(FNAME, 4096);
+
+ ret = test(&ring, 0, 0, 0);
+ if (ret) {
+ fprintf(stderr, "test 0 0 0 failed\n");
+ goto err;
+ }
+
+ ret = test(&ring, 0, 1, 0);
+ if (ret) {
+ fprintf(stderr, "test 0 1 0 failed\n");
+ goto err;
+ }
+
+ ret = test(&ring, 0, 0, 1);
+ if (ret) {
+ fprintf(stderr, "test 0 0 1 failed\n");
+ goto err;
+ }
+
+ ret = test(&ring, 0, 1, 1);
+ if (ret) {
+ fprintf(stderr, "test 0 1 1 failed\n");
+ goto err;
+ }
+
+ ret = test(&ring, 1, 0, 0);
+ if (ret) {
+ fprintf(stderr, "test 1 0 0 failed\n");
+ goto err;
+ }
+
+ ret = test(&ring, 1, 0, 1);
+ if (ret) {
+ fprintf(stderr, "test 1 0 1 failed\n");
+ goto err;
+ }
+
+ unlink(FNAME);
+ return 0;
+err:
+ unlink(FNAME);
+ return 1;
+}
diff --git a/test/open-direct-pick.c b/test/open-direct-pick.c
new file mode 100644
index 0000000..b1597e7
--- /dev/null
+++ b/test/open-direct-pick.c
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: run various openat(2) tests
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#include "helpers.h"
+#include "liburing.h"
+
+#define FDS 800
+
+static int no_direct_pick;
+
+static int submit_wait(struct io_uring *ring)
+{
+ struct io_uring_cqe *cqe;
+ int ret;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ return 1;
+ }
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ return 1;
+ }
+
+ ret = cqe->res;
+ io_uring_cqe_seen(ring, cqe);
+ return ret;
+}
+
+static inline int try_close(struct io_uring *ring, int slot)
+{
+ struct io_uring_sqe *sqe;
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_close_direct(sqe, slot);
+ return submit_wait(ring);
+}
+
+static int do_opens(struct io_uring *ring, const char *path, int nr,
+ int expect_enfile)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int i, ret;
+
+ for (i = 0; i < nr; i++) {
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+ io_uring_prep_openat_direct(sqe, -1, path, O_RDONLY, 0, 0);
+ sqe->file_index = UINT_MAX;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+ }
+
+ for (i = 0; i < nr; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ ret = cqe->res;
+ if (ret < 0) {
+ if (!expect_enfile || ret != -ENFILE) {
+ printf("open=%d, %d\n", cqe->res, i);
+ goto err;
+ }
+ if (!i && ret == -EINVAL) {
+ no_direct_pick = 1;
+ return 0;
+ }
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+ return 0;
+err:
+ return 1;
+}
+
+static int test_openat(struct io_uring *ring, const char *path)
+{
+ int ret, i;
+
+ /* open all */
+ ret = do_opens(ring, path, FDS, 0);
+ if (ret)
+ goto err;
+ if (no_direct_pick)
+ return 0;
+
+ /* now close 100 randomly */
+ for (i = 0; i < 100; i++) {
+ do {
+ int slot = rand() % FDS;
+ ret = try_close(ring, slot);
+ if (ret == -EBADF)
+ continue;
+ break;
+ } while (1);
+ }
+
+ /* opening 100 should work, we closed 100 */
+ ret = do_opens(ring, path, 100, 0);
+ if (ret)
+ goto err;
+
+ /* we should be full now, expect -ENFILE */
+ ret = do_opens(ring, path, 1, 1);
+ if (ret)
+ goto err;
+
+ return ret;
+err:
+ fprintf(stderr,"%s: err=%d\n", __FUNCTION__, ret);
+ return -1;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ const char *path;
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed\n");
+ return 1;
+ }
+
+ ret = io_uring_register_files_sparse(&ring, FDS);
+ if (ret ) {
+ if (ret != -EINVAL) {
+ fprintf(stderr, "Sparse file registration failed\n");
+ return 1;
+ }
+ /* skip, kernel doesn't support sparse file array */
+ return 0;
+ }
+
+ path = "/tmp/.open.close";
+ t_create_file(path, 4096);
+
+ ret = test_openat(&ring, path);
+ if (ret < 0) {
+ if (ret == -EINVAL) {
+ fprintf(stdout, "Open not supported, skipping\n");
+ goto done;
+ }
+ fprintf(stderr, "test_openat absolute failed: %d\n", ret);
+ goto err;
+ }
+
+done:
+ unlink(path);
+ return 0;
+err:
+ unlink(path);
+ return 1;
+}
diff --git a/test/openat2.c b/test/openat2.c
index 65f81b1..34c0f85 100644
--- a/test/openat2.c
+++ b/test/openat2.c
@@ -9,11 +9,13 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
+#include <sys/uio.h>
#include "helpers.h"
#include "liburing.h"
-static int test_openat2(struct io_uring *ring, const char *path, int dfd)
+static int test_openat2(struct io_uring *ring, const char *path, int dfd,
+ bool direct, int fixed_index)
{
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
@@ -23,28 +25,212 @@ static int test_openat2(struct io_uring *ring, const char *path, int dfd)
sqe = io_uring_get_sqe(ring);
if (!sqe) {
fprintf(stderr, "get sqe failed\n");
- goto err;
+ return -1;
}
memset(&how, 0, sizeof(how));
- how.flags = O_RDONLY;
- io_uring_prep_openat2(sqe, dfd, path, &how);
+ how.flags = O_RDWR;
+
+ if (!direct)
+ io_uring_prep_openat2(sqe, dfd, path, &how);
+ else
+ io_uring_prep_openat2_direct(sqe, dfd, path, &how, fixed_index);
ret = io_uring_submit(ring);
if (ret <= 0) {
fprintf(stderr, "sqe submit failed: %d\n", ret);
- goto err;
+ return -1;
}
ret = io_uring_wait_cqe(ring, &cqe);
if (ret < 0) {
fprintf(stderr, "wait completion %d\n", ret);
- goto err;
+ return -1;
}
ret = cqe->res;
io_uring_cqe_seen(ring, cqe);
+
+ if (direct && ret > 0) {
+ close(ret);
+ return -EINVAL;
+ }
return ret;
-err:
- return -1;
+}
+
+static int test_open_fixed(const char *path, int dfd)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ struct io_uring ring;
+ const char pattern = 0xac;
+ char buffer[] = { 0, 0 };
+ int i, ret, fd = -1;
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed\n");
+ return -1;
+ }
+ ret = io_uring_register_files(&ring, &fd, 1);
+ if (ret) {
+ fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+ return -1;
+ }
+
+ ret = test_openat2(&ring, path, dfd, true, 0);
+ if (ret == -EINVAL) {
+ printf("fixed open isn't supported\n");
+ return 1;
+ } else if (ret) {
+ fprintf(stderr, "direct open failed %d\n", ret);
+ return -1;
+ }
+
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_write(sqe, 0, &pattern, 1, 0);
+ sqe->user_data = 1;
+ sqe->flags |= IOSQE_FIXED_FILE | IOSQE_IO_LINK;
+
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_read(sqe, 0, buffer, 1, 0);
+ sqe->user_data = 2;
+ sqe->flags |= IOSQE_FIXED_FILE;
+
+ ret = io_uring_submit(&ring);
+ if (ret != 2) {
+ fprintf(stderr, "%s: got %d, wanted 2\n", __FUNCTION__, ret);
+ return -1;
+ }
+
+ for (i = 0; i < 2; i++) {
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ return -1;
+ }
+ if (cqe->res != 1) {
+ fprintf(stderr, "unexpectetd ret %d\n", cqe->res);
+ return -1;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+ }
+ if (memcmp(&pattern, buffer, 1) != 0) {
+ fprintf(stderr, "buf validation failed\n");
+ return -1;
+ }
+
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+static int test_open_fixed_fail(const char *path, int dfd)
+{
+ struct io_uring ring;
+ int ret, fd = -1;
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed\n");
+ return -1;
+ }
+
+ ret = test_openat2(&ring, path, dfd, true, 0);
+ if (ret != -ENXIO) {
+ fprintf(stderr, "install into not existing table, %i\n", ret);
+ return 1;
+ }
+
+ ret = io_uring_register_files(&ring, &fd, 1);
+ if (ret) {
+ fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+ return -1;
+ }
+
+ ret = test_openat2(&ring, path, dfd, true, 1);
+ if (ret != -EINVAL) {
+ fprintf(stderr, "install out of bounds, %i\n", ret);
+ return -1;
+ }
+
+ ret = test_openat2(&ring, path, dfd, true, (1u << 16));
+ if (ret != -EINVAL) {
+ fprintf(stderr, "install out of bounds or u16 overflow, %i\n", ret);
+ return -1;
+ }
+
+ ret = test_openat2(&ring, path, dfd, true, (1u << 16) + 1);
+ if (ret != -EINVAL) {
+ fprintf(stderr, "install out of bounds or u16 overflow, %i\n", ret);
+ return -1;
+ }
+
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+static int test_direct_reinstall(const char *path, int dfd)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ char buf[1] = { 0xfa };
+ struct io_uring ring;
+ int ret, pipe_fds[2];
+ ssize_t ret2;
+
+ if (pipe2(pipe_fds, O_NONBLOCK)) {
+ fprintf(stderr, "pipe() failed\n");
+ return -1;
+ }
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed\n");
+ return -1;
+ }
+ ret = io_uring_register_files(&ring, pipe_fds, 2);
+ if (ret) {
+ fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+ return -1;
+ }
+
+ /* reinstall into the second slot */
+ ret = test_openat2(&ring, path, dfd, true, 1);
+ if (ret != 0) {
+ fprintf(stderr, "reinstall failed, %i\n", ret);
+ return -1;
+ }
+
+ /* verify it's reinstalled, first write into the slot... */
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_write(sqe, 1, buf, sizeof(buf), 0);
+ sqe->flags |= IOSQE_FIXED_FILE;
+
+ ret = io_uring_submit(&ring);
+ if (ret != 1) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ return -1;
+ }
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ return ret;
+ }
+ ret = cqe->res;
+ io_uring_cqe_seen(&ring, cqe);
+ if (ret != 1) {
+ fprintf(stderr, "invalid write %i\n", ret);
+ return -1;
+ }
+
+ /* ... and make sure nothing has been written to the pipe */
+ ret2 = read(pipe_fds[0], buf, 1);
+ if (ret2 != 0 && !(ret2 < 0 && errno == EAGAIN)) {
+ fprintf(stderr, "invalid pipe read, %d %d\n", errno, (int)ret2);
+ return -1;
+ }
+
+ close(pipe_fds[0]);
+ close(pipe_fds[1]);
+ io_uring_queue_exit(&ring);
+ return 0;
}
int main(int argc, char *argv[])
@@ -60,12 +246,12 @@ int main(int argc, char *argv[])
}
if (argc > 1) {
- path = "/tmp/.open.close";
+ path = "/tmp/.open.at2";
path_rel = argv[1];
do_unlink = 0;
} else {
- path = "/tmp/.open.close";
- path_rel = ".open.close";
+ path = "/tmp/.open.at2";
+ path_rel = ".open.at2";
do_unlink = 1;
}
@@ -74,7 +260,7 @@ int main(int argc, char *argv[])
if (do_unlink)
t_create_file(path_rel, 4096);
- ret = test_openat2(&ring, path, -1);
+ ret = test_openat2(&ring, path, -1, false, 0);
if (ret < 0) {
if (ret == -EINVAL) {
fprintf(stdout, "openat2 not supported, skipping\n");
@@ -84,12 +270,31 @@ int main(int argc, char *argv[])
goto err;
}
- ret = test_openat2(&ring, path_rel, AT_FDCWD);
+ ret = test_openat2(&ring, path_rel, AT_FDCWD, false, 0);
if (ret < 0) {
fprintf(stderr, "test_openat2 relative failed: %d\n", ret);
goto err;
}
+ ret = test_open_fixed(path, -1);
+ if (ret > 0)
+ goto done;
+ if (ret) {
+ fprintf(stderr, "test_open_fixed failed\n");
+ goto err;
+ }
+ ret = test_open_fixed_fail(path, -1);
+ if (ret) {
+ fprintf(stderr, "test_open_fixed_fail failed\n");
+ goto err;
+ }
+
+ ret = test_direct_reinstall(path, -1);
+ if (ret) {
+ fprintf(stderr, "test_direct_reinstall failed\n");
+ goto err;
+ }
+
done:
unlink(path);
if (do_unlink)
diff --git a/test/pipe-eof.c b/test/pipe-eof.c
index 4c98de9..bf13517 100644
--- a/test/pipe-eof.c
+++ b/test/pipe-eof.c
@@ -1,3 +1,5 @@
+/* SPDX-License-Identifier: MIT */
+
/*
* Test that closed pipe reads returns 0, instead of waiting for more
* data.
diff --git a/test/poll-cancel-all.c b/test/poll-cancel-all.c
new file mode 100644
index 0000000..35116f5
--- /dev/null
+++ b/test/poll-cancel-all.c
@@ -0,0 +1,472 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: Test IORING_ASYNC_CANCEL_{ALL,FD}
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <poll.h>
+
+#include "liburing.h"
+
+static int no_cancel_flags;
+
+static int test1(struct io_uring *ring, int *fd)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ int ret, i;
+
+ for (i = 0; i < 8; i++) {
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ return 1;
+ }
+
+ io_uring_prep_poll_add(sqe, fd[0], POLLIN);
+ sqe->user_data = i + 1;
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret < 8) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ return 1;
+ }
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ return 1;
+ }
+
+ /*
+ * Mark CANCEL_ALL to cancel all matching the key, and use
+ * CANCEL_FD to cancel requests matching the specified fd.
+ * This should cancel all the pending poll requests on the pipe
+ * input.
+ */
+ io_uring_prep_cancel(sqe, 0, IORING_ASYNC_CANCEL_ALL);
+ sqe->cancel_flags |= IORING_ASYNC_CANCEL_FD;
+ sqe->fd = fd[0];
+ sqe->user_data = 100;
+
+ ret = io_uring_submit(ring);
+ if (ret < 1) {
+ fprintf(stderr, "child: sqe submit failed: %d\n", ret);
+ return 1;
+ }
+
+ for (i = 0; i < 9; i++) {
+ if (no_cancel_flags)
+ break;
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait=%d\n", ret);
+ return 1;
+ }
+ switch (cqe->user_data) {
+ case 100:
+ if (cqe->res == -EINVAL) {
+ no_cancel_flags = 1;
+ break;
+ }
+ if (cqe->res != 8) {
+ fprintf(stderr, "canceled %d\n", cqe->res);
+ return 1;
+ }
+ break;
+ case 1 ... 8:
+ if (cqe->res != -ECANCELED) {
+ fprintf(stderr, "poll res %d\n", cqe->res);
+ return 1;
+ }
+ break;
+ default:
+ fprintf(stderr, "invalid user_data %lu\n",
+ (unsigned long) cqe->user_data);
+ return 1;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ return 0;
+}
+
+static int test2(struct io_uring *ring, int *fd)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ int ret, i, fd2[2];
+
+ if (pipe(fd2) < 0) {
+ perror("pipe");
+ return 1;
+ }
+
+ for (i = 0; i < 8; i++) {
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ if (!(i & 1))
+ io_uring_prep_poll_add(sqe, fd[0], POLLIN);
+ else
+ io_uring_prep_poll_add(sqe, fd2[0], POLLIN);
+ sqe->user_data = i & 1;
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret < 8) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ /*
+ * Mark CANCEL_ALL to cancel all matching the key, and use
+ * CANCEL_FD to cancel requests matching the specified fd.
+ * This should cancel all the pending poll requests on the pipe
+ * input.
+ */
+ io_uring_prep_cancel(sqe, 0, IORING_ASYNC_CANCEL_ALL);
+ sqe->cancel_flags |= IORING_ASYNC_CANCEL_FD;
+ sqe->fd = fd[0];
+ sqe->user_data = 100;
+
+ ret = io_uring_submit(ring);
+ if (ret < 1) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ for (i = 0; i < 5; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait=%d\n", ret);
+ goto err;
+ }
+ switch (cqe->user_data) {
+ case 100:
+ if (cqe->res != 4) {
+ fprintf(stderr, "canceled %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ case 0:
+ if (cqe->res != -ECANCELED) {
+ fprintf(stderr, "poll res %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ default:
+ fprintf(stderr, "invalid user_data %lu\n",
+ (unsigned long) cqe->user_data);
+ goto err;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ usleep(1000);
+
+ /*
+ * Should not have any pending CQEs now
+ */
+ ret = io_uring_peek_cqe(ring, &cqe);
+ if (!ret) {
+ fprintf(stderr, "Unexpected extra cancel cqe\n");
+ goto err;
+ }
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ /*
+ * Mark CANCEL_ALL to cancel all matching the key, and use
+ * CANCEL_FD to cancel requests matching the specified fd.
+ * This should cancel all the pending poll requests on the pipe
+ * input.
+ */
+ io_uring_prep_cancel(sqe, 0, IORING_ASYNC_CANCEL_ALL);
+ sqe->cancel_flags |= IORING_ASYNC_CANCEL_FD;
+ sqe->fd = fd2[0];
+ sqe->user_data = 100;
+
+ ret = io_uring_submit(ring);
+ if (ret < 1) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ for (i = 0; i < 5; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait=%d\n", ret);
+ goto err;
+ }
+ switch (cqe->user_data) {
+ case 100:
+ if (cqe->res != 4) {
+ fprintf(stderr, "canceled %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ case 1:
+ if (cqe->res != -ECANCELED) {
+ fprintf(stderr, "poll res %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ default:
+ fprintf(stderr, "invalid user_data %lu\n",
+ (unsigned long) cqe->user_data);
+ goto err;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ close(fd2[0]);
+ close(fd2[1]);
+ return 0;
+err:
+ close(fd2[0]);
+ close(fd2[1]);
+ return 1;
+}
+
+static int test3(struct io_uring *ring, int *fd)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ int ret, i, fd2[2];
+
+ if (pipe(fd2) < 0) {
+ perror("pipe");
+ return 1;
+ }
+
+ for (i = 0; i < 8; i++) {
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ if (!(i & 1)) {
+ io_uring_prep_poll_add(sqe, fd[0], POLLIN);
+ sqe->flags |= IOSQE_ASYNC;
+ } else
+ io_uring_prep_poll_add(sqe, fd2[0], POLLIN);
+ sqe->user_data = i & 1;
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret < 8) {
+ fprintf(stderr, "child: sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ usleep(10000);
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ /*
+ * Mark CANCEL_ALL to cancel all matching the key, and use
+ * CANCEL_FD to cancel requests matching the specified fd.
+ * This should cancel all the pending poll requests on the pipe
+ * input.
+ */
+ io_uring_prep_cancel(sqe, 0, IORING_ASYNC_CANCEL_ALL);
+ sqe->cancel_flags |= IORING_ASYNC_CANCEL_ANY;
+ sqe->fd = 0;
+ sqe->user_data = 100;
+
+ ret = io_uring_submit(ring);
+ if (ret < 1) {
+ fprintf(stderr, "child: sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ for (i = 0; i < 9; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait=%d\n", ret);
+ goto err;
+ }
+ switch (cqe->user_data) {
+ case 100:
+ if (cqe->res != 8) {
+ fprintf(stderr, "canceled %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ case 0:
+ case 1:
+ if (cqe->res != -ECANCELED) {
+ fprintf(stderr, "poll res %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ default:
+ fprintf(stderr, "invalid user_data %lu\n",
+ (unsigned long) cqe->user_data);
+ goto err;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ close(fd2[0]);
+ close(fd2[1]);
+ return 0;
+err:
+ close(fd2[0]);
+ close(fd2[1]);
+ return 1;
+}
+
+static int test4(struct io_uring *ring, int *fd)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ char buffer[32];
+ int ret, i;
+
+ for (i = 0; i < 8; i++) {
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ io_uring_prep_read(sqe, fd[0], &buffer, sizeof(buffer), 0);
+ sqe->flags |= IOSQE_ASYNC;
+ sqe->user_data = i + 1;
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret < 8) {
+ fprintf(stderr, "child: sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ usleep(10000);
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ goto err;
+ }
+
+ /*
+ * Mark CANCEL_ALL to cancel all matching the key, and use
+ * CANCEL_FD to cancel requests matching the specified fd.
+ * This should cancel all the pending poll requests on the pipe
+ * input.
+ */
+ io_uring_prep_cancel(sqe, 0, IORING_ASYNC_CANCEL_ALL);
+ sqe->cancel_flags |= IORING_ASYNC_CANCEL_ANY;
+ sqe->fd = 0;
+ sqe->user_data = 100;
+
+ ret = io_uring_submit(ring);
+ if (ret < 1) {
+ fprintf(stderr, "child: sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ for (i = 0; i < 9; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait=%d\n", ret);
+ goto err;
+ }
+ switch (cqe->user_data) {
+ case 100:
+ if (cqe->res != 8) {
+ fprintf(stderr, "canceled %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ case 1 ... 8:
+ if (cqe->res != -ECANCELED) {
+ fprintf(stderr, "poll res %d\n", cqe->res);
+ goto err;
+ }
+ break;
+ default:
+ fprintf(stderr, "invalid user_data %lu\n",
+ (unsigned long) cqe->user_data);
+ goto err;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ return 0;
+err:
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ int ret, fd[2];
+
+ if (argc > 1)
+ return 0;
+
+ if (pipe(fd) < 0) {
+ perror("pipe");
+ return 1;
+ }
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return 1;
+ }
+
+ ret = test1(&ring, fd);
+ if (ret) {
+ fprintf(stderr, "test1 failed\n");
+ return ret;
+ }
+ if (no_cancel_flags)
+ return 0;
+
+ ret = test2(&ring, fd);
+ if (ret) {
+ fprintf(stderr, "test2 failed\n");
+ return ret;
+ }
+
+ ret = test3(&ring, fd);
+ if (ret) {
+ fprintf(stderr, "test3 failed\n");
+ return ret;
+ }
+
+ ret = test4(&ring, fd);
+ if (ret) {
+ fprintf(stderr, "test4 failed\n");
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/test/poll-cancel-ton.c b/test/poll-cancel-ton.c
index e9d612e..b023394 100644
--- a/test/poll-cancel-ton.c
+++ b/test/poll-cancel-ton.c
@@ -9,9 +9,9 @@
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/wait.h>
-#include <sys/signal.h>
+#include <signal.h>
#include "liburing.h"
@@ -55,7 +55,7 @@ static int del_polls(struct io_uring *ring, int fd, int nr)
sqe = io_uring_get_sqe(ring);
data = sqe_index[lrand48() % nr];
- io_uring_prep_poll_remove(sqe, data);
+ io_uring_prep_poll_remove(sqe, (__u64)(uintptr_t)data);
}
ret = io_uring_submit(ring);
@@ -71,10 +71,10 @@ static int del_polls(struct io_uring *ring, int fd, int nr)
static int add_polls(struct io_uring *ring, int fd, int nr)
{
- int pending, batch, i, count, ret;
+ int batch, i, count, ret;
struct io_uring_sqe *sqe;
- pending = count = 0;
+ count = 0;
while (nr) {
batch = 1024;
if (batch > nr)
@@ -93,7 +93,6 @@ static int add_polls(struct io_uring *ring, int fd, int nr)
return 1;
}
nr -= batch;
- pending += batch;
reap_events(ring, batch, 1);
}
return 0;
@@ -129,9 +128,6 @@ int main(int argc, char *argv[])
}
add_polls(&ring, pipe1[0], 30000);
-#if 0
- usleep(1000);
-#endif
del_polls(&ring, pipe1[0], 30000);
io_uring_queue_exit(&ring);
diff --git a/test/poll-cancel.c b/test/poll-cancel.c
index a74e915..0714a80 100644
--- a/test/poll-cancel.c
+++ b/test/poll-cancel.c
@@ -9,9 +9,9 @@
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/wait.h>
-#include <sys/signal.h>
+#include <signal.h>
#include "liburing.h"
@@ -26,7 +26,7 @@ static void sig_alrm(int sig)
exit(1);
}
-int main(int argc, char *argv[])
+static int test_poll_cancel(void)
{
struct io_uring ring;
int pipe1[2];
@@ -36,9 +36,6 @@ int main(int argc, char *argv[])
struct sigaction act;
int ret;
- if (argc > 1)
- return 0;
-
if (pipe(pipe1) != 0) {
perror("pipe");
return 1;
@@ -82,7 +79,7 @@ int main(int argc, char *argv[])
pds[1].is_poll = 0;
pds[1].is_cancel = 1;
- io_uring_prep_poll_remove(sqe, &pds[0]);
+ io_uring_prep_poll_remove(sqe, (__u64)(uintptr_t)&pds[0]);
io_uring_sqe_set_data(sqe, &pds[1]);
ret = io_uring_submit(&ring);
@@ -130,6 +127,102 @@ int main(int argc, char *argv[])
return 1;
}
+ close(pipe1[0]);
+ close(pipe1[1]);
io_uring_cqe_seen(&ring, cqe);
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+
+static int __test_poll_cancel_with_timeouts(void)
+{
+ struct __kernel_timespec ts = { .tv_sec = 10, };
+ struct io_uring ring, ring2;
+ struct io_uring_sqe *sqe;
+ int ret, off_nr = 1000;
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return 1;
+ }
+
+ ret = io_uring_queue_init(1, &ring2, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return 1;
+ }
+
+ /* test timeout-offset triggering path during cancellation */
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_timeout(sqe, &ts, off_nr, 0);
+
+ /* poll ring2 to trigger cancellation on exit() */
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_poll_add(sqe, ring2.ring_fd, POLLIN);
+ sqe->flags |= IOSQE_IO_LINK;
+
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_link_timeout(sqe, &ts, 0);
+
+ ret = io_uring_submit(&ring);
+ if (ret != 3) {
+ fprintf(stderr, "sqe submit failed\n");
+ return 1;
+ }
+
+ /* just drop all rings/etc. intact, exit() will clean them up */
+ return 0;
+}
+
+static int test_poll_cancel_with_timeouts(void)
+{
+ int ret;
+ pid_t p;
+
+ p = fork();
+ if (p == -1) {
+ fprintf(stderr, "fork() failed\n");
+ return 1;
+ }
+
+ if (p == 0) {
+ ret = __test_poll_cancel_with_timeouts();
+ exit(ret);
+ } else {
+ int wstatus;
+
+ if (waitpid(p, &wstatus, 0) == (pid_t)-1) {
+ perror("waitpid()");
+ return 1;
+ }
+ if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus)) {
+ fprintf(stderr, "child failed %i\n", WEXITSTATUS(wstatus));
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ ret = test_poll_cancel();
+ if (ret) {
+ fprintf(stderr, "test_poll_cancel failed\n");
+ return -1;
+ }
+
+ ret = test_poll_cancel_with_timeouts();
+ if (ret) {
+ fprintf(stderr, "test_poll_cancel_with_timeouts failed\n");
+ return -1;
+ }
+
return 0;
}
diff --git a/test/poll-link.c b/test/poll-link.c
index 4b4f9aa..197ad77 100644
--- a/test/poll-link.c
+++ b/test/poll-link.c
@@ -11,6 +11,7 @@
#include <netinet/tcp.h>
#include <netinet/in.h>
#include <poll.h>
+#include <arpa/inet.h>
#include "liburing.h"
@@ -42,7 +43,8 @@ struct data {
unsigned expected[2];
unsigned is_mask[2];
unsigned long timeout;
- int port;
+ unsigned short port;
+ unsigned int addr;
int stop;
};
@@ -59,7 +61,7 @@ static void *send_thread(void *arg)
addr.sin_family = AF_INET;
addr.sin_port = data->port;
- addr.sin_addr.s_addr = 0x0100007fU;
+ addr.sin_addr.s_addr = data->addr;
if (connect(s0, (struct sockaddr*)&addr, sizeof(addr)) != -1)
wait_for_var(&recv_thread_done);
@@ -90,11 +92,12 @@ void *recv_thread(void *arg)
struct sockaddr_in addr;
addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = 0x0100007fU;
+ data->addr = inet_addr("127.0.0.1");
+ addr.sin_addr.s_addr = data->addr;
i = 0;
do {
- data->port = 1025 + (rand() % 64510);
+ data->port = htons(1025 + (rand() % 64510));
addr.sin_port = data->port;
if (bind(s0, (struct sockaddr*)&addr, sizeof(addr)) != -1)
diff --git a/test/poll-many.c b/test/poll-many.c
index 3f8d08d..dfbeeab 100644
--- a/test/poll-many.c
+++ b/test/poll-many.c
@@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <signal.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/resource.h>
#include <fcntl.h>
diff --git a/test/poll-mshot-update.c b/test/poll-mshot-update.c
index 1a9ea0a..caedb6f 100644
--- a/test/poll-mshot-update.c
+++ b/test/poll-mshot-update.c
@@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <signal.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/resource.h>
#include <fcntl.h>
#include <pthread.h>
@@ -28,7 +28,37 @@ struct p {
};
static struct p p[NFILES];
-static int no_update;
+
+static int has_poll_update(void)
+{
+ struct io_uring ring;
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ bool has_update = false;
+ int ret;
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret)
+ return -1;
+
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_poll_update(sqe, 0, 0, POLLIN, IORING_TIMEOUT_UPDATE);
+
+ ret = io_uring_submit(&ring);
+ if (ret != 1)
+ return -1;
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (!ret) {
+ if (cqe->res == -ENOENT)
+ has_update = true;
+ else if (cqe->res != -EINVAL)
+ return -1;
+ io_uring_cqe_seen(&ring, cqe);
+ }
+ io_uring_queue_exit(&ring);
+ return has_update;
+}
static int arm_poll(struct io_uring *ring, int off)
{
@@ -40,8 +70,7 @@ static int arm_poll(struct io_uring *ring, int off)
return 1;
}
- io_uring_prep_poll_add(sqe, p[off].fd[0], POLLIN);
- sqe->len = 1;
+ io_uring_prep_poll_multishot(sqe, p[off].fd[0], POLLIN);
sqe->user_data = off;
return 0;
}
@@ -57,8 +86,8 @@ static int reap_polls(struct io_uring *ring)
sqe = io_uring_get_sqe(ring);
/* update event */
- io_uring_prep_poll_update(sqe, (void *)(unsigned long)i, NULL,
- POLLIN, 2);
+ io_uring_prep_poll_update(sqe, i, 0, POLLIN,
+ IORING_POLL_UPDATE_EVENTS);
sqe->user_data = 0x12345678;
}
@@ -77,7 +106,6 @@ static int reap_polls(struct io_uring *ring)
off = cqe->user_data;
if (off == 0x12345678)
goto seen;
- p[off].triggered = 0;
ret = read(p[off].fd[0], &c, 1);
if (ret != 1) {
if (ret == -1 && errno == EAGAIN)
@@ -128,19 +156,6 @@ static void *trigger_polls_fn(void *data)
return NULL;
}
-static int check_no_update(struct io_uring *ring)
-{
- struct io_uring_cqe *cqe;
- int ret;
-
- ret = io_uring_wait_cqe(ring, &cqe);
- if (ret)
- return 0;
- ret = cqe->res;
- io_uring_cqe_seen(ring, cqe);
- return ret == -EINVAL;
-}
-
static int arm_polls(struct io_uring *ring)
{
int ret, to_arm = NFILES, i, off;
@@ -163,10 +178,6 @@ static int arm_polls(struct io_uring *ring)
ret = io_uring_submit(ring);
if (ret != this_arm) {
- if (ret > 0 && check_no_update(ring)) {
- no_update = 1;
- return 0;
- }
fprintf(stderr, "submitted %d, %d\n", ret, this_arm);
return 1;
}
@@ -182,11 +193,20 @@ int main(int argc, char *argv[])
struct io_uring_params params = { };
struct rlimit rlim;
pthread_t thread;
- int i, ret;
+ int i, j, ret;
if (argc > 1)
return 0;
+ ret = has_poll_update();
+ if (ret < 0) {
+ fprintf(stderr, "poll update check failed %i\n", ret);
+ return -1;
+ } else if (!ret) {
+ fprintf(stderr, "no poll update, skip\n");
+ return 0;
+ }
+
if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
perror("getrlimit");
goto err_noring;
@@ -227,10 +247,6 @@ int main(int argc, char *argv[])
if (arm_polls(&ring))
goto err;
- if (no_update) {
- printf("No poll update support, skipping\n");
- goto done;
- }
for (i = 0; i < NLOOPS; i++) {
pthread_create(&thread, NULL, trigger_polls_fn, NULL);
@@ -238,9 +254,11 @@ int main(int argc, char *argv[])
if (ret)
goto err;
pthread_join(thread, NULL);
+
+ for (j = 0; j < NFILES; j++)
+ p[j].triggered = 0;
}
-done:
io_uring_queue_exit(&ring);
return 0;
err:
diff --git a/test/poll-ring.c b/test/poll-ring.c
index 1f69e20..2dd3ae6 100644
--- a/test/poll-ring.c
+++ b/test/poll-ring.c
@@ -10,7 +10,7 @@
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
-#include <sys/poll.h>
+#include <poll.h>
#include "liburing.h"
diff --git a/test/poll-v-poll.c b/test/poll-v-poll.c
index c8ba6f1..1b277db 100644
--- a/test/poll-v-poll.c
+++ b/test/poll-v-poll.c
@@ -10,7 +10,7 @@
#include <string.h>
#include <signal.h>
#include <fcntl.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/wait.h>
#include <sys/select.h>
#include <pthread.h>
diff --git a/test/poll.c b/test/poll.c
index f9a89d0..1cd57ba 100644
--- a/test/poll.c
+++ b/test/poll.c
@@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <signal.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/wait.h>
#include "liburing.h"
diff --git a/test/pollfree.c b/test/pollfree.c
new file mode 100644
index 0000000..d753ffe
--- /dev/null
+++ b/test/pollfree.c
@@ -0,0 +1,426 @@
+/* SPDX-License-Identifier: MIT */
+// https://syzkaller.appspot.com/bug?id=5f5a44abb4cba056fe24255c4fcb7e7bbe13de7a
+// autogenerated by syzkaller (https://github.com/google/syzkaller)
+
+#include <dirent.h>
+#include <endian.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <linux/futex.h>
+
+#ifdef __NR_futex
+
+static void sleep_ms(uint64_t ms)
+{
+ usleep(ms * 1000);
+}
+
+static uint64_t current_time_ms(void)
+{
+ struct timespec ts;
+ if (clock_gettime(CLOCK_MONOTONIC, &ts))
+ exit(1);
+ return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
+}
+
+static void thread_start(void* (*fn)(void*), void* arg)
+{
+ pthread_t th;
+ pthread_attr_t attr;
+ pthread_attr_init(&attr);
+ pthread_attr_setstacksize(&attr, 128 << 10);
+ int i = 0;
+ for (; i < 100; i++) {
+ if (pthread_create(&th, &attr, fn, arg) == 0) {
+ pthread_attr_destroy(&attr);
+ return;
+ }
+ if (errno == EAGAIN) {
+ usleep(50);
+ continue;
+ }
+ break;
+ }
+ exit(1);
+}
+
+typedef struct {
+ int state;
+} event_t;
+
+static void event_init(event_t* ev)
+{
+ ev->state = 0;
+}
+
+static void event_reset(event_t* ev)
+{
+ ev->state = 0;
+}
+
+static void event_set(event_t* ev)
+{
+ if (ev->state)
+ exit(1);
+ __atomic_store_n(&ev->state, 1, __ATOMIC_RELEASE);
+ syscall(__NR_futex, &ev->state, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1000000);
+}
+
+static void event_wait(event_t* ev)
+{
+ while (!__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
+ syscall(__NR_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, 0);
+}
+
+static int event_isset(event_t* ev)
+{
+ return __atomic_load_n(&ev->state, __ATOMIC_ACQUIRE);
+}
+
+static int event_timedwait(event_t* ev, uint64_t timeout)
+{
+ uint64_t start = current_time_ms();
+ uint64_t now = start;
+ for (;;) {
+ uint64_t remain = timeout - (now - start);
+ struct timespec ts;
+ ts.tv_sec = remain / 1000;
+ ts.tv_nsec = (remain % 1000) * 1000 * 1000;
+ syscall(__NR_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, &ts);
+ if (__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
+ return 1;
+ now = current_time_ms();
+ if (now - start > timeout)
+ return 0;
+ }
+}
+
+#define SIZEOF_IO_URING_SQE 64
+#define SIZEOF_IO_URING_CQE 16
+#define SQ_HEAD_OFFSET 0
+#define SQ_TAIL_OFFSET 64
+#define SQ_RING_MASK_OFFSET 256
+#define SQ_RING_ENTRIES_OFFSET 264
+#define SQ_FLAGS_OFFSET 276
+#define SQ_DROPPED_OFFSET 272
+#define CQ_HEAD_OFFSET 128
+#define CQ_TAIL_OFFSET 192
+#define CQ_RING_MASK_OFFSET 260
+#define CQ_RING_ENTRIES_OFFSET 268
+#define CQ_RING_OVERFLOW_OFFSET 284
+#define CQ_FLAGS_OFFSET 280
+#define CQ_CQES_OFFSET 320
+
+struct io_sqring_offsets {
+ uint32_t head;
+ uint32_t tail;
+ uint32_t ring_mask;
+ uint32_t ring_entries;
+ uint32_t flags;
+ uint32_t dropped;
+ uint32_t array;
+ uint32_t resv1;
+ uint64_t resv2;
+};
+
+struct io_cqring_offsets {
+ uint32_t head;
+ uint32_t tail;
+ uint32_t ring_mask;
+ uint32_t ring_entries;
+ uint32_t overflow;
+ uint32_t cqes;
+ uint64_t resv[2];
+};
+
+struct io_uring_params {
+ uint32_t sq_entries;
+ uint32_t cq_entries;
+ uint32_t flags;
+ uint32_t sq_thread_cpu;
+ uint32_t sq_thread_idle;
+ uint32_t features;
+ uint32_t resv[4];
+ struct io_sqring_offsets sq_off;
+ struct io_cqring_offsets cq_off;
+};
+
+#define IORING_OFF_SQ_RING 0
+#define IORING_OFF_SQES 0x10000000ULL
+
+#define sys_io_uring_setup 425
+static long syz_io_uring_setup(volatile long a0, volatile long a1,
+ volatile long a2, volatile long a3,
+ volatile long a4, volatile long a5)
+{
+ uint32_t entries = (uint32_t)a0;
+ struct io_uring_params* setup_params = (struct io_uring_params*)a1;
+ void* vma1 = (void*)a2;
+ void* vma2 = (void*)a3;
+ void** ring_ptr_out = (void**)a4;
+ void** sqes_ptr_out = (void**)a5;
+ uint32_t fd_io_uring = syscall(sys_io_uring_setup, entries, setup_params);
+ uint32_t sq_ring_sz =
+ setup_params->sq_off.array + setup_params->sq_entries * sizeof(uint32_t);
+ uint32_t cq_ring_sz = setup_params->cq_off.cqes +
+ setup_params->cq_entries * SIZEOF_IO_URING_CQE;
+ uint32_t ring_sz = sq_ring_sz > cq_ring_sz ? sq_ring_sz : cq_ring_sz;
+ *ring_ptr_out = mmap(vma1, ring_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd_io_uring,
+ IORING_OFF_SQ_RING);
+ uint32_t sqes_sz = setup_params->sq_entries * SIZEOF_IO_URING_SQE;
+ *sqes_ptr_out =
+ mmap(vma2, sqes_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd_io_uring, IORING_OFF_SQES);
+ return fd_io_uring;
+}
+
+static long syz_io_uring_submit(volatile long a0, volatile long a1,
+ volatile long a2, volatile long a3)
+{
+ char* ring_ptr = (char*)a0;
+ char* sqes_ptr = (char*)a1;
+ char* sqe = (char*)a2;
+ uint32_t sqes_index = (uint32_t)a3;
+ uint32_t sq_ring_entries = *(uint32_t*)(ring_ptr + SQ_RING_ENTRIES_OFFSET);
+ uint32_t cq_ring_entries = *(uint32_t*)(ring_ptr + CQ_RING_ENTRIES_OFFSET);
+ uint32_t sq_array_off =
+ (CQ_CQES_OFFSET + cq_ring_entries * SIZEOF_IO_URING_CQE + 63) & ~63;
+ if (sq_ring_entries)
+ sqes_index %= sq_ring_entries;
+ char* sqe_dest = sqes_ptr + sqes_index * SIZEOF_IO_URING_SQE;
+ memcpy(sqe_dest, sqe, SIZEOF_IO_URING_SQE);
+ uint32_t sq_ring_mask = *(uint32_t*)(ring_ptr + SQ_RING_MASK_OFFSET);
+ uint32_t* sq_tail_ptr = (uint32_t*)(ring_ptr + SQ_TAIL_OFFSET);
+ uint32_t sq_tail = *sq_tail_ptr & sq_ring_mask;
+ uint32_t sq_tail_next = *sq_tail_ptr + 1;
+ uint32_t* sq_array = (uint32_t*)(ring_ptr + sq_array_off);
+ *(sq_array + sq_tail) = sqes_index;
+ __atomic_store_n(sq_tail_ptr, sq_tail_next, __ATOMIC_RELEASE);
+ return 0;
+}
+
+static void kill_and_wait(int pid, int* status)
+{
+ kill(-pid, SIGKILL);
+ kill(pid, SIGKILL);
+ for (int i = 0; i < 100; i++) {
+ if (waitpid(-1, status, WNOHANG | __WALL) == pid)
+ return;
+ usleep(1000);
+ }
+ DIR* dir = opendir("/sys/fs/fuse/connections");
+ if (dir) {
+ for (;;) {
+ struct dirent* ent = readdir(dir);
+ if (!ent)
+ break;
+ if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
+ continue;
+ char abort[300];
+ snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort",
+ ent->d_name);
+ int fd = open(abort, O_WRONLY);
+ if (fd == -1) {
+ continue;
+ }
+ if (write(fd, abort, 1) < 0) {
+ }
+ close(fd);
+ }
+ closedir(dir);
+ } else {
+ }
+ while (waitpid(-1, status, __WALL) != pid) {
+ }
+}
+
+static void setup_test()
+{
+ prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
+ setpgrp();
+}
+
+struct thread_t {
+ int created, call;
+ event_t ready, done;
+};
+
+static struct thread_t threads[16];
+static void execute_call(int call);
+static int running;
+
+static void* thr(void* arg)
+{
+ struct thread_t* th = (struct thread_t*)arg;
+ for (;;) {
+ event_wait(&th->ready);
+ event_reset(&th->ready);
+ execute_call(th->call);
+ __atomic_fetch_sub(&running, 1, __ATOMIC_RELAXED);
+ event_set(&th->done);
+ }
+ return 0;
+}
+
+static void execute_one(void)
+{
+ int i, call, thread;
+ for (call = 0; call < 4; call++) {
+ for (thread = 0; thread < (int)(sizeof(threads) / sizeof(threads[0]));
+ thread++) {
+ struct thread_t* th = &threads[thread];
+ if (!th->created) {
+ th->created = 1;
+ event_init(&th->ready);
+ event_init(&th->done);
+ event_set(&th->done);
+ thread_start(thr, th);
+ }
+ if (!event_isset(&th->done))
+ continue;
+ event_reset(&th->done);
+ th->call = call;
+ __atomic_fetch_add(&running, 1, __ATOMIC_RELAXED);
+ event_set(&th->ready);
+ event_timedwait(&th->done, 50);
+ break;
+ }
+ }
+ for (i = 0; i < 100 && __atomic_load_n(&running, __ATOMIC_RELAXED); i++)
+ sleep_ms(1);
+}
+
+static void execute_one(void);
+
+#define WAIT_FLAGS __WALL
+
+static void loop(void)
+{
+ int iter = 0;
+ for (; iter < 5000; iter++) {
+ int pid = fork();
+ if (pid < 0)
+ exit(1);
+ if (pid == 0) {
+ setup_test();
+ execute_one();
+ exit(0);
+ }
+ int status = 0;
+ uint64_t start = current_time_ms();
+ for (;;) {
+ if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
+ break;
+ sleep_ms(1);
+ if (current_time_ms() - start < 5000)
+ continue;
+ kill_and_wait(pid, &status);
+ break;
+ }
+ }
+}
+
+#ifndef __NR_io_uring_enter
+#define __NR_io_uring_enter 426
+#endif
+
+uint64_t r[4] = {0xffffffffffffffff, 0xffffffffffffffff, 0x0, 0x0};
+
+void execute_call(int call)
+{
+ intptr_t res = 0;
+ switch (call) {
+ case 0:
+ *(uint64_t*)0x200000c0 = 0;
+ res = syscall(__NR_signalfd4, -1, 0x200000c0ul, 8ul, 0ul);
+ if (res != -1)
+ r[0] = res;
+ break;
+ case 1:
+ *(uint32_t*)0x20000a84 = 0;
+ *(uint32_t*)0x20000a88 = 0;
+ *(uint32_t*)0x20000a8c = 0;
+ *(uint32_t*)0x20000a90 = 0;
+ *(uint32_t*)0x20000a98 = -1;
+ memset((void*)0x20000a9c, 0, 12);
+ res = -1;
+ res = syz_io_uring_setup(0x87, 0x20000a80, 0x206d6000, 0x206d7000,
+ 0x20000000, 0x20000040);
+ if (res != -1) {
+ r[1] = res;
+ r[2] = *(uint64_t*)0x20000000;
+ r[3] = *(uint64_t*)0x20000040;
+ }
+ break;
+ case 2:
+ *(uint8_t*)0x20002240 = 6;
+ *(uint8_t*)0x20002241 = 0;
+ *(uint16_t*)0x20002242 = 0;
+ *(uint32_t*)0x20002244 = r[0];
+ *(uint64_t*)0x20002248 = 0;
+ *(uint64_t*)0x20002250 = 0;
+ *(uint32_t*)0x20002258 = 0;
+ *(uint16_t*)0x2000225c = 0;
+ *(uint16_t*)0x2000225e = 0;
+ *(uint64_t*)0x20002260 = 0;
+ *(uint16_t*)0x20002268 = 0;
+ *(uint16_t*)0x2000226a = 0;
+ memset((void*)0x2000226c, 0, 20);
+ syz_io_uring_submit(r[2], r[3], 0x20002240, 0);
+ break;
+ case 3:
+ syscall(__NR_io_uring_enter, r[1], 0x1523a, 0, 0ul, 0ul, 0xaul);
+ break;
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ void *ret;
+
+#if !defined(__i386) && !defined(__x86_64__)
+ return 0;
+#endif
+
+ if (argc > 1)
+ return 0;
+
+ ret = mmap((void *)0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
+ if (ret == MAP_FAILED)
+ return 0;
+ ret = mmap((void *)0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul);
+ if (ret == MAP_FAILED)
+ return 0;
+ ret = mmap((void *)0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
+ if (ret == MAP_FAILED)
+ return 0;
+ loop();
+ return 0;
+}
+
+#else /* __NR_futex */
+
+int main(int argc, char *argv[])
+{
+ return 0;
+}
+
+#endif /* __NR_futex */
diff --git a/test/probe.c b/test/probe.c
index c7fc053..fd59612 100644
--- a/test/probe.c
+++ b/test/probe.c
@@ -36,7 +36,7 @@ static int verify_probe(struct io_uring_probe *p, int full)
return 1;
}
if (!(p->ops[IORING_OP_WRITE].flags & IO_URING_OP_SUPPORTED)) {
- fprintf(stderr, "READV not supported!?\n");
+ fprintf(stderr, "WRITE not supported!?\n");
return 1;
}
@@ -45,6 +45,7 @@ static int verify_probe(struct io_uring_probe *p, int full)
static int test_probe_helper(struct io_uring *ring)
{
+ int ret;
struct io_uring_probe *p;
p = io_uring_get_probe_ring(ring);
@@ -53,12 +54,9 @@ static int test_probe_helper(struct io_uring *ring)
return 1;
}
- if (verify_probe(p, 1)) {
- free(p);
- return 1;
- }
-
- return 0;
+ ret = verify_probe(p, 1);
+ io_uring_free_probe(p);
+ return ret;
}
static int test_probe(struct io_uring *ring)
diff --git a/test/read-before-exit.c b/test/read-before-exit.c
new file mode 100644
index 0000000..be36bd4
--- /dev/null
+++ b/test/read-before-exit.c
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: test if issuing IO from thread and immediately exiting will
+ * proceed correctly.
+ *
+ * Original test case from: https://github.com/axboe/liburing/issues/582
+ */
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/timerfd.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+struct data {
+ struct io_uring *ring;
+ int timer_fd1;
+ int timer_fd2;
+ uint64_t buf1;
+ uint64_t buf2;
+};
+
+void *submit(void *data)
+{
+ struct io_uring_sqe *sqe;
+ struct data *d = data;
+ int ret;
+
+ sqe = io_uring_get_sqe(d->ring);
+ io_uring_prep_read(sqe, d->timer_fd1, &d->buf1, sizeof(d->buf1), 0);
+
+ sqe = io_uring_get_sqe(d->ring);
+ io_uring_prep_read(sqe, d->timer_fd2, &d->buf2, sizeof(d->buf2), 0);
+
+ ret = io_uring_submit(d->ring);
+ if (ret != 2)
+ return (void *) (uintptr_t) 1;
+
+ /* Exit suddenly. */
+ return NULL;
+}
+
+static int test(int flags)
+{
+ struct io_uring_params params = { .flags = flags, };
+ struct io_uring ring;
+ struct data d = { .ring = &ring, };
+ pthread_t thread;
+ void *res;
+ int ret;
+
+ ret = t_create_ring_params(8, &ring, &params);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ else if (ret != T_SETUP_OK)
+ return 1;
+
+ d.timer_fd1 = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC);
+ if (d.timer_fd1 < 0) {
+ perror("timerfd_create");
+ return 1;
+ }
+ d.timer_fd2 = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC);
+ if (d.timer_fd2 < 0) {
+ perror("timerfd_create");
+ return 1;
+ }
+
+ pthread_create(&thread, NULL, submit, &d);
+ pthread_join(thread, &res);
+
+ /** Wait for completions and do stuff ... **/
+
+ io_uring_queue_exit(&ring);
+
+ close(d.timer_fd1);
+ close(d.timer_fd2);
+ return !!res;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret, i;
+
+ for (i = 0; i < 1000; i++) {
+ ret = test(0);
+ if (ret) {
+ fprintf(stderr, "Test failed\n");
+ return ret;
+ }
+ }
+
+ for (i = 0; i < 1000; i++) {
+ ret = test(IORING_SETUP_IOPOLL);
+ if (ret) {
+ fprintf(stderr, "Test IOPOLL failed\n");
+ return ret;
+ }
+ }
+
+ for (i = 0; i < 100; i++) {
+ ret = test(IORING_SETUP_SQPOLL);
+ if (ret) {
+ fprintf(stderr, "Test SQPOLL failed\n");
+ return ret;
+ }
+ }
+
+ return 0;
+}
diff --git a/test/read-write.c b/test/read-write.c
index d0a77fa..3951a64 100644
--- a/test/read-write.c
+++ b/test/read-write.c
@@ -9,15 +9,15 @@
#include <string.h>
#include <fcntl.h>
#include <sys/types.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/eventfd.h>
#include <sys/resource.h>
#include "helpers.h"
#include "liburing.h"
-#define FILE_SIZE (128 * 1024)
-#define BS 4096
+#define FILE_SIZE (256 * 1024)
+#define BS 8192
#define BUFFERS (FILE_SIZE / BS)
static struct iovec *vecs;
@@ -49,7 +49,7 @@ static int __test_io(const char *file, struct io_uring *ring, int write,
struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe;
int open_flags;
- int i, fd, ret;
+ int i, fd = -1, ret;
off_t offset;
#ifdef VERBOSE
@@ -57,13 +57,6 @@ static int __test_io(const char *file, struct io_uring *ring, int write,
buffered, sqthread,
fixed, nonvec);
#endif
- if (sqthread && geteuid()) {
-#ifdef VERBOSE
- fprintf(stdout, "SKIPPED (not root)\n");
-#endif
- return 0;
- }
-
if (write)
open_flags = O_WRONLY;
else
@@ -71,19 +64,22 @@ static int __test_io(const char *file, struct io_uring *ring, int write,
if (!buffered)
open_flags |= O_DIRECT;
+ if (fixed) {
+ ret = t_register_buffers(ring, vecs, BUFFERS);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ if (ret != T_SETUP_OK) {
+ fprintf(stderr, "buffer reg failed: %d\n", ret);
+ goto err;
+ }
+ }
+
fd = open(file, open_flags);
if (fd < 0) {
perror("file open");
goto err;
}
- if (fixed) {
- ret = io_uring_register_buffers(ring, vecs, BUFFERS);
- if (ret) {
- fprintf(stderr, "buffer reg failed: %d\n", ret);
- goto err;
- }
- }
if (sqthread) {
ret = io_uring_register_files(ring, &fd, 1);
if (ret) {
@@ -235,30 +231,21 @@ static int test_io(const char *file, int write, int buffered, int sqthread,
int fixed, int nonvec, int exp_len)
{
struct io_uring ring;
- int ret, ring_flags;
+ int ret, ring_flags = 0;
- if (sqthread) {
- if (geteuid()) {
- if (!warned) {
- fprintf(stderr, "SQPOLL requires root, skipping\n");
- warned = 1;
- }
- return 0;
- }
+ if (sqthread)
ring_flags = IORING_SETUP_SQPOLL;
- } else {
- ring_flags = 0;
- }
- ret = io_uring_queue_init(64, &ring, ring_flags);
- if (ret) {
+ ret = t_create_ring(64, &ring, ring_flags);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ if (ret != T_SETUP_OK) {
fprintf(stderr, "ring create failed: %d\n", ret);
return 1;
}
ret = __test_io(file, &ring, write, buffered, sqthread, fixed, nonvec,
0, 0, exp_len);
-
io_uring_queue_exit(&ring);
return ret;
}
@@ -493,7 +480,7 @@ static int test_buf_select(const char *filename, int nonvec)
fprintf(stdout, "Buffer select not supported, skipping\n");
return 0;
}
- free(p);
+ io_uring_free_probe(p);
/*
* Write out data with known pattern
@@ -671,8 +658,8 @@ static int test_write_efbig(void)
return 1;
}
rlim = old_rlim;
- rlim.rlim_cur = 64 * 1024;
- rlim.rlim_max = 64 * 1024;
+ rlim.rlim_cur = 128 * 1024;
+ rlim.rlim_max = 128 * 1024;
if (setrlimit(RLIMIT_FSIZE, &rlim) < 0) {
perror("setrlimit");
return 1;
@@ -683,6 +670,7 @@ static int test_write_efbig(void)
perror("file open");
goto err;
}
+ unlink(".efbig");
ret = io_uring_queue_init(32, &ring, 0);
if (ret) {
@@ -739,19 +727,22 @@ static int test_write_efbig(void)
err:
if (fd != -1)
close(fd);
- unlink(".efbig");
return 1;
}
int main(int argc, char *argv[])
{
int i, ret, nr;
+ char buf[256];
char *fname;
if (argc > 1) {
fname = argv[1];
} else {
- fname = ".basic-rw";
+ srand((unsigned)time(NULL));
+ snprintf(buf, sizeof(buf), ".basic-rw-%u-%u",
+ (unsigned)rand(), (unsigned)getpid());
+ fname = buf;
t_create_file(fname, FILE_SIZE);
}
diff --git a/test/recv-msgall-stream.c b/test/recv-msgall-stream.c
new file mode 100644
index 0000000..a188cc1
--- /dev/null
+++ b/test/recv-msgall-stream.c
@@ -0,0 +1,400 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Test MSG_WAITALL for recv/recvmsg and include normal sync versions just
+ * for comparison.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <arpa/inet.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <pthread.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+#define MAX_MSG 128
+
+static int port = 31200;
+
+struct recv_data {
+ pthread_mutex_t mutex;
+ int use_recvmsg;
+ int use_sync;
+ int port;
+};
+
+static int get_conn_sock(struct recv_data *rd, int *sockout)
+{
+ struct sockaddr_in saddr;
+ int sockfd, ret, val;
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sin_family = AF_INET;
+ saddr.sin_addr.s_addr = htonl(INADDR_ANY);
+ saddr.sin_port = htons(rd->port);
+
+ sockfd = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+ if (sockfd < 0) {
+ perror("socket");
+ goto err;
+ }
+
+ val = 1;
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val));
+
+ ret = bind(sockfd, (struct sockaddr *)&saddr, sizeof(saddr));
+ if (ret < 0) {
+ perror("bind");
+ goto err;
+ }
+
+ ret = listen(sockfd, 16);
+ if (ret < 0) {
+ perror("listen");
+ goto err;
+ }
+
+ pthread_mutex_unlock(&rd->mutex);
+
+ ret = accept(sockfd, NULL, NULL);
+ if (ret < 0) {
+ perror("accept");
+ return -1;
+ }
+
+ *sockout = sockfd;
+ return ret;
+err:
+ pthread_mutex_unlock(&rd->mutex);
+ return -1;
+}
+
+static int recv_prep(struct io_uring *ring, struct iovec *iov, int *sock,
+ struct recv_data *rd)
+{
+ struct io_uring_sqe *sqe;
+ struct msghdr msg = { };
+ int sockfd, sockout = -1, ret;
+
+ sockfd = get_conn_sock(rd, &sockout);
+ if (sockfd < 0)
+ goto err;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!rd->use_recvmsg) {
+ io_uring_prep_recv(sqe, sockfd, iov->iov_base, iov->iov_len,
+ MSG_WAITALL);
+ } else {
+ msg.msg_namelen = sizeof(struct sockaddr_in);
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 1;
+ io_uring_prep_recvmsg(sqe, sockfd, &msg, MSG_WAITALL);
+ }
+
+ sqe->user_data = 2;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "submit failed: %d\n", ret);
+ goto err;
+ }
+
+ *sock = sockfd;
+ return 0;
+err:
+ if (sockout != -1) {
+ shutdown(sockout, SHUT_RDWR);
+ close(sockout);
+ }
+ if (sockfd != -1) {
+ shutdown(sockfd, SHUT_RDWR);
+ close(sockfd);
+ }
+ return 1;
+}
+
+static int do_recv(struct io_uring *ring)
+{
+ struct io_uring_cqe *cqe;
+ int ret;
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stdout, "wait_cqe: %d\n", ret);
+ goto err;
+ }
+ if (cqe->res == -EINVAL) {
+ fprintf(stdout, "recv not supported, skipping\n");
+ return 0;
+ }
+ if (cqe->res < 0) {
+ fprintf(stderr, "failed cqe: %d\n", cqe->res);
+ goto err;
+ }
+ if (cqe->res != MAX_MSG * sizeof(int)) {
+ fprintf(stderr, "got wrong length: %d\n", cqe->res);
+ goto err;
+ }
+
+ io_uring_cqe_seen(ring, cqe);
+ return 0;
+err:
+ return 1;
+}
+
+static int recv_sync(struct recv_data *rd)
+{
+ int buf[MAX_MSG];
+ struct iovec iov = {
+ .iov_base = buf,
+ .iov_len = sizeof(buf),
+ };
+ int i, ret, sockfd, sockout = -1;
+
+ sockfd = get_conn_sock(rd, &sockout);
+
+ if (rd->use_recvmsg) {
+ struct msghdr msg = { };
+
+ msg.msg_namelen = sizeof(struct sockaddr_in);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ret = recvmsg(sockfd, &msg, MSG_WAITALL);
+ } else {
+ ret = recv(sockfd, buf, sizeof(buf), MSG_WAITALL);
+ }
+
+ if (ret < 0) {
+ perror("receive");
+ goto err;
+ }
+
+ if (ret != sizeof(buf)) {
+ ret = -1;
+ goto err;
+ }
+
+ for (i = 0; i < MAX_MSG; i++) {
+ if (buf[i] != i)
+ goto err;
+ }
+ ret = 0;
+err:
+ shutdown(sockout, SHUT_RDWR);
+ shutdown(sockfd, SHUT_RDWR);
+ close(sockout);
+ close(sockfd);
+ return ret;
+}
+
+static int recv_uring(struct recv_data *rd)
+{
+ int buf[MAX_MSG];
+ struct iovec iov = {
+ .iov_base = buf,
+ .iov_len = sizeof(buf),
+ };
+ struct io_uring_params p = { };
+ struct io_uring ring;
+ int ret, sock = -1, sockout = -1;
+
+ ret = t_create_ring_params(1, &ring, &p);
+ if (ret == T_SETUP_SKIP) {
+ pthread_mutex_unlock(&rd->mutex);
+ ret = 0;
+ goto err;
+ } else if (ret < 0) {
+ pthread_mutex_unlock(&rd->mutex);
+ goto err;
+ }
+
+ sock = recv_prep(&ring, &iov, &sockout, rd);
+ if (ret) {
+ fprintf(stderr, "recv_prep failed: %d\n", ret);
+ goto err;
+ }
+ ret = do_recv(&ring);
+ if (!ret) {
+ int i;
+
+ for (i = 0; i < MAX_MSG; i++) {
+ if (buf[i] != i) {
+ fprintf(stderr, "found %d at %d\n", buf[i], i);
+ ret = 1;
+ break;
+ }
+ }
+ }
+
+ shutdown(sockout, SHUT_RDWR);
+ shutdown(sock, SHUT_RDWR);
+ close(sock);
+ close(sockout);
+ io_uring_queue_exit(&ring);
+err:
+ if (sock != -1) {
+ shutdown(sock, SHUT_RDWR);
+ close(sock);
+ }
+ if (sockout != -1) {
+ shutdown(sockout, SHUT_RDWR);
+ close(sockout);
+ }
+ return ret;
+}
+
+static void *recv_fn(void *data)
+{
+ struct recv_data *rd = data;
+
+ if (rd->use_sync)
+ return (void *) (uintptr_t) recv_sync(rd);
+
+ return (void *) (uintptr_t) recv_uring(rd);
+}
+
+static int do_send(struct recv_data *rd)
+{
+ struct sockaddr_in saddr;
+ struct io_uring ring;
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int sockfd, ret, i;
+ struct iovec iov;
+ int *buf;
+
+ ret = io_uring_queue_init(2, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "queue init failed: %d\n", ret);
+ return 1;
+ }
+
+ buf = malloc(MAX_MSG * sizeof(int));
+ for (i = 0; i < MAX_MSG; i++)
+ buf[i] = i;
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sin_family = AF_INET;
+ saddr.sin_port = htons(rd->port);
+ inet_pton(AF_INET, "127.0.0.1", &saddr.sin_addr);
+
+ sockfd = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+ if (sockfd < 0) {
+ perror("socket");
+ return 1;
+ }
+
+ pthread_mutex_lock(&rd->mutex);
+
+ ret = connect(sockfd, (struct sockaddr *)&saddr, sizeof(saddr));
+ if (ret < 0) {
+ perror("connect");
+ return 1;
+ }
+
+ iov.iov_base = buf;
+ iov.iov_len = MAX_MSG * sizeof(int) / 2;
+ for (i = 0; i < 2; i++) {
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_send(sqe, sockfd, iov.iov_base, iov.iov_len, 0);
+ sqe->user_data = 1;
+
+ ret = io_uring_submit(&ring);
+ if (ret <= 0) {
+ fprintf(stderr, "submit failed: %d\n", ret);
+ goto err;
+ }
+ usleep(10000);
+ iov.iov_base += iov.iov_len;
+ }
+
+ for (i = 0; i < 2; i++) {
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (cqe->res == -EINVAL) {
+ fprintf(stdout, "send not supported, skipping\n");
+ close(sockfd);
+ return 0;
+ }
+ if (cqe->res != iov.iov_len) {
+ fprintf(stderr, "failed cqe: %d\n", cqe->res);
+ goto err;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+ }
+
+ shutdown(sockfd, SHUT_RDWR);
+ close(sockfd);
+ return 0;
+err:
+ shutdown(sockfd, SHUT_RDWR);
+ close(sockfd);
+ return 1;
+}
+
+static int test(int use_recvmsg, int use_sync)
+{
+ pthread_mutexattr_t attr;
+ pthread_t recv_thread;
+ struct recv_data rd;
+ int ret;
+ void *retval;
+
+ pthread_mutexattr_init(&attr);
+ pthread_mutexattr_setpshared(&attr, 1);
+ pthread_mutex_init(&rd.mutex, &attr);
+ pthread_mutex_lock(&rd.mutex);
+ rd.use_recvmsg = use_recvmsg;
+ rd.use_sync = use_sync;
+ rd.port = port++;
+
+ ret = pthread_create(&recv_thread, NULL, recv_fn, &rd);
+ if (ret) {
+ fprintf(stderr, "Thread create failed: %d\n", ret);
+ pthread_mutex_unlock(&rd.mutex);
+ return 1;
+ }
+
+ do_send(&rd);
+ pthread_join(recv_thread, &retval);
+ return (intptr_t)retval;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ ret = test(0, 0);
+ if (ret) {
+ fprintf(stderr, "test recv failed\n");
+ return ret;
+ }
+
+ ret = test(1, 0);
+ if (ret) {
+ fprintf(stderr, "test recvmsg failed\n");
+ return ret;
+ }
+
+ ret = test(0, 1);
+ if (ret) {
+ fprintf(stderr, "test sync recv failed\n");
+ return ret;
+ }
+
+ ret = test(1, 1);
+ if (ret) {
+ fprintf(stderr, "test sync recvmsg failed\n");
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/test/recv-msgall.c b/test/recv-msgall.c
new file mode 100644
index 0000000..a6f7cfc
--- /dev/null
+++ b/test/recv-msgall.c
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Test MSG_WAITALL with datagram sockets, with a send splice into two.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <pthread.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+#define MAX_MSG 128
+
+#define PORT 10201
+#define HOST "127.0.0.1"
+
+static int recv_prep(struct io_uring *ring, struct iovec *iov, int *sock,
+ int use_recvmsg)
+{
+ struct sockaddr_in saddr;
+ struct io_uring_sqe *sqe;
+ int sockfd, ret, val;
+ struct msghdr msg = { };
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sin_family = AF_INET;
+ saddr.sin_addr.s_addr = htonl(INADDR_ANY);
+ saddr.sin_port = htons(PORT);
+
+ sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sockfd < 0) {
+ perror("socket");
+ return 1;
+ }
+
+ val = 1;
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
+
+ ret = bind(sockfd, (struct sockaddr *)&saddr, sizeof(saddr));
+ if (ret < 0) {
+ perror("bind");
+ goto err;
+ }
+
+ sqe = io_uring_get_sqe(ring);
+ if (!use_recvmsg) {
+ io_uring_prep_recv(sqe, sockfd, iov->iov_base, iov->iov_len,
+ MSG_WAITALL);
+ } else {
+ msg.msg_namelen = sizeof(struct sockaddr_in);
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 1;
+ io_uring_prep_recvmsg(sqe, sockfd, &msg, MSG_WAITALL);
+ }
+
+ sqe->user_data = 2;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "submit failed: %d\n", ret);
+ goto err;
+ }
+
+ *sock = sockfd;
+ return 0;
+err:
+ close(sockfd);
+ return 1;
+}
+
+static int do_recv(struct io_uring *ring)
+{
+ struct io_uring_cqe *cqe;
+ int ret;
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stdout, "wait_cqe: %d\n", ret);
+ goto err;
+ }
+ if (cqe->res == -EINVAL) {
+ fprintf(stdout, "recv not supported, skipping\n");
+ return 0;
+ }
+ if (cqe->res < 0) {
+ fprintf(stderr, "failed cqe: %d\n", cqe->res);
+ goto err;
+ }
+ if (cqe->res != MAX_MSG * sizeof(int) / 2) {
+ fprintf(stderr, "got wrong length: %d\n", cqe->res);
+ goto err;
+ }
+
+ io_uring_cqe_seen(ring, cqe);
+ return 0;
+err:
+ return 1;
+}
+
+struct recv_data {
+ pthread_mutex_t mutex;
+ int use_recvmsg;
+};
+
+static void *recv_fn(void *data)
+{
+ struct recv_data *rd = data;
+ int buf[MAX_MSG];
+ struct iovec iov = {
+ .iov_base = buf,
+ .iov_len = sizeof(buf),
+ };
+ struct io_uring_params p = { };
+ struct io_uring ring;
+ int ret, sock;
+
+ ret = t_create_ring_params(1, &ring, &p);
+ if (ret == T_SETUP_SKIP) {
+ pthread_mutex_unlock(&rd->mutex);
+ ret = 0;
+ goto err;
+ } else if (ret < 0) {
+ pthread_mutex_unlock(&rd->mutex);
+ goto err;
+ }
+
+ ret = recv_prep(&ring, &iov, &sock, rd->use_recvmsg);
+ if (ret) {
+ fprintf(stderr, "recv_prep failed: %d\n", ret);
+ goto err;
+ }
+ pthread_mutex_unlock(&rd->mutex);
+ ret = do_recv(&ring);
+ close(sock);
+ io_uring_queue_exit(&ring);
+err:
+ return (void *)(intptr_t)ret;
+}
+
+static int do_send(void)
+{
+ struct sockaddr_in saddr;
+ struct io_uring ring;
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int sockfd, ret, i;
+ struct iovec iov;
+ int *buf;
+
+ ret = io_uring_queue_init(2, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "queue init failed: %d\n", ret);
+ return 1;
+ }
+
+ buf = malloc(MAX_MSG * sizeof(int));
+ for (i = 0; i < MAX_MSG; i++)
+ buf[i] = i;
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sin_family = AF_INET;
+ saddr.sin_port = htons(PORT);
+ inet_pton(AF_INET, HOST, &saddr.sin_addr);
+
+ sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sockfd < 0) {
+ perror("socket");
+ return 1;
+ }
+
+ ret = connect(sockfd, (struct sockaddr *)&saddr, sizeof(saddr));
+ if (ret < 0) {
+ perror("connect");
+ return 1;
+ }
+
+ iov.iov_base = buf;
+ iov.iov_len = MAX_MSG * sizeof(int) / 2;
+ for (i = 0; i < 2; i++) {
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_send(sqe, sockfd, iov.iov_base, iov.iov_len, 0);
+ sqe->user_data = 1;
+
+ ret = io_uring_submit(&ring);
+ if (ret <= 0) {
+ fprintf(stderr, "submit failed: %d\n", ret);
+ goto err;
+ }
+ usleep(10000);
+ iov.iov_base += iov.iov_len;
+ }
+
+ for (i = 0; i < 2; i++) {
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (cqe->res == -EINVAL) {
+ fprintf(stdout, "send not supported, skipping\n");
+ close(sockfd);
+ return 0;
+ }
+ if (cqe->res != iov.iov_len) {
+ fprintf(stderr, "failed cqe: %d\n", cqe->res);
+ goto err;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+ }
+
+ close(sockfd);
+ return 0;
+err:
+ close(sockfd);
+ return 1;
+}
+
+static int test(int use_recvmsg)
+{
+ pthread_mutexattr_t attr;
+ pthread_t recv_thread;
+ struct recv_data rd;
+ int ret;
+ void *retval;
+
+ pthread_mutexattr_init(&attr);
+ pthread_mutexattr_setpshared(&attr, 1);
+ pthread_mutex_init(&rd.mutex, &attr);
+ pthread_mutex_lock(&rd.mutex);
+ rd.use_recvmsg = use_recvmsg;
+
+ ret = pthread_create(&recv_thread, NULL, recv_fn, &rd);
+ if (ret) {
+ fprintf(stderr, "Thread create failed: %d\n", ret);
+ pthread_mutex_unlock(&rd.mutex);
+ return 1;
+ }
+
+ pthread_mutex_lock(&rd.mutex);
+ do_send();
+ pthread_join(recv_thread, &retval);
+ return (intptr_t)retval;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ ret = test(0);
+ if (ret) {
+ fprintf(stderr, "test recv failed\n");
+ return ret;
+ }
+
+ ret = test(1);
+ if (ret) {
+ fprintf(stderr, "test recvmsg failed\n");
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/test/register-restrictions.c b/test/register-restrictions.c
index bcae67c..e1cf5bd 100644
--- a/test/register-restrictions.c
+++ b/test/register-restrictions.c
@@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/eventfd.h>
#include "liburing.h"
diff --git a/test/rename.c b/test/rename.c
index af09d65..67d4e9c 100644
--- a/test/rename.c
+++ b/test/rename.c
@@ -9,6 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
+#include <sys/stat.h>
#include "liburing.h"
@@ -25,11 +26,8 @@ static int test_rename(struct io_uring *ring, const char *old, const char *new)
}
memset(sqe, 0, sizeof(*sqe));
- sqe->opcode = IORING_OP_RENAMEAT;
- sqe->fd = AT_FDCWD;
- sqe->addr2 = (unsigned long) new;
- sqe->addr = (unsigned long) old;
- sqe->len = AT_FDCWD;
+
+ io_uring_prep_rename(sqe, old, new);
ret = io_uring_submit(ring);
if (ret <= 0) {
diff --git a/test/ring-leak.c b/test/ring-leak.c
index f8f043c..5b739ad 100644
--- a/test/ring-leak.c
+++ b/test/ring-leak.c
@@ -131,12 +131,84 @@ static int test_iowq_request_cancel(void)
ret = read(fds[0], buffer, 10);
if (ret < 0)
perror("read");
+ close(fds[0]);
+ return 0;
+}
+
+static int test_scm_cycles(bool update)
+{
+ char buffer[128];
+ struct io_uring ring;
+ int i, ret;
+ int sp[2], fds[2], reg_fds[4];
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sp) != 0) {
+ perror("Failed to create Unix-domain socket pair\n");
+ return 1;
+ }
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret < 0) {
+ fprintf(stderr, "failed to init io_uring: %s\n", strerror(-ret));
+ return ret;
+ }
+ if (pipe(fds)) {
+ perror("pipe");
+ return -1;
+ }
+ send_fd(sp[0], ring.ring_fd);
+
+ /* register an empty set for updates */
+ if (update) {
+ for (i = 0; i < 4; i++)
+ reg_fds[i] = -1;
+ ret = io_uring_register_files(&ring, reg_fds, 4);
+ if (ret) {
+ fprintf(stderr, "file_register: %d\n", ret);
+ return ret;
+ }
+ }
+
+ reg_fds[0] = fds[0];
+ reg_fds[1] = fds[1];
+ reg_fds[2] = sp[0];
+ reg_fds[3] = sp[1];
+ if (update) {
+ ret = io_uring_register_files_update(&ring, 0, reg_fds, 4);
+ if (ret != 4) {
+ fprintf(stderr, "file_register: %d\n", ret);
+ return ret;
+ }
+ } else {
+ ret = io_uring_register_files(&ring, reg_fds, 4);
+ if (ret) {
+ fprintf(stderr, "file_register: %d\n", ret);
+ return ret;
+ }
+ }
+
+ close(fds[1]);
+ close(sp[0]);
+ close(sp[1]);
+
+ /* should unregister files and close the write fd */
+ io_uring_queue_exit(&ring);
+
+ /*
+ * We're trying to wait for the ring to "really" exit, that will be
+ * done async. For that rely on the registered write end to be closed
+ * after ring quiesce, so failing read from the other pipe end.
+ */
+ ret = read(fds[0], buffer, 10);
+ if (ret < 0)
+ perror("read");
+ close(fds[0]);
return 0;
}
int main(int argc, char *argv[])
{
int sp[2], pid, ring_fd, ret;
+ int i;
if (argc > 1)
return 0;
@@ -147,6 +219,18 @@ int main(int argc, char *argv[])
return 1;
}
+ for (i = 0; i < 2; i++) {
+ bool update = !!(i & 1);
+
+ ret = test_scm_cycles(update);
+ if (ret) {
+ fprintf(stderr, "test_scm_cycles() failed %i\n",
+ update);
+ return 1;
+ }
+ break;
+ }
+
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sp) != 0) {
perror("Failed to create Unix-domain socket pair\n");
return 1;
diff --git a/test/ring-leak2.c b/test/ring-leak2.c
index d9bfe0f..a8c03fe 100644
--- a/test/ring-leak2.c
+++ b/test/ring-leak2.c
@@ -14,7 +14,7 @@
#include <stdlib.h>
#include <string.h>
#include <strings.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/socket.h>
#include <unistd.h>
#include <sys/eventfd.h>
@@ -197,6 +197,7 @@ static void *client_thread(void *arg)
// connection closed or error
shutdown(conn_i.fd, SHUT_RDWR);
} else {
+ pthread_mutex_unlock(&lock);
break;
}
add_socket_pollin(&ring, conn_i.fd);
diff --git a/test/ringbuf-read.c b/test/ringbuf-read.c
new file mode 100644
index 0000000..673f2de
--- /dev/null
+++ b/test/ringbuf-read.c
@@ -0,0 +1,195 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: ring mapped provided buffers with reads
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+#define BUF_SIZE 4096
+#define NR_BUFS 64
+#define FSIZE (BUF_SIZE * NR_BUFS)
+
+#define BR_MASK (NR_BUFS - 1)
+
+static int no_buf_ring;
+
+static int verify_buffer(char *buf, char val)
+{
+ int i;
+
+ for (i = 0; i < BUF_SIZE; i++) {
+ if (buf[i] != val) {
+ fprintf(stderr, "got %d, wanted %d\n", buf[i], val);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int test(const char *filename, int dio, int async)
+{
+ struct io_uring_buf_reg reg = { };
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ struct io_uring ring;
+ struct io_uring_buf_ring *br;
+ int ret, fd, i;
+ char *buf;
+ void *ptr;
+
+ ret = io_uring_queue_init(NR_BUFS, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return 1;
+ }
+
+ if (dio)
+ fd = open(filename, O_DIRECT | O_RDONLY);
+ else
+ fd = open(filename, O_RDONLY);
+ if (fd < 0) {
+ perror("open");
+ return 1;
+ }
+
+ posix_fadvise(fd, 0, FSIZE, POSIX_FADV_DONTNEED);
+
+ if (posix_memalign((void **) &buf, 4096, FSIZE))
+ return 1;
+ if (posix_memalign((void **) &br, 4096, 4096))
+ return 1;
+
+ reg.ring_addr = (unsigned long) br;
+ reg.ring_entries = NR_BUFS;
+ reg.bgid = 1;
+
+ ret = io_uring_register_buf_ring(&ring, &reg, 0);
+ if (ret) {
+ if (ret == -EINVAL) {
+ no_buf_ring = 1;
+ return 0;
+ }
+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
+ return 1;
+ }
+
+ ptr = buf;
+ for (i = 0; i < NR_BUFS; i++) {
+ io_uring_buf_ring_add(br, ptr, BUF_SIZE, i + 1, BR_MASK, i);
+ ptr += BUF_SIZE;
+ }
+ io_uring_buf_ring_advance(br, NR_BUFS);
+
+ for (i = 0; i < NR_BUFS; i++) {
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_read(sqe, fd, NULL, BUF_SIZE, i * BUF_SIZE);
+ sqe->buf_group = 1;
+ sqe->flags |= IOSQE_BUFFER_SELECT;
+ if (async && !(i & 1))
+ sqe->flags |= IOSQE_ASYNC;
+ sqe->user_data = i + 1;
+ }
+
+ ret = io_uring_submit(&ring);
+ if (ret != NR_BUFS) {
+ fprintf(stderr, "submit: %d\n", ret);
+ return 1;
+ }
+
+ for (i = 0; i < NR_BUFS; i++) {
+ int bid, ud;
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait cqe failed %d\n", ret);
+ return 1;
+ }
+ if (cqe->res != BUF_SIZE) {
+ fprintf(stderr, "cqe res %d\n", cqe->res);
+ return 1;
+ }
+ if (!(cqe->flags & IORING_CQE_F_BUFFER)) {
+ fprintf(stderr, "no buffer selected\n");
+ return 1;
+ }
+ bid = cqe->flags >> IORING_CQE_BUFFER_SHIFT;
+ ud = cqe->user_data;
+ io_uring_cqe_seen(&ring, cqe);
+ if (verify_buffer(buf + ((bid - 1) * BUF_SIZE), ud))
+ return 1;
+ }
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ char buf[BUF_SIZE];
+ char fname[80];
+ int ret, fd, i, do_unlink;
+
+ if (argc > 1) {
+ strcpy(fname, argv[1]);
+ do_unlink = 0;
+ } else {
+ sprintf(fname, ".ringbuf-read.%d", getpid());
+ t_create_file(fname, FSIZE);
+ do_unlink = 1;
+ }
+
+ fd = open(fname, O_WRONLY);
+ if (fd < 0) {
+ perror("open");
+ goto err;
+ }
+ for (i = 0; i < NR_BUFS; i++) {
+ memset(buf, i + 1, BUF_SIZE);
+ ret = write(fd, buf, BUF_SIZE);
+ if (ret != BUF_SIZE) {
+ fprintf(stderr, "bad file prep write\n");
+ goto err;
+ }
+ }
+ close(fd);
+
+ ret = test(fname, 1, 0);
+ if (ret) {
+ fprintf(stderr, "dio test failed\n");
+ return ret;
+ }
+ if (no_buf_ring)
+ return 0;
+
+ ret = test(fname, 0, 0);
+ if (ret) {
+ fprintf(stderr, "buffered test failed\n");
+ return ret;
+ }
+
+ ret = test(fname, 1, 1);
+ if (ret) {
+ fprintf(stderr, "dio async test failed\n");
+ return ret;
+ }
+
+ ret = test(fname, 0, 1);
+ if (ret) {
+ fprintf(stderr, "buffered async test failed\n");
+ return ret;
+ }
+
+ return 0;
+err:
+ if (do_unlink)
+ unlink(fname);
+ return 1;
+}
diff --git a/test/rsrc_tags.c b/test/rsrc_tags.c
index 2b4890b..2d11d2a 100644
--- a/test/rsrc_tags.c
+++ b/test/rsrc_tags.c
@@ -27,11 +27,15 @@ static bool check_cq_empty(struct io_uring *ring)
struct io_uring_cqe *cqe = NULL;
int ret;
- sleep(1); /* doesn't happen immediately, so wait */
+ usleep(1000); /* doesn't happen immediately, so wait */
ret = io_uring_peek_cqe(ring, &cqe); /* nothing should be there */
return ret == -EAGAIN;
}
+/*
+ * There are io_uring_register_buffers_tags() and other wrappers,
+ * but they may change, so hand-code to specifically test this ABI.
+ */
static int register_rsrc(struct io_uring *ring, int type, int nr,
const void *arg, const __u64 *tags)
{
@@ -40,8 +44,8 @@ static int register_rsrc(struct io_uring *ring, int type, int nr,
memset(&reg, 0, sizeof(reg));
reg.nr = nr;
- reg.data = (__u64)arg;
- reg.tags = (__u64)tags;
+ reg.data = (__u64)(uintptr_t)arg;
+ reg.tags = (__u64)(uintptr_t)tags;
reg_type = IORING_REGISTER_FILES2;
if (type != TEST_IORING_RSRC_FILE)
@@ -52,6 +56,10 @@ static int register_rsrc(struct io_uring *ring, int type, int nr,
return ret ? -errno : 0;
}
+/*
+ * There are io_uring_register_buffers_update_tag() and other wrappers,
+ * but they may change, so hand-code to specifically test this ABI.
+ */
static int update_rsrc(struct io_uring *ring, int type, int nr, int off,
const void *arg, const __u64 *tags)
{
@@ -60,8 +68,8 @@ static int update_rsrc(struct io_uring *ring, int type, int nr, int off,
memset(&up, 0, sizeof(up));
up.offset = off;
- up.data = (__u64)arg;
- up.tags = (__u64)tags;
+ up.data = (__u64)(uintptr_t)arg;
+ up.tags = (__u64)(uintptr_t)tags;
up.nr = nr;
up_type = IORING_REGISTER_FILES_UPDATE2;
@@ -75,17 +83,17 @@ static int update_rsrc(struct io_uring *ring, int type, int nr, int off,
static bool has_rsrc_update(void)
{
struct io_uring ring;
- char buf[1024];
- struct iovec vec = {.iov_base = buf, .iov_len = sizeof(buf), };
int ret;
ret = io_uring_queue_init(1, &ring, 0);
- if (ret)
- return false;
+ if (ret) {
+ fprintf(stderr, "io_uring_queue_init() failed, %d\n", ret);
+ exit(1);
+ }
- ret = register_rsrc(&ring, TEST_IORING_RSRC_BUFFER, 1, &vec, NULL);
+ ret = ring.features & IORING_FEAT_RSRC_TAGS;
io_uring_queue_exit(&ring);
- return ret != -EINVAL;
+ return ret;
}
static int test_tags_generic(int nr, int type, void *rsrc, int ring_flags)
@@ -314,7 +322,7 @@ static int test_files(int ring_flags)
struct io_uring ring;
const int nr = 50;
int off = 5, i, ret, fd;
- int files[nr];
+ __s32 files[nr];
__u64 tags[nr], tag;
for (i = 0; i < nr; ++i) {
diff --git a/test/runtests-loop.sh b/test/runtests-loop.sh
index 4019eba..b80bc76 100755
--- a/test/runtests-loop.sh
+++ b/test/runtests-loop.sh
@@ -1,10 +1,10 @@
-#!/bin/bash
+#!/usr/bin/env bash
-TESTS="$@"
+TESTS=("$@")
ITER=0
while true; do
- ./runtests.sh "$TESTS"
+ ./runtests.sh "${TESTS[@]}"
RET="$?"
if [ "${RET}" -ne 0 ]; then
echo "Tests failed at loop $ITER"
diff --git a/test/runtests-quiet.sh b/test/runtests-quiet.sh
new file mode 100755
index 0000000..2bc7da0
--- /dev/null
+++ b/test/runtests-quiet.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+TESTS=("$@")
+RESULT_FILE=$(mktemp)
+./runtests.sh "${TESTS[@]}" > "$RESULT_FILE" 2>&1
+RET="$?"
+if [ "${RET}" -ne 0 ]; then
+ cat "$RESULT_FILE"
+fi
+rm "$RESULT_FILE"
+exit $RET
diff --git a/test/runtests.sh b/test/runtests.sh
index e8f4ae5..6d8f7af 100755
--- a/test/runtests.sh
+++ b/test/runtests.sh
@@ -1,13 +1,13 @@
-#!/bin/bash
+#!/usr/bin/env bash
-TESTS="$@"
+TESTS=("$@")
RET=0
TIMEOUT=60
DMESG_FILTER="cat"
-TEST_DIR=$(dirname $0)
+TEST_DIR=$(dirname "$0")
FAILED=""
SKIPPED=""
-MAYBE_FAILED=""
+TIMED_OUT=""
TEST_FILES=""
declare -A TEST_MAP
@@ -17,14 +17,15 @@ DO_KMSG="1"
# Include config.local if exists and check TEST_FILES for valid devices
if [ -f "$TEST_DIR/config.local" ]; then
- . $TEST_DIR/config.local
+ # shellcheck source=/dev/null disable=SC1091
+ . "$TEST_DIR/config.local"
for dev in $TEST_FILES; do
if [ ! -e "$dev" ]; then
echo "Test file $dev not valid"
exit 1
fi
done
- for dev in ${TEST_MAP[@]}; do
+ for dev in "${TEST_MAP[@]}"; do
if [ ! -e "$dev" ]; then
echo "Test file in map $dev not valid"
exit 1
@@ -37,7 +38,7 @@ _check_dmesg()
local dmesg_marker="$1"
local seqres="$2.seqres"
- if [ $DO_KMSG -eq 0 ]; then
+ if [ "$DO_KMSG" -eq 0 ]; then
return 0
fi
@@ -66,24 +67,31 @@ run_test()
{
local test_name="$1"
local dev="$2"
- local test_string=$test_name
+ local test_exec=("./$test_name")
+ local test_string="$test_name"
+ local out_name="$test_name"
# Specify test string to print
if [ -n "$dev" ]; then
+ test_exec+=("$dev")
test_string="$test_name $dev"
+ local suffix
+ suffix=$(basename "$dev")
+ out_name="$out_name.$suffix"
fi
# Log start of the test
if [ "$DO_KMSG" -eq 1 ]; then
local dmesg_marker="Running test $test_string:"
- echo $dmesg_marker | tee /dev/kmsg
+ echo "$dmesg_marker" > /dev/kmsg
else
local dmesg_marker=""
- echo Running test $test_name $dev
fi
+ printf "Running test %-55s" "$test_string"
# Do we have to exclude the test ?
- echo $TEST_EXCLUDE | grep -w "$test_name" > /dev/null 2>&1
+ echo "$TEST_EXCLUDE" | grep -w "$test_name" > /dev/null 2>&1
+ # shellcheck disable=SC2181
if [ $? -eq 0 ]; then
echo "Test skipped"
SKIPPED="$SKIPPED <$test_string>"
@@ -91,12 +99,19 @@ run_test()
fi
# Run the test
- timeout -s INT -k $TIMEOUT $TIMEOUT ./$test_name $dev
+ T_START=$(date +%s)
+ timeout -s INT -k $TIMEOUT $TIMEOUT "${test_exec[@]}"
local status=$?
+ T_END=$(date +%s)
+
+ if [ -e ./core ]; then
+ mv core "core-$test_name"
+ fi
# Check test status
if [ "$status" -eq 124 ]; then
echo "Test $test_name timed out (may not be a failure)"
+ TIMED_OUT="$TIMED_OUT <$test_string>"
elif [ "$status" -ne 0 ]; then
echo "Test $test_name failed with ret $status"
FAILED="$FAILED <$test_string>"
@@ -105,26 +120,36 @@ run_test()
echo "Test $test_name failed dmesg check"
FAILED="$FAILED <$test_string>"
RET=1
- elif [ -n "$dev" ]; then
- sleep .1
- ps aux | grep "\[io_wq_manager\]" > /dev/null
- if [ $? -eq 0 ]; then
- MAYBE_FAILED="$MAYBE_FAILED $test_string"
+ else
+ if [ -f "output/$out_name" ]; then
+ T_PREV=$(cat "output/$out_name")
+ else
+ T_PREV=""
+ fi
+ T_DIFF=$((T_END-T_START))
+ if [ -n "$T_PREV" ]; then
+ echo "$T_DIFF sec [$T_PREV]"
+ else
+ echo "$T_DIFF sec"
fi
+ echo $T_DIFF > "output/$out_name"
fi
}
# Run all specified tests
-for tst in $TESTS; do
- if [ ! -n "${TEST_MAP[$tst]}" ]; then
- run_test $tst
- if [ ! -z "$TEST_FILES" ]; then
+for tst in "${TESTS[@]}"; do
+ if [ ! -d output ]; then
+ mkdir -p output
+ fi
+ if [ -z "${TEST_MAP[$tst]}" ]; then
+ run_test "$tst"
+ if [ -n "$TEST_FILES" ]; then
for dev in $TEST_FILES; do
- run_test $tst $dev
+ run_test "$tst" "$dev"
done
fi
else
- run_test $tst ${TEST_MAP[$tst]}
+ run_test "$tst" "${TEST_MAP[$tst]}"
fi
done
@@ -132,18 +157,14 @@ if [ -n "$SKIPPED" ]; then
echo "Tests skipped: $SKIPPED"
fi
+if [ -n "$TIMED_OUT" ]; then
+ echo "Tests timed out: $TIMED_OUT"
+fi
+
if [ "${RET}" -ne 0 ]; then
echo "Tests failed: $FAILED"
exit $RET
else
- sleep 1
- ps aux | grep "\[io_wq_manager\]" > /dev/null
- if [ $? -ne 0 ]; then
- MAYBE_FAILED=""
- fi
- if [ ! -z "$MAYBE_FAILED" ]; then
- echo "Tests _maybe_ failed: $MAYBE_FAILED"
- fi
echo "All tests passed"
exit 0
fi
diff --git a/test/rw_merge_test.c b/test/rw_merge_test.c
index 43feed4..03f6467 100644
--- a/test/rw_merge_test.c
+++ b/test/rw_merge_test.c
@@ -35,7 +35,8 @@ int main(int argc, char *argv[])
assert(!ret);
fd = open("testfile", O_RDWR | O_CREAT, 0644);
- assert(ret >= 0);
+ assert(fd >= 0);
+ unlink("testfile");
ret = ftruncate(fd, 4096);
assert(!ret);
diff --git a/test/send_recv.c b/test/send_recv.c
index 38ae27f..a7b001a 100644
--- a/test/send_recv.c
+++ b/test/send_recv.c
@@ -19,14 +19,9 @@ static char str[] = "This is a test of send and recv over io_uring!";
#define MAX_MSG 128
-#define PORT 10200
+#define PORT 10202
#define HOST "127.0.0.1"
-#if 0
-# define io_uring_prep_send io_uring_prep_write
-# define io_uring_prep_recv io_uring_prep_read
-#endif
-
static int recv_prep(struct io_uring *ring, struct iovec *iov, int *sock,
int registerfiles)
{
@@ -200,7 +195,7 @@ static int do_send(void)
return 1;
}
- ret = connect(sockfd, &saddr, sizeof(saddr));
+ ret = connect(sockfd, (struct sockaddr *)&saddr, sizeof(saddr));
if (ret < 0) {
perror("connect");
return 1;
@@ -259,7 +254,7 @@ static int test(int use_sqthread, int regfiles)
pthread_mutex_lock(&rd.mutex);
do_send();
pthread_join(recv_thread, &retval);
- return (int)(intptr_t)retval;
+ return (intptr_t)retval;
}
int main(int argc, char *argv[])
diff --git a/test/send_recvmsg.c b/test/send_recvmsg.c
index 2ff8d9d..cce6c45 100644
--- a/test/send_recvmsg.c
+++ b/test/send_recvmsg.c
@@ -17,9 +17,11 @@
static char str[] = "This is a test of sendmsg and recvmsg over io_uring!";
+static int ud;
+
#define MAX_MSG 128
-#define PORT 10200
+#define PORT 10203
#define HOST "127.0.0.1"
#define BUF_BGID 10
@@ -27,31 +29,32 @@ static char str[] = "This is a test of sendmsg and recvmsg over io_uring!";
#define MAX_IOV_COUNT 10
-static int recv_prep(struct io_uring *ring, struct iovec iov[], int iov_count,
- int bgid)
+static int no_pbuf_ring;
+
+static int recv_prep(struct io_uring *ring, int *sockfd, struct iovec iov[],
+ int iov_count, int bgid, int async)
{
struct sockaddr_in saddr;
struct msghdr msg;
struct io_uring_sqe *sqe;
- int sockfd, ret;
- int val = 1;
+ int ret, val = 1;
memset(&saddr, 0, sizeof(saddr));
saddr.sin_family = AF_INET;
saddr.sin_addr.s_addr = htonl(INADDR_ANY);
saddr.sin_port = htons(PORT);
- sockfd = socket(AF_INET, SOCK_DGRAM, 0);
- if (sockfd < 0) {
+ *sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (*sockfd < 0) {
perror("socket");
return 1;
}
val = 1;
- setsockopt(sockfd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val));
- setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
+ setsockopt(*sockfd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val));
+ setsockopt(*sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
- ret = bind(sockfd, (struct sockaddr *)&saddr, sizeof(saddr));
+ ret = bind(*sockfd, (struct sockaddr *)&saddr, sizeof(saddr));
if (ret < 0) {
perror("bind");
goto err;
@@ -63,13 +66,16 @@ static int recv_prep(struct io_uring *ring, struct iovec iov[], int iov_count,
return 1;
}
- io_uring_prep_recvmsg(sqe, sockfd, &msg, 0);
+ io_uring_prep_recvmsg(sqe, *sockfd, &msg, 0);
if (bgid) {
iov->iov_base = NULL;
sqe->flags |= IOSQE_BUFFER_SELECT;
sqe->buf_group = bgid;
iov_count = 1;
}
+ sqe->user_data = ++ud;
+ if (async)
+ sqe->flags |= IOSQE_ASYNC;
memset(&msg, 0, sizeof(msg));
msg.msg_namelen = sizeof(struct sockaddr_in);
msg.msg_iov = iov;
@@ -81,18 +87,19 @@ static int recv_prep(struct io_uring *ring, struct iovec iov[], int iov_count,
goto err;
}
- close(sockfd);
return 0;
err:
- close(sockfd);
+ close(*sockfd);
return 1;
}
struct recv_data {
pthread_mutex_t *mutex;
int buf_select;
+ int buf_ring;
int no_buf_add;
int iov_count;
+ int async;
};
static int do_recvmsg(struct io_uring *ring, char buf[MAX_MSG + 1],
@@ -107,18 +114,18 @@ static int do_recvmsg(struct io_uring *ring, char buf[MAX_MSG + 1],
goto err;
}
if (cqe->res < 0) {
- if (rd->no_buf_add && rd->buf_select)
+ if (rd->no_buf_add && (rd->buf_select || rd->buf_ring))
return 0;
fprintf(stderr, "%s: failed cqe: %d\n", __FUNCTION__, cqe->res);
goto err;
}
- if (cqe->flags) {
+ if (cqe->flags & IORING_CQE_F_BUFFER) {
int bid = cqe->flags >> 16;
if (bid != BUF_BID)
fprintf(stderr, "Buffer ID mismatch %d\n", bid);
}
- if (rd->no_buf_add && rd->buf_select) {
+ if (rd->no_buf_add && (rd->buf_ring || rd->buf_select)) {
fprintf(stderr, "Expected -ENOBUFS: %d\n", cqe->res);
goto err;
}
@@ -158,12 +165,14 @@ static void *recv_fn(void *data)
{
struct recv_data *rd = data;
pthread_mutex_t *mutex = rd->mutex;
+ struct io_uring_buf_ring *br = NULL;
char buf[MAX_MSG + 1];
struct iovec iov[MAX_IOV_COUNT];
- struct io_uring_sqe *sqe;
- struct io_uring_cqe *cqe;
struct io_uring ring;
- int ret;
+ int ret, sockfd;
+
+ if (rd->buf_ring && no_pbuf_ring)
+ goto out_no_ring;
init_iov(iov, rd->iov_count, buf);
@@ -173,34 +182,61 @@ static void *recv_fn(void *data)
goto err;
}
- if (rd->buf_select && !rd->no_buf_add) {
- sqe = io_uring_get_sqe(&ring);
- io_uring_prep_provide_buffers(sqe, buf, sizeof(buf) -1, 1,
- BUF_BGID, BUF_BID);
- ret = io_uring_submit(&ring);
- if (ret != 1) {
- fprintf(stderr, "submit ret=%d\n", ret);
- goto err;
- }
-
- ret = io_uring_wait_cqe(&ring, &cqe);
- if (ret) {
- fprintf(stderr, "wait_cqe=%d\n", ret);
- goto err;
- }
- ret = cqe->res;
- io_uring_cqe_seen(&ring, cqe);
- if (ret == -EINVAL) {
- fprintf(stdout, "PROVIDE_BUFFERS not supported, skip\n");
- goto out;
- goto err;
- } else if (ret < 0) {
- fprintf(stderr, "PROVIDER_BUFFERS %d\n", ret);
- goto err;
+ if ((rd->buf_ring || rd->buf_select) && !rd->no_buf_add) {
+ if (rd->buf_ring) {
+ struct io_uring_buf_reg reg = { };
+ void *ptr;
+
+ if (posix_memalign(&ptr, 4096, 4096))
+ goto err;
+
+ reg.ring_addr = (unsigned long) ptr;
+ reg.ring_entries = 1;
+ reg.bgid = BUF_BGID;
+ if (io_uring_register_buf_ring(&ring, &reg, 0)) {
+ no_pbuf_ring = 1;
+ goto out;
+ }
+
+ br = ptr;
+ io_uring_buf_ring_init(br);
+ io_uring_buf_ring_add(br, buf, sizeof(buf), BUF_BID,
+ io_uring_buf_ring_mask(1), 0);
+ io_uring_buf_ring_advance(br, 1);
+ } else {
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_provide_buffers(sqe, buf, sizeof(buf) -1,
+ 1, BUF_BGID, BUF_BID);
+ sqe->user_data = ++ud;
+ ret = io_uring_submit(&ring);
+ if (ret != 1) {
+ fprintf(stderr, "submit ret=%d\n", ret);
+ goto err;
+ }
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait_cqe=%d\n", ret);
+ goto err;
+ }
+ ret = cqe->res;
+ io_uring_cqe_seen(&ring, cqe);
+ if (ret == -EINVAL) {
+ fprintf(stdout, "PROVIDE_BUFFERS not supported, skip\n");
+ goto out;
+ } else if (ret < 0) {
+ fprintf(stderr, "PROVIDER_BUFFERS %d\n", ret);
+ goto err;
+ }
}
}
- ret = recv_prep(&ring, iov, rd->iov_count, rd->buf_select ? BUF_BGID : 0);
+ ret = recv_prep(&ring, &sockfd, iov, rd->iov_count,
+ (rd->buf_ring || rd->buf_select) ? BUF_BGID : 0,
+ rd->async);
if (ret) {
fprintf(stderr, "recv_prep failed: %d\n", ret);
goto err;
@@ -208,14 +244,19 @@ static void *recv_fn(void *data)
pthread_mutex_unlock(mutex);
ret = do_recvmsg(&ring, buf, rd);
+ close(sockfd);
io_uring_queue_exit(&ring);
-
+ if (br)
+ free(br);
err:
return (void *)(intptr_t)ret;
out:
- pthread_mutex_unlock(mutex);
io_uring_queue_exit(&ring);
+out_no_ring:
+ pthread_mutex_unlock(mutex);
+ if (br)
+ free(br);
return NULL;
}
@@ -255,8 +296,11 @@ static int do_sendmsg(void)
return 1;
}
+ usleep(10000);
+
sqe = io_uring_get_sqe(&ring);
io_uring_prep_sendmsg(sqe, sockfd, &msg, 0);
+ sqe->user_data = ++ud;
ret = io_uring_submit(&ring);
if (ret <= 0) {
@@ -277,7 +321,8 @@ err:
return 1;
}
-static int test(int buf_select, int no_buf_add, int iov_count)
+static int test(int buf_select, int buf_ring, int no_buf_add, int iov_count,
+ int async)
{
struct recv_data rd;
pthread_mutexattr_t attr;
@@ -286,6 +331,9 @@ static int test(int buf_select, int no_buf_add, int iov_count)
int ret;
void *retval;
+ if (buf_select || buf_ring)
+ assert(iov_count == 1);
+
pthread_mutexattr_init(&attr);
pthread_mutexattr_setpshared(&attr, 1);
pthread_mutex_init(&mutex, &attr);
@@ -293,8 +341,10 @@ static int test(int buf_select, int no_buf_add, int iov_count)
rd.mutex = &mutex;
rd.buf_select = buf_select;
+ rd.buf_ring = buf_ring;
rd.no_buf_add = no_buf_add;
rd.iov_count = iov_count;
+ rd.async = async;
ret = pthread_create(&recv_thread, NULL, recv_fn, &rd);
if (ret) {
pthread_mutex_unlock(&mutex);
@@ -305,7 +355,7 @@ static int test(int buf_select, int no_buf_add, int iov_count)
pthread_mutex_lock(&mutex);
do_sendmsg();
pthread_join(recv_thread, &retval);
- ret = (int)(intptr_t)retval;
+ ret = (intptr_t)retval;
return ret;
}
@@ -317,27 +367,87 @@ int main(int argc, char *argv[])
if (argc > 1)
return 0;
- ret = test(0, 0, 1);
+ ret = test(0, 0, 0, 1, 0);
if (ret) {
- fprintf(stderr, "send_recvmsg 0 failed\n");
+ fprintf(stderr, "send_recvmsg 0 0 0 1 0 failed\n");
return 1;
}
- ret = test(0, 0, 10);
+ ret = test(0, 0, 0, 10, 0);
if (ret) {
fprintf(stderr, "send_recvmsg multi iov failed\n");
return 1;
}
- ret = test(1, 0, 1);
+ ret = test(1, 0, 0, 1, 0);
+ if (ret) {
+ fprintf(stderr, "send_recvmsg 1 0 0 1 0 failed\n");
+ return 1;
+ }
+
+ ret = test(1, 0, 1, 1, 0);
+ if (ret) {
+ fprintf(stderr, "send_recvmsg 1 0 1 1 0 failed\n");
+ return 1;
+ }
+
+ ret = test(0, 1, 0, 1, 0);
+ if (ret) {
+ fprintf(stderr, "send_recvmsg 0 1 0 1 0 failed\n");
+ return 1;
+ }
+
+ ret = test(1, 1, 0, 1, 0);
+ if (ret) {
+ fprintf(stderr, "send_recvmsg 1 1 0 1 0 failed\n");
+ return 1;
+ }
+
+ ret = test(1, 1, 1, 1, 0);
+ if (ret) {
+ fprintf(stderr, "send_recvmsg 1 1 1 1 0 failed\n");
+ return 1;
+ }
+
+ ret = test(0, 0, 0, 1, 1);
+ if (ret) {
+ fprintf(stderr, "send_recvmsg async 0 0 0 1 1 failed\n");
+ return 1;
+ }
+
+ ret = test(0, 0, 0, 10, 1);
+ if (ret) {
+ fprintf(stderr, "send_recvmsg async multi iov failed\n");
+ return 1;
+ }
+
+ ret = test(1, 0, 0, 1, 1);
+ if (ret) {
+ fprintf(stderr, "send_recvmsg async 1 0 0 1 1 failed\n");
+ return 1;
+ }
+
+ ret = test(1, 0, 1, 1, 1);
+ if (ret) {
+ fprintf(stderr, "send_recvmsg async 1 0 1 1 1 failed\n");
+ return 1;
+ }
+
+ ret = test(0, 1, 0, 1, 1);
+ if (ret) {
+ fprintf(stderr, "send_recvmsg async 0 1 0 1 1 failed\n");
+ return 1;
+ }
+
+ ret = test(1, 1, 0, 1, 1);
if (ret) {
- fprintf(stderr, "send_recvmsg 1 0 failed\n");
+ fprintf(stderr, "send_recvmsg async 1 1 0 1 1 failed\n");
return 1;
}
- ret = test(1, 1, 1);
+ ret = test(1, 1, 1, 1, 1);
if (ret) {
- fprintf(stderr, "send_recvmsg 1 1 failed\n");
+ fprintf(stderr, "send_recvmsg async 1 1 1 1 1 failed\n");
return 1;
}
diff --git a/test/sendmsg_fs_cve.c b/test/sendmsg_fs_cve.c
index 8de220a..2ce3114 100644
--- a/test/sendmsg_fs_cve.c
+++ b/test/sendmsg_fs_cve.c
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: MIT */
/*
* repro-CVE-2020-29373 -- Reproducer for CVE-2020-29373.
*
@@ -20,6 +21,7 @@
#include <unistd.h>
#include <stdio.h>
+#include <string.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/un.h>
@@ -154,7 +156,13 @@ int main(int argc, char *argv[])
if (!c) {
close(rcv_sock);
- if (chroot(tmpdir)) {
+ r = chroot(tmpdir);
+ if (r) {
+ if (errno == EPERM) {
+ fprintf(stderr, "chroot not allowed, skip\n");
+ return 0;
+ }
+
perror("chroot()");
return 1;
}
diff --git a/test/short-read.c b/test/short-read.c
index 02eee04..a6f2620 100644
--- a/test/short-read.c
+++ b/test/short-read.c
@@ -6,7 +6,7 @@
#include <string.h>
#include <fcntl.h>
#include <sys/types.h>
-#include <sys/poll.h>
+#include <poll.h>
#include "helpers.h"
diff --git a/test/shutdown.c b/test/shutdown.c
index 5aa1371..14c7407 100644
--- a/test/shutdown.c
+++ b/test/shutdown.c
@@ -6,6 +6,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
+#include <string.h>
#include <assert.h>
#include <errno.h>
@@ -15,6 +16,7 @@
#include <sys/un.h>
#include <netinet/tcp.h>
#include <netinet/in.h>
+#include <arpa/inet.h>
#include "liburing.h"
@@ -42,8 +44,8 @@ int main(int argc, char *argv[])
assert(ret != -1);
addr.sin_family = AF_INET;
- addr.sin_port = (rand() % 61440) + 4096;
- addr.sin_addr.s_addr = 0x0100007fU;
+ addr.sin_port = htons((rand() % 61440) + 4096);
+ addr.sin_addr.s_addr = inet_addr("127.0.0.1");
ret = bind(recv_s0, (struct sockaddr*)&addr, sizeof(addr));
assert(ret != -1);
diff --git a/test/sigfd-deadlock.c b/test/sigfd-deadlock.c
index 038b094..277b342 100644
--- a/test/sigfd-deadlock.c
+++ b/test/sigfd-deadlock.c
@@ -8,7 +8,7 @@
#include <unistd.h>
#include <sys/signalfd.h>
#include <sys/epoll.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <stdio.h>
#include "liburing.h"
diff --git a/test/skip-cqe.c b/test/skip-cqe.c
new file mode 100644
index 0000000..99b882b
--- /dev/null
+++ b/test/skip-cqe.c
@@ -0,0 +1,429 @@
+/* SPDX-License-Identifier: MIT */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <assert.h>
+
+#include "liburing.h"
+
+#define LINK_SIZE 6
+#define TIMEOUT_USER_DATA (-1)
+
+static int fds[2];
+
+/* should be successfully submitted but fails during execution */
+static void prep_exec_fail_req(struct io_uring_sqe *sqe)
+{
+ io_uring_prep_write(sqe, fds[1], NULL, 100, 0);
+}
+
+static int test_link_success(struct io_uring *ring, int nr, bool skip_last)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret, i;
+
+ for (i = 0; i < nr; ++i) {
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_nop(sqe);
+ if (i != nr - 1 || skip_last)
+ sqe->flags |= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS;
+ sqe->user_data = i;
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret != nr) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ if (!skip_last) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret != 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ if (cqe->res != 0) {
+ fprintf(stderr, "nop failed: res %d\n", cqe->res);
+ goto err;
+ }
+ if (cqe->user_data != nr - 1) {
+ fprintf(stderr, "invalid user_data %i\n", (int)cqe->user_data);
+ goto err;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ if (io_uring_peek_cqe(ring, &cqe) >= 0) {
+ fprintf(stderr, "single CQE expected %i\n", (int)cqe->user_data);
+ goto err;
+ }
+ return 0;
+err:
+ return 1;
+}
+
+static int test_link_fail(struct io_uring *ring, int nr, int fail_idx)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret, i;
+
+ for (i = 0; i < nr; ++i) {
+ sqe = io_uring_get_sqe(ring);
+ if (i == fail_idx)
+ prep_exec_fail_req(sqe);
+ else
+ io_uring_prep_nop(sqe);
+
+ if (i != nr - 1)
+ sqe->flags |= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS;
+ sqe->user_data = i;
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret != nr) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret != 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ if (!cqe->res || cqe->user_data != fail_idx) {
+ fprintf(stderr, "got: user_data %d res %d, expected data: %d\n",
+ (int)cqe->user_data, cqe->res, fail_idx);
+ goto err;
+ }
+ io_uring_cqe_seen(ring, cqe);
+
+ if (io_uring_peek_cqe(ring, &cqe) >= 0) {
+ fprintf(stderr, "single CQE expected %i\n", (int)cqe->user_data);
+ goto err;
+ }
+ return 0;
+err:
+ return 1;
+}
+
+static int test_ltimeout_cancel(struct io_uring *ring, int nr, int tout_idx,
+ bool async, int fail_idx)
+{
+ struct __kernel_timespec ts = {.tv_sec = 1, .tv_nsec = 0};
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret, i;
+ int e_res = 0, e_idx = nr - 1;
+
+ if (fail_idx >= 0) {
+ e_res = -EFAULT;
+ e_idx = fail_idx;
+ }
+
+ for (i = 0; i < nr; ++i) {
+ sqe = io_uring_get_sqe(ring);
+ if (i == fail_idx)
+ prep_exec_fail_req(sqe);
+ else
+ io_uring_prep_nop(sqe);
+ sqe->user_data = i;
+ sqe->flags |= IOSQE_IO_LINK;
+ if (async)
+ sqe->flags |= IOSQE_ASYNC;
+ if (i != nr - 1)
+ sqe->flags |= IOSQE_CQE_SKIP_SUCCESS;
+
+ if (i == tout_idx) {
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_link_timeout(sqe, &ts, 0);
+ sqe->flags |= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS;
+ sqe->user_data = TIMEOUT_USER_DATA;
+ }
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret != nr + 1) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret != 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ if (cqe->user_data != e_idx) {
+ fprintf(stderr, "invalid user_data %i\n", (int)cqe->user_data);
+ goto err;
+ }
+ if (cqe->res != e_res) {
+ fprintf(stderr, "unexpected res: %d\n", cqe->res);
+ goto err;
+ }
+ io_uring_cqe_seen(ring, cqe);
+
+ if (io_uring_peek_cqe(ring, &cqe) >= 0) {
+ fprintf(stderr, "single CQE expected %i\n", (int)cqe->user_data);
+ goto err;
+ }
+ return 0;
+err:
+ return 1;
+}
+
+static int test_ltimeout_fire(struct io_uring *ring, bool async,
+ bool skip_main, bool skip_tout)
+{
+ char buf[1];
+ struct __kernel_timespec ts = {.tv_sec = 0, .tv_nsec = 1000000};
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret, i;
+ int nr = 1 + !skip_tout;
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_read(sqe, fds[0], buf, sizeof(buf), 0);
+ sqe->flags |= IOSQE_IO_LINK;
+ sqe->flags |= async ? IOSQE_ASYNC : 0;
+ sqe->flags |= skip_main ? IOSQE_CQE_SKIP_SUCCESS : 0;
+ sqe->user_data = 0;
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_link_timeout(sqe, &ts, 0);
+ sqe->flags |= skip_tout ? IOSQE_CQE_SKIP_SUCCESS : 0;
+ sqe->user_data = 1;
+
+ ret = io_uring_submit(ring);
+ if (ret != 2) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ return 1;
+ }
+
+ for (i = 0; i < nr; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret != 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ return 1;
+ }
+ switch (cqe->user_data) {
+ case 0:
+ if (cqe->res != -ECANCELED && cqe->res != -EINTR) {
+ fprintf(stderr, "unexpected read return: %d\n", cqe->res);
+ return 1;
+ }
+ break;
+ case 1:
+ if (skip_tout) {
+ fprintf(stderr, "extra timeout cqe, %d\n", cqe->res);
+ return 1;
+ }
+ break;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+
+ if (io_uring_peek_cqe(ring, &cqe) >= 0) {
+ fprintf(stderr, "single CQE expected: got data: %i res: %i\n",
+ (int)cqe->user_data, cqe->res);
+ return 1;
+ }
+ return 0;
+}
+
+static int test_hardlink(struct io_uring *ring, int nr, int fail_idx,
+ int skip_idx, bool hardlink_last)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret, i;
+
+ assert(fail_idx < nr);
+ assert(skip_idx < nr);
+
+ for (i = 0; i < nr; i++) {
+ sqe = io_uring_get_sqe(ring);
+ if (i == fail_idx)
+ prep_exec_fail_req(sqe);
+ else
+ io_uring_prep_nop(sqe);
+ if (i != nr - 1 || hardlink_last)
+ sqe->flags |= IOSQE_IO_HARDLINK;
+ if (i == skip_idx)
+ sqe->flags |= IOSQE_CQE_SKIP_SUCCESS;
+ sqe->user_data = i;
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret != nr) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ goto err;
+ }
+
+ for (i = 0; i < nr; i++) {
+ if (i == skip_idx && fail_idx != skip_idx)
+ continue;
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret != 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ goto err;
+ }
+ if (cqe->user_data != i) {
+ fprintf(stderr, "invalid user_data %d (%i)\n",
+ (int)cqe->user_data, i);
+ goto err;
+ }
+ if (i == fail_idx) {
+ if (cqe->res >= 0) {
+ fprintf(stderr, "req should've failed %d %d\n",
+ (int)cqe->user_data, cqe->res);
+ goto err;
+ }
+ } else {
+ if (cqe->res) {
+ fprintf(stderr, "req error %d %d\n",
+ (int)cqe->user_data, cqe->res);
+ goto err;
+ }
+ }
+
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ if (io_uring_peek_cqe(ring, &cqe) >= 0) {
+ fprintf(stderr, "single CQE expected %i\n", (int)cqe->user_data);
+ goto err;
+ }
+ return 0;
+err:
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ int ret, i, j, k;
+ int mid_idx = LINK_SIZE / 2;
+ int last_idx = LINK_SIZE - 1;
+
+ if (argc > 1)
+ return 0;
+
+ if (pipe(fds)) {
+ fprintf(stderr, "pipe() failed\n");
+ return 1;
+ }
+ ret = io_uring_queue_init(16, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return 1;
+ }
+
+ if (!(ring.features & IORING_FEAT_CQE_SKIP)) {
+ printf("IOSQE_CQE_SKIP_SUCCESS is not supported, skip\n");
+ return 0;
+ }
+
+ for (i = 0; i < 4; i++) {
+ bool skip_last = i & 1;
+ int sz = (i & 2) ? LINK_SIZE : 1;
+
+ ret = test_link_success(&ring, sz, skip_last);
+ if (ret) {
+ fprintf(stderr, "test_link_success sz %d, %d last\n",
+ skip_last, sz);
+ return ret;
+ }
+ }
+
+ ret = test_link_fail(&ring, LINK_SIZE, mid_idx);
+ if (ret) {
+ fprintf(stderr, "test_link_fail mid failed\n");
+ return ret;
+ }
+
+ ret = test_link_fail(&ring, LINK_SIZE, last_idx);
+ if (ret) {
+ fprintf(stderr, "test_link_fail last failed\n");
+ return ret;
+ }
+
+ for (i = 0; i < 2; i++) {
+ bool async = i & 1;
+
+ ret = test_ltimeout_cancel(&ring, 1, 0, async, -1);
+ if (ret) {
+ fprintf(stderr, "test_ltimeout_cancel 1 failed, %i\n",
+ async);
+ return ret;
+ }
+ ret = test_ltimeout_cancel(&ring, LINK_SIZE, mid_idx, async, -1);
+ if (ret) {
+ fprintf(stderr, "test_ltimeout_cancel mid failed, %i\n",
+ async);
+ return ret;
+ }
+ ret = test_ltimeout_cancel(&ring, LINK_SIZE, last_idx, async, -1);
+ if (ret) {
+ fprintf(stderr, "test_ltimeout_cancel last failed, %i\n",
+ async);
+ return ret;
+ }
+ ret = test_ltimeout_cancel(&ring, LINK_SIZE, mid_idx, async, mid_idx);
+ if (ret) {
+ fprintf(stderr, "test_ltimeout_cancel fail mid failed, %i\n",
+ async);
+ return ret;
+ }
+ ret = test_ltimeout_cancel(&ring, LINK_SIZE, mid_idx, async, mid_idx - 1);
+ if (ret) {
+ fprintf(stderr, "test_ltimeout_cancel fail2 mid failed, %i\n",
+ async);
+ return ret;
+ }
+ ret = test_ltimeout_cancel(&ring, LINK_SIZE, mid_idx, async, mid_idx + 1);
+ if (ret) {
+ fprintf(stderr, "test_ltimeout_cancel fail3 mid failed, %i\n",
+ async);
+ return ret;
+ }
+ }
+
+ for (i = 0; i < 8; i++) {
+ bool async = i & 1;
+ bool skip1 = i & 2;
+ bool skip2 = i & 4;
+
+ ret = test_ltimeout_fire(&ring, async, skip1, skip2);
+ if (ret) {
+ fprintf(stderr, "test_ltimeout_fire failed\n");
+ return ret;
+ }
+ }
+
+ /* test 3 positions, start/middle/end of the link, i.e. indexes 0, 3, 6 */
+ for (i = 0; i < 3; i++) {
+ for (j = 0; j < 3; j++) {
+ for (k = 0; k < 2; k++) {
+ bool mark_last = k & 1;
+
+ ret = test_hardlink(&ring, 7, i * 3, j * 3, mark_last);
+ if (ret) {
+ fprintf(stderr, "test_hardlink failed"
+ "fail %i skip %i mark last %i\n",
+ i * 3, j * 3, k);
+ return 1;
+ }
+ }
+ }
+ }
+
+ close(fds[0]);
+ close(fds[1]);
+ io_uring_queue_exit(&ring);
+ return 0;
+}
diff --git a/test/socket-rw-eagain.c b/test/socket-rw-eagain.c
index f15c0c1..2d6a817 100644
--- a/test/socket-rw-eagain.c
+++ b/test/socket-rw-eagain.c
@@ -15,6 +15,7 @@
#include <sys/un.h>
#include <netinet/tcp.h>
#include <netinet/in.h>
+#include <arpa/inet.h>
#include "liburing.h"
@@ -24,6 +25,7 @@ int main(int argc, char *argv[])
int32_t recv_s0;
int32_t val = 1;
struct sockaddr_in addr;
+ struct iovec iov_r[1], iov_w[1];
if (argc > 1)
return 0;
@@ -38,10 +40,10 @@ int main(int argc, char *argv[])
assert(ret != -1);
addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = 0x0100007fU;
+ addr.sin_addr.s_addr = inet_addr("127.0.0.1");
do {
- addr.sin_port = (rand() % 61440) + 4096;
+ addr.sin_port = htons((rand() % 61440) + 4096);
ret = bind(recv_s0, (struct sockaddr*)&addr, sizeof(addr));
if (!ret)
break;
@@ -92,36 +94,36 @@ int main(int argc, char *argv[])
}
struct io_uring m_io_uring;
+ struct io_uring_params p = { };
- ret = io_uring_queue_init(32, &m_io_uring, 0);
+ ret = io_uring_queue_init_params(32, &m_io_uring, &p);
assert(ret >= 0);
+ if (p.features & IORING_FEAT_FAST_POLL)
+ return 0;
+
char recv_buff[128];
char send_buff[128];
{
- struct iovec iov[1];
-
- iov[0].iov_base = recv_buff;
- iov[0].iov_len = sizeof(recv_buff);
+ iov_r[0].iov_base = recv_buff;
+ iov_r[0].iov_len = sizeof(recv_buff);
struct io_uring_sqe* sqe = io_uring_get_sqe(&m_io_uring);
assert(sqe != NULL);
- io_uring_prep_readv(sqe, p_fd[0], iov, 1, 0);
+ io_uring_prep_readv(sqe, p_fd[0], iov_r, 1, 0);
sqe->user_data = 1;
}
{
- struct iovec iov[1];
-
- iov[0].iov_base = send_buff;
- iov[0].iov_len = sizeof(send_buff);
+ iov_w[0].iov_base = send_buff;
+ iov_w[0].iov_len = sizeof(send_buff);
struct io_uring_sqe* sqe = io_uring_get_sqe(&m_io_uring);
assert(sqe != NULL);
- io_uring_prep_writev(sqe, p_fd[1], iov, 1, 0);
+ io_uring_prep_writev(sqe, p_fd[1], iov_w, 1, 0);
sqe->user_data = 2;
}
diff --git a/test/socket-rw-offset.c b/test/socket-rw-offset.c
new file mode 100644
index 0000000..987b6c9
--- /dev/null
+++ b/test/socket-rw-offset.c
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Check that a readv on a socket queued before a writev doesn't hang
+ * the processing.
+ *
+ * From Hrvoje Zeba <zeba.hrvoje@gmail.com>
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <netinet/tcp.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include "liburing.h"
+
+int main(int argc, char *argv[])
+{
+ int p_fd[2], ret;
+ int32_t recv_s0;
+ int32_t val = 1;
+ struct sockaddr_in addr;
+ struct iovec iov_r[1], iov_w[1];
+
+ if (argc > 1)
+ return 0;
+
+ srand(getpid());
+
+ recv_s0 = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+
+ ret = setsockopt(recv_s0, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val));
+ assert(ret != -1);
+ ret = setsockopt(recv_s0, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
+ assert(ret != -1);
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+
+ do {
+ addr.sin_port = htons((rand() % 61440) + 4096);
+ ret = bind(recv_s0, (struct sockaddr*)&addr, sizeof(addr));
+ if (!ret)
+ break;
+ if (errno != EADDRINUSE) {
+ perror("bind");
+ exit(1);
+ }
+ } while (1);
+ ret = listen(recv_s0, 128);
+ assert(ret != -1);
+
+
+ p_fd[1] = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+
+ val = 1;
+ ret = setsockopt(p_fd[1], IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val));
+ assert(ret != -1);
+
+ int32_t flags = fcntl(p_fd[1], F_GETFL, 0);
+ assert(flags != -1);
+
+ flags |= O_NONBLOCK;
+ ret = fcntl(p_fd[1], F_SETFL, flags);
+ assert(ret != -1);
+
+ ret = connect(p_fd[1], (struct sockaddr*)&addr, sizeof(addr));
+ assert(ret == -1);
+
+ flags = fcntl(p_fd[1], F_GETFL, 0);
+ assert(flags != -1);
+
+ flags &= ~O_NONBLOCK;
+ ret = fcntl(p_fd[1], F_SETFL, flags);
+ assert(ret != -1);
+
+ p_fd[0] = accept(recv_s0, NULL, NULL);
+ assert(p_fd[0] != -1);
+
+ while (1) {
+ int32_t code;
+ socklen_t code_len = sizeof(code);
+
+ ret = getsockopt(p_fd[1], SOL_SOCKET, SO_ERROR, &code, &code_len);
+ assert(ret != -1);
+
+ if (!code)
+ break;
+ }
+
+ struct io_uring m_io_uring;
+ struct io_uring_params p = { };
+
+ ret = io_uring_queue_init_params(32, &m_io_uring, &p);
+ assert(ret >= 0);
+
+ /* skip for kernels without cur position read/write */
+ if (!(p.features & IORING_FEAT_RW_CUR_POS))
+ return 0;
+
+ char recv_buff[128];
+ char send_buff[128];
+
+ {
+ iov_r[0].iov_base = recv_buff;
+ iov_r[0].iov_len = sizeof(recv_buff);
+
+ struct io_uring_sqe* sqe = io_uring_get_sqe(&m_io_uring);
+ assert(sqe != NULL);
+
+ io_uring_prep_readv(sqe, p_fd[0], iov_r, 1, -1);
+ }
+
+ {
+ iov_w[0].iov_base = send_buff;
+ iov_w[0].iov_len = sizeof(send_buff);
+
+ struct io_uring_sqe* sqe = io_uring_get_sqe(&m_io_uring);
+ assert(sqe != NULL);
+
+ io_uring_prep_writev(sqe, p_fd[1], iov_w, 1, 0);
+ }
+
+ ret = io_uring_submit_and_wait(&m_io_uring, 2);
+ assert(ret != -1);
+
+ struct io_uring_cqe* cqe;
+ uint32_t head;
+ uint32_t count = 0;
+
+ ret = 0;
+ while (count != 2) {
+ io_uring_for_each_cqe(&m_io_uring, head, cqe) {
+ if (cqe->res != 128) {
+ fprintf(stderr, "Got %d, expected 128\n", cqe->res);
+ ret = 1;
+ goto err;
+ }
+ assert(cqe->res == 128);
+ count++;
+ }
+
+ assert(count <= 2);
+ io_uring_cq_advance(&m_io_uring, count);
+ }
+
+err:
+ io_uring_queue_exit(&m_io_uring);
+ return ret;
+}
diff --git a/test/socket-rw.c b/test/socket-rw.c
index 1b731b2..4fbf032 100644
--- a/test/socket-rw.c
+++ b/test/socket-rw.c
@@ -17,6 +17,7 @@
#include <sys/un.h>
#include <netinet/tcp.h>
#include <netinet/in.h>
+#include <arpa/inet.h>
#include "liburing.h"
@@ -26,6 +27,7 @@ int main(int argc, char *argv[])
int32_t recv_s0;
int32_t val = 1;
struct sockaddr_in addr;
+ struct iovec iov_r[1], iov_w[1];
if (argc > 1)
return 0;
@@ -40,10 +42,10 @@ int main(int argc, char *argv[])
assert(ret != -1);
addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = 0x0100007fU;
+ addr.sin_addr.s_addr = inet_addr("127.0.0.1");
do {
- addr.sin_port = (rand() % 61440) + 4096;
+ addr.sin_port = htons((rand() % 61440) + 4096);
ret = bind(recv_s0, (struct sockaddr*)&addr, sizeof(addr));
if (!ret)
break;
@@ -102,27 +104,23 @@ int main(int argc, char *argv[])
char send_buff[128];
{
- struct iovec iov[1];
-
- iov[0].iov_base = recv_buff;
- iov[0].iov_len = sizeof(recv_buff);
+ iov_r[0].iov_base = recv_buff;
+ iov_r[0].iov_len = sizeof(recv_buff);
struct io_uring_sqe* sqe = io_uring_get_sqe(&m_io_uring);
assert(sqe != NULL);
- io_uring_prep_readv(sqe, p_fd[0], iov, 1, 0);
+ io_uring_prep_readv(sqe, p_fd[0], iov_r, 1, 0);
}
{
- struct iovec iov[1];
-
- iov[0].iov_base = send_buff;
- iov[0].iov_len = sizeof(send_buff);
+ iov_w[0].iov_base = send_buff;
+ iov_w[0].iov_len = sizeof(send_buff);
struct io_uring_sqe* sqe = io_uring_get_sqe(&m_io_uring);
assert(sqe != NULL);
- io_uring_prep_writev(sqe, p_fd[1], iov, 1, 0);
+ io_uring_prep_writev(sqe, p_fd[1], iov_w, 1, 0);
}
ret = io_uring_submit_and_wait(&m_io_uring, 2);
diff --git a/test/socket.c b/test/socket.c
new file mode 100644
index 0000000..6a3ea09
--- /dev/null
+++ b/test/socket.c
@@ -0,0 +1,408 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Simple test case using the socket op
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <pthread.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+static char str[] = "This is a test of send and recv over io_uring!";
+
+#define MAX_MSG 128
+
+#define PORT 10202
+#define HOST "127.0.0.1"
+
+static int no_socket;
+
+static int recv_prep(struct io_uring *ring, struct iovec *iov, int *sock,
+ int registerfiles)
+{
+ struct sockaddr_in saddr;
+ struct io_uring_sqe *sqe;
+ int sockfd, ret, val, use_fd;
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sin_family = AF_INET;
+ saddr.sin_addr.s_addr = htonl(INADDR_ANY);
+ saddr.sin_port = htons(PORT);
+
+ sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sockfd < 0) {
+ perror("socket");
+ return 1;
+ }
+
+ val = 1;
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
+
+ ret = bind(sockfd, (struct sockaddr *)&saddr, sizeof(saddr));
+ if (ret < 0) {
+ perror("bind");
+ goto err;
+ }
+
+ if (registerfiles) {
+ ret = io_uring_register_files(ring, &sockfd, 1);
+ if (ret) {
+ fprintf(stderr, "file reg failed\n");
+ goto err;
+ }
+ use_fd = 0;
+ } else {
+ use_fd = sockfd;
+ }
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_recv(sqe, use_fd, iov->iov_base, iov->iov_len, 0);
+ if (registerfiles)
+ sqe->flags |= IOSQE_FIXED_FILE;
+ sqe->user_data = 2;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "submit failed: %d\n", ret);
+ goto err;
+ }
+
+ *sock = sockfd;
+ return 0;
+err:
+ close(sockfd);
+ return 1;
+}
+
+static int do_recv(struct io_uring *ring, struct iovec *iov)
+{
+ struct io_uring_cqe *cqe;
+ int ret;
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stdout, "wait_cqe: %d\n", ret);
+ goto err;
+ }
+ if (cqe->res == -EINVAL) {
+ fprintf(stdout, "recv not supported, skipping\n");
+ return 0;
+ }
+ if (cqe->res < 0) {
+ fprintf(stderr, "failed cqe: %d\n", cqe->res);
+ goto err;
+ }
+
+ if (cqe->res -1 != strlen(str)) {
+ fprintf(stderr, "got wrong length: %d/%d\n", cqe->res,
+ (int) strlen(str) + 1);
+ goto err;
+ }
+
+ if (strcmp(str, iov->iov_base)) {
+ fprintf(stderr, "string mismatch\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ return 1;
+}
+
+struct recv_data {
+ pthread_mutex_t mutex;
+ int use_sqthread;
+ int registerfiles;
+};
+
+static void *recv_fn(void *data)
+{
+ struct recv_data *rd = data;
+ char buf[MAX_MSG + 1];
+ struct iovec iov = {
+ .iov_base = buf,
+ .iov_len = sizeof(buf) - 1,
+ };
+ struct io_uring_params p = { };
+ struct io_uring ring;
+ int ret, sock;
+
+ if (rd->use_sqthread)
+ p.flags = IORING_SETUP_SQPOLL;
+ ret = t_create_ring_params(1, &ring, &p);
+ if (ret == T_SETUP_SKIP) {
+ pthread_mutex_unlock(&rd->mutex);
+ ret = 0;
+ goto err;
+ } else if (ret < 0) {
+ pthread_mutex_unlock(&rd->mutex);
+ goto err;
+ }
+
+ if (rd->use_sqthread && !rd->registerfiles) {
+ if (!(p.features & IORING_FEAT_SQPOLL_NONFIXED)) {
+ fprintf(stdout, "Non-registered SQPOLL not available, skipping\n");
+ pthread_mutex_unlock(&rd->mutex);
+ goto err;
+ }
+ }
+
+ ret = recv_prep(&ring, &iov, &sock, rd->registerfiles);
+ if (ret) {
+ fprintf(stderr, "recv_prep failed: %d\n", ret);
+ goto err;
+ }
+ pthread_mutex_unlock(&rd->mutex);
+ ret = do_recv(&ring, &iov);
+
+ close(sock);
+ io_uring_queue_exit(&ring);
+err:
+ return (void *)(intptr_t)ret;
+}
+
+static int fallback_send(struct io_uring *ring, struct sockaddr_in *saddr)
+{
+ struct iovec iov = {
+ .iov_base = str,
+ .iov_len = sizeof(str),
+ };
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int sockfd, ret;
+
+ sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sockfd < 0) {
+ perror("socket");
+ return 1;
+ }
+
+ ret = connect(sockfd, (struct sockaddr *)saddr, sizeof(*saddr));
+ if (ret < 0) {
+ perror("connect");
+ return 1;
+ }
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_send(sqe, sockfd, iov.iov_base, iov.iov_len, 0);
+ sqe->user_data = 1;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "submit failed: %d\n", ret);
+ goto err;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (cqe->res == -EINVAL) {
+ fprintf(stdout, "send not supported, skipping\n");
+ close(sockfd);
+ return 0;
+ }
+ if (cqe->res != iov.iov_len) {
+ fprintf(stderr, "failed cqe: %d\n", cqe->res);
+ goto err;
+ }
+
+ close(sockfd);
+ return 0;
+err:
+ close(sockfd);
+ return 1;
+}
+
+static int do_send(int socket_direct, int alloc)
+{
+ struct sockaddr_in saddr;
+ struct iovec iov = {
+ .iov_base = str,
+ .iov_len = sizeof(str),
+ };
+ struct io_uring ring;
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int sockfd, ret, fd = -1;
+
+ ret = io_uring_queue_init(1, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "queue init failed: %d\n", ret);
+ return 1;
+ }
+
+ if (socket_direct) {
+ ret = io_uring_register_files(&ring, &fd, 1);
+ if (ret) {
+ fprintf(stderr, "file register %d\n", ret);
+ return 1;
+ }
+ }
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sin_family = AF_INET;
+ saddr.sin_port = htons(PORT);
+ inet_pton(AF_INET, HOST, &saddr.sin_addr);
+
+ sqe = io_uring_get_sqe(&ring);
+ if (socket_direct) {
+ unsigned file_index = 0;
+ if (alloc)
+ file_index = IORING_FILE_INDEX_ALLOC - 1;
+ io_uring_prep_socket_direct(sqe, AF_INET, SOCK_DGRAM, 0,
+ file_index, 0);
+ } else {
+ io_uring_prep_socket(sqe, AF_INET, SOCK_DGRAM, 0, 0);
+ }
+ ret = io_uring_submit(&ring);
+ if (ret != 1) {
+ fprintf(stderr, "socket submit: %d\n", ret);
+ return 1;
+ }
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait_cqe: %d\n", ret);
+ return 1;
+ }
+ if (cqe->res < 0) {
+ if (cqe->res == -EINVAL) {
+ fprintf(stdout, "No socket support, skipping\n");
+ no_socket = 1;
+ io_uring_cqe_seen(&ring, cqe);
+ return fallback_send(&ring, &saddr);
+ }
+
+ fprintf(stderr, "socket res: %d\n", ret);
+ return 1;
+ }
+
+ sockfd = cqe->res;
+ if (socket_direct && !alloc)
+ sockfd = 0;
+ io_uring_cqe_seen(&ring, cqe);
+
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_connect(sqe, sockfd, (struct sockaddr *) &saddr,
+ sizeof(saddr));
+ if (socket_direct)
+ sqe->flags |= IOSQE_FIXED_FILE;
+ ret = io_uring_submit(&ring);
+ if (ret != 1) {
+ fprintf(stderr, "connect submit: %d\n", ret);
+ return 1;
+ }
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait_cqe: %d\n", ret);
+ return 1;
+ }
+ if (cqe->res < 0) {
+ fprintf(stderr, "connect res: %d\n", cqe->res);
+ return 1;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_send(sqe, sockfd, iov.iov_base, iov.iov_len, 0);
+ sqe->user_data = 1;
+ if (socket_direct)
+ sqe->flags |= IOSQE_FIXED_FILE;
+
+ ret = io_uring_submit(&ring);
+ if (ret <= 0) {
+ fprintf(stderr, "submit failed: %d\n", ret);
+ goto err;
+ }
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (cqe->res == -EINVAL) {
+ fprintf(stdout, "send not supported, skipping\n");
+ close(sockfd);
+ return 0;
+ }
+ if (cqe->res != iov.iov_len) {
+ fprintf(stderr, "failed cqe: %d\n", cqe->res);
+ goto err;
+ }
+
+ close(sockfd);
+ return 0;
+err:
+ close(sockfd);
+ return 1;
+}
+
+static int test(int use_sqthread, int regfiles, int socket_direct, int alloc)
+{
+ pthread_mutexattr_t attr;
+ pthread_t recv_thread;
+ struct recv_data rd;
+ int ret;
+ void *retval;
+
+ pthread_mutexattr_init(&attr);
+ pthread_mutexattr_setpshared(&attr, 1);
+ pthread_mutex_init(&rd.mutex, &attr);
+ pthread_mutex_lock(&rd.mutex);
+ rd.use_sqthread = use_sqthread;
+ rd.registerfiles = regfiles;
+
+ ret = pthread_create(&recv_thread, NULL, recv_fn, &rd);
+ if (ret) {
+ fprintf(stderr, "Thread create failed: %d\n", ret);
+ pthread_mutex_unlock(&rd.mutex);
+ return 1;
+ }
+
+ pthread_mutex_lock(&rd.mutex);
+ do_send(socket_direct, alloc);
+ pthread_join(recv_thread, &retval);
+ return (intptr_t)retval;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ ret = test(0, 0, 0, 0);
+ if (ret) {
+ fprintf(stderr, "test sqthread=0 failed\n");
+ return ret;
+ }
+ if (no_socket)
+ return 0;
+
+ ret = test(1, 1, 0, 0);
+ if (ret) {
+ fprintf(stderr, "test sqthread=1 reg=1 failed\n");
+ return ret;
+ }
+
+ ret = test(1, 0, 0, 0);
+ if (ret) {
+ fprintf(stderr, "test sqthread=1 reg=0 failed\n");
+ return ret;
+ }
+
+ ret = test(0, 0, 1, 0);
+ if (ret) {
+ fprintf(stderr, "test sqthread=0 direct=1 failed\n");
+ return ret;
+ }
+
+ ret = test(0, 0, 1, 1);
+ if (ret) {
+ fprintf(stderr, "test sqthread=0 direct=alloc failed\n");
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/test/splice.c b/test/splice.c
index f4f0c9c..5e9b789 100644
--- a/test/splice.c
+++ b/test/splice.c
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: MIT */
#include <errno.h>
#include <stdio.h>
#include <unistd.h>
diff --git a/test/sq-poll-dup.c b/test/sq-poll-dup.c
index eeb619c..6a72b82 100644
--- a/test/sq-poll-dup.c
+++ b/test/sq-poll-dup.c
@@ -10,7 +10,6 @@
#include <string.h>
#include <fcntl.h>
#include <sys/types.h>
-#include <sys/poll.h>
#include <sys/eventfd.h>
#include <sys/resource.h>
@@ -29,9 +28,14 @@ static struct io_uring rings[NR_RINGS];
static int wait_io(struct io_uring *ring, int nr_ios)
{
struct io_uring_cqe *cqe;
+ int ret;
while (nr_ios) {
- io_uring_wait_cqe(ring, &cqe);
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait_ret=%d\n", ret);
+ return 1;
+ }
if (cqe->res != BS) {
fprintf(stderr, "Unexpected ret %d\n", cqe->res);
return 1;
@@ -123,7 +127,7 @@ static int test(int fd, int do_dup_and_close, int close_ring)
ring_fd = dup(rings[0].ring_fd);
if (close_ring)
close(rings[0].ring_fd);
- rings[0].ring_fd = ring_fd;
+ rings[0].ring_fd = rings[0].enter_ring_fd = ring_fd;
if (do_dup_and_close)
goto done;
@@ -160,13 +164,16 @@ int main(int argc, char *argv[])
if (argc > 1) {
fname = argv[1];
} else {
- fname = ".basic-rw";
+ fname = ".basic-rw-poll-dup";
t_create_file(fname, FILE_SIZE);
}
vecs = t_create_buffers(BUFFERS, BS);
fd = open(fname, O_RDONLY | O_DIRECT);
+ if (fname != argv[1])
+ unlink(fname);
+
if (fd < 0) {
perror("open");
return -1;
@@ -191,11 +198,7 @@ int main(int argc, char *argv[])
goto err;
}
- if (fname != argv[1])
- unlink(fname);
return 0;
err:
- if (fname != argv[1])
- unlink(fname);
return 1;
}
diff --git a/test/sq-poll-kthread.c b/test/sq-poll-kthread.c
index ed7d0bf..3f4a07b 100644
--- a/test/sq-poll-kthread.c
+++ b/test/sq-poll-kthread.c
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: MIT */
/*
* Description: test if io_uring SQ poll kthread is stopped when the userspace
* process ended with or without closing the io_uring fd
@@ -12,11 +13,12 @@
#include <stdlib.h>
#include <string.h>
#include <signal.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/wait.h>
#include <sys/epoll.h>
#include "liburing.h"
+#include "helpers.h"
#define SQ_THREAD_IDLE 2000
#define BUF_SIZE 128
@@ -38,23 +40,20 @@ static int do_test_sq_poll_kthread_stopped(bool do_exit)
uint8_t buf[BUF_SIZE];
struct iovec iov;
- if (geteuid()) {
- fprintf(stderr, "sqpoll requires root!\n");
- return TEST_SKIPPED;
- }
-
if (pipe(pipe1) != 0) {
perror("pipe");
return TEST_FAILED;
}
memset(&param, 0, sizeof(param));
-
param.flags |= IORING_SETUP_SQPOLL;
param.sq_thread_idle = SQ_THREAD_IDLE;
- ret = io_uring_queue_init_params(16, &ring, &param);
- if (ret) {
+ ret = t_create_ring_params(16, &ring, &param);
+ if (ret == T_SETUP_SKIP) {
+ ret = TEST_FAILED;
+ goto err_pipe;
+ } else if (ret != T_SETUP_OK) {
fprintf(stderr, "ring setup failed\n");
ret = TEST_FAILED;
goto err_pipe;
diff --git a/test/sq-poll-share.c b/test/sq-poll-share.c
index a46b94f..7bb7626 100644
--- a/test/sq-poll-share.c
+++ b/test/sq-poll-share.c
@@ -9,7 +9,7 @@
#include <string.h>
#include <fcntl.h>
#include <sys/types.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <sys/eventfd.h>
#include <sys/resource.h>
@@ -82,13 +82,15 @@ int main(int argc, char *argv[])
if (argc > 1) {
fname = argv[1];
} else {
- fname = ".basic-rw";
+ fname = ".basic-rw-poll-share";
t_create_file(fname, FILE_SIZE);
}
vecs = t_create_buffers(BUFFERS, BS);
fd = open(fname, O_RDONLY | O_DIRECT);
+ if (fname != argv[1])
+ unlink(fname);
if (fd < 0) {
perror("open");
return -1;
@@ -129,11 +131,7 @@ int main(int argc, char *argv[])
ios += BUFFERS;
}
- if (fname != argv[1])
- unlink(fname);
return 0;
err:
- if (fname != argv[1])
- unlink(fname);
return 1;
}
diff --git a/test/sqpoll-cancel-hang.c b/test/sqpoll-cancel-hang.c
new file mode 100644
index 0000000..ef62272
--- /dev/null
+++ b/test/sqpoll-cancel-hang.c
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: MIT */
+#include <fcntl.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+#include "liburing.h"
+#include "../src/syscall.h"
+
+static uint64_t current_time_ms(void)
+{
+ struct timespec ts;
+ if (clock_gettime(CLOCK_MONOTONIC, &ts))
+ exit(1);
+ return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
+}
+
+#define SIZEOF_IO_URING_SQE 64
+#define SIZEOF_IO_URING_CQE 16
+#define SQ_TAIL_OFFSET 64
+#define SQ_RING_MASK_OFFSET 256
+#define SQ_RING_ENTRIES_OFFSET 264
+#define CQ_RING_ENTRIES_OFFSET 268
+#define CQ_CQES_OFFSET 320
+
+#define IORING_OFF_SQES 0x10000000ULL
+
+static void kill_and_wait(int pid, int* status)
+{
+ kill(-pid, SIGKILL);
+ kill(pid, SIGKILL);
+ while (waitpid(-1, status, __WALL) != pid) {
+ }
+}
+
+#define WAIT_FLAGS __WALL
+
+uint64_t r[3] = {0xffffffffffffffff, 0x0, 0x0};
+
+static long syz_io_uring_setup(volatile long a0, volatile long a1,
+volatile long a2, volatile long a3, volatile long a4, volatile long
+a5)
+{
+ uint32_t entries = (uint32_t)a0;
+ struct io_uring_params* setup_params = (struct io_uring_params*)a1;
+ void* vma1 = (void*)a2;
+ void* vma2 = (void*)a3;
+ void** ring_ptr_out = (void**)a4;
+ void** sqes_ptr_out = (void**)a5;
+ uint32_t fd_io_uring = __sys_io_uring_setup(entries, setup_params);
+ uint32_t sq_ring_sz = setup_params->sq_off.array +
+setup_params->sq_entries * sizeof(uint32_t);
+ uint32_t cq_ring_sz = setup_params->cq_off.cqes +
+setup_params->cq_entries * SIZEOF_IO_URING_CQE;
+ uint32_t ring_sz = sq_ring_sz > cq_ring_sz ? sq_ring_sz : cq_ring_sz;
+ *ring_ptr_out = mmap(vma1, ring_sz, PROT_READ | PROT_WRITE,
+MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd_io_uring,
+IORING_OFF_SQ_RING);
+ uint32_t sqes_sz = setup_params->sq_entries * SIZEOF_IO_URING_SQE;
+ *sqes_ptr_out = mmap(vma2, sqes_sz, PROT_READ | PROT_WRITE,
+MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd_io_uring, IORING_OFF_SQES);
+ return fd_io_uring;
+}
+
+static long syz_io_uring_submit(volatile long a0, volatile long a1,
+volatile long a2, volatile long a3)
+{
+ char* ring_ptr = (char*)a0;
+ char* sqes_ptr = (char*)a1;
+ char* sqe = (char*)a2;
+ uint32_t sqes_index = (uint32_t)a3;
+ uint32_t sq_ring_entries = *(uint32_t*)(ring_ptr + SQ_RING_ENTRIES_OFFSET);
+ uint32_t cq_ring_entries = *(uint32_t*)(ring_ptr + CQ_RING_ENTRIES_OFFSET);
+ uint32_t sq_array_off = (CQ_CQES_OFFSET + cq_ring_entries *
+SIZEOF_IO_URING_CQE + 63) & ~63;
+ if (sq_ring_entries)
+ sqes_index %= sq_ring_entries;
+ char* sqe_dest = sqes_ptr + sqes_index * SIZEOF_IO_URING_SQE;
+ memcpy(sqe_dest, sqe, SIZEOF_IO_URING_SQE);
+ uint32_t sq_ring_mask = *(uint32_t*)(ring_ptr + SQ_RING_MASK_OFFSET);
+ uint32_t* sq_tail_ptr = (uint32_t*)(ring_ptr + SQ_TAIL_OFFSET);
+ uint32_t sq_tail = *sq_tail_ptr & sq_ring_mask;
+ uint32_t sq_tail_next = *sq_tail_ptr + 1;
+ uint32_t* sq_array = (uint32_t*)(ring_ptr + sq_array_off);
+ *(sq_array + sq_tail) = sqes_index;
+ __atomic_store_n(sq_tail_ptr, sq_tail_next, __ATOMIC_RELEASE);
+ return 0;
+}
+
+
+void trigger_bug(void)
+{
+ intptr_t res = 0;
+ *(uint32_t*)0x20000204 = 0;
+ *(uint32_t*)0x20000208 = 2;
+ *(uint32_t*)0x2000020c = 0;
+ *(uint32_t*)0x20000210 = 0;
+ *(uint32_t*)0x20000218 = -1;
+ memset((void*)0x2000021c, 0, 12);
+ res = -1;
+ res = syz_io_uring_setup(0x7987, 0x20000200, 0x20400000, 0x20ffd000, 0x200000c0, 0x200001c0);
+ if (res != -1) {
+ r[0] = res;
+ r[1] = *(uint64_t*)0x200000c0;
+ r[2] = *(uint64_t*)0x200001c0;
+ }
+ *(uint8_t*)0x20000180 = 0xb;
+ *(uint8_t*)0x20000181 = 1;
+ *(uint16_t*)0x20000182 = 0;
+ *(uint32_t*)0x20000184 = 0;
+ *(uint64_t*)0x20000188 = 4;
+ *(uint64_t*)0x20000190 = 0x20000140;
+ *(uint64_t*)0x20000140 = 0x77359400;
+ *(uint64_t*)0x20000148 = 0;
+ *(uint32_t*)0x20000198 = 1;
+ *(uint32_t*)0x2000019c = 0;
+ *(uint64_t*)0x200001a0 = 0;
+ *(uint16_t*)0x200001a8 = 0;
+ *(uint16_t*)0x200001aa = 0;
+ memset((void*)0x200001ac, 0, 20);
+ syz_io_uring_submit(r[1], r[2], 0x20000180, 1);
+ *(uint32_t*)0x20000544 = 0;
+ *(uint32_t*)0x20000548 = 0x36;
+ *(uint32_t*)0x2000054c = 0;
+ *(uint32_t*)0x20000550 = 0;
+ *(uint32_t*)0x20000558 = r[0];
+ memset((void*)0x2000055c, 0, 12);
+
+}
+int main(void)
+{
+ mmap((void *)0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul);
+ int pid = fork();
+ if (pid < 0)
+ exit(1);
+ if (pid == 0) {
+ trigger_bug();
+ exit(0);
+ }
+ int status = 0;
+ uint64_t start = current_time_ms();
+ for (;;) {
+ if (current_time_ms() - start < 1000) {
+ continue;
+ }
+ kill_and_wait(pid, &status);
+ break;
+ }
+ return 0;
+}
+
+
+
diff --git a/test/sqpoll-disable-exit.c b/test/sqpoll-disable-exit.c
index 93bcf42..76b6cf5 100644
--- a/test/sqpoll-disable-exit.c
+++ b/test/sqpoll-disable-exit.c
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: MIT */
// https://syzkaller.appspot.com/bug?id=99f4ea77bb9b9ef24cefb66469be319f4aa9f162
// autogenerated by syzkaller (https://github.com/google/syzkaller)
diff --git a/test/sqpoll-exit-hang.c b/test/sqpoll-exit-hang.c
index 43385ce..cde2115 100644
--- a/test/sqpoll-exit-hang.c
+++ b/test/sqpoll-exit-hang.c
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: MIT */
/*
* Test that we exit properly with SQPOLL and having a request that
* adds a circular reference to the ring itself.
@@ -7,7 +8,7 @@
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
-#include <sys/poll.h>
+#include <poll.h>
#include "liburing.h"
static unsigned long long mtime_since(const struct timeval *s,
diff --git a/test/sqpoll-sleep.c b/test/sqpoll-sleep.c
index 7ffd0e5..9d1cff6 100644
--- a/test/sqpoll-sleep.c
+++ b/test/sqpoll-sleep.c
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: MIT */
/*
* Test that the sqthread goes to sleep around the specified time, and that
* the NEED_WAKEUP flag is then set.
diff --git a/test/statx.c b/test/statx.c
index c0f9e9c..5fa086e 100644
--- a/test/statx.c
+++ b/test/statx.c
@@ -11,7 +11,7 @@
#include <fcntl.h>
#include <sys/types.h>
#include <sys/syscall.h>
-#include <linux/stat.h>
+#include <sys/stat.h>
#include "helpers.h"
#include "liburing.h"
diff --git a/test/submit-link-fail.c b/test/submit-link-fail.c
new file mode 100644
index 0000000..45f6976
--- /dev/null
+++ b/test/submit-link-fail.c
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: tests linked requests failing during submission
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <assert.h>
+
+#include "liburing.h"
+
+#define DRAIN_USER_DATA 42
+
+static int test_underprep_fail(bool hardlink, bool drain, bool link_last,
+ int link_size, int fail_idx)
+{
+ const int invalid_fd = 42;
+ int link_flags = IOSQE_IO_LINK;
+ int total_submit = link_size;
+ struct io_uring ring;
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ char buffer[1];
+ int i, ret, fds[2];
+
+ if (drain)
+ link_flags |= IOSQE_IO_DRAIN;
+ if (hardlink)
+ link_flags |= IOSQE_IO_HARDLINK;
+
+ assert(fail_idx < link_size);
+ assert(link_size < 40);
+
+ /* create a new ring as it leaves it dirty */
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ printf("ring setup failed\n");
+ return -1;
+ }
+ if (pipe(fds)) {
+ perror("pipe");
+ return -1;
+ }
+
+ if (drain) {
+ /* clog drain, so following reqs sent to draining */
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_read(sqe, fds[0], buffer, sizeof(buffer), 0);
+ sqe->user_data = DRAIN_USER_DATA;
+ sqe->flags |= IOSQE_IO_DRAIN;
+ total_submit++;
+ }
+
+ for (i = 0; i < link_size; i++) {
+ sqe = io_uring_get_sqe(&ring);
+ if (i == fail_idx) {
+ io_uring_prep_read(sqe, invalid_fd, buffer, 1, 0);
+ sqe->ioprio = (short) -1;
+ } else {
+ io_uring_prep_nop(sqe);
+ }
+
+ if (i != link_size - 1 || !link_last)
+ sqe->flags |= link_flags;
+ sqe->user_data = i;
+ }
+
+ ret = io_uring_submit(&ring);
+ if (ret != total_submit) {
+ /* Old behaviour, failed early and under-submitted */
+ if (ret == fail_idx + 1 + drain)
+ goto out;
+ fprintf(stderr, "submit failed: %d\n", ret);
+ return -1;
+ }
+
+ if (drain) {
+ /* unclog drain */
+ ret = write(fds[1], buffer, sizeof(buffer));
+ if (ret < 0) {
+ perror("write");
+ return 1;
+ }
+ }
+
+ for (i = 0; i < total_submit; i++) {
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait_cqe=%d\n", ret);
+ return 1;
+ }
+
+ ret = cqe->res;
+ if (cqe->user_data == DRAIN_USER_DATA) {
+ if (ret != 1) {
+ fprintf(stderr, "drain failed %d\n", ret);
+ return 1;
+ }
+ } else if (cqe->user_data == fail_idx) {
+ if (ret == 0 || ret == -ECANCELED) {
+ fprintf(stderr, "half-prep req unexpected return %d\n", ret);
+ return 1;
+ }
+ } else {
+ if (ret != -ECANCELED) {
+ fprintf(stderr, "cancel failed %d, ud %d\n", ret, (int)cqe->user_data);
+ return 1;
+ }
+ }
+ io_uring_cqe_seen(&ring, cqe);
+ }
+out:
+ close(fds[0]);
+ close(fds[1]);
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret, link_size, fail_idx, i;
+
+ if (argc > 1)
+ return 0;
+
+ /*
+ * hardlink, size=3, fail_idx=1, drain=false -- kernel fault
+ * link, size=3, fail_idx=0, drain=true -- kernel fault
+ * link, size=3, fail_idx=1, drain=true -- invalid cqe->res
+ */
+ for (link_size = 0; link_size < 3; link_size++) {
+ for (fail_idx = 0; fail_idx < link_size; fail_idx++) {
+ for (i = 0; i < 8; i++) {
+ bool hardlink = (i & 1) != 0;
+ bool drain = (i & 2) != 0;
+ bool link_last = (i & 4) != 0;
+
+ ret = test_underprep_fail(hardlink, drain, link_last,
+ link_size, fail_idx);
+ if (!ret)
+ continue;
+
+ fprintf(stderr, "failed %d, hard %d, drain %d,"
+ "link_last %d, size %d, idx %d\n",
+ ret, hardlink, drain, link_last,
+ link_size, fail_idx);
+ return 1;
+ }
+ }
+ }
+
+ return 0;
+}
diff --git a/test/submit-reuse.c b/test/submit-reuse.c
index 74ba769..d5ccdd4 100644
--- a/test/submit-reuse.c
+++ b/test/submit-reuse.c
@@ -26,13 +26,11 @@ struct thread_data {
static void *flusher(void *__data)
{
struct thread_data *data = __data;
- int i = 0;
while (!data->do_exit) {
posix_fadvise(data->fd1, 0, FILE_SIZE, POSIX_FADV_DONTNEED);
posix_fadvise(data->fd2, 0, FILE_SIZE, POSIX_FADV_DONTNEED);
usleep(10);
- i++;
}
return NULL;
@@ -140,11 +138,6 @@ static int test_reuse(int argc, char *argv[], int split, int async)
int do_unlink = 1;
void *tret;
- if (argc > 1) {
- fname1 = argv[1];
- do_unlink = 0;
- }
-
ret = io_uring_queue_init_params(32, &ring, &p);
if (ret) {
fprintf(stderr, "io_uring_queue_init: %d\n", ret);
@@ -153,21 +146,29 @@ static int test_reuse(int argc, char *argv[], int split, int async)
if (!(p.features & IORING_FEAT_SUBMIT_STABLE)) {
fprintf(stdout, "FEAT_SUBMIT_STABLE not there, skipping\n");
+ io_uring_queue_exit(&ring);
no_stable = 1;
return 0;
}
- if (do_unlink)
+ if (argc > 1) {
+ fname1 = argv[1];
+ do_unlink = 0;
+ } else {
t_create_file(fname1, FILE_SIZE);
-
- t_create_file(".reuse.2", FILE_SIZE);
+ }
fd1 = open(fname1, O_RDONLY);
+ if (do_unlink)
+ unlink(fname1);
if (fd1 < 0) {
perror("open fname1");
goto err;
}
+
+ t_create_file(".reuse.2", FILE_SIZE);
fd2 = open(".reuse.2", O_RDONLY);
+ unlink(".reuse.2");
if (fd2 < 0) {
perror("open .reuse.2");
goto err;
@@ -206,15 +207,9 @@ static int test_reuse(int argc, char *argv[], int split, int async)
close(fd2);
close(fd1);
io_uring_queue_exit(&ring);
- if (do_unlink)
- unlink(fname1);
- unlink(".reuse.2");
return 0;
err:
io_uring_queue_exit(&ring);
- if (do_unlink)
- unlink(fname1);
- unlink(".reuse.2");
return 1;
}
diff --git a/test/symlink.c b/test/symlink.c
index 8b5e04a..cf4aa96 100644
--- a/test/symlink.c
+++ b/test/symlink.c
@@ -67,6 +67,9 @@ int main(int argc, char *argv[])
int ret;
struct io_uring ring;
+ if (argc > 1)
+ return 0;
+
ret = io_uring_queue_init(8, &ring, 0);
if (ret) {
fprintf(stderr, "queue init failed: %d\n", ret);
diff --git a/test/test.h b/test/test.h
new file mode 100644
index 0000000..3628163
--- /dev/null
+++ b/test/test.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: Test configs for tests.
+ */
+#ifndef LIBURING_TEST_H
+#define LIBURING_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct io_uring_test_config {
+ unsigned int flags;
+ const char *description;
+} io_uring_test_config;
+
+io_uring_test_config io_uring_test_configs[] = {
+ { 0, "default" },
+ { IORING_SETUP_SQE128, "large SQE"},
+ { IORING_SETUP_CQE32, "large CQE"},
+ { IORING_SETUP_SQE128 | IORING_SETUP_CQE32, "large SQE/CQE" },
+};
+
+#define FOR_ALL_TEST_CONFIGS \
+ for (int i = 0; i < sizeof(io_uring_test_configs) / sizeof(io_uring_test_configs[0]); i++)
+
+#define IORING_GET_TEST_CONFIG_FLAGS() (io_uring_test_configs[i].flags)
+#define IORING_GET_TEST_CONFIG_DESCRIPTION() (io_uring_test_configs[i].description)
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/test/thread-exit.c b/test/thread-exit.c
index c2f2148..3e20431 100644
--- a/test/thread-exit.c
+++ b/test/thread-exit.c
@@ -11,7 +11,7 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
-#include <sys/poll.h>
+#include <poll.h>
#include <pthread.h>
#include "helpers.h"
@@ -26,8 +26,18 @@ struct d {
unsigned long off;
int pipe_fd;
int err;
+ int i;
};
+static char *g_buf[NR_IOS] = {NULL};
+
+static void free_g_buf(void)
+{
+ int i;
+ for (i = 0; i < NR_IOS; i++)
+ free(g_buf[i]);
+}
+
static void *do_io(void *data)
{
struct d *d = data;
@@ -36,6 +46,7 @@ static void *do_io(void *data)
int ret;
buffer = t_malloc(WSIZE);
+ g_buf[d->i] = buffer;
memset(buffer, 0x5a, WSIZE);
sqe = io_uring_get_sqe(d->ring);
if (!sqe) {
@@ -55,8 +66,6 @@ static void *do_io(void *data)
ret = io_uring_submit(d->ring);
if (ret != 2)
d->err++;
-
- free(buffer);
return NULL;
}
@@ -86,12 +95,12 @@ int main(int argc, char *argv[])
} else {
fname = ".thread.exit";
do_unlink = 1;
- }
-
- if (do_unlink)
t_create_file(fname, 4096);
+ }
fd = open(fname, O_WRONLY);
+ if (do_unlink)
+ unlink(fname);
if (fd < 0) {
perror("open");
return 1;
@@ -103,6 +112,7 @@ int main(int argc, char *argv[])
d.pipe_fd = fds[0];
d.err = 0;
for (i = 0; i < NR_IOS; i++) {
+ d.i = i;
memset(&thread, 0, sizeof(thread));
pthread_create(&thread, NULL, do_io, &d);
pthread_join(thread, NULL);
@@ -125,11 +135,9 @@ int main(int argc, char *argv[])
io_uring_cqe_seen(&ring, cqe);
}
- if (do_unlink)
- unlink(fname);
+ free_g_buf();
return d.err;
err:
- if (do_unlink)
- unlink(fname);
+ free_g_buf();
return 1;
}
diff --git a/test/timeout-new.c b/test/timeout-new.c
index b0bb5ee..6efcfb4 100644
--- a/test/timeout-new.c
+++ b/test/timeout-new.c
@@ -53,14 +53,12 @@ static int test_return_before_timeout(struct io_uring *ring)
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
int ret;
+ bool retried = false;
struct __kernel_timespec ts;
- sqe = io_uring_get_sqe(ring);
- if (!sqe) {
- fprintf(stderr, "%s: get sqe failed\n", __FUNCTION__);
- return 1;
- }
+ msec_to_ts(&ts, TIMEOUT_MSEC);
+ sqe = io_uring_get_sqe(ring);
io_uring_prep_nop(sqe);
ret = io_uring_submit(ring);
@@ -69,13 +67,21 @@ static int test_return_before_timeout(struct io_uring *ring)
return 1;
}
- msec_to_ts(&ts, TIMEOUT_MSEC);
+again:
ret = io_uring_wait_cqe_timeout(ring, &cqe, &ts);
- if (ret < 0) {
+ if (ret == -ETIME && (ring->flags & IORING_SETUP_SQPOLL) && !retried) {
+ /*
+ * there is a small chance SQPOLL hasn't been waked up yet,
+ * give it one more try.
+ */
+ printf("warning: funky SQPOLL timing\n");
+ sleep(1);
+ retried = true;
+ goto again;
+ } else if (ret < 0) {
fprintf(stderr, "%s: timeout error: %d\n", __FUNCTION__, ret);
return 1;
}
-
io_uring_cqe_seen(ring, cqe);
return 0;
}
@@ -202,7 +208,7 @@ int main(int argc, char *argv[])
return 1;
}
if (!(ring_normal.features & IORING_FEAT_EXT_ARG)) {
- fprintf(stderr, "feature IORING_FEAT_EXT_ARG not supported.\n");
+ fprintf(stderr, "feature IORING_FEAT_EXT_ARG not supported, skipping.\n");
return 0;
}
diff --git a/test/timeout-overflow.c b/test/timeout-overflow.c
index f952f80..671f171 100644
--- a/test/timeout-overflow.c
+++ b/test/timeout-overflow.c
@@ -101,7 +101,7 @@ static int test_timeout_overflow(void)
msec_to_ts(&ts, TIMEOUT_MSEC);
for (i = 0; i < 4; i++) {
- unsigned num;
+ unsigned num = 0;
sqe = io_uring_get_sqe(&ring);
switch (i) {
case 0:
diff --git a/test/timeout.c b/test/timeout.c
index a28d599..2fd4736 100644
--- a/test/timeout.c
+++ b/test/timeout.c
@@ -10,6 +10,9 @@
#include <string.h>
#include <fcntl.h>
#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/stat.h>
#include "liburing.h"
#include "../src/syscall.h"
@@ -180,7 +183,8 @@ err:
return 1;
}
-static int test_single_timeout_wait(struct io_uring *ring)
+static int test_single_timeout_wait(struct io_uring *ring,
+ struct io_uring_params *p)
{
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
@@ -195,6 +199,15 @@ static int test_single_timeout_wait(struct io_uring *ring)
io_uring_prep_nop(sqe);
io_uring_sqe_set_data(sqe, (void *) 1);
+ /* no implied submit for newer kernels */
+ if (p->features & IORING_FEAT_EXT_ARG) {
+ ret = io_uring_submit(ring);
+ if (ret != 2) {
+ fprintf(stderr, "%s: submit %d\n", __FUNCTION__, ret);
+ return 1;
+ }
+ }
+
msec_to_ts(&ts, 1000);
i = 0;
@@ -550,8 +563,8 @@ static int test_multi_timeout(struct io_uring *ring)
gettimeofday(&tv, NULL);
for (i = 0; i < 2; i++) {
- unsigned int time;
- __u64 user_data;
+ unsigned int time = 0;
+ __u64 user_data = 0;
ret = io_uring_wait_cqe(ring, &cqe);
if (ret < 0) {
@@ -1161,16 +1174,171 @@ err:
return 1;
}
+static int fill_exec_target(char *dst, char *path)
+{
+ struct stat sb;
+
+ /*
+ * Should either be ./exec-target.t or test/exec-target.t
+ */
+ sprintf(dst, "%s", path);
+ return stat(dst, &sb);
+}
+
+static int test_timeout_link_cancel(void)
+{
+ struct io_uring ring;
+ struct io_uring_cqe *cqe;
+ char prog_path[PATH_MAX];
+ pid_t p;
+ int ret, i, wstatus;
+
+ if (fill_exec_target(prog_path, "./exec-target.t") &&
+ fill_exec_target(prog_path, "test/exec-target.t")) {
+ fprintf(stdout, "Can't find exec-target, skipping\n");
+ return 0;
+ }
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring create failed: %d\n", ret);
+ return 1;
+ }
+
+ p = fork();
+ if (p == -1) {
+ fprintf(stderr, "fork() failed\n");
+ return 1;
+ }
+
+ if (p == 0) {
+ struct io_uring_sqe *sqe;
+ struct __kernel_timespec ts;
+
+ msec_to_ts(&ts, 10000);
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_timeout(sqe, &ts, 0, 0);
+ sqe->flags |= IOSQE_IO_LINK;
+ sqe->user_data = 0;
+
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_nop(sqe);
+ sqe->user_data = 1;
+
+ ret = io_uring_submit(&ring);
+ if (ret != 2) {
+ fprintf(stderr, "%s: got %d, wanted 1\n", __FUNCTION__, ret);
+ exit(1);
+ }
+
+ /* trigger full cancellation */
+ ret = execl(prog_path, prog_path, NULL);
+ if (ret) {
+ fprintf(stderr, "exec failed %i\n", errno);
+ exit(1);
+ }
+ exit(0);
+ }
+
+ if (waitpid(p, &wstatus, 0) == (pid_t)-1) {
+ perror("waitpid()");
+ return 1;
+ }
+ if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus)) {
+ fprintf(stderr, "child failed %i\n", WEXITSTATUS(wstatus));
+ return 1;
+ }
+
+ for (i = 0; i < 2; ++i) {
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait_cqe=%d\n", ret);
+ return 1;
+ }
+ if (cqe->res != -ECANCELED) {
+ fprintf(stderr, "invalid result, user_data: %i res: %i\n",
+ (int)cqe->user_data, cqe->res);
+ return 1;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+ }
+
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+
+static int test_not_failing_links(void)
+{
+ struct io_uring ring;
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ struct __kernel_timespec ts;
+ int ret;
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "ring create failed: %d\n", ret);
+ return 1;
+ }
+
+ msec_to_ts(&ts, 1);
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_timeout(sqe, &ts, 0, IORING_TIMEOUT_ETIME_SUCCESS);
+ sqe->user_data = 1;
+ sqe->flags |= IOSQE_IO_LINK;
+
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_nop(sqe);
+ sqe->user_data = 2;
+
+ ret = io_uring_submit(&ring);
+ if (ret != 2) {
+ fprintf(stderr, "%s: sqe submit failed: %d\n", __FUNCTION__, ret);
+ return 1;
+ }
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "%s: wait completion %d\n", __FUNCTION__, ret);
+ return 1;
+ } else if (cqe->user_data == 1 && cqe->res == -EINVAL) {
+ fprintf(stderr, "ETIME_SUCCESS is not supported, skip\n");
+ goto done;
+ } else if (cqe->res != -ETIME || cqe->user_data != 1) {
+ fprintf(stderr, "timeout failed %i %i\n", cqe->res,
+ (int)cqe->user_data);
+ return 1;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "%s: wait completion %d\n", __FUNCTION__, ret);
+ return 1;
+ } else if (cqe->res || cqe->user_data != 2) {
+ fprintf(stderr, "nop failed %i %i\n", cqe->res,
+ (int)cqe->user_data);
+ return 1;
+ }
+done:
+ io_uring_cqe_seen(&ring, cqe);
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+
int main(int argc, char *argv[])
{
struct io_uring ring, sqpoll_ring;
bool has_timeout_update, sqpoll;
+ struct io_uring_params p = { };
int ret;
if (argc > 1)
return 0;
- ret = io_uring_queue_init(8, &ring, 0);
+ ret = io_uring_queue_init_params(8, &ring, &p);
if (ret) {
fprintf(stderr, "ring setup failed\n");
return 1;
@@ -1252,7 +1420,7 @@ int main(int argc, char *argv[])
return ret;
}
- ret = test_single_timeout_wait(&ring);
+ ret = test_single_timeout_wait(&ring, &p);
if (ret) {
fprintf(stderr, "test_single_timeout_wait failed\n");
return ret;
@@ -1337,6 +1505,18 @@ int main(int argc, char *argv[])
return ret;
}
+ ret = test_timeout_link_cancel();
+ if (ret) {
+ fprintf(stderr, "test_timeout_link_cancel failed\n");
+ return ret;
+ }
+
+ ret = test_not_failing_links();
+ if (ret) {
+ fprintf(stderr, "test_not_failing_links failed\n");
+ return ret;
+ }
+
if (sqpoll)
io_uring_queue_exit(&sqpoll_ring);
return 0;
diff --git a/test/tty-write-dpoll.c b/test/tty-write-dpoll.c
new file mode 100644
index 0000000..ea9df1d
--- /dev/null
+++ b/test/tty-write-dpoll.c
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Test double poll tty write. A test case for the regression fixed by:
+ *
+ * commit 6e295a664efd083ac9a5c1a8130c45be1db0cde7
+ * Author: Jens Axboe <axboe@kernel.dk>
+ * Date: Tue Mar 22 13:11:28 2022 -0600
+ *
+ * io_uring: fix assuming triggered poll waitqueue is the single poll
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+#define SQES 128
+#define BUFSIZE 512
+
+int main(int argc, char *argv[])
+{
+ static char buf[BUFSIZE];
+ struct iovec vecs[SQES];
+ struct io_uring ring;
+ int ret, i, fd;
+
+ if (argc > 1)
+ return 0;
+
+ fd = open("/dev/ttyS0", O_RDWR | O_NONBLOCK);
+ if (fd < 0)
+ return 0;
+
+ ret = t_create_ring(SQES, &ring, 0);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ else if (ret < 0)
+ return 1;
+
+ for (i = 0; i < SQES; i++) {
+ struct io_uring_sqe *sqe;
+
+ sqe = io_uring_get_sqe(&ring);
+ vecs[i].iov_base = buf;
+ vecs[i].iov_len = sizeof(buf);
+ io_uring_prep_writev(sqe, fd, &vecs[i], 1, 0);
+ }
+
+ ret = io_uring_submit(&ring);
+ if (ret != SQES) {
+ fprintf(stderr, "submit: %d\n", ret);
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/test/unlink.c b/test/unlink.c
index f8c7639..8e7d2f4 100644
--- a/test/unlink.c
+++ b/test/unlink.c
@@ -9,6 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
+#include <sys/stat.h>
#include "liburing.h"
@@ -23,7 +24,7 @@ static int test_unlink(struct io_uring *ring, const char *old)
fprintf(stderr, "get sqe failed\n");
goto err;
}
- io_uring_prep_unlinkat(sqe, AT_FDCWD, old, 0);
+ io_uring_prep_unlink(sqe, old, 0);
ret = io_uring_submit(ring);
if (ret <= 0) {
diff --git a/test/xattr.c b/test/xattr.c
new file mode 100644
index 0000000..d88059c
--- /dev/null
+++ b/test/xattr.c
@@ -0,0 +1,425 @@
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/xattr.h>
+#include <unistd.h>
+
+#include "helpers.h"
+#include "liburing.h"
+
+static int no_xattr;
+
+/* Define constants. */
+#define XATTR_SIZE 255
+#define QUEUE_DEPTH 32
+
+#define FILENAME "xattr.test"
+#define KEY1 "user.val1"
+#define KEY2 "user.val2"
+#define VALUE1 "value1"
+#define VALUE2 "value2-a-lot-longer"
+
+
+/* Call fsetxattr. */
+static int io_uring_fsetxattr(struct io_uring *ring, int fd, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ int ret;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "Error cannot get sqe\n");
+ return -1;
+ }
+
+ io_uring_prep_fsetxattr(sqe, fd, name, value, flags, size);
+
+ ret = io_uring_submit(ring);
+ if (ret != 1) {
+ fprintf(stderr, "Error io_uring_submit_and_wait: ret=%d\n", ret);
+ return -1;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "Error io_uring_wait_cqe: ret=%d\n", ret);
+ return -1;
+ }
+
+ ret = cqe->res;
+ if (ret == -EINVAL)
+ no_xattr = 1;
+ io_uring_cqe_seen(ring, cqe);
+
+ return ret;
+}
+
+/* Submit fgetxattr request. */
+static int io_uring_fgetxattr(struct io_uring *ring, int fd, const char *name,
+ void *value, size_t size)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ int ret;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "Error cannot get sqe\n");
+ return -1;
+ }
+
+ io_uring_prep_fgetxattr(sqe, fd, name, value, size);
+
+ ret = io_uring_submit(ring);
+ if (ret != 1) {
+ fprintf(stderr, "Error io_uring_submit_and_wait: ret=%d\n", ret);
+ return -1;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "Error io_uring_wait_cqe: ret=%d\n", ret);
+ return -1;
+ }
+
+ ret = cqe->res;
+ if (ret == -1) {
+ fprintf(stderr, "Error couldn'tget value\n");
+ return -1;
+ }
+
+ io_uring_cqe_seen(ring, cqe);
+ return ret;
+}
+
+/* Call setxattr. */
+static int io_uring_setxattr(struct io_uring *ring, const char *path,
+ const char *name, const void *value, size_t size,
+ int flags)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ int ret;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "Error cannot get sqe\n");
+ return -1;
+ }
+
+ io_uring_prep_setxattr(sqe, name, value, path, flags, size);
+
+ ret = io_uring_submit_and_wait(ring, 1);
+ if (ret != 1) {
+ fprintf(stderr, "Error io_uring_submit_and_wait: ret=%d\n", ret);
+ return -1;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "Error io_uring_wait_cqe: ret=%d\n", ret);
+ return -1;
+ }
+
+ ret = cqe->res;
+ io_uring_cqe_seen(ring, cqe);
+
+ return ret;
+}
+
+/* Submit getxattr request. */
+static int io_uring_getxattr(struct io_uring *ring, const char *path,
+ const char *name, void *value, size_t size)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ int ret;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "Error cannot get sqe\n");
+ return -1;
+ }
+
+ io_uring_prep_getxattr(sqe, name, value, path, size);
+
+ ret = io_uring_submit(ring);
+ if (ret != 1) {
+ fprintf(stderr, "Error io_uring_submit_and_wait: ret=%d\n", ret);
+ return -1;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "Error io_uring_wait_cqe: ret=%d\n", ret);
+ return -1;
+ }
+
+ ret = cqe->res;
+ if (ret == -1) {
+ fprintf(stderr, "Error couldn'tget value\n");
+ return -1;
+ }
+
+ io_uring_cqe_seen(ring, cqe);
+ return ret;
+}
+
+/* Test driver for fsetxattr and fgetxattr. */
+static int test_fxattr(void)
+{
+ int rc = 0;
+ size_t value_len;
+ struct io_uring ring;
+ char value[XATTR_SIZE];
+
+ /* Init io-uring queue. */
+ int ret = io_uring_queue_init(QUEUE_DEPTH, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "child: ring setup failed: %d\n", ret);
+ return -1;
+ }
+
+ /* Create the test file. */
+ int fd = open(FILENAME, O_CREAT | O_RDWR, 0644);
+ if (fd < 0) {
+ fprintf(stderr, "Error: cannot open file: ret=%d\n", fd);
+ return -1;
+ }
+
+ /* Test writing attributes. */
+ if (io_uring_fsetxattr(&ring, fd, KEY1, VALUE1, strlen(VALUE1), 0) < 0) {
+ if (no_xattr) {
+ fprintf(stdout, "No xattr support, skipping\n");
+ goto Exit;
+ }
+ fprintf(stderr, "Error fsetxattr cannot write key1\n");
+ rc = -1;
+ goto Exit;
+ }
+
+ if (io_uring_fsetxattr(&ring, fd, KEY2, VALUE2, strlen(VALUE2), 0) < 0) {
+ fprintf(stderr, "Error fsetxattr cannot write key1\n");
+ rc = -1;
+ goto Exit;
+ }
+
+ /* Test reading attributes. */
+ value_len = io_uring_fgetxattr(&ring, fd, KEY1, value, XATTR_SIZE);
+ if (value_len != strlen(value) || strncmp(value, VALUE1, value_len)) {
+ fprintf(stderr, "Error: fgetxattr expected value: %s, returned value: %s\n", VALUE1, value);
+ rc = -1;
+ goto Exit;
+ }
+
+ value_len = io_uring_fgetxattr(&ring, fd, KEY2, value, XATTR_SIZE);
+ if (value_len != strlen(value)|| strncmp(value, VALUE2, value_len)) {
+ fprintf(stderr, "Error: fgetxattr expected value: %s, returned value: %s\n", VALUE2, value);
+ rc = -1;
+ goto Exit;
+ }
+
+ /* Cleanup. */
+Exit:
+ close(fd);
+ unlink(FILENAME);
+
+ io_uring_queue_exit(&ring);
+
+ return rc;
+}
+
+/* Test driver for setxattr and getxattr. */
+static int test_xattr(void)
+{
+ int rc = 0;
+ int value_len;
+ struct io_uring ring;
+ char value[XATTR_SIZE];
+
+ /* Init io-uring queue. */
+ int ret = io_uring_queue_init(QUEUE_DEPTH, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "child: ring setup failed: %d\n", ret);
+ return -1;
+ }
+
+ /* Create the test file. */
+ t_create_file(FILENAME, 0);
+
+ /* Test writing attributes. */
+ if (io_uring_setxattr(&ring, FILENAME, KEY1, VALUE1, strlen(VALUE1), 0) < 0) {
+ fprintf(stderr, "Error setxattr cannot write key1\n");
+ rc = -1;
+ goto Exit;
+ }
+
+ if (io_uring_setxattr(&ring, FILENAME, KEY2, VALUE2, strlen(VALUE2), 0) < 0) {
+ fprintf(stderr, "Error setxattr cannot write key1\n");
+ rc = -1;
+ goto Exit;
+ }
+
+ /* Test reading attributes. */
+ value_len = io_uring_getxattr(&ring, FILENAME, KEY1, value, XATTR_SIZE);
+ if (value_len != strlen(VALUE1) || strncmp(value, VALUE1, value_len)) {
+ fprintf(stderr, "Error: getxattr expected value: %s, returned value: %s\n", VALUE1, value);
+ rc = -1;
+ goto Exit;
+ }
+
+ value_len = io_uring_getxattr(&ring, FILENAME, KEY2, value, XATTR_SIZE);
+ if (value_len != strlen(VALUE2) || strncmp(value, VALUE2, value_len)) {
+ fprintf(stderr, "Error: getxattr expected value: %s, returned value: %s\n", VALUE2, value);
+ rc = -1;
+ goto Exit;
+ }
+
+ /* Cleanup. */
+Exit:
+ io_uring_queue_exit(&ring);
+ unlink(FILENAME);
+
+ return rc;
+}
+
+/* Test driver for failure cases of fsetxattr and fgetxattr. */
+static int test_failure_fxattr(void)
+{
+ int rc = 0;
+ struct io_uring ring;
+ char value[XATTR_SIZE];
+
+ /* Init io-uring queue. */
+ int ret = io_uring_queue_init(QUEUE_DEPTH, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "child: ring setup failed: %d\n", ret);
+ return -1;
+ }
+
+ /* Create the test file. */
+ int fd = open(FILENAME, O_CREAT | O_RDWR, 0644);
+ if (fd < 0) {
+ fprintf(stderr, "Error: cannot open file: ret=%d\n", fd);
+ return -1;
+ }
+
+ /* Test writing attributes. */
+ assert(io_uring_fsetxattr(&ring, -1, KEY1, VALUE1, strlen(VALUE1), 0) < 0);
+ assert(io_uring_fsetxattr(&ring, fd, NULL, VALUE1, strlen(VALUE1), 0) < 0);
+ assert(io_uring_fsetxattr(&ring, fd, KEY1, NULL, strlen(VALUE1), 0) < 0);
+ assert(io_uring_fsetxattr(&ring, fd, KEY1, VALUE1, 0, 0) == 0);
+ assert(io_uring_fsetxattr(&ring, fd, KEY1, VALUE1, -1, 0) < 0);
+
+ /* Test reading attributes. */
+ assert(io_uring_fgetxattr(&ring, -1, KEY1, value, XATTR_SIZE) < 0);
+ assert(io_uring_fgetxattr(&ring, fd, NULL, value, XATTR_SIZE) < 0);
+ assert(io_uring_fgetxattr(&ring, fd, KEY1, value, 0) == 0);
+
+ /* Cleanup. */
+ close(fd);
+ unlink(FILENAME);
+
+ io_uring_queue_exit(&ring);
+
+ return rc;
+}
+
+
+/* Test driver for failure cases for setxattr and getxattr. */
+static int test_failure_xattr(void)
+{
+ int rc = 0;
+ struct io_uring ring;
+ char value[XATTR_SIZE];
+
+ /* Init io-uring queue. */
+ int ret = io_uring_queue_init(QUEUE_DEPTH, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "child: ring setup failed: %d\n", ret);
+ return -1;
+ }
+
+ /* Create the test file. */
+ t_create_file(FILENAME, 0);
+
+ /* Test writing attributes. */
+ assert(io_uring_setxattr(&ring, "complete garbage", KEY1, VALUE1, strlen(VALUE1), 0) < 0);
+ assert(io_uring_setxattr(&ring, NULL, KEY1, VALUE1, strlen(VALUE1), 0) < 0);
+ assert(io_uring_setxattr(&ring, FILENAME, NULL, VALUE1, strlen(VALUE1), 0) < 0);
+ assert(io_uring_setxattr(&ring, FILENAME, KEY1, NULL, strlen(VALUE1), 0) < 0);
+ assert(io_uring_setxattr(&ring, FILENAME, KEY1, VALUE1, 0, 0) == 0);
+
+ /* Test reading attributes. */
+ assert(io_uring_getxattr(&ring, "complete garbage", KEY1, value, XATTR_SIZE) < 0);
+ assert(io_uring_getxattr(&ring, NULL, KEY1, value, XATTR_SIZE) < 0);
+ assert(io_uring_getxattr(&ring, FILENAME, NULL, value, XATTR_SIZE) < 0);
+ assert(io_uring_getxattr(&ring, FILENAME, KEY1, NULL, XATTR_SIZE) == 0);
+ assert(io_uring_getxattr(&ring, FILENAME, KEY1, value, 0) == 0);
+
+ /* Cleanup. */
+ io_uring_queue_exit(&ring);
+ unlink(FILENAME);
+
+ return rc;
+}
+
+/* Test for invalid SQE, this will cause a segmentation fault if enabled. */
+static int test_invalid_sqe(void)
+{
+#ifdef DESTRUCTIVE_TEST
+ struct io_uring_sqe *sqe = NULL;
+ struct io_uring_cqe *cqe = NULL;
+ struct io_uring ring;
+
+ /* Init io-uring queue. */
+ int ret = io_uring_queue_init(QUEUE_DEPTH, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "child: ring setup failed: %d\n", ret);
+ return -1;
+ }
+
+ /* Pass invalid SQE. */
+ io_uring_prep_setxattr(sqe, FILENAME, KEY1, VALUE1, strlen(VALUE1), 0);
+
+ ret = io_uring_submit(&ring);
+ if (ret != 1) {
+ fprintf(stderr, "Error io_uring_submit_and_wait: ret=%d\n", ret);
+ return -1;
+ }
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "Error io_uring_wait_cqe: ret=%d\n", ret);
+ return -1;
+ }
+
+ ret = cqe->res;
+ io_uring_cqe_seen(&ring, cqe);
+
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+/* Test driver. */
+int main(int argc, char *argv[])
+{
+ if (argc > 1)
+ return 0;
+
+ if (test_fxattr())
+ return EXIT_FAILURE;
+ if (no_xattr)
+ return EXIT_SUCCESS;
+ if (test_xattr() || test_failure_fxattr() || test_failure_xattr() ||
+ test_invalid_sqe())
+ return EXIT_FAILURE;
+
+ return EXIT_SUCCESS;
+}