diff options
author | Joseph Huber <huberjn@outlook.com> | 2024-05-10 16:38:13 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-10 16:38:13 -0500 |
commit | fb3f4b013c3acab0ea3cb14c4d29f4e6d9caa33c (patch) | |
tree | a7f1cdac64b12f731d6a6f5d54815a99e3f35ecd | |
parent | 5d18d575d8d304e4336483d8be7394d0f4694cb1 (diff) | |
download | llvm-fb3f4b013c3acab0ea3cb14c4d29f4e6d9caa33c.tar.gz |
[libc] Add memory fence utility to the GPU utilities (#91756)
Summary:
GPUs like to execute instructions in the background until something
excplitely consumes them. We are working on adding some
microbenchmarking code, which requires flushing the pending memory
operations beforehand. This patch simply adds these utility functions
that will be used in the near future.
-rw-r--r-- | libc/src/__support/GPU/amdgpu/utils.h | 5 | ||||
-rw-r--r-- | libc/src/__support/GPU/nvptx/utils.h | 4 |
2 files changed, 9 insertions, 0 deletions
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h index 9b520a6bcf38..5f8ad74f6aea 100644 --- a/libc/src/__support/GPU/amdgpu/utils.h +++ b/libc/src/__support/GPU/amdgpu/utils.h @@ -140,6 +140,11 @@ LIBC_INLINE uint32_t get_lane_size() { __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); } +/// Waits for all pending memory operations to complete in program order. +[[clang::convergent]] LIBC_INLINE void memory_fence() { + __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, ""); +} + /// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU. [[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) { __builtin_amdgcn_wave_barrier(); diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h index 3f19afb83648..88b8ee2e31d3 100644 --- a/libc/src/__support/GPU/nvptx/utils.h +++ b/libc/src/__support/GPU/nvptx/utils.h @@ -118,9 +118,13 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; } uint32_t mask = static_cast<uint32_t>(lane_mask); return __nvvm_vote_ballot_sync(mask, x); } + /// Waits for all the threads in the block to converge and issues a fence. [[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); } +/// Waits for all pending memory operations to complete in program order. +[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); } + /// Waits for all threads in the warp to reconverge for independent scheduling. [[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) { __nvvm_bar_warp_sync(static_cast<uint32_t>(mask)); |