[libc] Add memory fence utility to the GPU utilities (#91756)

Summary: GPUs like to execute instructions in the background until something excplitely consumes them. We are working on adding some microbenchmarking code, which requires flushing the pending memory operations beforehand. This patch simply adds these utility functions that will be used in the near future.
author: Joseph Huber <huberjn@outlook.com> 2024-05-10 16:38:13 -0500
committer: GitHub <noreply@github.com> 2024-05-10 16:38:13 -0500
commit: fb3f4b013c3acab0ea3cb14c4d29f4e6d9caa33c (patch)
tree: a7f1cdac64b12f731d6a6f5d54815a99e3f35ecd
parent: 5d18d575d8d304e4336483d8be7394d0f4694cb1 (diff)
download: llvm-fb3f4b013c3acab0ea3cb14c4d29f4e6d9caa33c.tar.gz
2 files changed, 9 insertions, 0 deletions
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 9b520a6bcf38..5f8ad74f6aea 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -140,6 +140,11 @@ LIBC_INLINE uint32_t get_lane_size() {
   __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 }
 
+/// Waits for all pending memory operations to complete in program order.
+[[clang::convergent]] LIBC_INLINE void memory_fence() {
+  __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
+}
+
 /// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
 [[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {
   __builtin_amdgcn_wave_barrier();
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index 3f19afb83648..88b8ee2e31d3 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -118,9 +118,13 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; }
   uint32_t mask = static_cast<uint32_t>(lane_mask);
   return __nvvm_vote_ballot_sync(mask, x);
 }
+
 /// Waits for all the threads in the block to converge and issues a fence.
 [[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
 
+/// Waits for all pending memory operations to complete in program order.
+[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); }
+
 /// Waits for all threads in the warp to reconverge for independent scheduling.
 [[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
   __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
author	Joseph Huber <huberjn@outlook.com>	2024-05-10 16:38:13 -0500
committer	GitHub <noreply@github.com>	2024-05-10 16:38:13 -0500
commit	fb3f4b013c3acab0ea3cb14c4d29f4e6d9caa33c (patch)
tree	a7f1cdac64b12f731d6a6f5d54815a99e3f35ecd
parent	5d18d575d8d304e4336483d8be7394d0f4694cb1 (diff)
download	llvm-fb3f4b013c3acab0ea3cb14c4d29f4e6d9caa33c.tar.gz