openmp/device/include/Synchronization.h - mirrors/github.com/llvm/llvm-project - Git at Google

 //===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 //
 //===----------------------------------------------------------------------===//

 #ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
 #define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H

 #include "DeviceTypes.h"
 #include "DeviceUtils.h"

 namespace ompx {
 namespace atomic {

 enum OrderingTy {
   relaxed = __ATOMIC_RELAXED,
   acquire = __ATOMIC_ACQUIRE,
   release = __ATOMIC_RELEASE,
   acq_rel = __ATOMIC_ACQ_REL,
   seq_cst = __ATOMIC_SEQ_CST,
 };

 enum MemScopeTy {
   system = __MEMORY_SCOPE_SYSTEM,
   device = __MEMORY_SCOPE_DEVICE,
   workgroup = __MEMORY_SCOPE_WRKGRP,
   wavefront = __MEMORY_SCOPE_WVFRNT,
   single = __MEMORY_SCOPE_SINGLE,
 };

 /// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
 /// result is stored in \p *Addr;
 /// {

 /// Atomically increments with wrapping semantics modulo \p Val.
 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 V inc(Ty *Address, V Val, atomic::OrderingTy Ordering,
       MemScopeTy MemScope = MemScopeTy::device) {
 #if defined(__SPIRV__)
   uint32_t Old;
   while (true) {
     Old = load(Address, Ordering, MemScope);
     if (Old >= Val) {
       if (cas(Address, Old, 0u, Ordering, Ordering, MemScope))
         break;
     } else if (cas(Address, Old, Old + 1, Ordering, Ordering, MemScope))
       break;
   }
   return Old;
 #else
   return __scoped_atomic_fetch_uinc(Address, Val, Ordering, MemScope);
 #endif
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc,
          atomic::OrderingTy OrderingFail,
          MemScopeTy MemScope = MemScopeTy::device) {
   return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false,
                                           OrderingSucc, OrderingFail, MemScope);
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 V add(Ty *Address, V Val, atomic::OrderingTy Ordering,
       MemScopeTy MemScope = MemScopeTy::device) {
   return __scoped_atomic_fetch_add(Address, Val, Ordering, MemScope);
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 V load(Ty *Address, atomic::OrderingTy Ordering,
        MemScopeTy MemScope = MemScopeTy::device) {
 #ifdef __NVPTX__
   return __scoped_atomic_fetch_add(Address, V(0), Ordering, MemScope);
 #else
   return __scoped_atomic_load_n(Address, Ordering, MemScope);
 #endif
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 void store(Ty *Address, V Val, atomic::OrderingTy Ordering,
            MemScopeTy MemScope = MemScopeTy::device) {
   __scoped_atomic_store_n(Address, Val, Ordering, MemScope);
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 V mul(Ty *Address, V Val, atomic::OrderingTy Ordering,
       MemScopeTy MemScope = MemScopeTy::device) {
   Ty TypedCurrentVal, TypedResultVal, TypedNewVal;
   bool Success;
   do {
     TypedCurrentVal = atomic::load(Address, Ordering);
     TypedNewVal = TypedCurrentVal * Val;
     Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering,
                           atomic::relaxed, MemScope);
   } while (!Success);
   return TypedResultVal;
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 utils::enable_if_t<!utils::is_floating_point_v<V>, V>
 max(Ty *Address, V Val, atomic::OrderingTy Ordering,
     MemScopeTy MemScope = MemScopeTy::device) {
   return __scoped_atomic_fetch_max(Address, Val, Ordering, MemScope);
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 utils::enable_if_t<utils::is_same_v<V, float>, V>
 max(Ty *Address, V Val, atomic::OrderingTy Ordering,
     MemScopeTy MemScope = MemScopeTy::device) {
   if (Val >= 0)
     return utils::bitCast<float>(max(
         (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope));
   return utils::bitCast<float>(min(
       (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope));
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 utils::enable_if_t<utils::is_same_v<V, double>, V>
 max(Ty *Address, V Val, atomic::OrderingTy Ordering,
     MemScopeTy MemScope = MemScopeTy::device) {
   if (Val >= 0)
     return utils::bitCast<double>(max(
         (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope));
   return utils::bitCast<double>(min(
       (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope));
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 utils::enable_if_t<!utils::is_floating_point_v<V>, V>
 min(Ty *Address, V Val, atomic::OrderingTy Ordering,
     MemScopeTy MemScope = MemScopeTy::device) {
   return __scoped_atomic_fetch_min(Address, Val, Ordering, MemScope);
 }

 // TODO: Implement this with __atomic_fetch_max and remove the duplication.
 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 utils::enable_if_t<utils::is_same_v<V, float>, V>
 min(Ty *Address, V Val, atomic::OrderingTy Ordering,
     MemScopeTy MemScope = MemScopeTy::device) {
   if (Val >= 0)
     return utils::bitCast<float>(min(
         (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope));
   return utils::bitCast<float>(max(
       (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope));
 }

 // TODO: Implement this with __atomic_fetch_max and remove the duplication.
 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 utils::enable_if_t<utils::is_same_v<V, double>, V>
 min(Ty *Address, utils::remove_addrspace_t<Ty> Val, atomic::OrderingTy Ordering,
     MemScopeTy MemScope = MemScopeTy::device) {
   if (Val >= 0)
     return utils::bitCast<double>(min(
         (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope));
   return utils::bitCast<double>(max(
       (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope));
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering,
          MemScopeTy MemScope = MemScopeTy::device) {
   return __scoped_atomic_fetch_or(Address, Val, Ordering, MemScope);
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering,
           MemScopeTy MemScope = MemScopeTy::device) {
   return __scoped_atomic_fetch_and(Address, Val, Ordering, MemScope);
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering,
           MemScopeTy MemScope = MemScopeTy::device) {
   return __scoped_atomic_fetch_xor(Address, Val, Ordering, MemScope);
 }

 static inline uint32_t
 atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
                MemScopeTy MemScope = MemScopeTy::device) {
   uint32_t R;
   __scoped_atomic_exchange(Address, &Val, &R, Ordering, MemScope);
   return R;
 }

 ///}

 } // namespace atomic

 // FIXME: NVPTX does not respect the memory scope argument.
 namespace fence {

 /// Memory fence with \p Ordering semantics for the team.
 static inline void team(atomic::OrderingTy Ordering) {
 #ifdef __NVPTX__
   __nvvm_membar_cta();
 #else
   __scoped_atomic_thread_fence(Ordering, atomic::workgroup);
 #endif
 }

 /// Memory fence with \p Ordering semantics for the contention group.
 static inline void kernel(atomic::OrderingTy Ordering) {
 #ifdef __NVPTX__
   __nvvm_membar_gl();
 #else
   __scoped_atomic_thread_fence(Ordering, atomic::device);
 #endif
 }

 /// Memory fence with \p Ordering semantics for the system.
 static inline void system(atomic::OrderingTy Ordering) {
 #ifdef __NVPTX__
   __nvvm_membar_sys();
 #else
   __scoped_atomic_thread_fence(Ordering, atomic::system);
 #endif
 }

 } // namespace fence

 namespace synchronize {

 /// Initialize the synchronization machinery. Must be called by all threads.
 void init(bool IsSPMD);

 /// Synchronize all threads in a warp identified by \p Mask.
 static inline void warp(LaneMaskTy Mask) { __gpu_sync_lane(Mask); }

 /// Synchronize all threads in a block and perform a fence before and after the
 /// barrier according to \p Ordering. Note that the fence might be part of the
 /// barrier.
 static inline void threads(atomic::OrderingTy Ordering) {
 #if defined(__NVPTX__)
   __nvvm_barrier_sync(8);
 #elif defined(__AMDGPU__)
   if (Ordering != atomic::relaxed)
     fence::team(Ordering == atomic::acq_rel ? atomic::release
                                             : atomic::seq_cst);

   __builtin_amdgcn_s_barrier();

   if (Ordering != atomic::relaxed)
     fence::team(Ordering == atomic::acq_rel ? atomic::acquire
                                             : atomic::seq_cst);
 #else
   __gpu_sync_threads();
 #endif
 }

 /// Synchronizing threads is allowed even if they all hit different instances of
 /// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
 /// restrictive in that it requires all threads to hit the same instance. The
 /// noinline is removed by the openmp-opt pass and helps to preserve the
 /// information till then.
 ///{

 /// Synchronize all threads in a block, they are reaching the same instruction
 /// (hence all threads in the block are "aligned"). Also perform a fence before
 /// and after the barrier according to \p Ordering. Note that the
 /// fence might be part of the barrier if the target offers this.
 [[gnu::noinline, omp::assume("ompx_aligned_barrier")]] void
 threadsAligned(atomic::OrderingTy Ordering);

 ///}

 } // namespace synchronize

 } // namespace ompx

 #endif
	//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	//
	//===----------------------------------------------------------------------===//

	#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
	#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H

	#include "DeviceTypes.h"
	#include "DeviceUtils.h"

	namespace ompx {
	namespace atomic {

	enum OrderingTy {
	relaxed = __ATOMIC_RELAXED,
	acquire = __ATOMIC_ACQUIRE,
	release = __ATOMIC_RELEASE,
	acq_rel = __ATOMIC_ACQ_REL,
	seq_cst = __ATOMIC_SEQ_CST,
	};

	enum MemScopeTy {
	system = __MEMORY_SCOPE_SYSTEM,
	device = __MEMORY_SCOPE_DEVICE,
	workgroup = __MEMORY_SCOPE_WRKGRP,
	wavefront = __MEMORY_SCOPE_WVFRNT,
	single = __MEMORY_SCOPE_SINGLE,
	};

	/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
	/// result is stored in \p *Addr;
	/// {

	/// Atomically increments with wrapping semantics modulo \p Val.
	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	V inc(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	#if defined(__SPIRV__)
	uint32_t Old;
	while (true) {
	Old = load(Address, Ordering, MemScope);
	if (Old >= Val) {
	if (cas(Address, Old, 0u, Ordering, Ordering, MemScope))
	break;
	} else if (cas(Address, Old, Old + 1, Ordering, Ordering, MemScope))
	break;
	}
	return Old;
	#else
	return __scoped_atomic_fetch_uinc(Address, Val, Ordering, MemScope);
	#endif
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc,
	atomic::OrderingTy OrderingFail,
	MemScopeTy MemScope = MemScopeTy::device) {
	return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false,
	OrderingSucc, OrderingFail, MemScope);
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	V add(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	return __scoped_atomic_fetch_add(Address, Val, Ordering, MemScope);
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	V load(Ty *Address, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	#ifdef __NVPTX__
	return __scoped_atomic_fetch_add(Address, V(0), Ordering, MemScope);
	#else
	return __scoped_atomic_load_n(Address, Ordering, MemScope);
	#endif
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	void store(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	__scoped_atomic_store_n(Address, Val, Ordering, MemScope);
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	V mul(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	Ty TypedCurrentVal, TypedResultVal, TypedNewVal;
	bool Success;
	do {
	TypedCurrentVal = atomic::load(Address, Ordering);
	TypedNewVal = TypedCurrentVal * Val;
	Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering,
	atomic::relaxed, MemScope);
	} while (!Success);
	return TypedResultVal;
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	utils::enable_if_t<!utils::is_floating_point_v<V>, V>
	max(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	return __scoped_atomic_fetch_max(Address, Val, Ordering, MemScope);
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	utils::enable_if_t<utils::is_same_v<V, float>, V>
	max(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	if (Val >= 0)
	return utils::bitCast<float>(max(
	(int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope));
	return utils::bitCast<float>(min(
	(uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope));
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	utils::enable_if_t<utils::is_same_v<V, double>, V>
	max(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	if (Val >= 0)
	return utils::bitCast<double>(max(
	(int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope));
	return utils::bitCast<double>(min(
	(uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope));
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	utils::enable_if_t<!utils::is_floating_point_v<V>, V>
	min(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	return __scoped_atomic_fetch_min(Address, Val, Ordering, MemScope);
	}

	// TODO: Implement this with __atomic_fetch_max and remove the duplication.
	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	utils::enable_if_t<utils::is_same_v<V, float>, V>
	min(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	if (Val >= 0)
	return utils::bitCast<float>(min(
	(int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope));
	return utils::bitCast<float>(max(
	(uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope));
	}

	// TODO: Implement this with __atomic_fetch_max and remove the duplication.
	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	utils::enable_if_t<utils::is_same_v<V, double>, V>
	min(Ty *Address, utils::remove_addrspace_t<Ty> Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	if (Val >= 0)
	return utils::bitCast<double>(min(
	(int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope));
	return utils::bitCast<double>(max(
	(uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope));
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	return __scoped_atomic_fetch_or(Address, Val, Ordering, MemScope);
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	return __scoped_atomic_fetch_and(Address, Val, Ordering, MemScope);
	}

	template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
	V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	return __scoped_atomic_fetch_xor(Address, Val, Ordering, MemScope);
	}

	static inline uint32_t
	atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
	MemScopeTy MemScope = MemScopeTy::device) {
	uint32_t R;
	__scoped_atomic_exchange(Address, &Val, &R, Ordering, MemScope);
	return R;
	}

	///}

	} // namespace atomic

	// FIXME: NVPTX does not respect the memory scope argument.
	namespace fence {

	/// Memory fence with \p Ordering semantics for the team.
	static inline void team(atomic::OrderingTy Ordering) {
	#ifdef __NVPTX__
	__nvvm_membar_cta();
	#else
	__scoped_atomic_thread_fence(Ordering, atomic::workgroup);
	#endif
	}

	/// Memory fence with \p Ordering semantics for the contention group.
	static inline void kernel(atomic::OrderingTy Ordering) {
	#ifdef __NVPTX__
	__nvvm_membar_gl();
	#else
	__scoped_atomic_thread_fence(Ordering, atomic::device);
	#endif
	}

	/// Memory fence with \p Ordering semantics for the system.
	static inline void system(atomic::OrderingTy Ordering) {
	#ifdef __NVPTX__
	__nvvm_membar_sys();
	#else
	__scoped_atomic_thread_fence(Ordering, atomic::system);
	#endif
	}

	} // namespace fence

	namespace synchronize {

	/// Initialize the synchronization machinery. Must be called by all threads.
	void init(bool IsSPMD);

	/// Synchronize all threads in a warp identified by \p Mask.
	static inline void warp(LaneMaskTy Mask) { __gpu_sync_lane(Mask); }

	/// Synchronize all threads in a block and perform a fence before and after the
	/// barrier according to \p Ordering. Note that the fence might be part of the
	/// barrier.
	static inline void threads(atomic::OrderingTy Ordering) {
	#if defined(__NVPTX__)
	__nvvm_barrier_sync(8);
	#elif defined(__AMDGPU__)
	if (Ordering != atomic::relaxed)
	fence::team(Ordering == atomic::acq_rel ? atomic::release
	: atomic::seq_cst);

	__builtin_amdgcn_s_barrier();

	if (Ordering != atomic::relaxed)
	fence::team(Ordering == atomic::acq_rel ? atomic::acquire
	: atomic::seq_cst);
	#else
	__gpu_sync_threads();
	#endif
	}

	/// Synchronizing threads is allowed even if they all hit different instances of
	/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
	/// restrictive in that it requires all threads to hit the same instance. The
	/// noinline is removed by the openmp-opt pass and helps to preserve the
	/// information till then.
	///{

	/// Synchronize all threads in a block, they are reaching the same instruction
	/// (hence all threads in the block are "aligned"). Also perform a fence before
	/// and after the barrier according to \p Ordering. Note that the
	/// fence might be part of the barrier if the target offers this.
	[[gnu::noinline, omp::assume("ompx_aligned_barrier")]] void
	threadsAligned(atomic::OrderingTy Ordering);

	///}

	} // namespace synchronize

	} // namespace ompx

	#endif