31#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
32#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
35#if !defined(__HIPCC_RTC__)
36#include <hip/amd_detail/amd_device_functions.h>
38#if !defined(__align__)
39#define __align__(x) __attribute__((aligned(x)))
43#pragma clang diagnostic push
44#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
45#pragma clang diagnostic ignored "-Wc++98-compat"
46#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
47#pragma clang diagnostic ignored "-Wshorten-64-to-32"
50#if !defined(__CG_QUALIFIER__)
51#define __CG_QUALIFIER__ __device__ __forceinline__
54#if !defined(__CG_STATIC_QUALIFIER__)
55#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
58#if !defined(_CG_STATIC_CONST_DECL_)
59#define _CG_STATIC_CONST_DECL_ static constexpr
62#if __AMDGCN_WAVEFRONT_SIZE == 32
63using lane_mask =
unsigned int;
65using lane_mask =
unsigned long long int;
68namespace cooperative_groups {
71template <
unsigned int size>
72using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;
74template <
unsigned int size>
75using is_valid_wavefront = std::integral_constant<bool, (size <= __AMDGCN_WAVEFRONT_SIZE)>;
77template <
unsigned int size>
78using is_valid_tile_size =
79 std::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
83 std::integral_constant<bool, std::is_integral<T>::value || std::is_floating_point<T>::value>;
118namespace multi_grid {
120__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
121 return static_cast<uint32_t
>(__ockl_multi_grid_num_grids()); }
123__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
124 return static_cast<uint32_t
>(__ockl_multi_grid_grid_rank()); }
126__CG_STATIC_QUALIFIER__ uint32_t size() {
return static_cast<uint32_t
>(__ockl_multi_grid_size()); }
128__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
129 return static_cast<uint32_t
>(__ockl_multi_grid_thread_rank()); }
131__CG_STATIC_QUALIFIER__
bool is_valid() {
return static_cast<bool>(__ockl_multi_grid_is_valid()); }
133__CG_STATIC_QUALIFIER__
void sync() { __ockl_multi_grid_sync(); }
143__CG_STATIC_QUALIFIER__ uint32_t size() {
144 return static_cast<uint32_t
>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
145 (blockDim.x * gridDim.x));
148__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
150 uint32_t blkIdx =
static_cast<uint32_t
>((blockIdx.z * gridDim.y * gridDim.x) +
151 (blockIdx.y * gridDim.x) + (blockIdx.x));
155 uint32_t num_threads_till_current_workgroup =
156 static_cast<uint32_t
>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
159 uint32_t local_thread_rank =
static_cast<uint32_t
>((threadIdx.z * blockDim.y * blockDim.x) +
160 (threadIdx.y * blockDim.x) + (threadIdx.x));
162 return (num_threads_till_current_workgroup + local_thread_rank);
165__CG_STATIC_QUALIFIER__
bool is_valid() {
return static_cast<bool>(__ockl_grid_is_valid()); }
167__CG_STATIC_QUALIFIER__
void sync() { __ockl_grid_sync(); }
178__CG_STATIC_QUALIFIER__ dim3 group_index() {
179 return (dim3(
static_cast<uint32_t
>(blockIdx.x),
static_cast<uint32_t
>(blockIdx.y),
180 static_cast<uint32_t
>(blockIdx.z)));
183__CG_STATIC_QUALIFIER__ dim3 thread_index() {
184 return (dim3(
static_cast<uint32_t
>(threadIdx.x),
static_cast<uint32_t
>(threadIdx.y),
185 static_cast<uint32_t
>(threadIdx.z)));
188__CG_STATIC_QUALIFIER__ uint32_t size() {
189 return (
static_cast<uint32_t
>(blockDim.x * blockDim.y * blockDim.z));
192__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
193 return (
static_cast<uint32_t
>((threadIdx.z * blockDim.y * blockDim.x) +
194 (threadIdx.y * blockDim.x) + (threadIdx.x)));
197__CG_STATIC_QUALIFIER__
bool is_valid() {
201__CG_STATIC_QUALIFIER__
void sync() { __syncthreads(); }
203__CG_STATIC_QUALIFIER__ dim3 block_dim() {
204 return (dim3(
static_cast<uint32_t
>(blockDim.x),
static_cast<uint32_t
>(blockDim.y),
205 static_cast<uint32_t
>(blockDim.z)));
210namespace tiled_group {
213__CG_STATIC_QUALIFIER__
void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL,
"agent"); }
217namespace coalesced_group {
220__CG_STATIC_QUALIFIER__
void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL,
"agent"); }
226__CG_STATIC_QUALIFIER__
unsigned int masked_bit_count(lane_mask x,
unsigned int add = 0) {
227 unsigned int counter=0;
228 #if __AMDGCN_WAVEFRONT_SIZE == 32
229 counter = __builtin_amdgcn_mbcnt_lo(x, add);
231 counter = __builtin_amdgcn_mbcnt_lo(
static_cast<lane_mask
>(x), add);
232 counter = __builtin_amdgcn_mbcnt_hi(
static_cast<lane_mask
>(x >> 32), counter);
247#if defined(__clang__)
248#pragma clang diagnostic pop