29#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
30#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
32#include <hip/amd_detail/amd_hip_common.h>
46const char* amd_dbgapi_get_build_name();
55const char* amd_dbgapi_get_git_hash();
64size_t amd_dbgapi_get_build_id();
73#if !defined(__HIPCC_RTC__)
84typedef unsigned int uint32_t;
85typedef unsigned long long uint64_t;
86typedef signed int int32_t;
87typedef signed long long int64_t;
98#if !defined(__align__)
99#define __align__(x) __attribute__((aligned(x)))
102#define CUDA_SUCCESS hipSuccess
104#if !defined(__HIPCC_RTC__)
105#include <hip/hip_runtime_api.h>
106extern int HIP_TRACE_API;
110#include <hip/amd_detail/hip_ldg.h>
112#include <hip/amd_detail/amd_hip_atomic.h>
114#include <hip/amd_detail/amd_device_functions.h>
115#include <hip/amd_detail/amd_surface_functions.h>
116#include <hip/amd_detail/texture_fetch_functions.h>
117#include <hip/amd_detail/texture_indirect_functions.h>
120#if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
121#define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
125#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
129#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
130#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
131#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
132#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
133#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
136#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
137#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
140#define __HIP_ARCH_HAS_DOUBLES__ (1)
143#define __HIP_ARCH_HAS_WARP_VOTE__ (1)
144#define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
145#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
146#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
149#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
150#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
153#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
154#define __HIP_ARCH_HAS_3DGRID__ (1)
155#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
160#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
161 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
162#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
163 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
164 amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
165#define select_impl_(_1, _2, impl_, ...) impl_
166#define __launch_bounds__(...) \
167 select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0, )(__VA_ARGS__)
169#if !defined(__HIPCC_RTC__)
170__host__ inline void* __get_dynamicgroupbaseptr() {
return nullptr; }
181#if !defined(__HIPCC_RTC__)
182#define HIP_KERNEL_NAME(...) __VA_ARGS__
183#define HIP_SYMBOL(X) X
185typedef int hipLaunchParm;
187template <std::size_t n,
typename... Ts,
188 typename std::enable_if<n ==
sizeof...(Ts)>::type* =
nullptr>
189void pArgs(
const std::tuple<Ts...>&,
void*) {}
191template <std::size_t n,
typename... Ts,
192 typename std::enable_if<n !=
sizeof...(Ts)>::type* =
nullptr>
193void pArgs(
const std::tuple<Ts...>& formals,
void** _vargs) {
194 using T =
typename std::tuple_element<n, std::tuple<Ts...> >::type;
196 static_assert(!std::is_reference<T>{},
197 "A __global__ function cannot have a reference as one of its "
199#if defined(HIP_STRICT)
200 static_assert(std::is_trivially_copyable<T>{},
201 "Only TriviallyCopyable types can be arguments to a __global__ "
204 _vargs[n] =
const_cast<void*
>(
reinterpret_cast<const void*
>(&std::get<n>(formals)));
205 return pArgs<n + 1>(formals, _vargs);
208template <
typename... Formals,
typename... Actuals>
209std::tuple<Formals...> validateArgsCountType(
void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
210 static_assert(
sizeof...(Formals) ==
sizeof...(Actuals),
"Argument Count Mismatch");
211 std::tuple<Formals...> to_formals{std::move(actuals)};
215#if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
216template <
typename... Args,
typename F = void (*)(Args...)>
217void hipLaunchKernelGGL(F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
218 std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
219 constexpr size_t count =
sizeof...(Args);
220 auto tup_ = std::tuple<Args...>{args...};
221 auto tup = validateArgsCountType(kernel, tup_);
223 pArgs<0>(tup, _Args);
225 auto k =
reinterpret_cast<void*
>(kernel);
226 hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
229#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
231 kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
234#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
237#include <hip/hip_runtime_api.h>
240extern "C" __device__ __attribute__((
const)) size_t __ockl_get_local_id(uint);
241extern "C" __device__ __attribute__((const))
size_t __ockl_get_group_id(uint);
242extern "C" __device__ __attribute__((const))
size_t __ockl_get_local_size(uint);
243extern "C" __device__ __attribute__((const))
size_t __ockl_get_num_groups(uint);
244struct __HIP_BlockIdx {
246 std::uint32_t operator()(std::uint32_t x)
const noexcept {
return __ockl_get_group_id(x); }
248struct __HIP_BlockDim {
250 std::uint32_t operator()(std::uint32_t x)
const noexcept {
251 return __ockl_get_local_size(x);
254struct __HIP_GridDim {
256 std::uint32_t operator()(std::uint32_t x)
const noexcept {
257 return __ockl_get_num_groups(x);
260struct __HIP_ThreadIdx {
262 std::uint32_t operator()(std::uint32_t x)
const noexcept {
263 return __ockl_get_local_id(x);
267#if defined(__HIPCC_RTC__)
273 constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
279struct __HIP_Coordinates {
280 using R =
decltype(F{}(0));
283 __device__
operator R() const noexcept {
return F{}(0); }
284 __device__ R operator+=(
const R& rhs) {
return F{}(0) + rhs; }
287 __device__
operator R() const noexcept {
return F{}(1); }
288 __device__ R operator+=(
const R& rhs) {
return F{}(1) + rhs; }
291 __device__
operator R() const noexcept {
return F{}(2); }
292 __device__ R operator+=(
const R& rhs) {
return F{}(2) + rhs; }
295 static constexpr __X x{};
296 static constexpr __Y y{};
297 static constexpr __Z z{};
299 __device__
operator dim3()
const {
return dim3(x, y, z); }
304#if !defined(_MSC_VER)
307constexpr typename __HIP_Coordinates<F>::__X __HIP_Coordinates<F>::x;
309#if !defined(_MSC_VER)
312constexpr typename __HIP_Coordinates<F>::__Y __HIP_Coordinates<F>::y;
314#if !defined(_MSC_VER)
317constexpr typename __HIP_Coordinates<F>::__Z __HIP_Coordinates<F>::z;
319extern "C" __device__ __attribute__((
const)) size_t __ockl_get_global_size(uint);
322std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X,
323 __HIP_Coordinates<__HIP_BlockDim>::__X) noexcept {
324 return __ockl_get_global_size(0);
328std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X,
329 __HIP_Coordinates<__HIP_GridDim>::__X)
noexcept {
330 return __ockl_get_global_size(0);
334std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y,
335 __HIP_Coordinates<__HIP_BlockDim>::__Y)
noexcept {
336 return __ockl_get_global_size(1);
340std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y,
341 __HIP_Coordinates<__HIP_GridDim>::__Y)
noexcept {
342 return __ockl_get_global_size(1);
346std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z,
347 __HIP_Coordinates<__HIP_BlockDim>::__Z)
noexcept {
348 return __ockl_get_global_size(2);
352std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z,
353 __HIP_Coordinates<__HIP_GridDim>::__Z)
noexcept {
354 return __ockl_get_global_size(2);
357static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
358static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
359static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
360static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
362extern "C" __device__ __attribute__((
const)) size_t __ockl_get_local_id(uint);
363#define hipThreadIdx_x (__ockl_get_local_id(0))
364#define hipThreadIdx_y (__ockl_get_local_id(1))
365#define hipThreadIdx_z (__ockl_get_local_id(2))
367extern "C" __device__ __attribute__((
const)) size_t __ockl_get_group_id(uint);
368#define hipBlockIdx_x (__ockl_get_group_id(0))
369#define hipBlockIdx_y (__ockl_get_group_id(1))
370#define hipBlockIdx_z (__ockl_get_group_id(2))
372extern "C" __device__ __attribute__((
const)) size_t __ockl_get_local_size(uint);
373#define hipBlockDim_x (__ockl_get_local_size(0))
374#define hipBlockDim_y (__ockl_get_local_size(1))
375#define hipBlockDim_z (__ockl_get_local_size(2))
377extern "C" __device__ __attribute__((
const)) size_t __ockl_get_num_groups(uint);
378#define hipGridDim_x (__ockl_get_num_groups(0))
379#define hipGridDim_y (__ockl_get_num_groups(1))
380#define hipGridDim_z (__ockl_get_num_groups(2))
382#include <hip/amd_detail/amd_math_functions.h>
384#if __HIP_HCC_COMPAT_MODE__
386#pragma push_macro("__DEFINE_HCC_FUNC")
387#define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
388inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
397__DEFINE_HCC_FUNC(workitem_id, threadIdx)
398__DEFINE_HCC_FUNC(group_id, blockIdx)
399__DEFINE_HCC_FUNC(group_size, blockDim)
400__DEFINE_HCC_FUNC(num_groups, gridDim)
401#pragma pop_macro("__DEFINE_HCC_FUNC")
403extern "C" __device__ __attribute__((
const)) size_t __ockl_get_global_id(uint);
404inline __device__ __attribute__((always_inline)) uint
405hc_get_workitem_absolute_id(
int dim)
407 return (uint)__ockl_get_global_id(dim);
412#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
413#if !defined(__HIPCC_RTC__)
415#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
416#pragma push_macro("__CUDA__")
418#include <__clang_cuda_math_forward_declares.h>
419#include <__clang_cuda_complex_builtins.h>
425#include <include/cuda_wrappers/algorithm>
426#include <include/cuda_wrappers/complex>
427#include <include/cuda_wrappers/new>
429#pragma pop_macro("__CUDA__")
#define __host__
Definition host_defines.h:170