28#pragma clang diagnostic push
29#pragma clang diagnostic ignored "-Wold-style-cast"
58__device__
inline float unsafeAtomicAdd(
float* addr,
float value) {
59#if defined(__gfx90a__) && \
60 __has_builtin(__builtin_amdgcn_is_shared) && \
61 __has_builtin(__builtin_amdgcn_is_private) && \
62 __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \
63 __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
64 if (__builtin_amdgcn_is_shared(
65 (
const __attribute__((address_space(0)))
void*)addr))
66 return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
67 else if (__builtin_amdgcn_is_private(
68 (
const __attribute__((address_space(0)))
void*)addr)) {
74 return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
75#elif __has_builtin(__hip_atomic_fetch_add)
76 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
78 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
96__device__
inline float unsafeAtomicMax(
float* addr,
float val) {
97 #if __has_builtin(__hip_atomic_load) && \
98 __has_builtin(__hip_atomic_compare_exchange_strong)
99 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
101 while (!done && value < val) {
102 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
103 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
107 unsigned int *uaddr = (
unsigned int *)addr;
108 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
110 while (!done && __uint_as_float(value) < val) {
111 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val),
false,
112 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
114 return __uint_as_float(value);
132__device__
inline float unsafeAtomicMin(
float* addr,
float val) {
133 #if __has_builtin(__hip_atomic_load) && \
134 __has_builtin(__hip_atomic_compare_exchange_strong)
135 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
137 while (!done && value > val) {
138 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
139 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
143 unsigned int *uaddr = (
unsigned int *)addr;
144 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
146 while (!done && __uint_as_float(value) > val) {
147 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val),
false,
148 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
150 return __uint_as_float(value);
180__device__
inline double unsafeAtomicAdd(
double* addr,
double value) {
181#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
182 return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
183#elif defined (__hip_atomic_fetch_add)
184 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
186 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
216__device__
inline double unsafeAtomicMax(
double* addr,
double val) {
217#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
218 __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
219 return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
221 #if __has_builtin(__hip_atomic_load) && \
222 __has_builtin(__hip_atomic_compare_exchange_strong)
223 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
225 while (!done && value < val) {
226 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
227 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
231 unsigned long long *uaddr = (
unsigned long long *)addr;
232 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
234 while (!done && __longlong_as_double(value) < val) {
235 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val),
false,
236 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
238 return __longlong_as_double(value);
269__device__
inline double unsafeAtomicMin(
double* addr,
double val) {
270#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
271 __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
272 return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
274 #if __has_builtin(__hip_atomic_load) && \
275 __has_builtin(__hip_atomic_compare_exchange_strong)
276 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
278 while (!done && value > val) {
279 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
280 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
284 unsigned long long *uaddr = (
unsigned long long *)addr;
285 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
287 while (!done && __longlong_as_double(value) > val) {
288 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val),
false,
289 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
291 return __longlong_as_double(value);
310__device__
inline float safeAtomicAdd(
float* addr,
float value) {
311#if defined(__gfx908__) || defined(__gfx941__) \
312 || ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__)) \
313 && !__has_builtin(__hip_atomic_fetch_add))
321#if __has_builtin(__hip_atomic_load)
322 old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
324 old_val = __uint_as_float(__atomic_load_n(
reinterpret_cast<unsigned int*
>(addr), __ATOMIC_RELAXED));
326 float expected, temp;
328 temp = expected = old_val;
329#if __has_builtin(__hip_atomic_compare_exchange_strong)
330 __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
331 __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
333 __atomic_compare_exchange_n(addr, &expected, old_val + value,
false,
334 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
337 }
while (__float_as_uint(temp) != __float_as_uint(old_val));
339#elif defined(__gfx90a__)
344 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
345#elif __has_builtin(__hip_atomic_fetch_add)
346 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
348 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
366__device__
inline float safeAtomicMax(
float* addr,
float val) {
367 #if __has_builtin(__hip_atomic_load) && \
368 __has_builtin(__hip_atomic_compare_exchange_strong)
369 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
371 while (!done && value < val) {
372 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
373 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
377 unsigned int *uaddr = (
unsigned int *)addr;
378 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
380 while (!done && __uint_as_float(value) < val) {
381 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val),
false,
382 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
384 return __uint_as_float(value);
402__device__
inline float safeAtomicMin(
float* addr,
float val) {
403 #if __has_builtin(__hip_atomic_load) && \
404 __has_builtin(__hip_atomic_compare_exchange_strong)
405 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
407 while (!done && value > val) {
408 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
409 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
413 unsigned int *uaddr = (
unsigned int *)addr;
414 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
416 while (!done && __uint_as_float(value) > val) {
417 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val),
false,
418 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
420 return __uint_as_float(value);
438__device__
inline double safeAtomicAdd(
double* addr,
double value) {
439#if defined(__gfx90a__) && __has_builtin(__hip_atomic_fetch_add)
444 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
445#elif defined(__gfx90a__)
449#if __has_builtin(__hip_atomic_load)
450 old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
452 old_val = __longlong_as_double(__atomic_load_n(
reinterpret_cast<unsigned long long*
>(addr), __ATOMIC_RELAXED));
454 double expected, temp;
456 temp = expected = old_val;
457#if __has_builtin(__hip_atomic_compare_exchange_strong)
458 __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
459 __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
461 __atomic_compare_exchange_n(addr, &expected, old_val + value,
false,
462 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
465 }
while (__double_as_longlong(temp) != __double_as_longlong(old_val));
468#if __has_builtin(__hip_atomic_fetch_add)
469 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
471 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
490__device__
inline double safeAtomicMax(
double* addr,
double val) {
491 #if __has_builtin(__builtin_amdgcn_is_private)
492 if (__builtin_amdgcn_is_private(
493 (
const __attribute__((address_space(0)))
void*)addr)) {
495 *addr = __builtin_fmax(old, val);
499 #if __has_builtin(__hip_atomic_load) && \
500 __has_builtin(__hip_atomic_compare_exchange_strong)
501 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
503 while (!done && value < val) {
504 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
505 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
509 unsigned long long *uaddr = (
unsigned long long *)addr;
510 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
512 while (!done && __longlong_as_double(value) < val) {
513 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val),
false,
514 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
516 return __longlong_as_double(value);
518 #if __has_builtin(__builtin_amdgcn_is_private)
537__device__
inline double safeAtomicMin(
double* addr,
double val) {
538 #if __has_builtin(__builtin_amdgcn_is_private)
539 if (__builtin_amdgcn_is_private(
540 (
const __attribute__((address_space(0)))
void*)addr)) {
542 *addr = __builtin_fmin(old, val);
546 #if __has_builtin(__hip_atomic_load) && \
547 __has_builtin(__hip_atomic_compare_exchange_strong)
548 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
550 while (!done && value > val) {
551 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
552 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
556 unsigned long long *uaddr = (
unsigned long long *)addr;
557 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
559 while (!done && __longlong_as_double(value) > val) {
560 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val),
false,
561 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
563 return __longlong_as_double(value);
565 #if __has_builtin(__builtin_amdgcn_is_private)
570#if defined(__clang__)
571#pragma clang diagnostic pop