HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_hip_atomic.h
1/*
2Copyright (c) 2015 - Present Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#pragma once
24
25#if !defined(__HIPCC_RTC__)
26#include "amd_device_functions.h"
27#endif
28
29#if __has_builtin(__hip_atomic_compare_exchange_strong)
30
31template<bool B, typename T, typename F> struct Cond_t;
32
33template<typename T, typename F> struct Cond_t<true, T, F> { using type = T; };
34template<typename T, typename F> struct Cond_t<false, T, F> { using type = F; };
35
36#if !__HIP_DEVICE_COMPILE__
37//TODO: Remove this after compiler pre-defines the following Macros.
38#define __HIP_MEMORY_SCOPE_SINGLETHREAD 1
39#define __HIP_MEMORY_SCOPE_WAVEFRONT 2
40#define __HIP_MEMORY_SCOPE_WORKGROUP 3
41#define __HIP_MEMORY_SCOPE_AGENT 4
42#define __HIP_MEMORY_SCOPE_SYSTEM 5
43#endif
44
45#if !defined(__HIPCC_RTC__)
46#include "amd_hip_unsafe_atomics.h"
47#endif
48
49// Atomic expanders
50template<
51 int mem_order = __ATOMIC_SEQ_CST,
52 int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
53 typename T,
54 typename Op,
55 typename F>
56inline
57__attribute__((always_inline, device))
58T hip_cas_expander(T* p, T x, Op op, F f) noexcept
59{
60 using FP = __attribute__((address_space(0))) const void*;
61
62 __device__
63 extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
64
65 if (is_shared_workaround((FP)p))
66 return f();
67
68 using U = typename Cond_t<
69 sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
70
71 auto q = reinterpret_cast<U*>(p);
72
73 U tmp0{__hip_atomic_load(q, mem_order, mem_scope)};
74 U tmp1;
75 do {
76 tmp1 = tmp0;
77
78 op(reinterpret_cast<T&>(tmp1), x);
79 } while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order,
80 mem_order, mem_scope));
81
82 return reinterpret_cast<const T&>(tmp0);
83}
84
85template<
86 int mem_order = __ATOMIC_SEQ_CST,
87 int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
88 typename T,
89 typename Cmp,
90 typename F>
91inline
92__attribute__((always_inline, device))
93T hip_cas_extrema_expander(T* p, T x, Cmp cmp, F f) noexcept
94{
95 using FP = __attribute__((address_space(0))) const void*;
96
97 __device__
98 extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
99
100 if (is_shared_workaround((FP)p))
101 return f();
102
103 using U = typename Cond_t<
104 sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
105
106 auto q = reinterpret_cast<U*>(p);
107
108 U tmp{__hip_atomic_load(q, mem_order, mem_scope)};
109 while (cmp(x, reinterpret_cast<const T&>(tmp)) &&
110 !__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order,
111 mem_scope));
112
113 return reinterpret_cast<const T&>(tmp);
114}
115
116__device__
117inline
118int atomicCAS(int* address, int compare, int val) {
119 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
120 __HIP_MEMORY_SCOPE_AGENT);
121 return compare;
122}
123
124__device__
125inline
126int atomicCAS_system(int* address, int compare, int val) {
127 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
128 __HIP_MEMORY_SCOPE_SYSTEM);
129 return compare;
130}
131
132__device__
133inline
134unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
135 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
136 __HIP_MEMORY_SCOPE_AGENT);
137 return compare;
138}
139
140__device__
141inline
142unsigned int atomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val) {
143 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
144 __HIP_MEMORY_SCOPE_SYSTEM);
145 return compare;
146}
147
148__device__
149inline
150unsigned long atomicCAS(unsigned long* address, unsigned long compare, unsigned long val) {
151 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
152 __HIP_MEMORY_SCOPE_AGENT);
153 return compare;
154}
155
156__device__
157inline
158unsigned long atomicCAS_system(unsigned long* address, unsigned long compare, unsigned long val) {
159 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
160 __HIP_MEMORY_SCOPE_SYSTEM);
161 return compare;
162}
163
164__device__
165inline
166unsigned long long atomicCAS(unsigned long long* address, unsigned long long compare,
167 unsigned long long val) {
168 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
169 __HIP_MEMORY_SCOPE_AGENT);
170 return compare;
171}
172
173__device__
174inline
175unsigned long long atomicCAS_system(unsigned long long* address, unsigned long long compare,
176 unsigned long long val) {
177 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
178 __HIP_MEMORY_SCOPE_SYSTEM);
179 return compare;
180}
181
182__device__
183inline
184float atomicCAS(float* address, float compare, float val) {
185 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
186 __HIP_MEMORY_SCOPE_AGENT);
187 return compare;
188}
189
190__device__
191inline
192float atomicCAS_system(float* address, float compare, float val) {
193 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
194 __HIP_MEMORY_SCOPE_SYSTEM);
195 return compare;
196}
197
198__device__
199inline
200double atomicCAS(double* address, double compare, double val) {
201 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
202 __HIP_MEMORY_SCOPE_AGENT);
203 return compare;
204}
205
206__device__
207inline
208double atomicCAS_system(double* address, double compare, double val) {
209 __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
210 __HIP_MEMORY_SCOPE_SYSTEM);
211 return compare;
212}
213
214__device__
215inline
216int atomicAdd(int* address, int val) {
217 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
218}
219
220__device__
221inline
222int atomicAdd_system(int* address, int val) {
223 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
224}
225
226__device__
227inline
228unsigned int atomicAdd(unsigned int* address, unsigned int val) {
229 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
230}
231
232__device__
233inline
234unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
235 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
236}
237
238__device__
239inline
240unsigned long atomicAdd(unsigned long* address, unsigned long val) {
241 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
242}
243
244__device__
245inline
246unsigned long atomicAdd_system(unsigned long* address, unsigned long val) {
247 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
248}
249
250__device__
251inline
252unsigned long long atomicAdd(unsigned long long* address, unsigned long long val) {
253 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
254}
255
256__device__
257inline
258unsigned long long atomicAdd_system(unsigned long long* address, unsigned long long val) {
259 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
260}
261
262__device__
263inline
264float atomicAdd(float* address, float val) {
265#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
266 return unsafeAtomicAdd(address, val);
267#else
268 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
269#endif
270}
271
272__device__
273inline
274float atomicAdd_system(float* address, float val) {
275 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
276}
277
278#if !defined(__HIPCC_RTC__)
279DEPRECATED("use atomicAdd instead")
280#endif // !defined(__HIPCC_RTC__)
281__device__
282inline
283void atomicAddNoRet(float* address, float val)
284{
285 __ockl_atomic_add_noret_f32(address, val);
286}
287
288__device__
289inline
290double atomicAdd(double* address, double val) {
291#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
292 return unsafeAtomicAdd(address, val);
293#else
294 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
295#endif
296}
297
298__device__
299inline
300double atomicAdd_system(double* address, double val) {
301 return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
302}
303
304__device__
305inline
306int atomicSub(int* address, int val) {
307 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
308}
309
310__device__
311inline
312int atomicSub_system(int* address, int val) {
313 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
314}
315
316__device__
317inline
318unsigned int atomicSub(unsigned int* address, unsigned int val) {
319 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
320}
321
322__device__
323inline
324unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
325 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
326}
327
328__device__
329inline
330unsigned long atomicSub(unsigned long* address, unsigned long val) {
331 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
332}
333
334__device__
335inline
336unsigned long atomicSub_system(unsigned long* address, unsigned long val) {
337 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
338}
339
340__device__
341inline
342unsigned long long atomicSub(unsigned long long* address, unsigned long long val) {
343 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
344}
345
346__device__
347inline
348unsigned long long atomicSub_system(unsigned long long* address, unsigned long long val) {
349 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
350}
351
352__device__
353inline
354float atomicSub(float* address, float val) {
355#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
356 return unsafeAtomicAdd(address, -val);
357#else
358 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
359#endif
360}
361
362__device__
363inline
364float atomicSub_system(float* address, float val) {
365 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
366}
367
368__device__
369inline
370double atomicSub(double* address, double val) {
371#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
372 return unsafeAtomicAdd(address, -val);
373#else
374 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
375#endif
376}
377
378__device__
379inline
380double atomicSub_system(double* address, double val) {
381 return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
382}
383
384__device__
385inline
386int atomicExch(int* address, int val) {
387 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
388}
389
390__device__
391inline
392int atomicExch_system(int* address, int val) {
393 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
394}
395
396__device__
397inline
398unsigned int atomicExch(unsigned int* address, unsigned int val) {
399 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
400}
401
402__device__
403inline
404unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
405 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
406}
407
408__device__
409inline
410unsigned long atomicExch(unsigned long* address, unsigned long val) {
411 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
412}
413
414__device__
415inline
416unsigned long atomicExch_system(unsigned long* address, unsigned long val) {
417 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
418}
419
420__device__
421inline
422unsigned long long atomicExch(unsigned long long* address, unsigned long long val) {
423 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
424}
425
426__device__
427inline
428unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val) {
429 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
430}
431
432__device__
433inline
434float atomicExch(float* address, float val) {
435 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
436}
437
438__device__
439inline
440float atomicExch_system(float* address, float val) {
441 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
442}
443
444__device__
445inline
446double atomicExch(double* address, double val) {
447 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
448}
449
450__device__
451inline
452double atomicExch_system(double* address, double val) {
453 return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
454}
455
456__device__
457inline
458int atomicMin(int* address, int val) {
459#if defined(__gfx941__)
460 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
461 address, val, [](int x, int y) { return x < y; }, [=]() {
462 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
463 __HIP_MEMORY_SCOPE_AGENT);
464 });
465#else
466 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
467#endif // __gfx941__
468}
469
470__device__
471inline
472int atomicMin_system(int* address, int val) {
473#if defined(__gfx941__)
474 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
475 address, val, [](int x, int y) { return x < y; }, [=]() {
476 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
477 __HIP_MEMORY_SCOPE_SYSTEM);
478 });
479#else
480 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
481#endif // __gfx941__
482}
483
484__device__
485inline
486unsigned int atomicMin(unsigned int* address, unsigned int val) {
487#if defined(__gfx941__)
488 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
489 address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
490 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
491 __HIP_MEMORY_SCOPE_AGENT);
492 });
493#else
494 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
495#endif // __gfx941__
496
497}
498
499__device__
500inline
501unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
502#if defined(__gfx941__)
503 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
504 address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
505 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
506 __HIP_MEMORY_SCOPE_SYSTEM);
507 });
508#else
509 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
510#endif // __gfx941__
511}
512
513__device__
514inline
515unsigned long long atomicMin(unsigned long* address, unsigned long val) {
516#if defined(__gfx941__)
517 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
518 address,
519 val,
520 [](unsigned long x, unsigned long y) { return x < y; },
521 [=]() {
522 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
523 __HIP_MEMORY_SCOPE_AGENT);
524 });
525#else
526 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
527#endif // __gfx941__
528}
529
530__device__
531inline
532unsigned long atomicMin_system(unsigned long* address, unsigned long val) {
533#if defined(__gfx941__)
534 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
535 address,
536 val,
537 [](unsigned long x, unsigned long y) { return x < y; },
538 [=]() {
539 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
540 __HIP_MEMORY_SCOPE_SYSTEM);
541 });
542#else
543 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
544#endif // __gfx941__
545}
546
547__device__
548inline
549unsigned long long atomicMin(unsigned long long* address, unsigned long long val) {
550#if defined(__gfx941__)
551 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
552 address,
553 val,
554 [](unsigned long long x, unsigned long long y) { return x < y; },
555 [=]() {
556 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
557 __HIP_MEMORY_SCOPE_AGENT);
558 });
559#else
560 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
561#endif // __gfx941__
562}
563
564__device__
565inline
566unsigned long long atomicMin_system(unsigned long long* address, unsigned long long val) {
567#if defined(__gfx941__)
568 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
569 address,
570 val,
571 [](unsigned long long x, unsigned long long y) { return x < y; },
572 [=]() {
573 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
574 __HIP_MEMORY_SCOPE_SYSTEM);
575 });
576#else
577 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
578#endif // __gfx941__
579}
580
581__device__
582inline
583long long atomicMin(long long* address, long long val) {
584#if defined(__gfx941__)
585 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
586 address, val, [](long long x, long long y) { return x < y; },
587 [=]() {
588 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
589 });
590#else
591 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
592#endif // __gfx941__
593}
594
595__device__
596inline
597long long atomicMin_system(long long* address, long long val) {
598#if defined(__gfx941__)
599 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
600 address, val, [](long long x, long long y) { return x < y; },
601 [=]() {
602 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
603 });
604#else
605 return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
606#endif // __gfx941__
607}
608
609__device__
610inline
611float atomicMin(float* addr, float val) {
612#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
613 return unsafeAtomicMin(addr, val);
614#else
615 #if __has_builtin(__hip_atomic_load) && \
616 __has_builtin(__hip_atomic_compare_exchange_strong)
617 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
618 bool done = false;
619 while (!done && value > val) {
620 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
621 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
622 }
623 return value;
624 #else
625 unsigned int *uaddr = (unsigned int *)addr;
626 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
627 bool done = false;
628 while (!done && __uint_as_float(value) > val) {
629 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
630 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
631 }
632 return __uint_as_float(value);
633 #endif
634#endif
635}
636
637__device__
638inline
639float atomicMin_system(float* address, float val) {
640 unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
641 #if __has_builtin(__hip_atomic_load)
642 unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
643 #else
644 unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
645 #endif
646 float value = __uint_as_float(tmp);
647
648 while (val < value) {
649 value = atomicCAS_system(address, value, val);
650 }
651
652 return value;
653}
654
655__device__
656inline
657double atomicMin(double* addr, double val) {
658#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
659 return unsafeAtomicMin(addr, val);
660#else
661 #if __has_builtin(__hip_atomic_load) && \
662 __has_builtin(__hip_atomic_compare_exchange_strong)
663 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
664 bool done = false;
665 while (!done && value > val) {
666 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
667 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
668 }
669 return value;
670 #else
671 unsigned long long *uaddr = (unsigned long long *)addr;
672 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
673 bool done = false;
674 while (!done && __longlong_as_double(value) > val) {
675 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
676 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
677 }
678 return __longlong_as_double(value);
679 #endif
680#endif
681}
682
683__device__
684inline
685double atomicMin_system(double* address, double val) {
686 unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
687 #if __has_builtin(__hip_atomic_load)
688 unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
689 #else
690 unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
691 #endif
692 double value = __longlong_as_double(tmp);
693
694 while (val < value) {
695 value = atomicCAS_system(address, value, val);
696 }
697
698 return value;
699}
700
701__device__
702inline
703int atomicMax(int* address, int val) {
704#if defined(__gfx941__)
705 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
706 address, val, [](int x, int y) { return y < x; }, [=]() {
707 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
708 __HIP_MEMORY_SCOPE_AGENT);
709 });
710#else
711 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
712#endif // __gfx941__
713}
714
715__device__
716inline
717int atomicMax_system(int* address, int val) {
718#if defined(__gfx941__)
719 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
720 address, val, [](int x, int y) { return y < x; }, [=]() {
721 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
722 __HIP_MEMORY_SCOPE_SYSTEM);
723 });
724#else
725 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
726#endif // __gfx941__
727}
728
729__device__
730inline
731unsigned int atomicMax(unsigned int* address, unsigned int val) {
732#if defined(__gfx941__)
733 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
734 address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
735 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
736 __HIP_MEMORY_SCOPE_AGENT);
737 });
738#else
739 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
740#endif // __gfx941__
741}
742
743__device__
744inline
745unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
746#if defined(__gfx941__)
747 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
748 address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
749 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
750 __HIP_MEMORY_SCOPE_SYSTEM);
751 });
752#else
753 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
754#endif // __gfx941__
755}
756
757__device__
758inline
759unsigned long atomicMax(unsigned long* address, unsigned long val) {
760#if defined(__gfx941__)
761 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
762 address,
763 val,
764 [](unsigned long x, unsigned long y) { return y < x; },
765 [=]() {
766 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
767 __HIP_MEMORY_SCOPE_AGENT);
768 });
769#else
770 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
771#endif // __gfx941__
772}
773
774__device__
775inline
776unsigned long atomicMax_system(unsigned long* address, unsigned long val) {
777#if defined(__gfx941__)
778 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
779 address,
780 val,
781 [](unsigned long x, unsigned long y) { return y < x; },
782 [=]() {
783 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
784 __HIP_MEMORY_SCOPE_SYSTEM);
785 });
786#else
787 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
788#endif // __gfx941__
789}
790
791__device__
792inline
793unsigned long long atomicMax(unsigned long long* address, unsigned long long val) {
794#if defined(__gfx941__)
795 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
796 address,
797 val,
798 [](unsigned long long x, unsigned long long y) { return y < x; },
799 [=]() {
800 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
801 __HIP_MEMORY_SCOPE_AGENT);
802 });
803#else
804 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
805#endif // __gfx941__
806}
807
808__device__
809inline
810unsigned long long atomicMax_system(unsigned long long* address, unsigned long long val) {
811#if defined(__gfx941__)
812 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
813 address,
814 val,
815 [](unsigned long long x, unsigned long long y) { return y < x; },
816 [=]() {
817 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
818 __HIP_MEMORY_SCOPE_SYSTEM);
819 });
820#else
821 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
822#endif // __gfx941__
823}
824
825__device__
826inline
827long long atomicMax(long long* address, long long val) {
828 #if defined(__gfx941__)
829 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
830 address, val, [](long long x, long long y) { return y < x; },
831 [=]() {
832 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
833 });
834#else
835 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
836#endif // __gfx941__
837}
838
839__device__
840inline
841long long atomicMax_system(long long* address, long long val) {
842#if defined(__gfx941__)
843 return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
844 address, val, [](long long x, long long y) { return y < x; },
845 [=]() {
846 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
847 });
848#else
849 return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
850#endif // __gfx941__
851}
852
853__device__
854inline
855float atomicMax(float* addr, float val) {
856#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
857 return unsafeAtomicMax(addr, val);
858#else
859 #if __has_builtin(__hip_atomic_load) && \
860 __has_builtin(__hip_atomic_compare_exchange_strong)
861 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
862 bool done = false;
863 while (!done && value < val) {
864 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
865 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
866 }
867 return value;
868 #else
869 unsigned int *uaddr = (unsigned int *)addr;
870 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
871 bool done = false;
872 while (!done && __uint_as_float(value) < val) {
873 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
874 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
875 }
876 return __uint_as_float(value);
877 #endif
878#endif
879}
880
881__device__
882inline
883float atomicMax_system(float* address, float val) {
884 unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
885 #if __has_builtin(__hip_atomic_load)
886 unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
887 #else
888 unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
889 #endif
890 float value = __uint_as_float(tmp);
891
892 while (value < val) {
893 value = atomicCAS_system(address, value, val);
894 }
895
896 return value;
897}
898
899__device__
900inline
901double atomicMax(double* addr, double val) {
902#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
903 return unsafeAtomicMax(addr, val);
904#else
905 #if __has_builtin(__hip_atomic_load) && \
906 __has_builtin(__hip_atomic_compare_exchange_strong)
907 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
908 bool done = false;
909 while (!done && value < val) {
910 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
911 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
912 }
913 return value;
914 #else
915 unsigned long long *uaddr = (unsigned long long *)addr;
916 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
917 bool done = false;
918 while (!done && __longlong_as_double(value) < val) {
919 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
920 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
921 }
922 return __longlong_as_double(value);
923 #endif
924#endif
925}
926
927__device__
928inline
929double atomicMax_system(double* address, double val) {
930 unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
931 #if __has_builtin(__hip_atomic_load)
932 unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
933 #else
934 unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
935 #endif
936 double value = __longlong_as_double(tmp);
937
938 while (value < val) {
939 value = atomicCAS_system(address, value, val);
940 }
941
942 return value;
943}
944
945__device__
946inline
947unsigned int atomicInc(unsigned int* address, unsigned int val)
948{
949#if defined(__gfx941__)
950 __device__
951 extern
952 unsigned int __builtin_amdgcn_atomic_inc(
953 unsigned int*,
954 unsigned int,
955 unsigned int,
956 unsigned int,
957 bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32");
958
959 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
960 address,
961 val,
962 [](unsigned int& x, unsigned int y) { x = (x >= y) ? 0 : (x + 1); },
963 [=]() {
964 return
965 __builtin_amdgcn_atomic_inc(address, val, __ATOMIC_RELAXED, 1, false);
966 });
967#else
968 return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
969#endif // __gfx941__
970
971}
972
973__device__
974inline
975unsigned int atomicDec(unsigned int* address, unsigned int val)
976{
977#if defined(__gfx941__)
978 __device__
979 extern
980 unsigned int __builtin_amdgcn_atomic_dec(
981 unsigned int*,
982 unsigned int,
983 unsigned int,
984 unsigned int,
985 bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32");
986
987 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
988 address,
989 val,
990 [](unsigned int& x, unsigned int y) { x = (!x || x > y) ? y : (x - 1); },
991 [=]() {
992 return
993 __builtin_amdgcn_atomic_dec(address, val, __ATOMIC_RELAXED, 1, false);
994 });
995#else
996 return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
997#endif // __gfx941__
998
999}
1000
1001__device__
1002inline
1003int atomicAnd(int* address, int val) {
1004#if defined(__gfx941__)
1005 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1006 address, val, [](int& x, int y) { x &= y; }, [=]() {
1007 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1008 __HIP_MEMORY_SCOPE_AGENT);
1009 });
1010#else
1011 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1012#endif // __gfx941__
1013}
1014
1015__device__
1016inline
1017int atomicAnd_system(int* address, int val) {
1018#if defined(__gfx941__)
1019 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1020 address, val, [](int& x, int y) { x &= y; }, [=]() {
1021 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1022 __HIP_MEMORY_SCOPE_SYSTEM);
1023 });
1024#else
1025 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1026#endif // __gfx941__
1027}
1028
1029__device__
1030inline
1031unsigned int atomicAnd(unsigned int* address, unsigned int val) {
1032#if defined(__gfx941__)
1033 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1034 address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
1035 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1036 __HIP_MEMORY_SCOPE_AGENT);
1037 });
1038#else
1039 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1040#endif // __gfx941__
1041}
1042
1043__device__
1044inline
1045unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
1046#if defined(__gfx941__)
1047 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1048 address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
1049 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1050 __HIP_MEMORY_SCOPE_SYSTEM);
1051 });
1052#else
1053 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1054#endif // __gfx941__
1055}
1056
1057__device__
1058inline
1059unsigned long atomicAnd(unsigned long* address, unsigned long val) {
1060#if defined(__gfx941__)
1061 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1062 address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
1063 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1064 __HIP_MEMORY_SCOPE_AGENT);
1065 });
1066#else
1067 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1068#endif // __gfx941__
1069}
1070
1071__device__
1072inline
1073unsigned long atomicAnd_system(unsigned long* address, unsigned long val) {
1074#if defined(__gfx941__)
1075 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1076 address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
1077 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1078 __HIP_MEMORY_SCOPE_SYSTEM);
1079 });
1080#else
1081 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1082#endif // __gfx941__
1083}
1084
1085__device__
1086inline
1087unsigned long long atomicAnd(unsigned long long* address, unsigned long long val) {
1088#if defined(__gfx941__)
1089 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1090 address,
1091 val,
1092 [](unsigned long long& x, unsigned long long y) { x &= y; },
1093 [=]() {
1094 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1095 __HIP_MEMORY_SCOPE_AGENT);
1096 });
1097#else
1098 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1099#endif // __gfx941__
1100}
1101
1102__device__
1103inline
1104unsigned long long atomicAnd_system(unsigned long long* address, unsigned long long val) {
1105#if defined(__gfx941__)
1106 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1107 address,
1108 val,
1109 [](unsigned long long& x, unsigned long long y) { x &= y; },
1110 [=]() {
1111 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1112 __HIP_MEMORY_SCOPE_SYSTEM);
1113 });
1114#else
1115 return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1116#endif // __gfx941__
1117}
1118
1119__device__
1120inline
1121int atomicOr(int* address, int val) {
1122#if defined(__gfx941__)
1123 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1124 address, val, [](int& x, int y) { x |= y; }, [=]() {
1125 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1126 __HIP_MEMORY_SCOPE_AGENT);
1127 });
1128#else
1129 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1130#endif // __gfx941__
1131}
1132
1133__device__
1134inline
1135int atomicOr_system(int* address, int val) {
1136#if defined(__gfx941__)
1137 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1138 address, val, [](int& x, int y) { x |= y; }, [=]() {
1139 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1140 __HIP_MEMORY_SCOPE_SYSTEM);
1141 });
1142#else
1143 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1144#endif // __gfx941__
1145}
1146
1147__device__
1148inline
1149unsigned int atomicOr(unsigned int* address, unsigned int val) {
1150#if defined(__gfx941__)
1151 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1152 address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
1153 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1154 __HIP_MEMORY_SCOPE_AGENT);
1155 });
1156#else
1157 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1158#endif // __gfx941__
1159}
1160
1161__device__
1162inline
1163unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
1164#if defined(__gfx941__)
1165 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1166 address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
1167 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1168 __HIP_MEMORY_SCOPE_SYSTEM);
1169 });
1170#else
1171 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1172#endif // __gfx941__
1173}
1174
1175__device__
1176inline
1177unsigned long atomicOr(unsigned long* address, unsigned long val) {
1178#if defined(__gfx941__)
1179 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1180 address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
1181 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1182 __HIP_MEMORY_SCOPE_AGENT);
1183 });
1184#else
1185 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1186#endif // __gfx941__
1187}
1188
1189__device__
1190inline
1191unsigned long atomicOr_system(unsigned long* address, unsigned long val) {
1192#if defined(__gfx941__)
1193 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1194 address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
1195 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1196 __HIP_MEMORY_SCOPE_SYSTEM);
1197 });
1198#else
1199 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1200#endif // __gfx941__
1201}
1202
1203__device__
1204inline
1205unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
1206#if defined(__gfx941__)
1207 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1208 address,
1209 val,
1210 [](unsigned long long& x, unsigned long long y) { x |= y; },
1211 [=]() {
1212 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1213 __HIP_MEMORY_SCOPE_AGENT);
1214 });
1215#else
1216 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1217#endif // __gfx941__
1218}
1219
1220__device__
1221inline
1222unsigned long long atomicOr_system(unsigned long long* address, unsigned long long val) {
1223#if defined(__gfx941__)
1224 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1225 address,
1226 val,
1227 [](unsigned long long& x, unsigned long long y) { x |= y; },
1228 [=]() {
1229 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1230 __HIP_MEMORY_SCOPE_SYSTEM);
1231 });
1232#else
1233 return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1234#endif // __gfx941__
1235}
1236
1237__device__
1238inline
1239int atomicXor(int* address, int val) {
1240#if defined(__gfx941__)
1241 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1242 address, val, [](int& x, int y) { x ^= y; }, [=]() {
1243 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1244 __HIP_MEMORY_SCOPE_AGENT);
1245 });
1246#else
1247 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1248#endif // __gfx941__
1249}
1250
1251__device__
1252inline
1253int atomicXor_system(int* address, int val) {
1254#if defined(__gfx941__)
1255 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1256 address, val, [](int& x, int y) { x ^= y; }, [=]() {
1257 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1258 __HIP_MEMORY_SCOPE_SYSTEM);
1259 });
1260#else
1261 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1262#endif // __gfx941__
1263}
1264
1265__device__
1266inline
1267unsigned int atomicXor(unsigned int* address, unsigned int val) {
1268#if defined(__gfx941__)
1269 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1270 address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
1271 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1272 __HIP_MEMORY_SCOPE_AGENT);
1273 });
1274#else
1275 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1276#endif // __gfx941__
1277}
1278
1279__device__
1280inline
1281unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
1282#if defined(__gfx941__)
1283 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1284 address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
1285 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1286 __HIP_MEMORY_SCOPE_SYSTEM);
1287 });
1288#else
1289 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1290#endif // __gfx941__
1291}
1292
1293__device__
1294inline
1295unsigned long atomicXor(unsigned long* address, unsigned long val) {
1296#if defined(__gfx941__)
1297 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1298 address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
1299 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1300 __HIP_MEMORY_SCOPE_AGENT);
1301 });
1302#else
1303 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1304#endif // __gfx941__
1305}
1306
1307__device__
1308inline
1309unsigned long atomicXor_system(unsigned long* address, unsigned long val) {
1310#if defined(__gfx941__)
1311 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1312 address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
1313 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1314 __HIP_MEMORY_SCOPE_SYSTEM);
1315 });
1316#else
1317 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1318#endif // __gfx941__
1319}
1320
1321__device__
1322inline
1323unsigned long long atomicXor(unsigned long long* address, unsigned long long val) {
1324#if defined(__gfx941__)
1325 return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1326 address,
1327 val,
1328 [](unsigned long long& x, unsigned long long y) { x ^= y; },
1329 [=]() {
1330 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1331 __HIP_MEMORY_SCOPE_AGENT);
1332 });
1333#else
1334 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1335#endif // __gfx941__
1336}
1337
1338__device__
1339inline
1340unsigned long long atomicXor_system(unsigned long long* address, unsigned long long val) {
1341 return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1342}
1343
1344#else // __hip_atomic_compare_exchange_strong
1345
1346__device__
1347inline
1348int atomicCAS(int* address, int compare, int val)
1349{
1350 __atomic_compare_exchange_n(
1351 address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
1352
1353 return compare;
1354}
1355__device__
1356inline
1357unsigned int atomicCAS(
1358 unsigned int* address, unsigned int compare, unsigned int val)
1359{
1360 __atomic_compare_exchange_n(
1361 address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
1362
1363 return compare;
1364}
1365__device__
1366inline
1367unsigned long long atomicCAS(
1368 unsigned long long* address,
1369 unsigned long long compare,
1370 unsigned long long val)
1371{
1372 __atomic_compare_exchange_n(
1373 address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
1374
1375 return compare;
1376}
1377
1378__device__
1379inline
1380int atomicAdd(int* address, int val)
1381{
1382 return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1383}
1384__device__
1385inline
1386unsigned int atomicAdd(unsigned int* address, unsigned int val)
1387{
1388 return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1389}
1390__device__
1391inline
1392unsigned long long atomicAdd(
1393 unsigned long long* address, unsigned long long val)
1394{
1395 return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1396}
1397__device__
1398inline
1399float atomicAdd(float* address, float val)
1400{
1401#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
1402 return unsafeAtomicAdd(address, val);
1403#else
1404 return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1405#endif
1406}
1407
1408#if !defined(__HIPCC_RTC__)
1409DEPRECATED("use atomicAdd instead")
1410#endif // !defined(__HIPCC_RTC__)
1411__device__
1412inline
1413void atomicAddNoRet(float* address, float val)
1414{
1415 __ockl_atomic_add_noret_f32(address, val);
1416}
1417
1418__device__
1419inline
1420double atomicAdd(double* address, double val)
1421{
1422#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
1423 return unsafeAtomicAdd(address, val);
1424#else
1425 return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1426#endif
1427}
1428
1429__device__
1430inline
1431int atomicSub(int* address, int val)
1432{
1433 return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
1434}
1435__device__
1436inline
1437unsigned int atomicSub(unsigned int* address, unsigned int val)
1438{
1439 return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
1440}
1441
1442__device__
1443inline
1444int atomicExch(int* address, int val)
1445{
1446 return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
1447}
1448__device__
1449inline
1450unsigned int atomicExch(unsigned int* address, unsigned int val)
1451{
1452 return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
1453}
1454__device__
1455inline
1456unsigned long long atomicExch(unsigned long long* address, unsigned long long val)
1457{
1458 return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
1459}
1460__device__
1461inline
1462float atomicExch(float* address, float val)
1463{
1464 return __uint_as_float(__atomic_exchange_n(
1465 reinterpret_cast<unsigned int*>(address),
1466 __float_as_uint(val),
1467 __ATOMIC_RELAXED));
1468}
1469
1470__device__
1471inline
1472int atomicMin(int* address, int val)
1473{
1474 return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
1475}
1476__device__
1477inline
1478unsigned int atomicMin(unsigned int* address, unsigned int val)
1479{
1480 return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
1481}
1482__device__
1483inline
1484unsigned long long atomicMin(
1485 unsigned long long* address, unsigned long long val)
1486{
1487 unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1488 while (val < tmp) {
1489 const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1490
1491 if (tmp1 != tmp) { tmp = tmp1; continue; }
1492
1493 tmp = atomicCAS(address, tmp, val);
1494 }
1495
1496 return tmp;
1497}
1498__device__ inline long long atomicMin(long long* address, long long val) {
1499 long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1500 while (val < tmp) {
1501 const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1502
1503 if (tmp1 != tmp) {
1504 tmp = tmp1;
1505 continue;
1506 }
1507
1508 tmp = atomicCAS(address, tmp, val);
1509 }
1510 return tmp;
1511}
1512
1513__device__
1514inline
1515int atomicMax(int* address, int val)
1516{
1517 return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
1518}
1519__device__
1520inline
1521unsigned int atomicMax(unsigned int* address, unsigned int val)
1522{
1523 return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
1524}
1525__device__
1526inline
1527unsigned long long atomicMax(
1528 unsigned long long* address, unsigned long long val)
1529{
1530 unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1531 while (tmp < val) {
1532 const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1533
1534 if (tmp1 != tmp) { tmp = tmp1; continue; }
1535
1536 tmp = atomicCAS(address, tmp, val);
1537 }
1538
1539 return tmp;
1540}
1541__device__ inline long long atomicMax(long long* address, long long val) {
1542 long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1543 while (tmp < val) {
1544 const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1545
1546 if (tmp1 != tmp) {
1547 tmp = tmp1;
1548 continue;
1549 }
1550
1551 tmp = atomicCAS(address, tmp, val);
1552 }
1553 return tmp;
1554}
1555
1556__device__
1557inline
1558unsigned int atomicInc(unsigned int* address, unsigned int val)
1559{
1560 return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
1561}
1562
1563__device__
1564inline
1565unsigned int atomicDec(unsigned int* address, unsigned int val)
1566{
1567 return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
1568}
1569
1570__device__
1571inline
1572int atomicAnd(int* address, int val)
1573{
1574 return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
1575}
1576__device__
1577inline
1578unsigned int atomicAnd(unsigned int* address, unsigned int val)
1579{
1580 return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
1581}
1582__device__
1583inline
1584unsigned long long atomicAnd(
1585 unsigned long long* address, unsigned long long val)
1586{
1587 return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
1588}
1589
1590__device__
1591inline
1592int atomicOr(int* address, int val)
1593{
1594 return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
1595}
1596__device__
1597inline
1598unsigned int atomicOr(unsigned int* address, unsigned int val)
1599{
1600 return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
1601}
1602__device__
1603inline
1604unsigned long long atomicOr(
1605 unsigned long long* address, unsigned long long val)
1606{
1607 return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
1608}
1609
1610__device__
1611inline
1612int atomicXor(int* address, int val)
1613{
1614 return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
1615}
1616__device__
1617inline
1618unsigned int atomicXor(unsigned int* address, unsigned int val)
1619{
1620 return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
1621}
1622__device__
1623inline
1624unsigned long long atomicXor(
1625 unsigned long long* address, unsigned long long val)
1626{
1627 return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
1628}
1629
1630#endif // __hip_atomic_compare_exchange_strong