HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_warp_functions.h
1/*
2Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
25
26#if defined(__clang__)
27#pragma clang diagnostic push
28#pragma clang diagnostic ignored "-Wreserved-identifier"
29#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
30#pragma clang diagnostic ignored "-Wsign-conversion"
31#pragma clang diagnostic ignored "-Wold-style-cast"
32#pragma clang diagnostic ignored "-Wc++98-compat"
33#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
34#endif
35
36__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
37 union { int i; unsigned u; float f; } tmp; tmp.u = src;
38 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
39 return tmp.u;
40}
41
42__device__ static inline float __hip_ds_bpermutef(int index, float src) {
43 union { int i; unsigned u; float f; } tmp; tmp.f = src;
44 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
45 return tmp.f;
46}
47
48__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
49 union { int i; unsigned u; float f; } tmp; tmp.u = src;
50 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
51 return tmp.u;
52}
53
54__device__ static inline float __hip_ds_permutef(int index, float src) {
55 union { int i; unsigned u; float f; } tmp; tmp.f = src;
56 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
57 return tmp.f;
58}
59
60#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
61#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
62
63template <int pattern>
64__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
65 union { int i; unsigned u; float f; } tmp; tmp.u = src;
66 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
67 return tmp.u;
68}
69
70template <int pattern>
71__device__ static inline float __hip_ds_swizzlef_N(float src) {
72 union { int i; unsigned u; float f; } tmp; tmp.f = src;
73 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
74 return tmp.f;
75}
76
77#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
78 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
79
80template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
81__device__ static inline int __hip_move_dpp_N(int src) {
82 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
83 bound_ctrl);
84}
85
86static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
87
88__device__
89inline
90int __shfl(int var, int src_lane, int width = warpSize) {
91 int self = __lane_id();
92 int index = (src_lane & (width - 1)) + (self & ~(width-1));
93 return __builtin_amdgcn_ds_bpermute(index<<2, var);
94}
95__device__
96inline
97unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
98 union { int i; unsigned u; float f; } tmp; tmp.u = var;
99 tmp.i = __shfl(tmp.i, src_lane, width);
100 return tmp.u;
101}
102__device__
103inline
104float __shfl(float var, int src_lane, int width = warpSize) {
105 union { int i; unsigned u; float f; } tmp; tmp.f = var;
106 tmp.i = __shfl(tmp.i, src_lane, width);
107 return tmp.f;
108}
109__device__
110inline
111double __shfl(double var, int src_lane, int width = warpSize) {
112 static_assert(sizeof(double) == 2 * sizeof(int), "");
113 static_assert(sizeof(double) == sizeof(uint64_t), "");
114
115 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
116 tmp[0] = __shfl(tmp[0], src_lane, width);
117 tmp[1] = __shfl(tmp[1], src_lane, width);
118
119 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
120 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
121 return tmp1;
122}
123__device__
124inline
125long __shfl(long var, int src_lane, int width = warpSize)
126{
127 #ifndef _MSC_VER
128 static_assert(sizeof(long) == 2 * sizeof(int), "");
129 static_assert(sizeof(long) == sizeof(uint64_t), "");
130
131 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
132 tmp[0] = __shfl(tmp[0], src_lane, width);
133 tmp[1] = __shfl(tmp[1], src_lane, width);
134
135 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
136 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
137 return tmp1;
138 #else
139 static_assert(sizeof(long) == sizeof(int), "");
140 return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
141 #endif
142}
143__device__
144inline
145unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
146 #ifndef _MSC_VER
147 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
148 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
149
150 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
151 tmp[0] = __shfl(tmp[0], src_lane, width);
152 tmp[1] = __shfl(tmp[1], src_lane, width);
153
154 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
155 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
156 return tmp1;
157 #else
158 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
159 return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
160 #endif
161}
162__device__
163inline
164long long __shfl(long long var, int src_lane, int width = warpSize)
165{
166 static_assert(sizeof(long long) == 2 * sizeof(int), "");
167 static_assert(sizeof(long long) == sizeof(uint64_t), "");
168
169 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
170 tmp[0] = __shfl(tmp[0], src_lane, width);
171 tmp[1] = __shfl(tmp[1], src_lane, width);
172
173 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
174 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
175 return tmp1;
176}
177__device__
178inline
179unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
180 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
181 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
182
183 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
184 tmp[0] = __shfl(tmp[0], src_lane, width);
185 tmp[1] = __shfl(tmp[1], src_lane, width);
186
187 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
188 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
189 return tmp1;
190}
191
192__device__
193inline
194int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
195 int self = __lane_id();
196 int index = self - lane_delta;
197 index = (index < (self & ~(width-1)))?self:index;
198 return __builtin_amdgcn_ds_bpermute(index<<2, var);
199}
200__device__
201inline
202unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
203 union { int i; unsigned u; float f; } tmp; tmp.u = var;
204 tmp.i = __shfl_up(tmp.i, lane_delta, width);
205 return tmp.u;
206}
207__device__
208inline
209float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
210 union { int i; unsigned u; float f; } tmp; tmp.f = var;
211 tmp.i = __shfl_up(tmp.i, lane_delta, width);
212 return tmp.f;
213}
214__device__
215inline
216double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
217 static_assert(sizeof(double) == 2 * sizeof(int), "");
218 static_assert(sizeof(double) == sizeof(uint64_t), "");
219
220 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
221 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
222 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
223
224 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
225 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
226 return tmp1;
227}
228__device__
229inline
230long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
231{
232 #ifndef _MSC_VER
233 static_assert(sizeof(long) == 2 * sizeof(int), "");
234 static_assert(sizeof(long) == sizeof(uint64_t), "");
235
236 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
237 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
238 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
239
240 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
241 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
242 return tmp1;
243 #else
244 static_assert(sizeof(long) == sizeof(int), "");
245 return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
246 #endif
247}
248
249__device__
250inline
251unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
252{
253 #ifndef _MSC_VER
254 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
255 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
256
257 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
258 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
259 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
260
261 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
262 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
263 return tmp1;
264 #else
265 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
266 return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
267 #endif
268}
269
270__device__
271inline
272long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
273{
274 static_assert(sizeof(long long) == 2 * sizeof(int), "");
275 static_assert(sizeof(long long) == sizeof(uint64_t), "");
276 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
277 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
278 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
279 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
280 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
281 return tmp1;
282}
283
284__device__
285inline
286unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
287{
288 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
289 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
290 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
291 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
292 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
293 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
294 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
295 return tmp1;
296}
297
298__device__
299inline
300int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
301 int self = __lane_id();
302 int index = self + lane_delta;
303 index = (int)((self&(width-1))+lane_delta) >= width?self:index;
304 return __builtin_amdgcn_ds_bpermute(index<<2, var);
305}
306__device__
307inline
308unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
309 union { int i; unsigned u; float f; } tmp; tmp.u = var;
310 tmp.i = __shfl_down(tmp.i, lane_delta, width);
311 return tmp.u;
312}
313__device__
314inline
315float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
316 union { int i; unsigned u; float f; } tmp; tmp.f = var;
317 tmp.i = __shfl_down(tmp.i, lane_delta, width);
318 return tmp.f;
319}
320__device__
321inline
322double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
323 static_assert(sizeof(double) == 2 * sizeof(int), "");
324 static_assert(sizeof(double) == sizeof(uint64_t), "");
325
326 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
327 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
328 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
329
330 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
331 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
332 return tmp1;
333}
334__device__
335inline
336long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
337{
338 #ifndef _MSC_VER
339 static_assert(sizeof(long) == 2 * sizeof(int), "");
340 static_assert(sizeof(long) == sizeof(uint64_t), "");
341
342 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
343 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
344 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
345
346 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
347 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
348 return tmp1;
349 #else
350 static_assert(sizeof(long) == sizeof(int), "");
351 return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
352 #endif
353}
354__device__
355inline
356unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
357{
358 #ifndef _MSC_VER
359 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
360 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
361
362 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
363 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
364 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
365
366 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
367 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
368 return tmp1;
369 #else
370 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
371 return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
372 #endif
373}
374__device__
375inline
376long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
377{
378 static_assert(sizeof(long long) == 2 * sizeof(int), "");
379 static_assert(sizeof(long long) == sizeof(uint64_t), "");
380 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
381 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
382 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
383 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
384 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
385 return tmp1;
386}
387__device__
388inline
389unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
390{
391 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
392 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
393 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
394 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
395 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
396 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
397 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
398 return tmp1;
399}
400
401__device__
402inline
403int __shfl_xor(int var, int lane_mask, int width = warpSize) {
404 int self = __lane_id();
405 int index = self^lane_mask;
406 index = index >= ((self+width)&~(width-1))?self:index;
407 return __builtin_amdgcn_ds_bpermute(index<<2, var);
408}
409__device__
410inline
411unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
412 union { int i; unsigned u; float f; } tmp; tmp.u = var;
413 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
414 return tmp.u;
415}
416__device__
417inline
418float __shfl_xor(float var, int lane_mask, int width = warpSize) {
419 union { int i; unsigned u; float f; } tmp; tmp.f = var;
420 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
421 return tmp.f;
422}
423__device__
424inline
425double __shfl_xor(double var, int lane_mask, int width = warpSize) {
426 static_assert(sizeof(double) == 2 * sizeof(int), "");
427 static_assert(sizeof(double) == sizeof(uint64_t), "");
428
429 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
430 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
431 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
432
433 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
434 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
435 return tmp1;
436}
437__device__
438inline
439long __shfl_xor(long var, int lane_mask, int width = warpSize)
440{
441 #ifndef _MSC_VER
442 static_assert(sizeof(long) == 2 * sizeof(int), "");
443 static_assert(sizeof(long) == sizeof(uint64_t), "");
444
445 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
446 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
447 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
448
449 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
450 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
451 return tmp1;
452 #else
453 static_assert(sizeof(long) == sizeof(int), "");
454 return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
455 #endif
456}
457__device__
458inline
459unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
460{
461 #ifndef _MSC_VER
462 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
463 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
464
465 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
466 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
467 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
468
469 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
470 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
471 return tmp1;
472 #else
473 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
474 return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
475 #endif
476}
477__device__
478inline
479long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
480{
481 static_assert(sizeof(long long) == 2 * sizeof(int), "");
482 static_assert(sizeof(long long) == sizeof(uint64_t), "");
483 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
484 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
485 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
486 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
487 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
488 return tmp1;
489}
490__device__
491inline
492unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
493{
494 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
495 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
496 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
497 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
498 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
499 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
500 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
501 return tmp1;
502}
503
504#if defined(__clang__)
505#pragma clang diagnostic pop
506#endif
507
508#endif