HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_hip_unsafe_atomics.h
1/*
2Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#pragma once
24
25#ifdef __cplusplus
26
53__device__ inline float unsafeAtomicAdd(float* addr, float value) {
54#if defined(__gfx90a__) && \
55 __has_builtin(__builtin_amdgcn_is_shared) && \
56 __has_builtin(__builtin_amdgcn_is_private) && \
57 __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \
58 __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
59 if (__builtin_amdgcn_is_shared(
60 (const __attribute__((address_space(0))) void*)addr))
61 return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
62 else if (__builtin_amdgcn_is_private(
63 (const __attribute__((address_space(0))) void*)addr)) {
64 float temp = *addr;
65 *addr = temp + value;
66 return temp;
67 }
68 else
69 return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
70#elif __has_builtin(__hip_atomic_fetch_add)
71 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
72#else
73 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
74#endif
75}
76
91__device__ inline float unsafeAtomicMax(float* addr, float val) {
92 #if __has_builtin(__hip_atomic_load) && \
93 __has_builtin(__hip_atomic_compare_exchange_strong)
94 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
95 bool done = false;
96 while (!done && value < val) {
97 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
98 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
99 }
100 return value;
101 #else
102 unsigned int *uaddr = (unsigned int *)addr;
103 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
104 bool done = false;
105 while (!done && __uint_as_float(value) < val) {
106 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
107 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
108 }
109 return __uint_as_float(value);
110 #endif
111}
112
127__device__ inline float unsafeAtomicMin(float* addr, float val) {
128 #if __has_builtin(__hip_atomic_load) && \
129 __has_builtin(__hip_atomic_compare_exchange_strong)
130 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
131 bool done = false;
132 while (!done && value > val) {
133 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
134 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
135 }
136 return value;
137 #else
138 unsigned int *uaddr = (unsigned int *)addr;
139 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
140 bool done = false;
141 while (!done && __uint_as_float(value) > val) {
142 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
143 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
144 }
145 return __uint_as_float(value);
146 #endif
147}
148
175__device__ inline double unsafeAtomicAdd(double* addr, double value) {
176#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
177 return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
178#elif defined (__hip_atomic_fetch_add)
179 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
180#else
181 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
182#endif
183}
184
211__device__ inline double unsafeAtomicMax(double* addr, double val) {
212#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
213 __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
214 return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
215#else
216 #if __has_builtin(__hip_atomic_load) && \
217 __has_builtin(__hip_atomic_compare_exchange_strong)
218 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
219 bool done = false;
220 while (!done && value < val) {
221 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
222 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
223 }
224 return value;
225 #else
226 unsigned long long *uaddr = (unsigned long long *)addr;
227 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
228 bool done = false;
229 while (!done && __longlong_as_double(value) < val) {
230 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
231 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
232 }
233 return __longlong_as_double(value);
234 #endif
235#endif
236}
237
264__device__ inline double unsafeAtomicMin(double* addr, double val) {
265#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
266 __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
267 return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
268#else
269 #if __has_builtin(__hip_atomic_load) && \
270 __has_builtin(__hip_atomic_compare_exchange_strong)
271 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
272 bool done = false;
273 while (!done && value > val) {
274 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
275 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
276 }
277 return value;
278 #else
279 unsigned long long *uaddr = (unsigned long long *)addr;
280 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
281 bool done = false;
282 while (!done && __longlong_as_double(value) > val) {
283 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
284 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
285 }
286 return __longlong_as_double(value);
287 #endif
288#endif
289}
290
305__device__ inline float safeAtomicAdd(float* addr, float value) {
306#if defined(__gfx908__) || defined(__gfx941__) \
307 || ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__)) \
308 && !__has_builtin(__hip_atomic_fetch_add))
309 // On gfx908, we can generate unsafe FP32 atomic add that does not follow all
310 // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
311 // On gfx941, we can generate unsafe FP32 atomic add that may not always happen atomically,
312 // so we need to force a CAS loop emulation to ensure safety.
313 // On gfx90a, gfx940 and gfx942 if we do not have the __hip_atomic_fetch_add builtin, we
314 // need to force a CAS loop here.
315 float old_val;
316#if __has_builtin(__hip_atomic_load)
317 old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
318#else // !__has_builtin(__hip_atomic_load)
319 old_val = __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
320#endif // __has_builtin(__hip_atomic_load)
321 float expected, temp;
322 do {
323 temp = expected = old_val;
324#if __has_builtin(__hip_atomic_compare_exchange_strong)
325 __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
326 __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
327#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
328 __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
329 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
330#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
331 old_val = expected;
332 } while (__float_as_uint(temp) != __float_as_uint(old_val));
333 return old_val;
334#elif defined(__gfx90a__)
335 // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
336 // atomics will produce safe CAS loops, but are otherwise not different than
337 // agent-scope atomics. This logic is only applicable for gfx90a, and should
338 // not be assumed on other architectures.
339 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
340#elif __has_builtin(__hip_atomic_fetch_add)
341 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
342#else
343 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
344#endif
345}
346
361__device__ inline float safeAtomicMax(float* addr, float val) {
362 #if __has_builtin(__hip_atomic_load) && \
363 __has_builtin(__hip_atomic_compare_exchange_strong)
364 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
365 bool done = false;
366 while (!done && value < val) {
367 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
368 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
369 }
370 return value;
371 #else
372 unsigned int *uaddr = (unsigned int *)addr;
373 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
374 bool done = false;
375 while (!done && __uint_as_float(value) < val) {
376 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
377 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
378 }
379 return __uint_as_float(value);
380 #endif
381}
382
397__device__ inline float safeAtomicMin(float* addr, float val) {
398 #if __has_builtin(__hip_atomic_load) && \
399 __has_builtin(__hip_atomic_compare_exchange_strong)
400 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
401 bool done = false;
402 while (!done && value > val) {
403 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
404 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
405 }
406 return value;
407 #else
408 unsigned int *uaddr = (unsigned int *)addr;
409 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
410 bool done = false;
411 while (!done && __uint_as_float(value) > val) {
412 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
413 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
414 }
415 return __uint_as_float(value);
416 #endif
417}
418
433__device__ inline double safeAtomicAdd(double* addr, double value) {
434#if defined(__gfx90a__) && __has_builtin(__hip_atomic_fetch_add)
435 // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
436 // atomics will produce safe CAS loops, but are otherwise not different than
437 // agent-scope atomics. This logic is only applicable for gfx90a, and should
438 // not be assumed on other architectures.
439 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
440#elif defined(__gfx90a__)
441 // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
442 // force a CAS loop here.
443 double old_val;
444#if __has_builtin(__hip_atomic_load)
445 old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
446#else // !__has_builtin(__hip_atomic_load)
447 old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
448#endif // __has_builtin(__hip_atomic_load)
449 double expected, temp;
450 do {
451 temp = expected = old_val;
452#if __has_builtin(__hip_atomic_compare_exchange_strong)
453 __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
454 __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
455#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
456 __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
457 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
458#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
459 old_val = expected;
460 } while (__double_as_longlong(temp) != __double_as_longlong(old_val));
461 return old_val;
462#else // !defined(__gfx90a__)
463#if __has_builtin(__hip_atomic_fetch_add)
464 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
465#else // !__has_builtin(__hip_atomic_fetch_add)
466 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
467#endif // __has_builtin(__hip_atomic_fetch_add)
468#endif
469}
470
485__device__ inline double safeAtomicMax(double* addr, double val) {
486 #if __has_builtin(__builtin_amdgcn_is_private)
487 if (__builtin_amdgcn_is_private(
488 (const __attribute__((address_space(0))) void*)addr)) {
489 double old = *addr;
490 *addr = __builtin_fmax(old, val);
491 return old;
492 } else {
493 #endif
494 #if __has_builtin(__hip_atomic_load) && \
495 __has_builtin(__hip_atomic_compare_exchange_strong)
496 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
497 bool done = false;
498 while (!done && value < val) {
499 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
500 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
501 }
502 return value;
503 #else
504 unsigned long long *uaddr = (unsigned long long *)addr;
505 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
506 bool done = false;
507 while (!done && __longlong_as_double(value) < val) {
508 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
509 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
510 }
511 return __longlong_as_double(value);
512 #endif
513 #if __has_builtin(__builtin_amdgcn_is_private)
514 }
515 #endif
516}
517
532__device__ inline double safeAtomicMin(double* addr, double val) {
533 #if __has_builtin(__builtin_amdgcn_is_private)
534 if (__builtin_amdgcn_is_private(
535 (const __attribute__((address_space(0))) void*)addr)) {
536 double old = *addr;
537 *addr = __builtin_fmin(old, val);
538 return old;
539 } else {
540 #endif
541 #if __has_builtin(__hip_atomic_load) && \
542 __has_builtin(__hip_atomic_compare_exchange_strong)
543 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
544 bool done = false;
545 while (!done && value > val) {
546 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
547 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
548 }
549 return value;
550 #else
551 unsigned long long *uaddr = (unsigned long long *)addr;
552 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
553 bool done = false;
554 while (!done && __longlong_as_double(value) > val) {
555 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
556 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
557 }
558 return __longlong_as_double(value);
559 #endif
560 #if __has_builtin(__builtin_amdgcn_is_private)
561 }
562 #endif
563}
564
565#endif
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition hip_fp16_math_fwd.h:57