HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_hip_runtime.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
28//#pragma once
29#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
30#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
31
32#include <hip/amd_detail/amd_hip_common.h>
33
34#if !defined(__HIPCC_RTC__)
35#ifdef __cplusplus
36#include <cstddef>
37#else
38#include <stddef.h>
39#endif // __cplusplus
40#endif // !defined(__HIPCC_RTC__)
41
42#ifdef __cplusplus
43extern "C" {
44#endif
45
54const char* amd_dbgapi_get_build_name();
55
63const char* amd_dbgapi_get_git_hash();
64
72size_t amd_dbgapi_get_build_id();
73
74#ifdef __cplusplus
75} /* extern "c" */
76#endif
77
78//---
79// Top part of file can be compiled with any compiler
80
81#if !defined(__HIPCC_RTC__)
82#ifdef __cplusplus
83#include <cmath>
84#include <cstdint>
85#include <tuple>
86#else
87#include <math.h>
88#include <stdint.h>
89#endif // __cplusplus
90#else
91#if !__HIP_NO_STD_DEFS__
92typedef unsigned int uint32_t;
93typedef unsigned long long uint64_t;
94typedef signed int int32_t;
95typedef signed long long int64_t;
96namespace std {
97using ::uint32_t;
98using ::uint64_t;
99using ::int32_t;
100using ::int64_t;
101}
102#endif // __HIP_NO_STD_DEFS__
103#endif // !defined(__HIPCC_RTC__)
104
105#if __HIP_CLANG_ONLY__
106
107#if !defined(__align__)
108#define __align__(x) __attribute__((aligned(x)))
109#endif
110
111#define CUDA_SUCCESS hipSuccess
112
113#if !defined(__HIPCC_RTC__)
114#include <hip/hip_runtime_api.h>
115#include <hip/amd_detail/amd_hip_atomic.h>
116#include <hip/amd_detail/amd_device_functions.h>
117#include <hip/amd_detail/amd_surface_functions.h>
118#include <hip/amd_detail/texture_fetch_functions.h>
119#include <hip/amd_detail/texture_indirect_functions.h>
120extern int HIP_TRACE_API;
121#endif // !defined(__HIPCC_RTC__)
122
123#ifdef __cplusplus
124#include <hip/amd_detail/hip_ldg.h>
125#endif
126
128
129// TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
130#if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
131#define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
132#endif
133
134// Feature tests:
135#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
136// Device compile and not host compile:
137
138// 32-bit Atomics:
139#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
140#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
141#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
142#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
143#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
144
145// 64-bit Atomics:
146#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
147#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
148
149// Doubles
150#define __HIP_ARCH_HAS_DOUBLES__ (1)
151
152// warp cross-lane operations:
153#define __HIP_ARCH_HAS_WARP_VOTE__ (1)
154#define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
155#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
156#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
157
158// sync
159#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
160#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
161
162// misc
163#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
164#define __HIP_ARCH_HAS_3DGRID__ (1)
165#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
166
167#endif /* Device feature flags */
168
169
170#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
171 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
172#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
173 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
174 amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
175#define select_impl_(_1, _2, impl_, ...) impl_
176#define __launch_bounds__(...) \
177 select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0, )(__VA_ARGS__)
178
179#if !defined(__HIPCC_RTC__)
180__host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
181#endif // !defined(__HIPCC_RTC__)
182
183// End doxygen API:
188//
189// hip-clang functions
190//
191#if !defined(__HIPCC_RTC__)
192#define HIP_KERNEL_NAME(...) __VA_ARGS__
193#define HIP_SYMBOL(X) X
194
195typedef int hipLaunchParm;
196
197template <std::size_t n, typename... Ts,
198 typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
199void pArgs(const std::tuple<Ts...>&, void*) {}
200
201template <std::size_t n, typename... Ts,
202 typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
203void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
204 using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
205
206 static_assert(!std::is_reference<T>{},
207 "A __global__ function cannot have a reference as one of its "
208 "arguments.");
209#if defined(HIP_STRICT)
210 static_assert(std::is_trivially_copyable<T>{},
211 "Only TriviallyCopyable types can be arguments to a __global__ "
212 "function");
213#endif
214 _vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
215 return pArgs<n + 1>(formals, _vargs);
216}
217
218template <typename... Formals, typename... Actuals>
219std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
220 static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
221 std::tuple<Formals...> to_formals{std::move(actuals)};
222 return to_formals;
223}
224
225#if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
226template <typename... Args, typename F = void (*)(Args...)>
227void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
228 std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
229 constexpr size_t count = sizeof...(Args);
230 auto tup_ = std::tuple<Args...>{args...};
231 auto tup = validateArgsCountType(kernel, tup_);
232 void* _Args[count];
233 pArgs<0>(tup, _Args);
234
235 auto k = reinterpret_cast<void*>(kernel);
236 hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
237}
238#else
239#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
240 do { \
241 kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
242 } while (0)
243
244#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
245#endif
246
247#include <hip/hip_runtime_api.h>
248#endif // !defined(__HIPCC_RTC__)
249
250#if defined(__HIPCC_RTC__)
251typedef struct dim3 {
252 uint32_t x;
253 uint32_t y;
254 uint32_t z;
255#ifdef __cplusplus
256 constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
257#endif
258} dim3;
259#endif // !defined(__HIPCC_RTC__)
260
261#pragma push_macro("__DEVICE__")
262#define __DEVICE__ static __device__ __forceinline__
263
264extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
265__DEVICE__ unsigned int __hip_get_thread_idx_x() { return __ockl_get_local_id(0); }
266__DEVICE__ unsigned int __hip_get_thread_idx_y() { return __ockl_get_local_id(1); }
267__DEVICE__ unsigned int __hip_get_thread_idx_z() { return __ockl_get_local_id(2); }
268
269extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
270__DEVICE__ unsigned int __hip_get_block_idx_x() { return __ockl_get_group_id(0); }
271__DEVICE__ unsigned int __hip_get_block_idx_y() { return __ockl_get_group_id(1); }
272__DEVICE__ unsigned int __hip_get_block_idx_z() { return __ockl_get_group_id(2); }
273
274extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
275__DEVICE__ unsigned int __hip_get_block_dim_x() { return __ockl_get_local_size(0); }
276__DEVICE__ unsigned int __hip_get_block_dim_y() { return __ockl_get_local_size(1); }
277__DEVICE__ unsigned int __hip_get_block_dim_z() { return __ockl_get_local_size(2); }
278
279extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(unsigned int);
280__DEVICE__ unsigned int __hip_get_grid_dim_x() { return __ockl_get_num_groups(0); }
281__DEVICE__ unsigned int __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); }
282__DEVICE__ unsigned int __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); }
283
284#define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
285 __declspec(property(get = __get_##DIMENSION)) unsigned int DIMENSION; \
286 __DEVICE__ unsigned int __get_##DIMENSION(void) { \
287 return FUNCTION; \
288 }
289
290struct __hip_builtin_threadIdx_t {
291 __HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x());
292 __HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y());
293 __HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z());
294#ifdef __cplusplus
295 __device__ operator dim3() const { return dim3(x, y, z); }
296#endif
297};
298
299struct __hip_builtin_blockIdx_t {
300 __HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x());
301 __HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y());
302 __HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z());
303#ifdef __cplusplus
304 __device__ operator dim3() const { return dim3(x, y, z); }
305#endif
306};
307
308struct __hip_builtin_blockDim_t {
309 __HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x());
310 __HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y());
311 __HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z());
312#ifdef __cplusplus
313 __device__ operator dim3() const { return dim3(x, y, z); }
314#endif
315};
316
317struct __hip_builtin_gridDim_t {
318 __HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x());
319 __HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y());
320 __HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z());
321#ifdef __cplusplus
322 __device__ operator dim3() const { return dim3(x, y, z); }
323#endif
324};
325
326#undef __HIP_DEVICE_BUILTIN
327#pragma pop_macro("__DEVICE__")
328
329extern const __device__ __attribute__((weak)) __hip_builtin_threadIdx_t threadIdx;
330extern const __device__ __attribute__((weak)) __hip_builtin_blockIdx_t blockIdx;
331extern const __device__ __attribute__((weak)) __hip_builtin_blockDim_t blockDim;
332extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
333
334#define hipThreadIdx_x threadIdx.x
335#define hipThreadIdx_y threadIdx.y
336#define hipThreadIdx_z threadIdx.z
337
338#define hipBlockIdx_x blockIdx.x
339#define hipBlockIdx_y blockIdx.y
340#define hipBlockIdx_z blockIdx.z
341
342#define hipBlockDim_x blockDim.x
343#define hipBlockDim_y blockDim.y
344#define hipBlockDim_z blockDim.z
345
346#define hipGridDim_x gridDim.x
347#define hipGridDim_y gridDim.y
348#define hipGridDim_z gridDim.z
349
350#if !defined(__HIPCC_RTC__)
351#include <hip/amd_detail/amd_math_functions.h>
352#endif
353
354#if __HIP_HCC_COMPAT_MODE__
355// Define HCC work item functions in terms of HIP builtin variables.
356#pragma push_macro("__DEFINE_HCC_FUNC")
357#define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
358inline __device__ __attribute__((always_inline)) unsigned int hc_get_##hc_fun(unsigned int i) { \
359 if (i==0) \
360 return hip_var.x; \
361 else if(i==1) \
362 return hip_var.y; \
363 else \
364 return hip_var.z; \
365}
366
367__DEFINE_HCC_FUNC(workitem_id, threadIdx)
368__DEFINE_HCC_FUNC(group_id, blockIdx)
369__DEFINE_HCC_FUNC(group_size, blockDim)
370__DEFINE_HCC_FUNC(num_groups, gridDim)
371#pragma pop_macro("__DEFINE_HCC_FUNC")
372
373extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(unsigned int);
374inline __device__ __attribute__((always_inline)) unsigned int
375hc_get_workitem_absolute_id(int dim)
376{
377 return (unsigned int)__ockl_get_global_id(dim);
378}
379
380#endif
381
382#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
383#if !defined(__HIPCC_RTC__)
384// Support std::complex.
385#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
386#pragma push_macro("__CUDA__")
387#define __CUDA__
388#include <__clang_cuda_math_forward_declares.h>
389#include <__clang_cuda_complex_builtins.h>
390// Workaround for using libc++ with HIP-Clang.
391// The following headers requires clang include path before standard C++ include path.
392// However libc++ include path requires to be before clang include path.
393// To workaround this, we pass -isystem with the parent directory of clang include
394// path instead of the clang include path itself.
395#include <include/cuda_wrappers/algorithm>
396#include <include/cuda_wrappers/complex>
397#include <include/cuda_wrappers/new>
398#undef __CUDA__
399#pragma pop_macro("__CUDA__")
400#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
401#endif // !defined(__HIPCC_RTC__)
402#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
403#endif // __HIP_CLANG_ONLY__
404
405#endif // HIP_AMD_DETAIL_RUNTIME_H
#define __host__
Definition host_defines.h:170
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition hip_fp16_math_fwd.h:57