From 84cd3d4ba48126588dac871b7689da966591b521 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Sat, 9 Mar 2024 07:02:52 +0200 Subject: [PATCH 01/33] CHIP_USE_EXTERNAL_HIP_TESTS ON --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 21833304e..e9ee2c5aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -213,7 +213,7 @@ option(CHIP_ENABLE_UNCOMPILABLE_TESTS "Enable tests which are known to not compi option(CHIP_BUILD_TESTS "Enable build_tests target" ON) option(CHIP_BUILD_SAMPLES "Build samples" ON) option(CHIP_DUBIOUS_LOCKS "Enable locks that don't seem necessary but make a lot of valgrind issues go away" OFF) -option(CHIP_USE_EXTERNAL_HIP_TESTS "Use Catch2 tests from the hip-tests submodule" OFF) +option(CHIP_USE_EXTERNAL_HIP_TESTS "Use Catch2 tests from the hip-tests submodule" ON) option(CHIP_ENABLE_NON_COMPLIANT_DEVICELIB_CODE "Enable non-compliant devicelib code such as calling LLVM builtins from inside kernel code. Enables certain unsigned long devicelib func variants" OFF) option(CHIP_FAST_MATH "Use native_ OpenCL functions which are fast but their precision is implementation defined" OFF) option(CHIP_USE_INTEL_USM "When enabled, cl_intel_unified_shared_memory extension, when available, will be used for HIP allocations in the OpenCL backend" ON) From a297110e298ff6922879dc5e6f59c0aaff9162ff Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Sat, 9 Mar 2024 07:18:43 +0200 Subject: [PATCH 02/33] create hipconfig symlink --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e9ee2c5aa..57d43ef00 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -355,6 +355,7 @@ add_dependencies(CHIP hipcc.bin hipconfig.bin) file(MAKE_DIRECTORY ${HIPCC_BUILD_PATH}) file(CREATE_LINK hipcc.bin ${HIPCC_BUILD_PATH}/hipcc SYMBOLIC) +file(CREATE_LINK hipconfig.bin ${HIPCC_BUILD_PATH}/hipconfig SYMBOLIC) add_subdirectory(bin) From c51a1208535865c137ed4805fc8f6cfa6ebfb37e Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Sat, 9 Mar 2024 09:43:20 +0200 Subject: [PATCH 03/33] update HIPCC - passthrough --genco --- HIPCC | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HIPCC b/HIPCC index f43fdf9bb..350c9a602 160000 --- a/HIPCC +++ b/HIPCC @@ -1 +1 @@ -Subproject commit f43fdf9bbda1fdb21962e83382ea166efcf1361a +Subproject commit 350c9a6023dd951f959a7a8f23ac629d3f895ecc From 35987cd103819354e995da0ccf708e799b08a74f Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Sun, 10 Mar 2024 05:56:57 +0200 Subject: [PATCH 04/33] Rebase HIP on develop 6.x * Update CHIPBindings to include new API --- HIP | 2 +- include/hip/spirv_hip_runtime.h | 22 +++ samples/hipInfo/hipInfo.cpp | 1 - src/CHIPBackend.hh | 2 +- src/CHIPBindings.cc | 231 +++++++++++++++++++++++- src/backend/Level0/CHIPBackendLevel0.cc | 1 - src/backend/OpenCL/CHIPBackendOpenCL.cc | 1 - src/common.hh | 11 +- 8 files changed, 254 insertions(+), 17 deletions(-) diff --git a/HIP b/HIP index 48d32b6f7..03f0970a0 160000 --- a/HIP +++ b/HIP @@ -1 +1 @@ -Subproject commit 48d32b6f778b22cd9fe4e32f46d8a9c794aff16d +Subproject commit 03f0970a02f01e538f35ed9d6a9baaf37469568f diff --git a/include/hip/spirv_hip_runtime.h b/include/hip/spirv_hip_runtime.h index 67a1188ca..020878b52 100644 --- a/include/hip/spirv_hip_runtime.h +++ b/include/hip/spirv_hip_runtime.h @@ -48,6 +48,28 @@ THE SOFTWARE. #include #include +struct ihipEvent_t {}; +struct ihipCtx_t {}; +struct ihipStream_t {}; +struct ihipModule_t {}; +struct ihipModuleSymbol_t {}; +struct ihipGraph {}; +struct hipGraphNode {}; +struct hipGraphExec {}; + +typedef struct hipArray { + void* data; // FIXME: generalize this + struct hipChannelFormatDesc desc; + unsigned int type; + unsigned int width; + unsigned int height; + unsigned int depth; + enum hipArray_Format Format; + unsigned int NumChannels; + bool isDrv; + unsigned int textureType; +}hipArray; + // Feature tests: #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || \ __HIP_DEVICE_COMPILE__ diff --git a/samples/hipInfo/hipInfo.cpp b/samples/hipInfo/hipInfo.cpp index bbee0c09c..fea1e5483 100644 --- a/samples/hipInfo/hipInfo.cpp +++ b/samples/hipInfo/hipInfo.cpp @@ -137,7 +137,6 @@ void printDeviceProp(int deviceId) { cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl; cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl; cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl; - cout << setw(w1) << "gcnArch: " << props.gcnArch << endl; cout << setw(w1) << "maxTexture1DLinear: " << props.maxTexture1DLinear << endl; diff --git a/src/CHIPBackend.hh b/src/CHIPBackend.hh index 86ee46e31..9e4b76594 100644 --- a/src/CHIPBackend.hh +++ b/src/CHIPBackend.hh @@ -37,7 +37,7 @@ #include "spirv.hh" #include "common.hh" -#include "hip/hip_runtime_api.h" +#include "hip/spirv_hip_runtime.h" #include "hip/spirv_hip.hh" #include "CHIPDriver.hh" diff --git a/src/CHIPBindings.cc b/src/CHIPBindings.cc index 4fc0c1eb0..b44746b2e 100644 --- a/src/CHIPBindings.cc +++ b/src/CHIPBindings.cc @@ -46,7 +46,7 @@ #include "CHIPException.hh" #include "common.hh" #include "hip/hip_interop.h" -#include "hip/hip_runtime_api.h" +#include "hip/spirv_hip_runtime.h" #include "hip/spirv_spt.h" #include "hip_conversions.hh" #include "macros.hh" @@ -73,6 +73,233 @@ hipError_t hipFreeArray(hipArray *Array); +hipError_t hipMallocMipmappedArray(hipMipmappedArray_t *mipmappedArray, + const struct hipChannelFormatDesc *desc, + struct hipExtent extent, + unsigned int numLevels, + unsigned int flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipDeviceGetGraphMemAttribute(int device, + hipGraphMemAttributeType attr, + void *value) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipDeviceGraphMemTrim(int device) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipDeviceSetGraphMemAttribute(int device, + hipGraphMemAttributeType attr, + void *value) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipDrvGetErrorName(hipError_t hipError, const char **errorString) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipDrvGetErrorString(hipError_t hipError, const char **errorString) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipFreeMipmappedArray(hipMipmappedArray_t mipmappedArray) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipFuncSetAttribute(const void *func, hipFuncAttribute attr, + int value) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGetMipmappedArrayLevel(hipArray_t *levelArray, + hipMipmappedArray_const_t mipmappedArray, + unsigned int level) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGraphAddMemAllocNode(hipGraphNode_t *pGraphNode, hipGraph_t graph, + const hipGraphNode_t *pDependencies, + size_t numDependencies, + hipMemAllocNodeParams *pNodeParams) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGraphAddMemFreeNode(hipGraphNode_t *pGraphNode, hipGraph_t graph, + const hipGraphNode_t *pDependencies, + size_t numDependencies, void *dptr) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGraphDebugDotPrint(hipGraph_t graph, const char *path, + unsigned int flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGraphKernelNodeCopyAttributes(hipGraphNode_t hSrc, + hipGraphNode_t hDst) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGraphKernelNodeGetAttribute(hipGraphNode_t hNode, + hipKernelNodeAttrID attr, + hipKernelNodeAttrValue *value) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGraphKernelNodeSetAttribute(hipGraphNode_t hNode, + hipKernelNodeAttrID attr, + const hipKernelNodeAttrValue *value) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGraphMemAllocNodeGetParams(hipGraphNode_t node, + hipMemAllocNodeParams *pNodeParams) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGraphMemFreeNodeGetParams(hipGraphNode_t node, void *dptr) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, + hipGraphNode_t hNode, + unsigned int *isEnabled) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGraphNodeSetEnabled(hipGraphExec_t hGraphExec, + hipGraphNode_t hNode, + unsigned int isEnabled) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipGraphUpload(hipGraphExec_t graphExec, hipStream_t stream) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipMallocMipmappedArray(hipMipmappedArray_t mipmappedArray, + const struct hipChannelFormatDesc desc, + struct hipExtent extent, + unsigned int numLevels, unsigned int flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipMemAddressFree(void *devPtr, size_t size) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipMemAddressReserve(void **ptr, size_t size, size_t alignment, + void *addr, unsigned long long flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipMemCreate(hipMemGenericAllocationHandle_t *handle, size_t size, + const hipMemAllocationProp *prop, + unsigned long long flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t +hipMemGetAllocationGranularity(size_t *granularity, + const hipMemAllocationProp *prop, + hipMemAllocationGranularity_flags option) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipMemMap(void *ptr, size_t size, size_t offset, + hipMemGenericAllocationHandle_t handle, + unsigned long long flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipMemPoolExportPointer(hipMemPoolPtrExportData *shareData, + void *ptr) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t +hipMemPoolExportToShareableHandle(void *shareableHandle, hipMemPool_t memPool, + hipMemAllocationHandleType handleType, + unsigned int flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipMemPoolImportFromShareableHandle( + hipMemPool_t *memPool, void *shareableHandle, + hipMemAllocationHandleType handleType, unsigned int flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipMemPoolImportPointer(void **ptr, hipMemPool_t memPool, + hipMemPoolPtrExportData *shareData) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t *handle, + void *addr) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipMemUnmap(void *ptr, size_t size) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipSignalExternalSemaphoresAsync( + const hipExternalSemaphore_t *extSemArray, + const hipExternalSemaphoreSignalParams *paramsArray, + unsigned int numExtSems, hipStream_t stream) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipStreamGetDevice(hipStream_t stream, hipDevice_t *device) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream, + hipGraphNode_t *dependencies, + size_t numDependencies, + unsigned int flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipStreamWaitValue32(hipStream_t stream, void *ptr, uint32_t value, + unsigned int flags, uint32_t mask) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipStreamWaitValue64(hipStream_t stream, void *ptr, uint64_t value, + unsigned int flags, uint64_t mask) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipStreamWriteValue32(hipStream_t stream, void *ptr, uint32_t value, + unsigned int flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipStreamWriteValue64(hipStream_t stream, void *ptr, uint64_t value, + unsigned int flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipThreadExchangeStreamCaptureMode(hipStreamCaptureMode *mode) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipWaitExternalSemaphoresAsync( + const hipExternalSemaphore_t *extSemArray, + const hipExternalSemaphoreWaitParams *paramsArray, unsigned int numExtSems, + hipStream_t stream) { + UNIMPLEMENTED(hipErrorNotSupported); +} + hipError_t hipDeviceGetP2PAttribute(int *value, hipDeviceP2PAttr attr, int srcDevice, int dstDevice) { UNIMPLEMENTED(hipErrorNotSupported); @@ -1255,7 +1482,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, attributes->devicePointer = const_cast(ptr); attributes->hostPointer = AllocInfo->HostPtr; attributes->isManaged = AllocInfo->Managed; - attributes->memoryType = AllocInfo->MemoryType; + // attributes->memoryType = AllocInfo->MemoryType; // Seems strange but the expected behavior is that if // hipPointerGetAttributes gets called with an offset host pointer, the diff --git a/src/backend/Level0/CHIPBackendLevel0.cc b/src/backend/Level0/CHIPBackendLevel0.cc index aa7f4daaa..58ace2981 100644 --- a/src/backend/Level0/CHIPBackendLevel0.cc +++ b/src/backend/Level0/CHIPBackendLevel0.cc @@ -2087,7 +2087,6 @@ void CHIPDeviceLevel0::populateDevicePropertiesImpl() { HipDeviceProps_.pciDeviceID = 0x40 + getDeviceId(); HipDeviceProps_.isMultiGpuBoard = 0; HipDeviceProps_.canMapHostMemory = 1; - HipDeviceProps_.gcnArch = 0; HipDeviceProps_.integrated = (ZeDeviceProps_.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) ? 1 : 0; HipDeviceProps_.maxSharedMemoryPerMultiProcessor = diff --git a/src/backend/OpenCL/CHIPBackendOpenCL.cc b/src/backend/OpenCL/CHIPBackendOpenCL.cc index 95ef8cd87..3796e3047 100644 --- a/src/backend/OpenCL/CHIPBackendOpenCL.cc +++ b/src/backend/OpenCL/CHIPBackendOpenCL.cc @@ -524,7 +524,6 @@ void CHIPDeviceOpenCL::populateDevicePropertiesImpl() { HipDeviceProps_.pciDeviceID = 0x40 + getDeviceId(); HipDeviceProps_.isMultiGpuBoard = 0; HipDeviceProps_.canMapHostMemory = 1; - HipDeviceProps_.gcnArch = 0; HipDeviceProps_.integrated = 0; HipDeviceProps_.maxSharedMemoryPerMultiProcessor = HipDeviceProps_.sharedMemPerBlock * 16; diff --git a/src/common.hh b/src/common.hh index 40a2b394b..b4d3ebd49 100644 --- a/src/common.hh +++ b/src/common.hh @@ -41,16 +41,7 @@ #include #include -/// The implementation of ihipEvent_t. The chipstar::Event class inherits this -/// so ihipEvent_t pointers may carry chipstar::Event instances. -struct ihipEvent_t {}; -struct ihipCtx_t {}; -struct ihipStream_t {}; -struct ihipModule_t {}; -struct ihipModuleSymbol_t {}; -struct ihipGraph {}; -struct hipGraphNode {}; -struct hipGraphExec {}; + using SPVFunctionInfoMap = std::map>; From d56e6193a7729cf20081c53915074e555e1fa319 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Sun, 10 Mar 2024 06:00:26 +0200 Subject: [PATCH 05/33] remove simple_kernel sample --- samples/CMakeLists.txt | 1 - samples/simple_kernel/CMakeLists.txt | 1 - samples/simple_kernel/simple_kernel.hip | 43 ------------------------- 3 files changed, 45 deletions(-) delete mode 100644 samples/simple_kernel/CMakeLists.txt delete mode 100644 samples/simple_kernel/simple_kernel.hip diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 50b47c75e..fb4f6cc15 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -123,7 +123,6 @@ set(SAMPLES hip-cuda printf shuffles - simple_kernel clock graphMatrixMultiply graphs diff --git a/samples/simple_kernel/CMakeLists.txt b/samples/simple_kernel/CMakeLists.txt deleted file mode 100644 index 2a1bb8e44..000000000 --- a/samples/simple_kernel/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_chip_test(simple_kernel simple_kernel PASSED simple_kernel.hip) diff --git a/samples/simple_kernel/simple_kernel.hip b/samples/simple_kernel/simple_kernel.hip deleted file mode 100644 index e0bec7fe1..000000000 --- a/samples/simple_kernel/simple_kernel.hip +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2021-22 chipStar developers - * Copyright (c) 2022 Henry Linjamäki / Parmance for Argonne National Laboratory - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - - -#include "hip/hip_runtime.h" -#include - -__global__ void kernel(int *Out, int *In) { *Out = *In; } - -int main() { - int InH = 123, OutH = 0, *InD, *OutD; - (void)hipMalloc(&OutD, sizeof(int)); - (void)hipMalloc(&InD, sizeof(int)); - (void)hipMemcpy(InD, &InH, sizeof(int), hipMemcpyHostToDevice); - kernel<<<1, 1>>>(OutD, InD); - (void)hipMemcpy(&OutH, OutD, sizeof(int), hipMemcpyDeviceToHost); - printf("OutH=%d\n", OutH); - if (OutH == 123) { - printf("PASSED\n"); - return 0; - } - return 1; -} From 403284f47d5d8093c4474dd4870a795b9804bdd1 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Sun, 10 Mar 2024 06:02:28 +0200 Subject: [PATCH 06/33] update HIPCC submodule * Support --genco option --- HIPCC | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HIPCC b/HIPCC index 350c9a602..5c7cf6a96 160000 --- a/HIPCC +++ b/HIPCC @@ -1 +1 @@ -Subproject commit 350c9a6023dd951f959a7a8f23ac629d3f895ecc +Subproject commit 5c7cf6a967d8595cbffa82601aa6bc9e0419b408 From 4c8f00aef8c8232871cfcc8f5963c2adfea7a532 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 11 Mar 2024 08:43:02 +0200 Subject: [PATCH 07/33] add hipStreamAttachMemAsync to CHIPBindings --- src/CHIPBindings.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/CHIPBindings.cc b/src/CHIPBindings.cc index b44746b2e..04a096162 100644 --- a/src/CHIPBindings.cc +++ b/src/CHIPBindings.cc @@ -73,11 +73,15 @@ hipError_t hipFreeArray(hipArray *Array); +hipError_t hipStreamAttachMemAsync(hipStream_t stream, void *dev_ptr, + size_t length, unsigned int flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + hipError_t hipMallocMipmappedArray(hipMipmappedArray_t *mipmappedArray, const struct hipChannelFormatDesc *desc, struct hipExtent extent, - unsigned int numLevels, - unsigned int flags) { + unsigned int numLevels, unsigned int flags) { UNIMPLEMENTED(hipErrorNotSupported); } From 53b989258e3b6306e7fbfb22bd9d73ef23a66287 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 11 Mar 2024 08:43:21 +0200 Subject: [PATCH 08/33] placeholders for cooperative groups --- include/hip/spirv_hip_cooperative_groups.h | 843 +++++++++++++++++- .../hip/spirv_hip_cooperative_groups_helper.h | 263 ++++++ include/hip/spirv_hip_devicelib.hh | 73 ++ 3 files changed, 1167 insertions(+), 12 deletions(-) create mode 100644 include/hip/spirv_hip_cooperative_groups_helper.h diff --git a/include/hip/spirv_hip_cooperative_groups.h b/include/hip/spirv_hip_cooperative_groups.h index 77e2f507d..28bef4fe6 100644 --- a/include/hip/spirv_hip_cooperative_groups.h +++ b/include/hip/spirv_hip_cooperative_groups.h @@ -1,12 +1,831 @@ -#ifndef HIP_INCLUDE_HIP_SPIRV_HIP_COOPERATIVE_GROUPS_H -#define HIP_INCLUDE_HIP_SPIRV_HIP_COOPERATIVE_GROUPS_H - -//// Include CUDA headers -//#include -//#include -// -//// Include HIP wrapper headers around CUDA -//#include -//#include - -#endif // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * @file amd_detail/hip_cooperative_groups.h + * + * @brief Device side implementation of `Cooperative Group` feature. + * + * Defines new types and device API wrappers related to `Cooperative Group` + * feature, which the programmer can directly use in his kernel(s) in order to + * make use of this feature. + */ +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H + +#if __cplusplus +#if !defined(__HIPCC_RTC__) +#include +#endif + +#define __hip_assert assert + +namespace cooperative_groups { + +/** @brief The base type of all cooperative group types + * + * \details Holds the key properties of a constructed cooperative group types + * object, like the group type, its size, etc + * + * @note Cooperative groups feature is implemented on Linux, under developement + * on Windows. + */ +class thread_group { + protected: + uint32_t _type; // thread_group type + uint32_t _size; // total number of threads in the tread_group + uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types, + // LSB represents lane 0, and MSB represents lane 63 + + // Construct a thread group, and set thread group type and other essential + // thread group properties. This generic thread group is directly constructed + // only when the group is supposed to contain only the calling the thread + // (throurh the API - `this_thread()`), and in all other cases, this thread + // group object is a sub-object of some other derived thread group object + __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast(0), + uint64_t mask = static_cast(0)) { + _type = type; + _size = size; + _mask = mask; + } + + struct _tiled_info { + bool is_tiled; + unsigned int size; + unsigned int meta_group_rank; + unsigned int meta_group_size; + }; + + struct _coalesced_info { + lane_mask member_mask; + unsigned int size; + struct _tiled_info tiled_info; + } coalesced_info; + + friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, + unsigned int tile_size); + friend class thread_block; + + public: + // Total number of threads in the thread group, and this serves the purpose + // for all derived cooperative group types since their `size` is directly + // saved during the construction + __CG_QUALIFIER__ uint32_t size() const { return _size; } + __CG_QUALIFIER__ unsigned int cg_type() const { return _type; } + // Rank of the calling thread within [0, size()) + __CG_QUALIFIER__ uint32_t thread_rank() const; + // Is this cooperative group type valid? + __CG_QUALIFIER__ bool is_valid() const; + // synchronize the threads in the thread group + __CG_QUALIFIER__ void sync() const; +}; +/** + *------------------------------------------------------------------------------------------------- + *------------------------------------------------------------------------------------------------- + * @defgroup CooperativeG Cooperative Groups + * @ingroup API + * @{ + * This section describes the cooperative groups functions of HIP runtime API. + * + * The cooperative groups provides flexible thread parallel programming algorithms, threads + * cooperate and share data to perform collective computations. + * + * @note Cooperative groups feature is implemented on Linux, under developement + * on Windows. + * + */ +/** \brief The multi-grid cooperative group type + * + * \details Represents an inter-device cooperative group type where the + * participating threads within the group spans across multple + * devices, running the (same) kernel on these devices + * @note The multi-grid cooperative group type is implemented on Linux, under developement + * on Windows. + */ +class multi_grid_group : public thread_group { + // Only these friend functions are allowed to construct an object of this class + // and access its resources + friend __CG_QUALIFIER__ multi_grid_group this_multi_grid(); + + protected: + // Construct mutli-grid thread group (through the API this_multi_grid()) + explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size) + : thread_group(internal::cg_multi_grid, size) {} + + public: + // Number of invocations participating in this multi-grid group. In other + // words, the number of GPUs + __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); } + // Rank of this invocation. In other words, an ID number within the range + // [0, num_grids()) of the GPU, this kernel is running on + __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); } + __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); } + __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); } + __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); } +}; + +/** @brief User exposed API interface to construct multi-grid cooperative + * group type object - `multi_grid_group` + * + * \details User is not allowed to directly construct an object of type + * `multi_grid_group`. Instead, he should construct it through this + * API function + * @note This multi-grid cooperative API type is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ multi_grid_group this_multi_grid() { + return multi_grid_group(internal::multi_grid::size()); +} + +/** @brief The grid cooperative group type + * + * \details Represents an inter-workgroup cooperative group type where the + * participating threads within the group spans across multiple + * workgroups running the (same) kernel on the same device + * @note This is implemented on Linux, under developement + * on Windows. + */ +class grid_group : public thread_group { + // Only these friend functions are allowed to construct an object of this class + // and access its resources + friend __CG_QUALIFIER__ grid_group this_grid(); + + protected: + // Construct grid thread group (through the API this_grid()) + explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {} + + public: + __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); } + __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); } + __CG_QUALIFIER__ void sync() const { internal::grid::sync(); } +}; + +/** @brief User exposed API interface to construct grid cooperative group type + * object - `grid_group` + * + * \details User is not allowed to directly construct an object of type + * `multi_grid_group`. Instead, he should construct it through this + * API function + * @note This function is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); } + +/** @brief The workgroup (thread-block in CUDA terminology) cooperative group + * type + * + * \details Represents an intra-workgroup cooperative group type where the + * participating threads within the group are exactly the same threads + * which are participated in the currently executing `workgroup` + * @note This is implemented on Linux, under developement + * on Windows. + */ +class thread_block : public thread_group { + // Only these friend functions are allowed to construct an object of thi + // class and access its resources + friend __CG_QUALIFIER__ thread_block this_thread_block(); + friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, + unsigned int tile_size); + friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, + unsigned int tile_size); + protected: + // Construct a workgroup thread group (through the API this_thread_block()) + explicit __CG_QUALIFIER__ thread_block(uint32_t size) + : thread_group(internal::cg_workgroup, size) {} + + __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const { + const bool pow2 = ((tile_size & (tile_size - 1)) == 0); + // Invalid tile size, assert + if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { + __hip_assert(false && "invalid tile size"); + } + + thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size); + tiledGroup.coalesced_info.tiled_info.size = tile_size; + tiledGroup.coalesced_info.tiled_info.is_tiled = true; + tiledGroup.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; + tiledGroup.coalesced_info.tiled_info.meta_group_size = (size() + tile_size - 1) / tile_size; + return tiledGroup; + } + + public: + // 3-dimensional block index within the grid + __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); } + // 3-dimensional thread index within the block + __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); } + __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); } + __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); } + __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); } + __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); } + __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); } +}; + +/** \brief User exposed API interface to construct workgroup cooperative + * group type object - `thread_block`. + * + * \details User is not allowed to directly construct an object of type + * `thread_block`. Instead, he should construct it through this API + * function. + * @note This function is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ thread_block this_thread_block() { + return thread_block(internal::workgroup::size()); +} + +/** \brief The tiled_group cooperative group type + * + * \details Represents one tiled thread group in a wavefront. + * This group type also supports sub-wave level intrinsics. + * @note This is implemented on Linux, under developement + * on Windows. + */ + +class tiled_group : public thread_group { + private: + friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, + unsigned int tile_size); + friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, + unsigned int tile_size); + + __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const { + const bool pow2 = ((tile_size & (tile_size - 1)) == 0); + + if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { + __hip_assert(false && "invalid tile size"); + } + + if (size() <= tile_size) { + return *this; + } + + tiled_group tiledGroup = tiled_group(tile_size); + tiledGroup.coalesced_info.tiled_info.is_tiled = true; + return tiledGroup; + } + + protected: + explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize) + : thread_group(internal::cg_tiled_group, tileSize) { + coalesced_info.tiled_info.size = tileSize; + coalesced_info.tiled_info.is_tiled = true; + } + + public: + __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); } + + __CG_QUALIFIER__ unsigned int thread_rank() const { + return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1)); + } + + __CG_QUALIFIER__ void sync() const { + internal::tiled_group::sync(); + } +}; + +/** \brief The coalesced_group cooperative group type + * + * \details Represents a active thread group in a wavefront. + * This group type also supports sub-wave level intrinsics. + * @note This is implemented on Linux, under developement + * on Windows. + */ +class coalesced_group : public thread_group { + private: + friend __CG_QUALIFIER__ coalesced_group coalesced_threads(); + friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size); + friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size); + + __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const { + const bool pow2 = ((tile_size & (tile_size - 1)) == 0); + + if (!tile_size || (tile_size > size()) || !pow2) { + return coalesced_group(0); + } + + // If a tiled group is passed to be partitioned further into a coalesced_group. + // prepare a mask for further partitioning it so that it stays coalesced. + if (coalesced_info.tiled_info.is_tiled) { + unsigned int base_offset = (thread_rank() & (~(tile_size - 1))); + unsigned int masklength = min(static_cast(size()) - base_offset, tile_size); + lane_mask member_mask = static_cast(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength); + + member_mask <<= (__lane_id() & ~(tile_size - 1)); + coalesced_group coalesced_tile = coalesced_group(member_mask); + coalesced_tile.coalesced_info.tiled_info.is_tiled = true; + coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; + coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size; + return coalesced_tile; + } + // Here the parent coalesced_group is not partitioned. + else { + lane_mask member_mask = 0; + unsigned int tile_rank = 0; + int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size; + + for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) { + lane_mask active = coalesced_info.member_mask & (1 << i); + // Make sure the lane is active + if (active) { + if (lanes_to_skip <= 0 && tile_rank < tile_size) { + // Prepare a member_mask that is appropriate for a tile + member_mask |= active; + tile_rank++; + } + lanes_to_skip--; + } + } + coalesced_group coalesced_tile = coalesced_group(member_mask); + coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; + coalesced_tile.coalesced_info.tiled_info.meta_group_size = + (size() + tile_size - 1) / tile_size; + return coalesced_tile; + } + return coalesced_group(0); + } + + protected: + // Constructor + explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask) + : thread_group(internal::cg_coalesced_group) { + coalesced_info.member_mask = member_mask; // Which threads are active + coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active + coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group + coalesced_info.tiled_info.meta_group_rank = 0; + coalesced_info.tiled_info.meta_group_size = 1; + } + + public: + __CG_QUALIFIER__ unsigned int size() const { + return coalesced_info.size; + } + + __CG_QUALIFIER__ unsigned int thread_rank() const { + return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask); + } + + __CG_QUALIFIER__ void sync() const { + internal::coalesced_group::sync(); + } + + __CG_QUALIFIER__ unsigned int meta_group_rank() const { + return coalesced_info.tiled_info.meta_group_rank; + } + + __CG_QUALIFIER__ unsigned int meta_group_size() const { + return coalesced_info.tiled_info.meta_group_size; + } + + template + __CG_QUALIFIER__ T shfl(T var, int srcRank) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + + srcRank = srcRank % static_cast(size()); + + int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank + : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) + : __fns32(coalesced_info.member_mask, 0, (srcRank + 1)); + + return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); + } + + template + __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + + // Note: The cuda implementation appears to use the remainder of lane_delta + // and WARP_SIZE as the shift value rather than lane_delta itself. + // This is not described in the documentation and is not done here. + + if (size() == __AMDGCN_WAVEFRONT_SIZE) { + return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE); + } + + int lane; + if (__AMDGCN_WAVEFRONT_SIZE == 64) { + lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1); + } + else { + lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1); + } + + if (lane == -1) { + lane = __lane_id(); + } + + return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); + } + + template + __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + + // Note: The cuda implementation appears to use the remainder of lane_delta + // and WARP_SIZE as the shift value rather than lane_delta itself. + // This is not described in the documentation and is not done here. + + if (size() == __AMDGCN_WAVEFRONT_SIZE) { + return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE); + } + + int lane; + if (__AMDGCN_WAVEFRONT_SIZE == 64) { + lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); + } + else if (__AMDGCN_WAVEFRONT_SIZE == 32) { + lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); + } + + if (lane == -1) { + lane = __lane_id(); + } + + return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); + } +}; + +/** \brief User exposed API to create coalesced groups. + * + * \details A collective operation that groups all active lanes into a new thread group. + * @note This function is implemented on Linux, under developement + * on Windows. + */ + +__CG_QUALIFIER__ coalesced_group coalesced_threads() { + return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec()); +} + +/** + * Implemenation of all publicly exposed base class APIs + * @note This function is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const { + switch (this->_type) { + case internal::cg_multi_grid: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_grid: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_workgroup: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_tiled_group: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_coalesced_group: { + return (static_cast(this)->thread_rank()); + } + default: { + __hip_assert(false && "invalid cooperative group type"); + return -1; + } + } +} +/** + * Implemenation of all publicly exposed thread group API + * @note This function is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ bool thread_group::is_valid() const { + switch (this->_type) { + case internal::cg_multi_grid: { + return (static_cast(this)->is_valid()); + } + case internal::cg_grid: { + return (static_cast(this)->is_valid()); + } + case internal::cg_workgroup: { + return (static_cast(this)->is_valid()); + } + case internal::cg_tiled_group: { + return (static_cast(this)->is_valid()); + } + case internal::cg_coalesced_group: { + return (static_cast(this)->is_valid()); + } + default: { + __hip_assert(false && "invalid cooperative group type"); + return false; + } + } +} +/** + * Implemenation of all publicly exposed thread group sync API + * @note This function is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ void thread_group::sync() const { + switch (this->_type) { + case internal::cg_multi_grid: { + static_cast(this)->sync(); + break; + } + case internal::cg_grid: { + static_cast(this)->sync(); + break; + } + case internal::cg_workgroup: { + static_cast(this)->sync(); + break; + } + case internal::cg_tiled_group: { + static_cast(this)->sync(); + break; + } + case internal::cg_coalesced_group: { + static_cast(this)->sync(); + break; + } + default: { + __hip_assert(false && "invalid cooperative group type"); + } + } +} + +/** + * Implemenation of publicly exposed `wrapper` API on top of basic cooperative + * group type APIs + * @note This function is implemented on Linux, under developement + * on Windows. + */ +template __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); } +/** + * Implemenation of publicly exposed `wrapper` API on top of basic cooperative + * group type APIs + * @note This function is implemented on Linux, under developement + * on Windows. + */ +template __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) { + return g.thread_rank(); +} +/** + * Implemenation of publicly exposed `wrapper` API on top of basic cooperative + * group type APIs + * @note This function is implemented on Linux, under developement + * on Windows. + */ +template __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); } +/** + * Implemenation of publicly exposed `wrapper` API on top of basic cooperative + * group type APIs + * @note This function is implemented on Linux, under developement + * on Windows. + */ +template __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); } +/** + * template class tile_base + * @note This class is implemented on Linux, under developement + * on Windows. + */ +template class tile_base { + protected: + _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; + + public: + // Rank of the thread within this tile + _CG_STATIC_CONST_DECL_ unsigned int thread_rank() { + return (internal::workgroup::thread_rank() & (numThreads - 1)); + } + + // Number of threads within this tile + __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; } +}; +/** + * template class thread_block_tile_base + * @note This class is implemented on Linux, under developement + * on Windows. + */ +template class thread_block_tile_base : public tile_base { + static_assert(is_valid_tile_size::value, + "Tile size is either not a power of 2 or greater than the wavefront size"); + using tile_base::numThreads; + + public: + __CG_STATIC_QUALIFIER__ void sync() { + internal::tiled_group::sync(); + } + + template __CG_QUALIFIER__ T shfl(T var, int srcRank) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + return (__shfl(var, srcRank, numThreads)); + } + + template __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + return (__shfl_down(var, lane_delta, numThreads)); + } + + template __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + return (__shfl_up(var, lane_delta, numThreads)); + } + + template __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + return (__shfl_xor(var, laneMask, numThreads)); + } +}; +/** \brief User exposed API that captures the state of the parent group pre-partition + */ +template +class parent_group_info { +public: + // Returns the linear rank of the group within the set of tiles partitioned + // from a parent group (bounded by meta_group_size) + __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() { + return ParentCGTy::thread_rank() / tileSize; + } + + // Returns the number of groups created when the parent group was partitioned. + __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() { + return (ParentCGTy::size() + tileSize - 1) / tileSize; + } +}; + +/** \brief Group type - thread_block_tile + * + * \details Represents one tile of thread group. + * @note This type is implemented on Linux, under developement + * on Windows. + */ +template +class thread_block_tile_type : public thread_block_tile_base, + public tiled_group, + public parent_group_info { + _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; + typedef thread_block_tile_base tbtBase; + protected: + __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) { + coalesced_info.tiled_info.size = numThreads; + coalesced_info.tiled_info.is_tiled = true; + } + public: + using tbtBase::size; + using tbtBase::sync; + using tbtBase::thread_rank; +}; + +// Partial template specialization +template +class thread_block_tile_type : public thread_block_tile_base, + public tiled_group + { + _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; + + typedef thread_block_tile_base tbtBase; + + protected: + + __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size) + : tiled_group(numThreads) { + coalesced_info.tiled_info.size = numThreads; + coalesced_info.tiled_info.is_tiled = true; + coalesced_info.tiled_info.meta_group_rank = meta_group_rank; + coalesced_info.tiled_info.meta_group_size = meta_group_size; + } + + public: + using tbtBase::size; + using tbtBase::sync; + using tbtBase::thread_rank; + + __CG_QUALIFIER__ unsigned int meta_group_rank() const { + return coalesced_info.tiled_info.meta_group_rank; + } + + __CG_QUALIFIER__ unsigned int meta_group_size() const { + return coalesced_info.tiled_info.meta_group_size; + } +// end of operative group +/** +* @} +*/ +}; + + +/** \brief User exposed API to partition groups. + * + * \details A collective operation that partitions the parent group into a one-dimensional, + * row-major, tiling of subgroups. + */ + +__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) { + if (parent.cg_type() == internal::cg_tiled_group) { + const tiled_group* cg = static_cast(&parent); + return cg->new_tiled_group(tile_size); + } + else if(parent.cg_type() == internal::cg_coalesced_group) { + const coalesced_group* cg = static_cast(&parent); + return cg->new_tiled_group(tile_size); + } + else { + const thread_block* tb = static_cast(&parent); + return tb->new_tiled_group(tile_size); + } +} + +// Thread block type overload +__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) { + return (parent.new_tiled_group(tile_size)); +} + +__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) { + return (parent.new_tiled_group(tile_size)); +} + +// If a coalesced group is passed to be partitioned, it should remain coalesced +__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) { + return (parent.new_tiled_group(tile_size)); +} + +template class thread_block_tile; + +namespace impl { +template class thread_block_tile_internal; + +template +class thread_block_tile_internal : public thread_block_tile_type { + protected: + template + __CG_QUALIFIER__ thread_block_tile_internal( + const thread_block_tile_internal& g) + : thread_block_tile_type(g.meta_group_rank(), g.meta_group_size()) {} + + __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g) + : thread_block_tile_type() {} +}; +} // namespace impl + +template +class thread_block_tile : public impl::thread_block_tile_internal { + protected: + __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g) + : impl::thread_block_tile_internal(g) {} + + public: + __CG_QUALIFIER__ operator thread_block_tile() const { + return thread_block_tile(*this); + } +}; + + +template +class thread_block_tile : public impl::thread_block_tile_internal { + template friend class thread_block_tile; + + protected: + public: + template + __CG_QUALIFIER__ thread_block_tile(const thread_block_tile& g) + : impl::thread_block_tile_internal(g) {} +}; + +template class thread_block_tile; + +namespace impl { +template struct tiled_partition_internal; + +template +struct tiled_partition_internal : public thread_block_tile { + __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g) + : thread_block_tile(g) {} +}; + +} // namespace impl + +/** \brief User exposed API to partition groups. + * + * \details This constructs a templated class derieved from thread_group. + * The template defines tile size of the new thread group at compile time. + */ +template +__CG_QUALIFIER__ thread_block_tile tiled_partition(const ParentCGTy& g) { + static_assert(is_valid_tile_size::value, + "Tiled partition with size > wavefront size. Currently not supported "); + return impl::tiled_partition_internal(g); +} +} // namespace cooperative_groups + +#endif // __cplusplus +#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H diff --git a/include/hip/spirv_hip_cooperative_groups_helper.h b/include/hip/spirv_hip_cooperative_groups_helper.h new file mode 100644 index 000000000..235cebfd8 --- /dev/null +++ b/include/hip/spirv_hip_cooperative_groups_helper.h @@ -0,0 +1,263 @@ +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * @file amd_detail/hip_cooperative_groups_helper.h + * + * @brief Device side implementation of cooperative group feature. + * + * Defines helper constructs and APIs which aid the types and device API + * wrappers defined within `amd_detail/hip_cooperative_groups.h`. + */ +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H + +#if __cplusplus +#if !defined(__HIPCC_RTC__) +#include // threadId, blockId +#include +#endif +#if !defined(__align__) +#define __align__(x) __attribute__((aligned(x))) +#endif + +#if !defined(__CG_QUALIFIER__) +#define __CG_QUALIFIER__ __device__ __forceinline__ +#endif + +#if !defined(__CG_STATIC_QUALIFIER__) +#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__ +#endif + +#if !defined(_CG_STATIC_CONST_DECL_) +#define _CG_STATIC_CONST_DECL_ static constexpr +#endif + +#define __AMDGCN_WAVEFRONT_SIZE 32 +#if __AMDGCN_WAVEFRONT_SIZE == 32 +using lane_mask = unsigned int; +#else +using lane_mask = unsigned long long int; +#endif + +namespace cooperative_groups { + +/* Global scope */ +template +using is_power_of_2 = std::integral_constant; + +template +using is_valid_wavefront = std::integral_constant; + +template +using is_valid_tile_size = + std::integral_constant::value && is_valid_wavefront::value>; + +template +using is_valid_type = + std::integral_constant::value || std::is_floating_point::value>; + + +// TODO Cooperative Groups +uint32_t __device__ __ockl_multi_grid_num_grids() {return 0;}; +uint32_t __device__ __ockl_multi_grid_grid_rank() {return 0;}; +uint32_t __device__ __ockl_multi_grid_size() {return 0;}; +uint32_t __device__ __ockl_multi_grid_thread_rank() {return 0;}; +uint32_t __device__ __ockl_multi_grid_is_valid() {return 0;}; +uint32_t __device__ __ockl_multi_grid_sync() {return 0;}; +uint32_t __device__ __ockl_grid_sync() {return 0;}; +uint32_t __device__ __ockl_grid_is_valid() {return 0;}; +void __device__ __builtin_amdgcn_fence(int, const char*){}; +unsigned int __device__ __builtin_amdgcn_mbcnt_lo(unsigned int, unsigned int){return 0;}; +unsigned int __device__ __builtin_amdgcn_read_exec(void){return 0;}; + + +namespace internal { + +/** +* @brief Enums representing different cooperative group types +* @note This enum is only applicable on Linux. +* + */ +typedef enum { + cg_invalid, + cg_multi_grid, + cg_grid, + cg_workgroup, + cg_tiled_group, + cg_coalesced_group +} group_type; +/** + * @ingroup CooperativeG + * @{ + * This section describes the cooperative groups functions of HIP runtime API. + * + * The cooperative groups provides flexible thread parallel programming algorithms, threads + * cooperate and share data to perform collective computations. + * + * @note Cooperative groups feature is implemented on Linux, under developement + * on Windows. + * + */ +/** + * + * @brief Functionalities related to multi-grid cooperative group type + * @note The following cooperative groups functions are only applicable on Linux. + * + */ + + + +namespace multi_grid { + + + +__CG_STATIC_QUALIFIER__ uint32_t num_grids() { + return static_cast(__ockl_multi_grid_num_grids()); } + +__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { + return static_cast(__ockl_multi_grid_grid_rank()); } + +__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast(__ockl_multi_grid_size()); } + +__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { + return static_cast(__ockl_multi_grid_thread_rank()); } + +__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_multi_grid_is_valid()); } + +__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); } + +} // namespace multi_grid + +/** + * @brief Functionalities related to grid cooperative group type + * @note The following cooperative groups functions are only applicable on Linux. + */ +namespace grid { + +__CG_STATIC_QUALIFIER__ uint32_t size() { + return static_cast((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) * + (blockDim.x * gridDim.x)); +} + +__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { + // Compute global id of the workgroup to which the current thread belongs to + uint32_t blkIdx = static_cast((blockIdx.z * gridDim.y * gridDim.x) + + (blockIdx.y * gridDim.x) + (blockIdx.x)); + + // Compute total number of threads being passed to reach current workgroup + // within grid + uint32_t num_threads_till_current_workgroup = + static_cast(blkIdx * (blockDim.x * blockDim.y * blockDim.z)); + + // Compute thread local rank within current workgroup + uint32_t local_thread_rank = static_cast((threadIdx.z * blockDim.y * blockDim.x) + + (threadIdx.y * blockDim.x) + (threadIdx.x)); + + return (num_threads_till_current_workgroup + local_thread_rank); +} + +__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_grid_is_valid()); } + +__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); } + +} // namespace grid + +/** + * @brief Functionalities related to `workgroup` (thread_block in CUDA terminology) + * cooperative group type + * @note The following cooperative groups functions are only applicable on Linux. + */ +namespace workgroup { + +__CG_STATIC_QUALIFIER__ dim3 group_index() { + return (dim3(static_cast(blockIdx.x), static_cast(blockIdx.y), + static_cast(blockIdx.z))); +} + +__CG_STATIC_QUALIFIER__ dim3 thread_index() { + return (dim3(static_cast(threadIdx.x), static_cast(threadIdx.y), + static_cast(threadIdx.z))); +} + +__CG_STATIC_QUALIFIER__ uint32_t size() { + return (static_cast(blockDim.x * blockDim.y * blockDim.z)); +} + +__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { + return (static_cast((threadIdx.z * blockDim.y * blockDim.x) + + (threadIdx.y * blockDim.x) + (threadIdx.x))); +} + +__CG_STATIC_QUALIFIER__ bool is_valid() { + return true; +} + +__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); } + +__CG_STATIC_QUALIFIER__ dim3 block_dim() { + return (dim3(static_cast(blockDim.x), static_cast(blockDim.y), + static_cast(blockDim.z))); +} + +} // namespace workgroup + +namespace tiled_group { + +// enforce ordering for memory intructions +__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); } + +} // namespace tiled_group + +namespace coalesced_group { + +// enforce ordering for memory intructions +__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); } + +// Masked bit count +// +// For each thread, this function returns the number of active threads which +// have i-th bit of x set and come before the current thread. +__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { + unsigned int counter=0; + #if __AMDGCN_WAVEFRONT_SIZE == 32 + counter = __builtin_amdgcn_mbcnt_lo(x, add); + #else + counter = __builtin_amdgcn_mbcnt_lo(static_cast(x), add); + counter = __builtin_amdgcn_mbcnt_hi(static_cast(x >> 32), counter); + #endif + + return counter; +} + +} // namespace coalesced_group + + +} // namespace internal + +} // namespace cooperative_groups +/** +* @} +*/ + +#endif // __cplusplus +#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H diff --git a/include/hip/spirv_hip_devicelib.hh b/include/hip/spirv_hip_devicelib.hh index 865ceb5d1..fffb4657e 100644 --- a/include/hip/spirv_hip_devicelib.hh +++ b/include/hip/spirv_hip_devicelib.hh @@ -158,6 +158,79 @@ EXPORT unsigned long long clock64() { // loss can be avoided. EXPORT clock_t clock() { return (clock_t)clock64(); } +// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE), +// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position. +// If not found, return -1. +__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) { + uint64_t temp_mask = mask; + int32_t temp_offset = offset; + + if (offset == 0) { + temp_mask &= (1 << base); + temp_offset = 1; + } + else if (offset < 0) { + temp_mask = __builtin_bitreverse64(mask); + base = 63 - base; + temp_offset = -offset; + } + + temp_mask = temp_mask & ((~0ULL) << base); + if (__builtin_popcountll(temp_mask) < temp_offset) + return -1; + int32_t total = 0; + for (int i = 0x20; i > 0; i >>= 1) { + uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1); + int32_t pcnt = __builtin_popcountll(temp_mask_lo); + if (pcnt < temp_offset) { + temp_mask = temp_mask >> i; + temp_offset -= pcnt; + total += i; + } + else { + temp_mask = temp_mask_lo; + } + } + if (offset < 0) + return 63 - total; + else + return total; +} + +__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) { + uint64_t temp_mask = mask; + int32_t temp_offset = offset; + if (offset == 0) { + temp_mask &= (1 << base); + temp_offset = 1; + } + else if (offset < 0) { + temp_mask = __builtin_bitreverse64(mask); + base = 63 - base; + temp_offset = -offset; + } + temp_mask = temp_mask & ((~0ULL) << base); + if (__builtin_popcountll(temp_mask) < temp_offset) + return -1; + int32_t total = 0; + for (int i = 0x20; i > 0; i >>= 1) { + uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1); + int32_t pcnt = __builtin_popcountll(temp_mask_lo); + if (pcnt < temp_offset) { + temp_mask = temp_mask >> i; + temp_offset -= pcnt; + total += i; + } + else { + temp_mask = temp_mask_lo; + } + } + if (offset < 0) + return 63 - total; + else + return total; +} + #include #endif From 6e85c2315a61832c4d5e4f71cf27a3789151f23d Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 11 Mar 2024 10:42:30 +0200 Subject: [PATCH 09/33] CHIPBindings --- src/CHIPBindings.cc | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/CHIPBindings.cc b/src/CHIPBindings.cc index 04a096162..3d0e092c0 100644 --- a/src/CHIPBindings.cc +++ b/src/CHIPBindings.cc @@ -71,6 +71,10 @@ #define DECONST_NODES(x) \ reinterpret_cast(const_cast(x)) +hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name) { + UNIMPLEMENTED(hipErrorNotSupported); +} + hipError_t hipFreeArray(hipArray *Array); hipError_t hipStreamAttachMemAsync(hipStream_t stream, void *dev_ptr, @@ -4071,6 +4075,10 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes *Attr, CHIP_CATCH } +hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunction_t hfunc) { + UNIMPLEMENTED(hipErrorTbd); +} + hipError_t hipModuleGetGlobal(hipDeviceptr_t *Dptr, size_t *Bytes, hipModule_t Hmod, const char *Name) { CHIP_TRY @@ -5112,6 +5120,33 @@ hipError_t hipLaunchCooperativeKernel_spt(const void *f, dim3 gridDim, UNIMPLEMENTED(hipErrorNotSupported); } +hipError_t hipModuleLaunchCooperativeKernel( + hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, + unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, + unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t stream, + void **kernelParams) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipModuleLaunchCooperativeKernelMultiDevice( + hipFunctionLaunchParams *launchParamsList, unsigned int numDevices, + unsigned int flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t hipLaunchCooperativeKernel(const void *f, dim3 gridDim, + dim3 blockDimX, void **kernelParams, + unsigned int sharedMemBytes, + hipStream_t stream) { + UNIMPLEMENTED(hipErrorNotSupported); +} + +hipError_t +hipLaunchCooperativeKernelMultiDevice(hipLaunchParams *launchParamsList, + int numDevices, unsigned int flags) { + UNIMPLEMENTED(hipErrorNotSupported); +} + hipError_t hipLaunchKernel_spt(const void *function_address, dim3 numBlocks, dim3 dimBlocks, void **args, size_t sharedMemBytes, hipStream_t stream) { From 1a8aa46437a7d84a1be4ba4be497e136eb194417 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 11 Mar 2024 10:44:58 +0200 Subject: [PATCH 10/33] add -gdwarf-4 Without this, hip-tests runs into a compilation/linking issue DWARF error: invalid or unhandled FORM value: 0x23 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57d43ef00..4328c4d0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ add_compile_options(-Wno-format-extra-args -mf16c) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-duplicate-decl-specifier \ -Wno-tautological-constant-compare -Wno-c++20-extensions -Wno-unused-result \ --Wno-delete-abstract-non-virtual-dtor -Wno-deprecated-declarations -Wunused-command-line-argument") +-Wno-delete-abstract-non-virtual-dtor -Wno-deprecated-declarations -Wunused-command-line-argument -gdwarf-4") # end temporary cmake_minimum_required(VERSION 3.20 FATAL_ERROR) From f6cb210b4d542820749017d39b2fb19762df99b2 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 11 Mar 2024 13:02:51 +0200 Subject: [PATCH 11/33] update CHIPBindings --- src/CHIPBindings.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/CHIPBindings.cc b/src/CHIPBindings.cc index 3d0e092c0..5034af850 100644 --- a/src/CHIPBindings.cc +++ b/src/CHIPBindings.cc @@ -77,6 +77,10 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const hipError_t hipFreeArray(hipArray *Array); +hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config) { + UNIMPLEMENTED(hipErrorNotSupported); +} + hipError_t hipStreamAttachMemAsync(hipStream_t stream, void *dev_ptr, size_t length, unsigned int flags) { UNIMPLEMENTED(hipErrorNotSupported); @@ -4890,6 +4894,8 @@ hipOccupancyMaxActiveBlocksPerMultiprocessor(int *NumBlocks, const void *Func, CHIP_CATCH } + + hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int *NumBlocks, const void *Func, int BlockSize, size_t DynSharedMemPerBlk, unsigned int Flags) { From e39a6b74fd7693f77437ecd26bdf6539bce0b8fb Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 11 Mar 2024 13:03:13 +0200 Subject: [PATCH 12/33] __managed__ keyword --- include/hip/spirv_hip_runtime.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/hip/spirv_hip_runtime.h b/include/hip/spirv_hip_runtime.h index 020878b52..9fb4d82d6 100644 --- a/include/hip/spirv_hip_runtime.h +++ b/include/hip/spirv_hip_runtime.h @@ -57,6 +57,8 @@ struct ihipGraph {}; struct hipGraphNode {}; struct hipGraphExec {}; +#define __managed__ __device__ + typedef struct hipArray { void* data; // FIXME: generalize this struct hipChannelFormatDesc desc; From dcb79f46a4334838690758da902a87bda5675938 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 11 Mar 2024 13:03:45 +0200 Subject: [PATCH 13/33] update hip-tests --- hip-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hip-tests b/hip-tests index d7159bbc0..4e952d145 160000 --- a/hip-tests +++ b/hip-tests @@ -1 +1 @@ -Subproject commit d7159bbc011be7e0a0da01115c8197568e96dc86 +Subproject commit 4e952d145c0a43f5a6c433db0cde6649eb73ea8a From d873b66767875200462fb3d32c2e41461bd9a816 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 11 Mar 2024 13:03:58 +0200 Subject: [PATCH 14/33] expectedArgs.cpp test file --- expectedArgs.cpp | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 expectedArgs.cpp diff --git a/expectedArgs.cpp b/expectedArgs.cpp new file mode 100644 index 000000000..a9eea491e --- /dev/null +++ b/expectedArgs.cpp @@ -0,0 +1,48 @@ + +#include + +template +std::tuple getExpectedArgs(void(*)(FArgs...)) {}; + +template +void validateArguments(F f, Args&&... args) { + using expectedArgsTuple = decltype(getExpectedArgs(f)); + using providedArgsTuple = std::tuple; + + static_assert(std::is_same::value, + "Kernel arguments types must match exactly!"); +} + +// General launchKernel function +template +void launchKernel(Kernel kernel, Dim numBlocks, Dim numThreads, std::uint32_t memPerBlock, hipStream_t stream, Args&&... args) { + // Define a stateless, capture-free lambda that matches the kernel's signature. + auto kernelWrapperLambda = [] (Args... args) { + // This lambda is intentionally left empty as it's used solely for type validation. + }; + + // Convert the lambda to a function pointer. + void (*kernelWrapper)(Args...) = kernelWrapperLambda; + + // Use the wrapper function pointer to validate arguments. + validateArguments(kernelWrapper, std::forward(args)...); + + // Launch the kernel directly with the provided arguments. + kernel<<>>(std::forward(args)...); + +} + +template void __global__ vectorADD(const T* A_d, const T* B_d, T* C_d, size_t NELEM) {} + +int main() { + int LEN = 1; + dim3 dimGrid(LEN / 512, 1, 1); + dim3 dimBlock(512, 1, 1); + float *A_d, *B_d, *C_d; + + + + launchKernel(vectorADD, dimGrid, dimBlock, + 0, 0, static_cast(A_d), + static_cast(B_d), C_d, static_cast(LEN)); +} \ No newline at end of file From a6c2bd50dc93054e04ee78d5287ce145c3ac715c Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Wed, 13 Mar 2024 02:57:27 +0200 Subject: [PATCH 15/33] devicelib __chip__fns32 --- bitcode/devicelib.cl | 75 +++++++++++++++++++ .../hip/devicelib/integer/int_intrinsics.hh | 28 +++++-- include/hip/spirv_hip_devicelib.hh | 73 ------------------ 3 files changed, 95 insertions(+), 81 deletions(-) diff --git a/bitcode/devicelib.cl b/bitcode/devicelib.cl index 0d495bb3a..fc136caaf 100644 --- a/bitcode/devicelib.cl +++ b/bitcode/devicelib.cl @@ -43,6 +43,81 @@ #error __opencl_c_generic_address_space needed! #endif +// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE), +// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position. +// If not found, return -1. +// In HIP long long is 64-bit integer. In OpenCL it's 128-bit integer. +EXPORT int __chip__fns64(unsigned long int mask, unsigned int base, int offset) { + unsigned long int temp_mask = mask; + int temp_offset = offset; + + if (offset == 0) { + temp_mask &= (1 << base); + temp_offset = 1; + } + else if (offset < 0) { + temp_mask = __builtin_bitreverse64(mask); + base = 63 - base; + temp_offset = -offset; + } + + temp_mask = temp_mask & ((~0ULL) << base); + if (__builtin_popcountll(temp_mask) < temp_offset) + return -1; + int total = 0; + for (int i = 0x20; i > 0; i >>= 1) { + unsigned long int temp_mask_lo = temp_mask & ((1ULL << i) - 1); + int pcnt = __builtin_popcountll(temp_mask_lo); + if (pcnt < temp_offset) { + temp_mask = temp_mask >> i; + temp_offset -= pcnt; + total += i; + } + else { + temp_mask = temp_mask_lo; + } + } + if (offset < 0) + return 63 - total; + else + return total; +} + +EXPORT int __chip__fns32(unsigned long int mask, unsigned int base, int offset) { + unsigned long int temp_mask = mask; + int temp_offset = offset; + if (offset == 0) { + temp_mask &= (1 << base); + temp_offset = 1; + } + else if (offset < 0) { + temp_mask = __builtin_bitreverse64(mask); + base = 63 - base; + temp_offset = -offset; + } + temp_mask = temp_mask & ((~0ULL) << base); + if (__builtin_popcountll(temp_mask) < temp_offset) + return -1; + int total = 0; + for (int i = 0x20; i > 0; i >>= 1) { + unsigned long int temp_mask_lo = temp_mask & ((1ULL << i) - 1); + int pcnt = __builtin_popcountll(temp_mask_lo); + if (pcnt < temp_offset) { + temp_mask = temp_mask >> i; + temp_offset -= pcnt; + total += i; + } + else { + temp_mask = temp_mask_lo; + } + } + if (offset < 0) + return 63 - total; + else + return total; +} + + EXPORT unsigned /* long */ long int __chip_umul64hi(unsigned /* long */ long int x, unsigned /* long */ long int y) { diff --git a/include/hip/devicelib/integer/int_intrinsics.hh b/include/hip/devicelib/integer/int_intrinsics.hh index 85fc7ef4b..12b862a36 100644 --- a/include/hip/devicelib/integer/int_intrinsics.hh +++ b/include/hip/devicelib/integer/int_intrinsics.hh @@ -42,18 +42,30 @@ __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int offset, return __chip_bitinsert_u32(src0, src1, offset, width); } -extern "C" __device__ uint64_t __chip_bitinsert_u64(uint64_t src0, - uint64_t src1, - uint64_t offset, - uint64_t width); -extern "C++" inline __device__ uint64_t __bitinsert_u64(uint64_t src0, - uint64_t src1, - uint64_t offset, - uint64_t width) { +extern "C" __device__ unsigned long long int __chip_bitinsert_u64(unsigned long long int src0, + unsigned long long int src1, + unsigned long long int offset, + unsigned long long int width); +extern "C++" inline __device__ unsigned long long int __bitinsert_u64(unsigned long long int src0, + unsigned long long int src1, + unsigned long long int offset, + unsigned long long int width) { return __chip_bitinsert_u64(src0, src1, offset, width); } #endif // CHIP_ENABLE_NON_COMPLIANT_DEVICELIB_CODE +// int was replaced with int +// int64_t was replaced with long long int +extern "C" __device__ int __chip__fns32(unsigned long long int mask, unsigned int base, int offset); +extern "C++" inline __device__ int __fns32(unsigned long long int mask, unsigned int base, int offset) { + return __chip__fns32(mask, base, offset); +} + +extern "C" __device__ int __chip__fns64(unsigned long long int mask, unsigned int base, int offset); +extern "C++" inline __device__ int __fns64(unsigned long long int mask, unsigned int base, int offset) { + return __chip__fns64(mask, base, offset); +} + extern "C" __device__ unsigned int __chip_brev(unsigned int x); // Custom extern "C++" inline __device__ unsigned int __brev(unsigned int x) { return __chip_brev(x); diff --git a/include/hip/spirv_hip_devicelib.hh b/include/hip/spirv_hip_devicelib.hh index fffb4657e..865ceb5d1 100644 --- a/include/hip/spirv_hip_devicelib.hh +++ b/include/hip/spirv_hip_devicelib.hh @@ -158,79 +158,6 @@ EXPORT unsigned long long clock64() { // loss can be avoided. EXPORT clock_t clock() { return (clock_t)clock64(); } -// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE), -// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position. -// If not found, return -1. -__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) { - uint64_t temp_mask = mask; - int32_t temp_offset = offset; - - if (offset == 0) { - temp_mask &= (1 << base); - temp_offset = 1; - } - else if (offset < 0) { - temp_mask = __builtin_bitreverse64(mask); - base = 63 - base; - temp_offset = -offset; - } - - temp_mask = temp_mask & ((~0ULL) << base); - if (__builtin_popcountll(temp_mask) < temp_offset) - return -1; - int32_t total = 0; - for (int i = 0x20; i > 0; i >>= 1) { - uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1); - int32_t pcnt = __builtin_popcountll(temp_mask_lo); - if (pcnt < temp_offset) { - temp_mask = temp_mask >> i; - temp_offset -= pcnt; - total += i; - } - else { - temp_mask = temp_mask_lo; - } - } - if (offset < 0) - return 63 - total; - else - return total; -} - -__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) { - uint64_t temp_mask = mask; - int32_t temp_offset = offset; - if (offset == 0) { - temp_mask &= (1 << base); - temp_offset = 1; - } - else if (offset < 0) { - temp_mask = __builtin_bitreverse64(mask); - base = 63 - base; - temp_offset = -offset; - } - temp_mask = temp_mask & ((~0ULL) << base); - if (__builtin_popcountll(temp_mask) < temp_offset) - return -1; - int32_t total = 0; - for (int i = 0x20; i > 0; i >>= 1) { - uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1); - int32_t pcnt = __builtin_popcountll(temp_mask_lo); - if (pcnt < temp_offset) { - temp_mask = temp_mask >> i; - temp_offset -= pcnt; - total += i; - } - else { - temp_mask = temp_mask_lo; - } - } - if (offset < 0) - return 63 - total; - else - return total; -} - #include #endif From 2db8685f861da3aae0e68180c62059bce5d259dd Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Wed, 13 Mar 2024 04:34:20 +0200 Subject: [PATCH 16/33] update hip-tests --- hip-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hip-tests b/hip-tests index 4e952d145..f33b7d173 160000 --- a/hip-tests +++ b/hip-tests @@ -1 +1 @@ -Subproject commit 4e952d145c0a43f5a6c433db0cde6649eb73ea8a +Subproject commit f33b7d173f7af6a645d6a11a350e3850495c7686 From b265f08f08006d532a4162867ee86619721744ea Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Wed, 13 Mar 2024 10:19:55 +0200 Subject: [PATCH 17/33] Fix cmake HIP_VERSION --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4328c4d0c..94c049550 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -567,7 +567,9 @@ set(_versionInfo "# Auto-generated by cmake\n") set(HIP_VERSION_MAJOR 5) set(HIP_VERSION_MINOR 1) set(HIP_VERSION_PATCH 0) -set(HIP_VERSION_PATCH_GITHASH 0) # TODO +set(HIP_VERSION_GITHASH 0) +set(HIP_VERSION_PATCH_GITHASH "${HIP_VERSION_PATCH}.${HIP_VERSION_GITHASH}") +set(HIP_VERSION "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_PATCH}-${HIP_VERSION_PATCH_GITHASH}") add_to_config(_versionInfo HIP_VERSION_MAJOR "${HIP_VERSION_MAJOR}") add_to_config(_versionInfo HIP_VERSION_MINOR "${HIP_VERSION_MINOR}") From 97c4ff8768829ac19e935b8e098be220507be9b5 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Wed, 13 Mar 2024 10:20:21 +0200 Subject: [PATCH 18/33] Cmake - build tests --- CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 94c049550..0f076a3dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,6 +217,8 @@ option(CHIP_USE_EXTERNAL_HIP_TESTS "Use Catch2 tests from the hip-tests submodul option(CHIP_ENABLE_NON_COMPLIANT_DEVICELIB_CODE "Enable non-compliant devicelib code such as calling LLVM builtins from inside kernel code. Enables certain unsigned long devicelib func variants" OFF) option(CHIP_FAST_MATH "Use native_ OpenCL functions which are fast but their precision is implementation defined" OFF) option(CHIP_USE_INTEL_USM "When enabled, cl_intel_unified_shared_memory extension, when available, will be used for HIP allocations in the OpenCL backend" ON) +option(CATCH2_DISCOVER_TESTS_COMPILE_TIME "Discover the tests at compile time" ON) + # This mitigation might be necessary on some systems with an older runtime. # This mitigation makes memory resident (disable swapping) on the GPU # This has a significant impact on the cost of a GPU malloc @@ -703,14 +705,10 @@ SET(HIP_PATH ${CMAKE_BINARY_DIR}) # pick up build sources, not install sources set(SAVED_WARN_DEPRECATED ${CMAKE_WARN_DEPRECATED}) set(CMAKE_WARN_DEPRECATED OFF) -if(CHIP_USE_EXTERNAL_HIP_TESTS) - add_subdirectory(hip-tests/catch catch) -else() - add_subdirectory(HIP/tests/catch catch) -endif() set(CMAKE_WARN_DEPRECATED ${SAVED_WARN_DEPRECATED}) if(CHIP_BUILD_TESTS) + add_subdirectory(hip-tests/catch catch) add_subdirectory(tests/cuda) add_subdirectory(tests/devicelib) add_subdirectory(tests/hiprtc) From 154b4c064e3fa8b2f5f1a842846ac4ce7b860867 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Thu, 14 Mar 2024 09:49:38 +0200 Subject: [PATCH 19/33] update hip-tests --- hip-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hip-tests b/hip-tests index f33b7d173..441d61748 160000 --- a/hip-tests +++ b/hip-tests @@ -1 +1 @@ -Subproject commit f33b7d173f7af6a645d6a11a350e3850495c7686 +Subproject commit 441d61748e532102c4602cb477d16ad1effd47f1 From cf2e07c3e8fd85ce0e7331b9aa07721671d6e62a Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Fri, 15 Mar 2024 03:27:51 +0200 Subject: [PATCH 20/33] tests cmake cleanup --- CMakeLists.txt | 8 +------- hip-tests | 2 +- tests/CMakeLists.txt | 22 ++++++++++++++++++++++ tests/compiler/CMakeLists.txt | 1 + tests/cuda/CMakeLists.txt | 1 + tests/devicelib/CMakeLists.txt | 15 +-------------- tests/fromLibCeed/CMakeLists.txt | 13 ------------- tests/hiprtc/CMakeLists.txt | 13 ------------- tests/runtime/CMakeLists.txt | 1 + 9 files changed, 28 insertions(+), 48 deletions(-) create mode 100644 tests/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 0f076a3dc..f6ca1c685 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -709,13 +709,7 @@ set(CMAKE_WARN_DEPRECATED ${SAVED_WARN_DEPRECATED}) if(CHIP_BUILD_TESTS) add_subdirectory(hip-tests/catch catch) - add_subdirectory(tests/cuda) - add_subdirectory(tests/devicelib) - add_subdirectory(tests/hiprtc) - add_subdirectory(tests/fromLibCeed) - add_subdirectory(tests/compiler) - add_subdirectory(tests/runtime) - add_subdirectory(tests/post-install) + add_subdirectory(tests) endif() if(CHIP_BUILD_SAMPLES) diff --git a/hip-tests b/hip-tests index 441d61748..52a8b06b0 160000 --- a/hip-tests +++ b/hip-tests @@ -1 +1 @@ -Subproject commit 441d61748e532102c4602cb477d16ad1effd47f1 +Subproject commit 52a8b06b035b95a602f6d18a8ab8f9ce4e92ef4d diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 000000000..65e6e08eb --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,22 @@ +function(add_hip_test MAIN_SOURCE) + get_filename_component(EXEC_NAME ${MAIN_SOURCE} NAME_WLE) + set_source_files_properties(${MAIN_SOURCE} PROPERTIES LANGUAGE CXX) + add_executable("${EXEC_NAME}" EXCLUDE_FROM_ALL ${MAIN_SOURCE}) + set_target_properties("${EXEC_NAME}" PROPERTIES CXX_STANDARD_REQUIRED ON) + + target_link_libraries("${EXEC_NAME}" CHIP deviceInternal) + target_include_directories("${EXEC_NAME}" + PUBLIC ${CMAKE_SOURCE_DIR}/HIP/include ${CMAKE_SOURCE_DIR}/include) + + add_test(NAME ${EXEC_NAME} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${EXEC_NAME}) + add_dependencies(build_tests "${EXEC_NAME}") +endfunction() + +add_subdirectory(compiler) +add_subdirectory(cuda) +add_subdirectory(devicelib) +add_subdirectory(fromLibCeed) +add_subdirectory(hiprtc) +add_subdirectory(post-install) +add_subdirectory(runtime) + diff --git a/tests/compiler/CMakeLists.txt b/tests/compiler/CMakeLists.txt index bb5c07fc8..1220a66f8 100644 --- a/tests/compiler/CMakeLists.txt +++ b/tests/compiler/CMakeLists.txt @@ -41,6 +41,7 @@ function(add_hipcc_test MAIN_SOURCE) COMMAND ${CMAKE_BINARY_DIR}/bin/hipcc ${TESTOPT_HIPCC_OPTIONS} ${CMAKE_CURRENT_SOURCE_DIR}/${MAIN_SOURCE} -o /dev/null) + endfunction() # add_shell_test() diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt index 353d1902d..a66bc747b 100644 --- a/tests/cuda/CMakeLists.txt +++ b/tests/cuda/CMakeLists.txt @@ -15,6 +15,7 @@ function(add_compile_test) add_test(NAME ${TEST_NAME} COMMAND bash -c "${CMAKE_BINARY_DIR}/bin/cuspvc ${TEST_COMPILE_OPTIONS} ${TEST_SOURCES} 2>&1" WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}") + endfunction() add_compile_test(NAME activemask SOURCES activemask.cu diff --git a/tests/devicelib/CMakeLists.txt b/tests/devicelib/CMakeLists.txt index 921314a6d..b59d2c065 100644 --- a/tests/devicelib/CMakeLists.txt +++ b/tests/devicelib/CMakeLists.txt @@ -23,19 +23,6 @@ # #============================================================================= -function(add_hip_test MAIN_SOURCE) - get_filename_component(EXEC_NAME ${MAIN_SOURCE} NAME_WLE) - set_source_files_properties(${MAIN_SOURCE} PROPERTIES LANGUAGE CXX) - add_executable("${EXEC_NAME}" ${MAIN_SOURCE}) - set_target_properties("${EXEC_NAME}" PROPERTIES CXX_STANDARD_REQUIRED ON) - - target_link_libraries("${EXEC_NAME}" CHIP deviceInternal) - target_include_directories("${EXEC_NAME}" - PUBLIC ${CMAKE_SOURCE_DIR}/HIP/include ${CMAKE_SOURCE_DIR}/include) - - add_test(NAME ${EXEC_NAME} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${EXEC_NAME}) -endfunction() - add_hip_test(implicitCasts.cpp) # Add this as a hipcc otherwise the `make all` will fail which is not convenient indicator for test failure @@ -43,4 +30,4 @@ add_test(NAME "deviceMallocCompile" COMMAND ${CMAKE_BINARY_DIR}/bin/hipcc ${CMAKE_CURRENT_SOURCE_DIR}/deviceMallocCompile.cpp) -add_hip_test(sincospifSpotTest.cc) +add_hip_test(sincospifSpotTest.cc) \ No newline at end of file diff --git a/tests/fromLibCeed/CMakeLists.txt b/tests/fromLibCeed/CMakeLists.txt index 1156d9191..1554c203c 100644 --- a/tests/fromLibCeed/CMakeLists.txt +++ b/tests/fromLibCeed/CMakeLists.txt @@ -23,18 +23,5 @@ # #============================================================================= -function(add_hip_test MAIN_SOURCE) - get_filename_component(EXEC_NAME ${MAIN_SOURCE} NAME_WLE) - set_source_files_properties(${MAIN_SOURCE} PROPERTIES LANGUAGE CXX) - add_executable("${EXEC_NAME}" ${MAIN_SOURCE}) - set_target_properties("${EXEC_NAME}" PROPERTIES CXX_STANDARD_REQUIRED ON) - - target_link_libraries("${EXEC_NAME}" CHIP deviceInternal) - target_include_directories("${EXEC_NAME}" - PUBLIC ${CMAKE_SOURCE_DIR}/HIP/include ${CMAKE_SOURCE_DIR}/include) - - add_test(NAME ${EXEC_NAME} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${EXEC_NAME}) -endfunction() - add_hip_test(firstTouch.cpp) add_hip_test(syncthreadsExitedThreads.cpp) \ No newline at end of file diff --git a/tests/hiprtc/CMakeLists.txt b/tests/hiprtc/CMakeLists.txt index 0ec93ae8c..e59a2570b 100644 --- a/tests/hiprtc/CMakeLists.txt +++ b/tests/hiprtc/CMakeLists.txt @@ -23,18 +23,5 @@ # #============================================================================= -function(add_hip_test MAIN_SOURCE) - get_filename_component(EXEC_NAME ${MAIN_SOURCE} NAME_WLE) - set_source_files_properties(${MAIN_SOURCE} PROPERTIES LANGUAGE CXX) - add_executable("${EXEC_NAME}" ${MAIN_SOURCE}) - set_target_properties("${EXEC_NAME}" PROPERTIES CXX_STANDARD_REQUIRED ON) - - target_link_libraries("${EXEC_NAME}" CHIP deviceInternal) - target_include_directories("${EXEC_NAME}" - PUBLIC ${CMAKE_SOURCE_DIR}/HIP/include ${CMAKE_SOURCE_DIR}/include) - - add_test(NAME ${EXEC_NAME} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${EXEC_NAME}) -endfunction() - add_hip_test(TestHiprtcCPPKernels.cc) add_hip_test(TestHiprtcOptions.cc) diff --git a/tests/runtime/CMakeLists.txt b/tests/runtime/CMakeLists.txt index 90296089b..062093661 100644 --- a/tests/runtime/CMakeLists.txt +++ b/tests/runtime/CMakeLists.txt @@ -45,6 +45,7 @@ function(add_hip_runtime_test MAIN_SOURCE) set_tests_properties("${EXEC_NAME}" PROPERTIES SKIP_RETURN_CODE ${CHIP_SKIP_TEST}) + add_dependencies(build_tests "${EXEC_NAME}") endfunction() # add_shell_test() From c5e48085da6e045d81e4b0a61dc11372f247f783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= Date: Thu, 21 Mar 2024 09:39:30 +0200 Subject: [PATCH 21/33] HipLowerZeroLegthArrays: process PtrToint constexprs This covers printf() uses involving casts from integer literals to pointers. Should fix #798. --- llvm_passes/HipLowerZeroLengthArrays.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm_passes/HipLowerZeroLengthArrays.cpp b/llvm_passes/HipLowerZeroLengthArrays.cpp index 316fb5912..0ddade37a 100644 --- a/llvm_passes/HipLowerZeroLengthArrays.cpp +++ b/llvm_passes/HipLowerZeroLengthArrays.cpp @@ -78,12 +78,13 @@ static Constant *getLoweredConstantOrNull(Constant *C) { (NewPtr ? NewPtr : OrigPtr), NewIndices, GEP->isInBounds()); return NewGEP; - } else if (isa(CE) || isa(CE)) { + } else if (isa(CE) || isa(CE) || + CE->getOpcode() == Instruction::IntToPtr) { auto *LoweredOpd0 = getLoweredConstantOrNull(CE->getOperand(0)); auto *LoweredTy = getLoweredTypeOrNull(CE->getType()); if (LoweredOpd0 || LoweredTy) - return ConstantExpr::getPointerCast( - LoweredOpd0 ? LoweredOpd0 : CE->getOperand(0), + return ConstantExpr::getCast( + CE->getOpcode(), LoweredOpd0 ? LoweredOpd0 : CE->getOperand(0), LoweredTy ? LoweredTy : CE->getType()); return nullptr; From 4f8dcc62e192440e17f2df8192c1b636b48b5493 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 1 Apr 2024 03:07:16 +0300 Subject: [PATCH 22/33] copy in old tests --- CMakeLists.txt | 5 +++++ HIP | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f6ca1c685..9e0ebcf1c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -705,6 +705,11 @@ SET(HIP_PATH ${CMAKE_BINARY_DIR}) # pick up build sources, not install sources set(SAVED_WARN_DEPRECATED ${CMAKE_WARN_DEPRECATED}) set(CMAKE_WARN_DEPRECATED OFF) +if(CHIP_USE_EXTERNAL_HIP_TESTS) + add_subdirectory(hip-tests/catch catch) +else() + add_subdirectory(HIP/tests/catch catch) +endif() set(CMAKE_WARN_DEPRECATED ${SAVED_WARN_DEPRECATED}) if(CHIP_BUILD_TESTS) diff --git a/HIP b/HIP index 03f0970a0..5f621263a 160000 --- a/HIP +++ b/HIP @@ -1 +1 @@ -Subproject commit 03f0970a02f01e538f35ed9d6a9baaf37469568f +Subproject commit 5f621263aeaf16e28558b60dfa385fe5204d908e From 5b00b1b5c67e709017ebad2bdb0d0297a01630ec Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 1 Apr 2024 03:07:58 +0300 Subject: [PATCH 23/33] CHIP_USE_EXTERNAL_HIP_TESTS OFF --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e0ebcf1c..96d4ae4c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -213,7 +213,7 @@ option(CHIP_ENABLE_UNCOMPILABLE_TESTS "Enable tests which are known to not compi option(CHIP_BUILD_TESTS "Enable build_tests target" ON) option(CHIP_BUILD_SAMPLES "Build samples" ON) option(CHIP_DUBIOUS_LOCKS "Enable locks that don't seem necessary but make a lot of valgrind issues go away" OFF) -option(CHIP_USE_EXTERNAL_HIP_TESTS "Use Catch2 tests from the hip-tests submodule" ON) +option(CHIP_USE_EXTERNAL_HIP_TESTS "Use Catch2 tests from the hip-tests submodule" OFF) option(CHIP_ENABLE_NON_COMPLIANT_DEVICELIB_CODE "Enable non-compliant devicelib code such as calling LLVM builtins from inside kernel code. Enables certain unsigned long devicelib func variants" OFF) option(CHIP_FAST_MATH "Use native_ OpenCL functions which are fast but their precision is implementation defined" OFF) option(CHIP_USE_INTEL_USM "When enabled, cl_intel_unified_shared_memory extension, when available, will be used for HIP allocations in the OpenCL backend" ON) From 99d54d65ec72037ac27b65c88b804cb5ef7be726 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 1 Apr 2024 04:49:01 +0300 Subject: [PATCH 24/33] disable tests which are now incompatible with HIP 6.x --- CMakeLists.txt | 13 +++++-------- HIP | 2 +- tests/known_failures.yaml | 1 + 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 96d4ae4c4..855634f28 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -705,15 +705,12 @@ SET(HIP_PATH ${CMAKE_BINARY_DIR}) # pick up build sources, not install sources set(SAVED_WARN_DEPRECATED ${CMAKE_WARN_DEPRECATED}) set(CMAKE_WARN_DEPRECATED OFF) -if(CHIP_USE_EXTERNAL_HIP_TESTS) - add_subdirectory(hip-tests/catch catch) -else() - add_subdirectory(HIP/tests/catch catch) -endif() -set(CMAKE_WARN_DEPRECATED ${SAVED_WARN_DEPRECATED}) - if(CHIP_BUILD_TESTS) - add_subdirectory(hip-tests/catch catch) + if(CHIP_USE_EXTERNAL_HIP_TESTS) + add_subdirectory(hip-tests/catch catch) + else() + add_subdirectory(HIP/tests/catch catch) + endif() add_subdirectory(tests) endif() diff --git a/HIP b/HIP index 5f621263a..d9a658b0d 160000 --- a/HIP +++ b/HIP @@ -1 +1 @@ -Subproject commit 5f621263aeaf16e28558b60dfa385fe5204d908e +Subproject commit d9a658b0d0fbd0412faf186b6827716920c18b1b diff --git a/tests/known_failures.yaml b/tests/known_failures.yaml index 07ee2ddd8..064653f9b 100644 --- a/tests/known_failures.yaml +++ b/tests/known_failures.yaml @@ -1,5 +1,6 @@ TOTAL_TESTS: 1397 ALL: + Print_Out_Attributes: 'old HIP tests + new HIP API' TestAssert: '' TestAssertFail: '' TestIndirectCall: '' From 05da6d95e3dde927748ed32572ec7d6178d525b6 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 1 Apr 2024 04:57:31 +0300 Subject: [PATCH 25/33] remove inline keyword to suppress warnings --- .../hip/devicelib/type_casting_intrinsics.hh | 130 +++++++++--------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/include/hip/devicelib/type_casting_intrinsics.hh b/include/hip/devicelib/type_casting_intrinsics.hh index 338bf0182..4bf8faf55 100644 --- a/include/hip/devicelib/type_casting_intrinsics.hh +++ b/include/hip/devicelib/type_casting_intrinsics.hh @@ -25,50 +25,50 @@ #include -extern "C++" inline __device__ float __double2float_rd(double x); -extern "C++" inline __device__ float __double2float_rn(double x); -extern "C++" inline __device__ float __double2float_ru(double x); -extern "C++" inline __device__ float __double2float_rz(double x); -extern "C++" inline __device__ int __double2hiint(double x); -extern "C++" inline __device__ int __double2int_rd(double x); -extern "C++" inline __device__ int __double2int_rn(double x); -extern "C++" inline __device__ int __double2int_ru(double x); -extern "C++" inline __device__ int __double2int_rz(double x); -extern "C++" inline __device__ long long int __double2ll_rd(double x); -extern "C++" inline __device__ long long int __double2ll_rn(double x); -extern "C++" inline __device__ long long int __double2ll_ru(double x); -extern "C++" inline __device__ long long int __double2ll_rz(double x); -extern "C++" inline __device__ int __double2loint(double x); -extern "C++" inline __device__ unsigned int __double2uint_rd(double x); -extern "C++" inline __device__ unsigned int __double2uint_rn(double x); -extern "C++" inline __device__ unsigned int __double2uint_ru(double x); -extern "C++" inline __device__ unsigned int __double2uint_rz(double x); -extern "C++" inline __device__ unsigned long long int __double2ull_rd(double x); -extern "C++" inline __device__ unsigned long long int __double2ull_rn(double x); -extern "C++" inline __device__ unsigned long long int __double2ull_ru(double x); -extern "C++" inline __device__ unsigned long long int __double2ull_rz(double x); +extern "C++" __device__ float __double2float_rd(double x); +extern "C++" __device__ float __double2float_rn(double x); +extern "C++" __device__ float __double2float_ru(double x); +extern "C++" __device__ float __double2float_rz(double x); +extern "C++" __device__ int __double2hiint(double x); +extern "C++" __device__ int __double2int_rd(double x); +extern "C++" __device__ int __double2int_rn(double x); +extern "C++" __device__ int __double2int_ru(double x); +extern "C++" __device__ int __double2int_rz(double x); +extern "C++" __device__ long long int __double2ll_rd(double x); +extern "C++" __device__ long long int __double2ll_rn(double x); +extern "C++" __device__ long long int __double2ll_ru(double x); +extern "C++" __device__ long long int __double2ll_rz(double x); +extern "C++" __device__ int __double2loint(double x); +extern "C++" __device__ unsigned int __double2uint_rd(double x); +extern "C++" __device__ unsigned int __double2uint_rn(double x); +extern "C++" __device__ unsigned int __double2uint_ru(double x); +extern "C++" __device__ unsigned int __double2uint_rz(double x); +extern "C++" __device__ unsigned long long int __double2ull_rd(double x); +extern "C++" __device__ unsigned long long int __double2ull_rn(double x); +extern "C++" __device__ unsigned long long int __double2ull_ru(double x); +extern "C++" __device__ unsigned long long int __double2ull_rz(double x); extern "C" __device__ long long int __chip_double_as_longlong(double x); extern "C++" inline __device__ long long int __double_as_longlong(double x) { return __chip_double_as_longlong(x); } -extern "C++" inline __device__ int __float2int_rd(float x); -extern "C++" inline __device__ int __float2int_rn(float x); -extern "C++" inline __device__ int __float2int_ru(float); -extern "C++" inline __device__ int __float2int_rz(float x); -extern "C++" inline __device__ long long int __float2ll_rd(float x); -extern "C++" inline __device__ long long int __float2ll_rn(float x); -extern "C++" inline __device__ long long int __float2ll_ru(float x); -extern "C++" inline __device__ long long int __float2ll_rz(float x); -extern "C++" inline __device__ unsigned int __float2uint_rd(float x); -extern "C++" inline __device__ unsigned int __float2uint_rn(float x); -extern "C++" inline __device__ unsigned int __float2uint_ru(float x); -extern "C++" inline __device__ unsigned int __float2uint_rz(float x); -extern "C++" inline __device__ unsigned long long int __float2ull_rd(float x); -extern "C++" inline __device__ unsigned long long int __float2ull_rn(float x); -extern "C++" inline __device__ unsigned long long int __float2ull_ru(float x); -extern "C++" inline __device__ unsigned long long int __float2ull_rz(float x); +extern "C++" __device__ int __float2int_rd(float x); +extern "C++" __device__ int __float2int_rn(float x); +extern "C++" __device__ int __float2int_ru(float); +extern "C++" __device__ int __float2int_rz(float x); +extern "C++" __device__ long long int __float2ll_rd(float x); +extern "C++" __device__ long long int __float2ll_rn(float x); +extern "C++" __device__ long long int __float2ll_ru(float x); +extern "C++" __device__ long long int __float2ll_rz(float x); +extern "C++" __device__ unsigned int __float2uint_rd(float x); +extern "C++" __device__ unsigned int __float2uint_rn(float x); +extern "C++" __device__ unsigned int __float2uint_ru(float x); +extern "C++" __device__ unsigned int __float2uint_rz(float x); +extern "C++" __device__ unsigned long long int __float2ull_rd(float x); +extern "C++" __device__ unsigned long long int __float2ull_rn(float x); +extern "C++" __device__ unsigned long long int __float2ull_ru(float x); +extern "C++" __device__ unsigned long long int __float2ull_rz(float x); extern "C" __device__ int __chip_float_as_int(float x); extern "C++" inline __device__ int __float_as_int(float x) { @@ -80,50 +80,50 @@ extern "C++" inline __device__ unsigned int __float_as_uint(float x) { return __chip_float_as_uint(x); } -extern "C++" inline __device__ double __hiloint2double(int hi, int lo); -extern "C++" inline __device__ double __int2double_rn(int x); -extern "C++" inline __device__ float __int2float_rd(int x); -extern "C++" inline __device__ float __int2float_rn(int x); -extern "C++" inline __device__ float __int2float_ru(int x); -extern "C++" inline __device__ float __int2float_rz(int x); +extern "C++" __device__ double __hiloint2double(int hi, int lo); +extern "C++" __device__ double __int2double_rn(int x); +extern "C++" __device__ float __int2float_rd(int x); +extern "C++" __device__ float __int2float_rn(int x); +extern "C++" __device__ float __int2float_ru(int x); +extern "C++" __device__ float __int2float_rz(int x); extern "C" __device__ float __chip_int_as_float(int x); extern "C++" inline __device__ float __int_as_float(int x) { return __chip_int_as_float(x); } -extern "C++" inline __device__ double __ll2double_rd(long long int x); -extern "C++" inline __device__ double __ll2double_rn(long long int x); -extern "C++" inline __device__ double __ll2double_ru(long long int x); -extern "C++" inline __device__ double __ll2double_rz(long long int x); -extern "C++" inline __device__ float __ll2float_rd(long long int x); -extern "C++" inline __device__ float __ll2float_rn(long long int x); -extern "C++" inline __device__ float __ll2float_ru(long long int x); -extern "C++" inline __device__ float __ll2float_rz(long long int x); +extern "C++" __device__ double __ll2double_rd(long long int x); +extern "C++" __device__ double __ll2double_rn(long long int x); +extern "C++" __device__ double __ll2double_ru(long long int x); +extern "C++" __device__ double __ll2double_rz(long long int x); +extern "C++" __device__ float __ll2float_rd(long long int x); +extern "C++" __device__ float __ll2float_rn(long long int x); +extern "C++" __device__ float __ll2float_ru(long long int x); +extern "C++" __device__ float __ll2float_rz(long long int x); extern "C" __device__ double __chip_longlong_as_double(long long int x); extern "C++" inline __device__ double __longlong_as_double(long long int x) { return __chip_longlong_as_double(x); } -extern "C++" inline __device__ double __uint2double_rn(unsigned int x); -extern "C++" inline __device__ float __uint2float_rd(unsigned int x); -extern "C++" inline __device__ float __uint2float_rn(unsigned int x); -extern "C++" inline __device__ float __uint2float_ru(unsigned int x); -extern "C++" inline __device__ float __uint2float_rz(unsigned int x); +extern "C++" __device__ double __uint2double_rn(unsigned int x); +extern "C++" __device__ float __uint2float_rd(unsigned int x); +extern "C++" __device__ float __uint2float_rn(unsigned int x); +extern "C++" __device__ float __uint2float_ru(unsigned int x); +extern "C++" __device__ float __uint2float_rz(unsigned int x); extern "C" __device__ float __chip_uint_as_float(uint x); extern "C++" inline __device__ float __uint_as_float(unsigned int x) { return __chip_uint_as_float(x); } -extern "C++" inline __device__ double __ull2double_rd(unsigned long long int x); -extern "C++" inline __device__ double __ull2double_rn(unsigned long long int x); -extern "C++" inline __device__ double __ull2double_ru(unsigned long long int x); -extern "C++" inline __device__ double __ull2double_rz(unsigned long long int x); -extern "C++" inline __device__ float __ull2float_rd(unsigned long long int x); -extern "C++" inline __device__ float __ull2float_rn(unsigned long long int x); -extern "C++" inline __device__ float __ull2float_ru(unsigned long long int x); -extern "C++" inline __device__ float __ull2float_rz(unsigned long long int x); +extern "C++" __device__ double __ull2double_rd(unsigned long long int x); +extern "C++" __device__ double __ull2double_rn(unsigned long long int x); +extern "C++" __device__ double __ull2double_ru(unsigned long long int x); +extern "C++" __device__ double __ull2double_rz(unsigned long long int x); +extern "C++" __device__ float __ull2float_rd(unsigned long long int x); +extern "C++" __device__ float __ull2float_rn(unsigned long long int x); +extern "C++" __device__ float __ull2float_ru(unsigned long long int x); +extern "C++" __device__ float __ull2float_rz(unsigned long long int x); #endif // include guard From d1d31665278e2f6ceadc8c0d3c416a7c63172203 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Tue, 2 Apr 2024 05:29:48 +0300 Subject: [PATCH 26/33] linter adjust --- .github/workflows/clang-tidy-format.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/clang-tidy-format.yml b/.github/workflows/clang-tidy-format.yml index 470e84b06..f62c377d1 100644 --- a/.github/workflows/clang-tidy-format.yml +++ b/.github/workflows/clang-tidy-format.yml @@ -28,7 +28,7 @@ jobs: files-changed-only: true lines-changed-only: true extensions: 'cc,hh' - tidy-checks: 'readability-identifier-naming' + tidy-checks: '-*,readability-*,modernize-*,clang-analyzer-*' - name: Fail fast?! if: steps.linter.outputs.checks-failed > 0 run: exit 1 From 3977e979f38ba6b0d32784de523000530a80b0d6 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Wed, 3 Apr 2024 00:24:37 +0300 Subject: [PATCH 27/33] PR comments --- expectedArgs.cpp | 48 ------------------------------------ src/CHIPBindings.cc | 2 +- tests/runtime/CMakeLists.txt | 1 - 3 files changed, 1 insertion(+), 50 deletions(-) delete mode 100644 expectedArgs.cpp diff --git a/expectedArgs.cpp b/expectedArgs.cpp deleted file mode 100644 index a9eea491e..000000000 --- a/expectedArgs.cpp +++ /dev/null @@ -1,48 +0,0 @@ - -#include - -template -std::tuple getExpectedArgs(void(*)(FArgs...)) {}; - -template -void validateArguments(F f, Args&&... args) { - using expectedArgsTuple = decltype(getExpectedArgs(f)); - using providedArgsTuple = std::tuple; - - static_assert(std::is_same::value, - "Kernel arguments types must match exactly!"); -} - -// General launchKernel function -template -void launchKernel(Kernel kernel, Dim numBlocks, Dim numThreads, std::uint32_t memPerBlock, hipStream_t stream, Args&&... args) { - // Define a stateless, capture-free lambda that matches the kernel's signature. - auto kernelWrapperLambda = [] (Args... args) { - // This lambda is intentionally left empty as it's used solely for type validation. - }; - - // Convert the lambda to a function pointer. - void (*kernelWrapper)(Args...) = kernelWrapperLambda; - - // Use the wrapper function pointer to validate arguments. - validateArguments(kernelWrapper, std::forward(args)...); - - // Launch the kernel directly with the provided arguments. - kernel<<>>(std::forward(args)...); - -} - -template void __global__ vectorADD(const T* A_d, const T* B_d, T* C_d, size_t NELEM) {} - -int main() { - int LEN = 1; - dim3 dimGrid(LEN / 512, 1, 1); - dim3 dimBlock(512, 1, 1); - float *A_d, *B_d, *C_d; - - - - launchKernel(vectorADD, dimGrid, dimBlock, - 0, 0, static_cast(A_d), - static_cast(B_d), C_d, static_cast(LEN)); -} \ No newline at end of file diff --git a/src/CHIPBindings.cc b/src/CHIPBindings.cc index 5034af850..2f7f3caaa 100644 --- a/src/CHIPBindings.cc +++ b/src/CHIPBindings.cc @@ -1494,7 +1494,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, attributes->devicePointer = const_cast(ptr); attributes->hostPointer = AllocInfo->HostPtr; attributes->isManaged = AllocInfo->Managed; - // attributes->memoryType = AllocInfo->MemoryType; + attributes->type = AllocInfo->MemoryType; // Seems strange but the expected behavior is that if // hipPointerGetAttributes gets called with an offset host pointer, the diff --git a/tests/runtime/CMakeLists.txt b/tests/runtime/CMakeLists.txt index 062093661..90296089b 100644 --- a/tests/runtime/CMakeLists.txt +++ b/tests/runtime/CMakeLists.txt @@ -45,7 +45,6 @@ function(add_hip_runtime_test MAIN_SOURCE) set_tests_properties("${EXEC_NAME}" PROPERTIES SKIP_RETURN_CODE ${CHIP_SKIP_TEST}) - add_dependencies(build_tests "${EXEC_NAME}") endfunction() # add_shell_test() From 366d99ec68f37284ed70fecefa4605fd154339c1 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Wed, 3 Apr 2024 00:28:35 +0300 Subject: [PATCH 28/33] document that coop groups are unimplemented --- docs/Features.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/Features.md b/docs/Features.md index 8bd29c0eb..d8cc740f5 100644 --- a/docs/Features.md +++ b/docs/Features.md @@ -41,6 +41,8 @@ CUDA features not present in HIP are unsupported unless explicitly stated otherw * few memory APIs (hipMemPrefetchAsync, hipMemAdvise) +* Cooperative Groups API + #### partially supported * Texture Objects of 1D/2D type are supported; 3D, LOD, Grad, From 79180bc9bb849be453612d02e657f36cd7eacde6 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Wed, 3 Apr 2024 00:40:25 +0300 Subject: [PATCH 29/33] clang-format main --- .../hip/devicelib/integer/int_intrinsics.hh | 27 +- include/hip/spirv_hip_cooperative_groups.h | 542 ++++++++++-------- .../hip/spirv_hip_cooperative_groups_helper.h | 174 +++--- include/hip/spirv_hip_runtime.h | 22 +- samples/hipInfo/hipInfo.cpp | 4 +- src/CHIPBindings.cc | 13 +- src/common.hh | 2 - 7 files changed, 431 insertions(+), 353 deletions(-) diff --git a/include/hip/devicelib/integer/int_intrinsics.hh b/include/hip/devicelib/integer/int_intrinsics.hh index 12b862a36..49080c7b9 100644 --- a/include/hip/devicelib/integer/int_intrinsics.hh +++ b/include/hip/devicelib/integer/int_intrinsics.hh @@ -42,27 +42,30 @@ __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int offset, return __chip_bitinsert_u32(src0, src1, offset, width); } -extern "C" __device__ unsigned long long int __chip_bitinsert_u64(unsigned long long int src0, - unsigned long long int src1, - unsigned long long int offset, - unsigned long long int width); -extern "C++" inline __device__ unsigned long long int __bitinsert_u64(unsigned long long int src0, - unsigned long long int src1, - unsigned long long int offset, - unsigned long long int width) { +extern "C" __device__ unsigned long long int +__chip_bitinsert_u64(unsigned long long int src0, unsigned long long int src1, + unsigned long long int offset, + unsigned long long int width); +extern "C++" inline __device__ unsigned long long int +__bitinsert_u64(unsigned long long int src0, unsigned long long int src1, + unsigned long long int offset, unsigned long long int width) { return __chip_bitinsert_u64(src0, src1, offset, width); } #endif // CHIP_ENABLE_NON_COMPLIANT_DEVICELIB_CODE // int was replaced with int // int64_t was replaced with long long int -extern "C" __device__ int __chip__fns32(unsigned long long int mask, unsigned int base, int offset); -extern "C++" inline __device__ int __fns32(unsigned long long int mask, unsigned int base, int offset) { +extern "C" __device__ int __chip__fns32(unsigned long long int mask, + unsigned int base, int offset); +extern "C++" inline __device__ int __fns32(unsigned long long int mask, + unsigned int base, int offset) { return __chip__fns32(mask, base, offset); } -extern "C" __device__ int __chip__fns64(unsigned long long int mask, unsigned int base, int offset); -extern "C++" inline __device__ int __fns64(unsigned long long int mask, unsigned int base, int offset) { +extern "C" __device__ int __chip__fns64(unsigned long long int mask, + unsigned int base, int offset); +extern "C++" inline __device__ int __fns64(unsigned long long int mask, + unsigned int base, int offset) { return __chip__fns64(mask, base, offset); } diff --git a/include/hip/spirv_hip_cooperative_groups.h b/include/hip/spirv_hip_cooperative_groups.h index 28bef4fe6..f4b800a3d 100644 --- a/include/hip/spirv_hip_cooperative_groups.h +++ b/include/hip/spirv_hip_cooperative_groups.h @@ -46,22 +46,23 @@ namespace cooperative_groups { * \details Holds the key properties of a constructed cooperative group types * object, like the group type, its size, etc * - * @note Cooperative groups feature is implemented on Linux, under developement - * on Windows. + * @note Cooperative groups feature is implemented on Linux, under + * developement on Windows. */ class thread_group { - protected: - uint32_t _type; // thread_group type - uint32_t _size; // total number of threads in the tread_group - uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types, - // LSB represents lane 0, and MSB represents lane 63 +protected: + uint32_t _type; // thread_group type + uint32_t _size; // total number of threads in the tread_group + uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types, + // LSB represents lane 0, and MSB represents lane 63 // Construct a thread group, and set thread group type and other essential // thread group properties. This generic thread group is directly constructed // only when the group is supposed to contain only the calling the thread // (throurh the API - `this_thread()`), and in all other cases, this thread // group object is a sub-object of some other derived thread group object - __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast(0), + __CG_QUALIFIER__ thread_group(internal::group_type type, + uint32_t size = static_cast(0), uint64_t mask = static_cast(0)) { _type = type; _size = size; @@ -81,11 +82,11 @@ class thread_group { struct _tiled_info tiled_info; } coalesced_info; - friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, - unsigned int tile_size); + friend __CG_QUALIFIER__ thread_group + tiled_partition(const thread_group &parent, unsigned int tile_size); friend class thread_block; - public: +public: // Total number of threads in the thread group, and this serves the purpose // for all derived cooperative group types since their `size` is directly // saved during the construction @@ -105,12 +106,13 @@ class thread_group { * @ingroup API * @{ * This section describes the cooperative groups functions of HIP runtime API. - * - * The cooperative groups provides flexible thread parallel programming algorithms, threads - * cooperate and share data to perform collective computations. * - * @note Cooperative groups feature is implemented on Linux, under developement - * on Windows. + * The cooperative groups provides flexible thread parallel programming + *algorithms, threads cooperate and share data to perform collective + *computations. + * + * @note Cooperative groups feature is implemented on Linux, under + *developement on Windows. * */ /** \brief The multi-grid cooperative group type @@ -118,28 +120,36 @@ class thread_group { * \details Represents an inter-device cooperative group type where the * participating threads within the group spans across multple * devices, running the (same) kernel on these devices - * @note The multi-grid cooperative group type is implemented on Linux, under developement - * on Windows. + * @note The multi-grid cooperative group type is implemented on Linux, under + * developement on Windows. */ class multi_grid_group : public thread_group { - // Only these friend functions are allowed to construct an object of this class - // and access its resources + // Only these friend functions are allowed to construct an object of this + // class and access its resources friend __CG_QUALIFIER__ multi_grid_group this_multi_grid(); - protected: +protected: // Construct mutli-grid thread group (through the API this_multi_grid()) explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size) : thread_group(internal::cg_multi_grid, size) {} - public: +public: // Number of invocations participating in this multi-grid group. In other // words, the number of GPUs - __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); } + __CG_QUALIFIER__ uint32_t num_grids() { + return internal::multi_grid::num_grids(); + } // Rank of this invocation. In other words, an ID number within the range // [0, num_grids()) of the GPU, this kernel is running on - __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); } - __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); } - __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); } + __CG_QUALIFIER__ uint32_t grid_rank() { + return internal::multi_grid::grid_rank(); + } + __CG_QUALIFIER__ uint32_t thread_rank() const { + return internal::multi_grid::thread_rank(); + } + __CG_QUALIFIER__ bool is_valid() const { + return internal::multi_grid::is_valid(); + } __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); } }; @@ -149,8 +159,8 @@ class multi_grid_group : public thread_group { * \details User is not allowed to directly construct an object of type * `multi_grid_group`. Instead, he should construct it through this * API function - * @note This multi-grid cooperative API type is implemented on Linux, under developement - * on Windows. + * @note This multi-grid cooperative API type is implemented on Linux, under + * developement on Windows. */ __CG_QUALIFIER__ multi_grid_group this_multi_grid() { return multi_grid_group(internal::multi_grid::size()); @@ -165,16 +175,19 @@ __CG_QUALIFIER__ multi_grid_group this_multi_grid() { * on Windows. */ class grid_group : public thread_group { - // Only these friend functions are allowed to construct an object of this class - // and access its resources + // Only these friend functions are allowed to construct an object of this + // class and access its resources friend __CG_QUALIFIER__ grid_group this_grid(); - protected: +protected: // Construct grid thread group (through the API this_grid()) - explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {} + explicit __CG_QUALIFIER__ grid_group(uint32_t size) + : thread_group(internal::cg_grid, size) {} - public: - __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); } +public: + __CG_QUALIFIER__ uint32_t thread_rank() const { + return internal::grid::thread_rank(); + } __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); } __CG_QUALIFIER__ void sync() const { internal::grid::sync(); } }; @@ -188,7 +201,9 @@ class grid_group : public thread_group { * @note This function is implemented on Linux, under developement * on Windows. */ -__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); } +__CG_QUALIFIER__ grid_group this_grid() { + return grid_group(internal::grid::size()); +} /** @brief The workgroup (thread-block in CUDA terminology) cooperative group * type @@ -203,11 +218,12 @@ class thread_block : public thread_group { // Only these friend functions are allowed to construct an object of thi // class and access its resources friend __CG_QUALIFIER__ thread_block this_thread_block(); - friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, - unsigned int tile_size); - friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, - unsigned int tile_size); - protected: + friend __CG_QUALIFIER__ thread_group + tiled_partition(const thread_group &parent, unsigned int tile_size); + friend __CG_QUALIFIER__ thread_group + tiled_partition(const thread_block &parent, unsigned int tile_size); + +protected: // Construct a workgroup thread group (through the API this_thread_block()) explicit __CG_QUALIFIER__ thread_block(uint32_t size) : thread_group(internal::cg_workgroup, size) {} @@ -222,19 +238,31 @@ class thread_block : public thread_group { thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size); tiledGroup.coalesced_info.tiled_info.size = tile_size; tiledGroup.coalesced_info.tiled_info.is_tiled = true; - tiledGroup.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; - tiledGroup.coalesced_info.tiled_info.meta_group_size = (size() + tile_size - 1) / tile_size; + tiledGroup.coalesced_info.tiled_info.meta_group_rank = + thread_rank() / tile_size; + tiledGroup.coalesced_info.tiled_info.meta_group_size = + (size() + tile_size - 1) / tile_size; return tiledGroup; } - public: +public: // 3-dimensional block index within the grid - __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); } + __CG_STATIC_QUALIFIER__ dim3 group_index() { + return internal::workgroup::group_index(); + } // 3-dimensional thread index within the block - __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); } - __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); } - __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); } - __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); } + __CG_STATIC_QUALIFIER__ dim3 thread_index() { + return internal::workgroup::thread_index(); + } + __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { + return internal::workgroup::thread_rank(); + } + __CG_STATIC_QUALIFIER__ uint32_t size() { + return internal::workgroup::size(); + } + __CG_STATIC_QUALIFIER__ bool is_valid() { + return internal::workgroup::is_valid(); + } __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); } __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); } }; @@ -261,10 +289,10 @@ __CG_QUALIFIER__ thread_block this_thread_block() { */ class tiled_group : public thread_group { - private: - friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, - unsigned int tile_size); - friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, +private: + friend __CG_QUALIFIER__ thread_group + tiled_partition(const thread_group &parent, unsigned int tile_size); + friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group &parent, unsigned int tile_size); __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const { @@ -283,23 +311,24 @@ class tiled_group : public thread_group { return tiledGroup; } - protected: +protected: explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize) : thread_group(internal::cg_tiled_group, tileSize) { coalesced_info.tiled_info.size = tileSize; coalesced_info.tiled_info.is_tiled = true; } - public: - __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); } +public: + __CG_QUALIFIER__ unsigned int size() const { + return (coalesced_info.tiled_info.size); + } __CG_QUALIFIER__ unsigned int thread_rank() const { - return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1)); + return (internal::workgroup::thread_rank() & + (coalesced_info.tiled_info.size - 1)); } - __CG_QUALIFIER__ void sync() const { - internal::tiled_group::sync(); - } + __CG_QUALIFIER__ void sync() const { internal::tiled_group::sync(); } }; /** \brief The coalesced_group cooperative group type @@ -310,30 +339,38 @@ class tiled_group : public thread_group { * on Windows. */ class coalesced_group : public thread_group { - private: +private: friend __CG_QUALIFIER__ coalesced_group coalesced_threads(); - friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size); - friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size); + friend __CG_QUALIFIER__ thread_group + tiled_partition(const thread_group &parent, unsigned int tile_size); + friend __CG_QUALIFIER__ coalesced_group + tiled_partition(const coalesced_group &parent, unsigned int tile_size); - __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const { + __CG_QUALIFIER__ coalesced_group + new_tiled_group(unsigned int tile_size) const { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); if (!tile_size || (tile_size > size()) || !pow2) { return coalesced_group(0); } - // If a tiled group is passed to be partitioned further into a coalesced_group. - // prepare a mask for further partitioning it so that it stays coalesced. + // If a tiled group is passed to be partitioned further into a + // coalesced_group. prepare a mask for further partitioning it so that it + // stays coalesced. if (coalesced_info.tiled_info.is_tiled) { unsigned int base_offset = (thread_rank() & (~(tile_size - 1))); - unsigned int masklength = min(static_cast(size()) - base_offset, tile_size); - lane_mask member_mask = static_cast(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength); + unsigned int masklength = + min(static_cast(size()) - base_offset, tile_size); + lane_mask member_mask = + static_cast(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength); member_mask <<= (__lane_id() & ~(tile_size - 1)); coalesced_group coalesced_tile = coalesced_group(member_mask); coalesced_tile.coalesced_info.tiled_info.is_tiled = true; - coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; - coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size; + coalesced_tile.coalesced_info.tiled_info.meta_group_rank = + thread_rank() / tile_size; + coalesced_tile.coalesced_info.tiled_info.meta_group_size = + size() / tile_size; return coalesced_tile; } // Here the parent coalesced_group is not partitioned. @@ -347,7 +384,7 @@ class coalesced_group : public thread_group { // Make sure the lane is active if (active) { if (lanes_to_skip <= 0 && tile_rank < tile_size) { - // Prepare a member_mask that is appropriate for a tile + // Prepare a member_mask that is appropriate for a tile member_mask |= active; tile_rank++; } @@ -355,55 +392,54 @@ class coalesced_group : public thread_group { } } coalesced_group coalesced_tile = coalesced_group(member_mask); - coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; + coalesced_tile.coalesced_info.tiled_info.meta_group_rank = + thread_rank() / tile_size; coalesced_tile.coalesced_info.tiled_info.meta_group_size = - (size() + tile_size - 1) / tile_size; + (size() + tile_size - 1) / tile_size; return coalesced_tile; } - return coalesced_group(0); + return coalesced_group(0); } - protected: - // Constructor +protected: + // Constructor explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask) : thread_group(internal::cg_coalesced_group) { coalesced_info.member_mask = member_mask; // Which threads are active - coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active + coalesced_info.size = + __popcll(coalesced_info.member_mask); // How many threads are active coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group coalesced_info.tiled_info.meta_group_rank = 0; coalesced_info.tiled_info.meta_group_size = 1; } - public: - __CG_QUALIFIER__ unsigned int size() const { - return coalesced_info.size; - } +public: + __CG_QUALIFIER__ unsigned int size() const { return coalesced_info.size; } - __CG_QUALIFIER__ unsigned int thread_rank() const { - return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask); - } + __CG_QUALIFIER__ unsigned int thread_rank() const { + return internal::coalesced_group::masked_bit_count( + coalesced_info.member_mask); + } - __CG_QUALIFIER__ void sync() const { - internal::coalesced_group::sync(); - } + __CG_QUALIFIER__ void sync() const { internal::coalesced_group::sync(); } - __CG_QUALIFIER__ unsigned int meta_group_rank() const { - return coalesced_info.tiled_info.meta_group_rank; - } + __CG_QUALIFIER__ unsigned int meta_group_rank() const { + return coalesced_info.tiled_info.meta_group_rank; + } - __CG_QUALIFIER__ unsigned int meta_group_size() const { - return coalesced_info.tiled_info.meta_group_size; - } + __CG_QUALIFIER__ unsigned int meta_group_size() const { + return coalesced_info.tiled_info.meta_group_size; + } - template - __CG_QUALIFIER__ T shfl(T var, int srcRank) const { + template __CG_QUALIFIER__ T shfl(T var, int srcRank) const { static_assert(is_valid_type::value, "Neither an integer or float type."); srcRank = srcRank % static_cast(size()); int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank - : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) - : __fns32(coalesced_info.member_mask, 0, (srcRank + 1)); + : (__AMDGCN_WAVEFRONT_SIZE == 64) + ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) + : __fns32(coalesced_info.member_mask, 0, (srcRank + 1)); return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); } @@ -423,8 +459,7 @@ class coalesced_group : public thread_group { int lane; if (__AMDGCN_WAVEFRONT_SIZE == 64) { lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1); - } - else { + } else { lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1); } @@ -449,10 +484,11 @@ class coalesced_group : public thread_group { int lane; if (__AMDGCN_WAVEFRONT_SIZE == 64) { - lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); - } - else if (__AMDGCN_WAVEFRONT_SIZE == 32) { - lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); + lane = + __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); + } else if (__AMDGCN_WAVEFRONT_SIZE == 32) { + lane = + __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); } if (lane == -1) { @@ -465,13 +501,14 @@ class coalesced_group : public thread_group { /** \brief User exposed API to create coalesced groups. * - * \details A collective operation that groups all active lanes into a new thread group. + * \details A collective operation that groups all active lanes into a new + * thread group. * @note This function is implemented on Linux, under developement * on Windows. */ __CG_QUALIFIER__ coalesced_group coalesced_threads() { - return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec()); + return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec()); } /** @@ -481,25 +518,25 @@ __CG_QUALIFIER__ coalesced_group coalesced_threads() { */ __CG_QUALIFIER__ uint32_t thread_group::thread_rank() const { switch (this->_type) { - case internal::cg_multi_grid: { - return (static_cast(this)->thread_rank()); - } - case internal::cg_grid: { - return (static_cast(this)->thread_rank()); - } - case internal::cg_workgroup: { - return (static_cast(this)->thread_rank()); - } - case internal::cg_tiled_group: { - return (static_cast(this)->thread_rank()); - } - case internal::cg_coalesced_group: { - return (static_cast(this)->thread_rank()); - } - default: { - __hip_assert(false && "invalid cooperative group type"); - return -1; - } + case internal::cg_multi_grid: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_grid: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_workgroup: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_tiled_group: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_coalesced_group: { + return (static_cast(this)->thread_rank()); + } + default: { + __hip_assert(false && "invalid cooperative group type"); + return -1; + } } } /** @@ -509,25 +546,25 @@ __CG_QUALIFIER__ uint32_t thread_group::thread_rank() const { */ __CG_QUALIFIER__ bool thread_group::is_valid() const { switch (this->_type) { - case internal::cg_multi_grid: { - return (static_cast(this)->is_valid()); - } - case internal::cg_grid: { - return (static_cast(this)->is_valid()); - } - case internal::cg_workgroup: { - return (static_cast(this)->is_valid()); - } - case internal::cg_tiled_group: { - return (static_cast(this)->is_valid()); - } - case internal::cg_coalesced_group: { - return (static_cast(this)->is_valid()); - } - default: { - __hip_assert(false && "invalid cooperative group type"); - return false; - } + case internal::cg_multi_grid: { + return (static_cast(this)->is_valid()); + } + case internal::cg_grid: { + return (static_cast(this)->is_valid()); + } + case internal::cg_workgroup: { + return (static_cast(this)->is_valid()); + } + case internal::cg_tiled_group: { + return (static_cast(this)->is_valid()); + } + case internal::cg_coalesced_group: { + return (static_cast(this)->is_valid()); + } + default: { + __hip_assert(false && "invalid cooperative group type"); + return false; + } } } /** @@ -537,29 +574,29 @@ __CG_QUALIFIER__ bool thread_group::is_valid() const { */ __CG_QUALIFIER__ void thread_group::sync() const { switch (this->_type) { - case internal::cg_multi_grid: { - static_cast(this)->sync(); - break; - } - case internal::cg_grid: { - static_cast(this)->sync(); - break; - } - case internal::cg_workgroup: { - static_cast(this)->sync(); - break; - } - case internal::cg_tiled_group: { - static_cast(this)->sync(); - break; - } - case internal::cg_coalesced_group: { - static_cast(this)->sync(); - break; - } - default: { - __hip_assert(false && "invalid cooperative group type"); - } + case internal::cg_multi_grid: { + static_cast(this)->sync(); + break; + } + case internal::cg_grid: { + static_cast(this)->sync(); + break; + } + case internal::cg_workgroup: { + static_cast(this)->sync(); + break; + } + case internal::cg_tiled_group: { + static_cast(this)->sync(); + break; + } + case internal::cg_coalesced_group: { + static_cast(this)->sync(); + break; + } + default: { + __hip_assert(false && "invalid cooperative group type"); + } } } @@ -569,14 +606,16 @@ __CG_QUALIFIER__ void thread_group::sync() const { * @note This function is implemented on Linux, under developement * on Windows. */ -template __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); } +template __CG_QUALIFIER__ uint32_t group_size(CGTy const &g) { + return g.size(); +} /** * Implemenation of publicly exposed `wrapper` API on top of basic cooperative * group type APIs * @note This function is implemented on Linux, under developement * on Windows. */ -template __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) { +template __CG_QUALIFIER__ uint32_t thread_rank(CGTy const &g) { return g.thread_rank(); } /** @@ -585,24 +624,26 @@ template __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) { * @note This function is implemented on Linux, under developement * on Windows. */ -template __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); } +template __CG_QUALIFIER__ bool is_valid(CGTy const &g) { + return g.is_valid(); +} /** * Implemenation of publicly exposed `wrapper` API on top of basic cooperative * group type APIs * @note This function is implemented on Linux, under developement * on Windows. */ -template __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); } +template __CG_QUALIFIER__ void sync(CGTy const &g) { g.sync(); } /** * template class tile_base * @note This class is implemented on Linux, under developement * on Windows. */ template class tile_base { - protected: +protected: _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; - public: +public: // Rank of the thread within this tile _CG_STATIC_CONST_DECL_ unsigned int thread_rank() { return (internal::workgroup::thread_rank() & (numThreads - 1)); @@ -616,40 +657,43 @@ template class tile_base { * @note This class is implemented on Linux, under developement * on Windows. */ -template class thread_block_tile_base : public tile_base { +template +class thread_block_tile_base : public tile_base { static_assert(is_valid_tile_size::value, - "Tile size is either not a power of 2 or greater than the wavefront size"); + "Tile size is either not a power of 2 or greater than the " + "wavefront size"); using tile_base::numThreads; - public: - __CG_STATIC_QUALIFIER__ void sync() { - internal::tiled_group::sync(); - } +public: + __CG_STATIC_QUALIFIER__ void sync() { internal::tiled_group::sync(); } template __CG_QUALIFIER__ T shfl(T var, int srcRank) const { static_assert(is_valid_type::value, "Neither an integer or float type."); return (__shfl(var, srcRank, numThreads)); } - template __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const { + template + __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const { static_assert(is_valid_type::value, "Neither an integer or float type."); return (__shfl_down(var, lane_delta, numThreads)); } - template __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const { + template + __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const { static_assert(is_valid_type::value, "Neither an integer or float type."); return (__shfl_up(var, lane_delta, numThreads)); } - template __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const { + template + __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const { static_assert(is_valid_type::value, "Neither an integer or float type."); return (__shfl_xor(var, laneMask, numThreads)); } }; -/** \brief User exposed API that captures the state of the parent group pre-partition +/** \brief User exposed API that captures the state of the parent group + * pre-partition */ -template -class parent_group_info { +template class parent_group_info { public: // Returns the linear rank of the group within the set of tiles partitioned // from a parent group (bounded by meta_group_size) @@ -675,37 +719,38 @@ class thread_block_tile_type : public thread_block_tile_base, public parent_group_info { _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; typedef thread_block_tile_base tbtBase; - protected: - __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) { - coalesced_info.tiled_info.size = numThreads; - coalesced_info.tiled_info.is_tiled = true; - } - public: - using tbtBase::size; - using tbtBase::sync; - using tbtBase::thread_rank; + +protected: + __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) { + coalesced_info.tiled_info.size = numThreads; + coalesced_info.tiled_info.is_tiled = true; + } + +public: + using tbtBase::size; + using tbtBase::sync; + using tbtBase::thread_rank; }; // Partial template specialization template -class thread_block_tile_type : public thread_block_tile_base, - public tiled_group - { +class thread_block_tile_type + : public thread_block_tile_base, public tiled_group { _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; typedef thread_block_tile_base tbtBase; - protected: - - __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size) - : tiled_group(numThreads) { +protected: + __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, + unsigned int meta_group_size) + : tiled_group(numThreads) { coalesced_info.tiled_info.size = numThreads; coalesced_info.tiled_info.is_tiled = true; coalesced_info.tiled_info.meta_group_rank = meta_group_rank; coalesced_info.tiled_info.meta_group_size = meta_group_size; } - public: +public: using tbtBase::size; using tbtBase::sync; using tbtBase::thread_rank; @@ -717,46 +762,47 @@ class thread_block_tile_type : public thread_block_tile_base(&parent); + const tiled_group *cg = static_cast(&parent); return cg->new_tiled_group(tile_size); - } - else if(parent.cg_type() == internal::cg_coalesced_group) { - const coalesced_group* cg = static_cast(&parent); + } else if (parent.cg_type() == internal::cg_coalesced_group) { + const coalesced_group *cg = static_cast(&parent); return cg->new_tiled_group(tile_size); - } - else { - const thread_block* tb = static_cast(&parent); + } else { + const thread_block *tb = static_cast(&parent); return tb->new_tiled_group(tile_size); } } // Thread block type overload -__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) { +__CG_QUALIFIER__ thread_group tiled_partition(const thread_block &parent, + unsigned int tile_size) { return (parent.new_tiled_group(tile_size)); } -__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) { +__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group &parent, + unsigned int tile_size) { return (parent.new_tiled_group(tile_size)); } // If a coalesced group is passed to be partitioned, it should remain coalesced -__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) { - return (parent.new_tiled_group(tile_size)); +__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group &parent, + unsigned int tile_size) { + return (parent.new_tiled_group(tile_size)); } template class thread_block_tile; @@ -765,39 +811,43 @@ namespace impl { template class thread_block_tile_internal; template -class thread_block_tile_internal : public thread_block_tile_type { - protected: +class thread_block_tile_internal + : public thread_block_tile_type { +protected: template __CG_QUALIFIER__ thread_block_tile_internal( - const thread_block_tile_internal& g) - : thread_block_tile_type(g.meta_group_rank(), g.meta_group_size()) {} + const thread_block_tile_internal &g) + : thread_block_tile_type(g.meta_group_rank(), + g.meta_group_size()) {} - __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g) + __CG_QUALIFIER__ thread_block_tile_internal(const thread_block &g) : thread_block_tile_type() {} }; -} // namespace impl +} // namespace impl template -class thread_block_tile : public impl::thread_block_tile_internal { - protected: - __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g) +class thread_block_tile + : public impl::thread_block_tile_internal { +protected: + __CG_QUALIFIER__ thread_block_tile(const ParentCGTy &g) : impl::thread_block_tile_internal(g) {} - public: +public: __CG_QUALIFIER__ operator thread_block_tile() const { return thread_block_tile(*this); } }; - template -class thread_block_tile : public impl::thread_block_tile_internal { +class thread_block_tile + : public impl::thread_block_tile_internal { template friend class thread_block_tile; - protected: - public: +protected: +public: template - __CG_QUALIFIER__ thread_block_tile(const thread_block_tile& g) + __CG_QUALIFIER__ + thread_block_tile(const thread_block_tile &g) : impl::thread_block_tile_internal(g) {} }; @@ -807,25 +857,29 @@ namespace impl { template struct tiled_partition_internal; template -struct tiled_partition_internal : public thread_block_tile { - __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g) +struct tiled_partition_internal + : public thread_block_tile { + __CG_QUALIFIER__ tiled_partition_internal(const thread_block &g) : thread_block_tile(g) {} }; -} // namespace impl +} // namespace impl /** \brief User exposed API to partition groups. * * \details This constructs a templated class derieved from thread_group. - * The template defines tile size of the new thread group at compile time. + * The template defines tile size of the new thread group at compile + * time. */ template -__CG_QUALIFIER__ thread_block_tile tiled_partition(const ParentCGTy& g) { - static_assert(is_valid_tile_size::value, - "Tiled partition with size > wavefront size. Currently not supported "); +__CG_QUALIFIER__ thread_block_tile +tiled_partition(const ParentCGTy &g) { + static_assert( + is_valid_tile_size::value, + "Tiled partition with size > wavefront size. Currently not supported "); return impl::tiled_partition_internal(g); } -} // namespace cooperative_groups +} // namespace cooperative_groups -#endif // __cplusplus -#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H +#endif // __cplusplus +#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H diff --git a/include/hip/spirv_hip_cooperative_groups_helper.h b/include/hip/spirv_hip_cooperative_groups_helper.h index 235cebfd8..c1c0fe1c5 100644 --- a/include/hip/spirv_hip_cooperative_groups_helper.h +++ b/include/hip/spirv_hip_cooperative_groups_helper.h @@ -66,37 +66,40 @@ template using is_power_of_2 = std::integral_constant; template -using is_valid_wavefront = std::integral_constant; +using is_valid_wavefront = + std::integral_constant; template using is_valid_tile_size = - std::integral_constant::value && is_valid_wavefront::value>; + std::integral_constant::value && + is_valid_wavefront::value>; template using is_valid_type = - std::integral_constant::value || std::is_floating_point::value>; - + std::integral_constant::value || + std::is_floating_point::value>; // TODO Cooperative Groups -uint32_t __device__ __ockl_multi_grid_num_grids() {return 0;}; -uint32_t __device__ __ockl_multi_grid_grid_rank() {return 0;}; -uint32_t __device__ __ockl_multi_grid_size() {return 0;}; -uint32_t __device__ __ockl_multi_grid_thread_rank() {return 0;}; -uint32_t __device__ __ockl_multi_grid_is_valid() {return 0;}; -uint32_t __device__ __ockl_multi_grid_sync() {return 0;}; -uint32_t __device__ __ockl_grid_sync() {return 0;}; -uint32_t __device__ __ockl_grid_is_valid() {return 0;}; -void __device__ __builtin_amdgcn_fence(int, const char*){}; -unsigned int __device__ __builtin_amdgcn_mbcnt_lo(unsigned int, unsigned int){return 0;}; -unsigned int __device__ __builtin_amdgcn_read_exec(void){return 0;}; - +uint32_t __device__ __ockl_multi_grid_num_grids() { return 0; }; +uint32_t __device__ __ockl_multi_grid_grid_rank() { return 0; }; +uint32_t __device__ __ockl_multi_grid_size() { return 0; }; +uint32_t __device__ __ockl_multi_grid_thread_rank() { return 0; }; +uint32_t __device__ __ockl_multi_grid_is_valid() { return 0; }; +uint32_t __device__ __ockl_multi_grid_sync() { return 0; }; +uint32_t __device__ __ockl_grid_sync() { return 0; }; +uint32_t __device__ __ockl_grid_is_valid() { return 0; }; +void __device__ __builtin_amdgcn_fence(int, const char *){}; +unsigned int __device__ __builtin_amdgcn_mbcnt_lo(unsigned int, unsigned int) { + return 0; +}; +unsigned int __device__ __builtin_amdgcn_read_exec(void) { return 0; }; namespace internal { /** -* @brief Enums representing different cooperative group types -* @note This enum is only applicable on Linux. -* + * @brief Enums representing different cooperative group types + * @note This enum is only applicable on Linux. + * */ typedef enum { cg_invalid, @@ -110,59 +113,67 @@ typedef enum { * @ingroup CooperativeG * @{ * This section describes the cooperative groups functions of HIP runtime API. - * - * The cooperative groups provides flexible thread parallel programming algorithms, threads - * cooperate and share data to perform collective computations. * - * @note Cooperative groups feature is implemented on Linux, under developement - * on Windows. + * The cooperative groups provides flexible thread parallel programming + * algorithms, threads cooperate and share data to perform collective + * computations. + * + * @note Cooperative groups feature is implemented on Linux, under + * developement on Windows. * */ /** * * @brief Functionalities related to multi-grid cooperative group type - * @note The following cooperative groups functions are only applicable on Linux. + * @note The following cooperative groups functions are only applicable on + * Linux. * */ - - namespace multi_grid { - - __CG_STATIC_QUALIFIER__ uint32_t num_grids() { - return static_cast(__ockl_multi_grid_num_grids()); } + return static_cast(__ockl_multi_grid_num_grids()); +} __CG_STATIC_QUALIFIER__ uint32_t grid_rank() { - return static_cast(__ockl_multi_grid_grid_rank()); } + return static_cast(__ockl_multi_grid_grid_rank()); +} -__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast(__ockl_multi_grid_size()); } +__CG_STATIC_QUALIFIER__ uint32_t size() { + return static_cast(__ockl_multi_grid_size()); +} __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { - return static_cast(__ockl_multi_grid_thread_rank()); } + return static_cast(__ockl_multi_grid_thread_rank()); +} -__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_multi_grid_is_valid()); } +__CG_STATIC_QUALIFIER__ bool is_valid() { + return static_cast(__ockl_multi_grid_is_valid()); +} __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); } -} // namespace multi_grid +} // namespace multi_grid /** * @brief Functionalities related to grid cooperative group type - * @note The following cooperative groups functions are only applicable on Linux. + * @note The following cooperative groups functions are only applicable on + * Linux. */ namespace grid { __CG_STATIC_QUALIFIER__ uint32_t size() { - return static_cast((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) * - (blockDim.x * gridDim.x)); + return static_cast((blockDim.z * gridDim.z) * + (blockDim.y * gridDim.y) * + (blockDim.x * gridDim.x)); } __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { // Compute global id of the workgroup to which the current thread belongs to - uint32_t blkIdx = static_cast((blockIdx.z * gridDim.y * gridDim.x) + - (blockIdx.y * gridDim.x) + (blockIdx.x)); + uint32_t blkIdx = + static_cast((blockIdx.z * gridDim.y * gridDim.x) + + (blockIdx.y * gridDim.x) + (blockIdx.x)); // Compute total number of threads being passed to reach current workgroup // within grid @@ -170,32 +181,38 @@ __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { static_cast(blkIdx * (blockDim.x * blockDim.y * blockDim.z)); // Compute thread local rank within current workgroup - uint32_t local_thread_rank = static_cast((threadIdx.z * blockDim.y * blockDim.x) + - (threadIdx.y * blockDim.x) + (threadIdx.x)); + uint32_t local_thread_rank = + static_cast((threadIdx.z * blockDim.y * blockDim.x) + + (threadIdx.y * blockDim.x) + (threadIdx.x)); return (num_threads_till_current_workgroup + local_thread_rank); } -__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_grid_is_valid()); } +__CG_STATIC_QUALIFIER__ bool is_valid() { + return static_cast(__ockl_grid_is_valid()); +} __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); } -} // namespace grid +} // namespace grid /** - * @brief Functionalities related to `workgroup` (thread_block in CUDA terminology) - * cooperative group type - * @note The following cooperative groups functions are only applicable on Linux. + * @brief Functionalities related to `workgroup` (thread_block in CUDA + * terminology) cooperative group type + * @note The following cooperative groups functions are only applicable on + * Linux. */ namespace workgroup { __CG_STATIC_QUALIFIER__ dim3 group_index() { - return (dim3(static_cast(blockIdx.x), static_cast(blockIdx.y), + return (dim3(static_cast(blockIdx.x), + static_cast(blockIdx.y), static_cast(blockIdx.z))); } __CG_STATIC_QUALIFIER__ dim3 thread_index() { - return (dim3(static_cast(threadIdx.x), static_cast(threadIdx.y), + return (dim3(static_cast(threadIdx.x), + static_cast(threadIdx.y), static_cast(threadIdx.z))); } @@ -205,59 +222,62 @@ __CG_STATIC_QUALIFIER__ uint32_t size() { __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return (static_cast((threadIdx.z * blockDim.y * blockDim.x) + - (threadIdx.y * blockDim.x) + (threadIdx.x))); + (threadIdx.y * blockDim.x) + (threadIdx.x))); } -__CG_STATIC_QUALIFIER__ bool is_valid() { - return true; -} +__CG_STATIC_QUALIFIER__ bool is_valid() { return true; } __CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); } __CG_STATIC_QUALIFIER__ dim3 block_dim() { - return (dim3(static_cast(blockDim.x), static_cast(blockDim.y), - static_cast(blockDim.z))); + return (dim3(static_cast(blockDim.x), + static_cast(blockDim.y), + static_cast(blockDim.z))); } -} // namespace workgroup +} // namespace workgroup namespace tiled_group { // enforce ordering for memory intructions -__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); } +__CG_STATIC_QUALIFIER__ void sync() { + __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); +} -} // namespace tiled_group +} // namespace tiled_group namespace coalesced_group { // enforce ordering for memory intructions -__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); } +__CG_STATIC_QUALIFIER__ void sync() { + __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); +} // Masked bit count // // For each thread, this function returns the number of active threads which // have i-th bit of x set and come before the current thread. -__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { - unsigned int counter=0; - #if __AMDGCN_WAVEFRONT_SIZE == 32 - counter = __builtin_amdgcn_mbcnt_lo(x, add); - #else - counter = __builtin_amdgcn_mbcnt_lo(static_cast(x), add); - counter = __builtin_amdgcn_mbcnt_hi(static_cast(x >> 32), counter); - #endif - - return counter; -} +__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, + unsigned int add = 0) { + unsigned int counter = 0; +#if __AMDGCN_WAVEFRONT_SIZE == 32 + counter = __builtin_amdgcn_mbcnt_lo(x, add); +#else + counter = __builtin_amdgcn_mbcnt_lo(static_cast(x), add); + counter = __builtin_amdgcn_mbcnt_hi(static_cast(x >> 32), counter); +#endif -} // namespace coalesced_group + return counter; +} +} // namespace coalesced_group -} // namespace internal +} // namespace internal -} // namespace cooperative_groups +} // namespace cooperative_groups /** -* @} -*/ + * @} + */ -#endif // __cplusplus -#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H +#endif // __cplusplus +#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H diff --git a/include/hip/spirv_hip_runtime.h b/include/hip/spirv_hip_runtime.h index 9fb4d82d6..7d19b3c60 100644 --- a/include/hip/spirv_hip_runtime.h +++ b/include/hip/spirv_hip_runtime.h @@ -60,17 +60,17 @@ struct hipGraphExec {}; #define __managed__ __device__ typedef struct hipArray { - void* data; // FIXME: generalize this - struct hipChannelFormatDesc desc; - unsigned int type; - unsigned int width; - unsigned int height; - unsigned int depth; - enum hipArray_Format Format; - unsigned int NumChannels; - bool isDrv; - unsigned int textureType; -}hipArray; + void *data; // FIXME: generalize this + struct hipChannelFormatDesc desc; + unsigned int type; + unsigned int width; + unsigned int height; + unsigned int depth; + enum hipArray_Format Format; + unsigned int NumChannels; + bool isDrv; + unsigned int textureType; +} hipArray; // Feature tests: #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || \ diff --git a/samples/hipInfo/hipInfo.cpp b/samples/hipInfo/hipInfo.cpp index fea1e5483..bdda9745b 100644 --- a/samples/hipInfo/hipInfo.cpp +++ b/samples/hipInfo/hipInfo.cpp @@ -136,7 +136,9 @@ void printDeviceProp(int deviceId) { cout << setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << endl; cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl; cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl; - cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl; + cout << setw(w1) + << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism + << endl; cout << setw(w1) << "maxTexture1DLinear: " << props.maxTexture1DLinear << endl; diff --git a/src/CHIPBindings.cc b/src/CHIPBindings.cc index 2f7f3caaa..1b2c72220 100644 --- a/src/CHIPBindings.cc +++ b/src/CHIPBindings.cc @@ -71,13 +71,15 @@ #define DECONST_NODES(x) \ reinterpret_cast(const_cast(x)) -hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name) { +hipError_t hipModuleGetTexRef(textureReference **texRef, hipModule_t hmod, + const char *name) { UNIMPLEMENTED(hipErrorNotSupported); } hipError_t hipFreeArray(hipArray *Array); -hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config) { +hipError_t hipFuncSetSharedMemConfig(const void *func, + hipSharedMemConfig config) { UNIMPLEMENTED(hipErrorNotSupported); } @@ -1494,7 +1496,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, attributes->devicePointer = const_cast(ptr); attributes->hostPointer = AllocInfo->HostPtr; attributes->isManaged = AllocInfo->Managed; - attributes->type = AllocInfo->MemoryType; + attributes->type = AllocInfo->MemoryType; // Seems strange but the expected behavior is that if // hipPointerGetAttributes gets called with an offset host pointer, the @@ -4079,7 +4081,8 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes *Attr, CHIP_CATCH } -hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunction_t hfunc) { +hipError_t hipFuncGetAttribute(int *value, hipFunction_attribute attrib, + hipFunction_t hfunc) { UNIMPLEMENTED(hipErrorTbd); } @@ -4894,8 +4897,6 @@ hipOccupancyMaxActiveBlocksPerMultiprocessor(int *NumBlocks, const void *Func, CHIP_CATCH } - - hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int *NumBlocks, const void *Func, int BlockSize, size_t DynSharedMemPerBlk, unsigned int Flags) { diff --git a/src/common.hh b/src/common.hh index b4d3ebd49..cc5103d4b 100644 --- a/src/common.hh +++ b/src/common.hh @@ -41,8 +41,6 @@ #include #include - - using SPVFunctionInfoMap = std::map>; struct SPVModuleInfo { From 31daec5795b31a50542324db31d6fb05a95f1699 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Wed, 3 Apr 2024 02:45:14 +0300 Subject: [PATCH 30/33] device-side placeholders for malloc, free, wall_clock --- bitcode/devicelib.cl | 3 +++ hip-tests | 2 +- include/hip/spirv_hip_devicelib.hh | 15 +++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/bitcode/devicelib.cl b/bitcode/devicelib.cl index fc136caaf..8ddd8b9d1 100644 --- a/bitcode/devicelib.cl +++ b/bitcode/devicelib.cl @@ -43,6 +43,9 @@ #error __opencl_c_generic_address_space needed! #endif +NOOPT void* device_malloc(unsigned int size) {return (void*)0;}; +NOOPT void device_free(void* ptr) {}; + // Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE), // find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position. // If not found, return -1. diff --git a/hip-tests b/hip-tests index 52a8b06b0..bbbfe89ed 160000 --- a/hip-tests +++ b/hip-tests @@ -1 +1 @@ -Subproject commit 52a8b06b035b95a602f6d18a8ab8f9ce4e92ef4d +Subproject commit bbbfe89edb2386664395677cff46b59abed2904d diff --git a/include/hip/spirv_hip_devicelib.hh b/include/hip/spirv_hip_devicelib.hh index 865ceb5d1..e7cba5ff0 100644 --- a/include/hip/spirv_hip_devicelib.hh +++ b/include/hip/spirv_hip_devicelib.hh @@ -71,6 +71,11 @@ THE SOFTWARE. #pragma push_macro("__HIP_OVERLOAD") #pragma push_macro("__HIP_OVERLOAD2") +__device__ void* device_malloc(unsigned int size); +__device__ void device_free(void* ptr); +EXPORT void* malloc(size_t size) { return device_malloc(size); } +EXPORT void free(void* ptr) { device_free(ptr); }; + // __hip_enable_if::type is a type function which returns __T if __B is true. template struct __hip_enable_if {}; @@ -158,6 +163,16 @@ EXPORT unsigned long long clock64() { // loss can be avoided. EXPORT clock_t clock() { return (clock_t)clock64(); } +EXPORT unsigned long long wall_clock64() { + atomicAdd(&__chip_clk_counter, 1); + return __chip_clk_counter; +} +// TODO: This is a temporary implementation of clock(), +// in future it will be changed with more reliable implementation. +// It is encouraged to use clock64() over clock() so that chance of data +// loss can be avoided. +EXPORT clock_t wall_clock() { return (clock_t)wall_clock64(); } + #include #endif From 6d9d9da99387c9c08c5184371b9ee24569bd1611 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Wed, 3 Apr 2024 02:45:22 +0300 Subject: [PATCH 31/33] update Features.md --- docs/Features.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/Features.md b/docs/Features.md index d8cc740f5..357ae8292 100644 --- a/docs/Features.md +++ b/docs/Features.md @@ -43,6 +43,8 @@ CUDA features not present in HIP are unsupported unless explicitly stated otherw * Cooperative Groups API +* Device-side malloc/free + #### partially supported * Texture Objects of 1D/2D type are supported; 3D, LOD, Grad, From e8d3ec95f04aa0d044a57c104f344d03c005774c Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Wed, 3 Apr 2024 02:46:50 +0300 Subject: [PATCH 32/33] fmt --- include/hip/spirv_hip_devicelib.hh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/hip/spirv_hip_devicelib.hh b/include/hip/spirv_hip_devicelib.hh index e7cba5ff0..effdf60f7 100644 --- a/include/hip/spirv_hip_devicelib.hh +++ b/include/hip/spirv_hip_devicelib.hh @@ -71,10 +71,10 @@ THE SOFTWARE. #pragma push_macro("__HIP_OVERLOAD") #pragma push_macro("__HIP_OVERLOAD2") -__device__ void* device_malloc(unsigned int size); -__device__ void device_free(void* ptr); -EXPORT void* malloc(size_t size) { return device_malloc(size); } -EXPORT void free(void* ptr) { device_free(ptr); }; +__device__ void *device_malloc(unsigned int size); +__device__ void device_free(void *ptr); +EXPORT void *malloc(size_t size) { return device_malloc(size); } +EXPORT void free(void *ptr) { device_free(ptr); }; // __hip_enable_if::type is a type function which returns __T if __B is true. template struct __hip_enable_if {}; From 13d7590fb7a1043ddbe84c344feb6bbb6b798184 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Wed, 3 Apr 2024 03:33:37 +0300 Subject: [PATCH 33/33] adjust clang-tidy --- .clang-tidy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.clang-tidy b/.clang-tidy index b05ad3966..9c36439b7 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,5 +1,5 @@ --- -Checks: 'readability-identifier-naming' +Checks: '-*,readability-*,modernize-*,clang-analyzer-*' WarningsAsErrors: '' HeaderFilterRegex: './src/.*' AnalyzeTemporaryDtors: false