From 5b8f856b433bee20e7f7791b95987ddc2cbd0d79 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 28 Feb 2024 12:31:58 +0100 Subject: [PATCH 1/4] Don't use hipMemcpyDefault due to bugs in HIP 2D memcpys involving host buffers with non-zero offsets are broken on HIP 5.6 onwards. Some cases have been fixed in https://github.com/ROCm/clr/commit/56daa6c4891b43ec233e9c63f755e3f7b45842b4 (included in 5.7.0) and further fixes in are in https://github.com/ROCm/clr/commit/d3bfb55d7a934355257a72fab538a0a634b43cad (not released as of this commit). --- include/dlaf/eigensolver/band_to_tridiag/mc.h | 10 ++++++---- include/dlaf/factorization/qr/t_factor_impl.h | 2 +- include/dlaf/lapack/gpu/lacpy.h | 4 ++-- src/lapack/gpu/lacpy.cu | 3 +-- test/unit/lapack/gpu/test_lacpy.cpp | 3 ++- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/dlaf/eigensolver/band_to_tridiag/mc.h b/include/dlaf/eigensolver/band_to_tridiag/mc.h index e4c8aee74b..073cc4d333 100644 --- a/include/dlaf/eigensolver/band_to_tridiag/mc.h +++ b/include/dlaf/eigensolver/band_to_tridiag/mc.h @@ -292,11 +292,11 @@ class BandBlock { const auto index = std::max(SizeType{0}, source.size().cols() - band_size_); if (index > 0) { gpulapack::lacpy(General, band_size_ + 1, index, source.ptr(), source.ld() + 1, ptr(0, j), - ld() + 1, stream); + ld() + 1, whip::memcpy_device_to_host, stream); } const auto size = std::min(band_size_, source.size().cols()); gpulapack::lacpy(Lower, size, size, source.ptr({index, index}), source.ld(), - ptr(0, j + index), ld(), stream); + ptr(0, j + index), ld(), whip::memcpy_device_to_host, stream); }, std::move(source)); } @@ -367,11 +367,13 @@ class BandBlock { const auto index = source.size().cols() - band_size_; const auto size = std::min(band_size_, source.size().rows()); auto dest = ptr(band_size_, j + index); - gpulapack::lacpy(Upper, size, size, source.ptr({0, index}), source.ld(), dest, ld(), stream); + gpulapack::lacpy(Upper, size, size, source.ptr({0, index}), source.ld(), dest, ld(), + whip::memcpy_device_to_host, stream); if (band_size_ > size) { const auto size2 = band_size_ - size; gpulapack::lacpy(General, source.size().rows(), size2, source.ptr({0, index + size}), - source.ld(), dest + ld() * size, ld(), stream); + source.ld(), dest + ld() * size, ld(), whip::memcpy_device_to_host, + stream); } }, std::move(source)); diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index b32139c343..81e052b36c 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -159,7 +159,7 @@ struct Helpers { // This assumes that elements in taus are contiguous, i.e. that it is a column vector of size k whip::memcpy_2d_async(tile_t.ptr(), to_sizet(tile_t.ld() + 1) * sizeof(T), taus.ptr(), sizeof(T), - sizeof(T), to_sizet(k), whip::memcpy_default, stream); + sizeof(T), to_sizet(k), whip::memcpy_host_to_device, stream); } for (SizeType j = 0; j < k; ++j) { diff --git a/include/dlaf/lapack/gpu/lacpy.h b/include/dlaf/lapack/gpu/lacpy.h index 92642a43d4..8c10b3651e 100644 --- a/include/dlaf/lapack/gpu/lacpy.h +++ b/include/dlaf/lapack/gpu/lacpy.h @@ -22,12 +22,12 @@ namespace dlaf::gpulapack { template void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a, const SizeType lda, - T* b, const SizeType ldb, const whip::stream_t stream); + T* b, const SizeType ldb, const whip::memcpy_kind kind, const whip::stream_t stream); #define DLAF_CUBLAS_LACPY_ETI(kword, Type) \ kword template void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const Type* a, \ const SizeType lda, Type* b, const SizeType ldb, \ - const whip::stream_t stream) + const whip::memcpy_kind kind, const whip::stream_t stream) DLAF_CUBLAS_LACPY_ETI(extern, float); DLAF_CUBLAS_LACPY_ETI(extern, double); diff --git a/src/lapack/gpu/lacpy.cu b/src/lapack/gpu/lacpy.cu index 16baa883e2..2313c69a6d 100644 --- a/src/lapack/gpu/lacpy.cu +++ b/src/lapack/gpu/lacpy.cu @@ -107,7 +107,7 @@ __global__ void lacpy(cublasFillMode_t uplo, const unsigned m, const unsigned n, template void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a, const SizeType lda, - T* b, const SizeType ldb, const whip::stream_t stream) { + T* b, const SizeType ldb, const whip::memcpy_kind kind, const whip::stream_t stream) { if (m == 0 || n == 0) return; @@ -118,7 +118,6 @@ void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a constexpr unsigned kernel_tile_size_cols = kernels::LacpyParams::kernel_tile_size_cols; if (uplo == blas::Uplo::General) { - const whip::memcpy_kind kind = whip::memcpy_default; whip::memcpy_2d_async(b, to_sizet(ldb) * sizeof(T), a, to_sizet(lda) * sizeof(T), to_sizet(m) * sizeof(T), to_sizet(n), kind, stream); } diff --git a/test/unit/lapack/gpu/test_lacpy.cpp b/test/unit/lapack/gpu/test_lacpy.cpp index 16a73a5827..bb633b55d7 100644 --- a/test/unit/lapack/gpu/test_lacpy.cpp +++ b/test/unit/lapack/gpu/test_lacpy.cpp @@ -70,7 +70,8 @@ TYPED_TEST(LacpyTestGPU, CorrectnessLocal) { auto tile_src = createTile(el, {m, n}, lda); auto tile_dst = createTile(zero, {m, n}, ldb); - gpulapack::lacpy(uplo, m, n, tile_src.ptr(), tile_src.ld(), tile_dst.ptr(), tile_dst.ld(), stream); + gpulapack::lacpy(uplo, m, n, tile_src.ptr(), tile_src.ld(), tile_dst.ptr(), tile_dst.ld(), + whip::memcpy_device_to_device, stream); whip::stream_synchronize(stream); // Verify From 6a908e1d5f38aed2b0934e7b184244f74e8ebe38 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 1 Mar 2024 10:10:38 +0100 Subject: [PATCH 2/4] Infer memcpy direction in lacpy for buggy HIP versions --- include/dlaf/eigensolver/band_to_tridiag/mc.h | 10 +++--- include/dlaf/lapack/gpu/lacpy.h | 4 +-- src/lapack/gpu/lacpy.cu | 36 ++++++++++++++++++- test/unit/lapack/gpu/test_lacpy.cpp | 3 +- 4 files changed, 42 insertions(+), 11 deletions(-) diff --git a/include/dlaf/eigensolver/band_to_tridiag/mc.h b/include/dlaf/eigensolver/band_to_tridiag/mc.h index 073cc4d333..e4c8aee74b 100644 --- a/include/dlaf/eigensolver/band_to_tridiag/mc.h +++ b/include/dlaf/eigensolver/band_to_tridiag/mc.h @@ -292,11 +292,11 @@ class BandBlock { const auto index = std::max(SizeType{0}, source.size().cols() - band_size_); if (index > 0) { gpulapack::lacpy(General, band_size_ + 1, index, source.ptr(), source.ld() + 1, ptr(0, j), - ld() + 1, whip::memcpy_device_to_host, stream); + ld() + 1, stream); } const auto size = std::min(band_size_, source.size().cols()); gpulapack::lacpy(Lower, size, size, source.ptr({index, index}), source.ld(), - ptr(0, j + index), ld(), whip::memcpy_device_to_host, stream); + ptr(0, j + index), ld(), stream); }, std::move(source)); } @@ -367,13 +367,11 @@ class BandBlock { const auto index = source.size().cols() - band_size_; const auto size = std::min(band_size_, source.size().rows()); auto dest = ptr(band_size_, j + index); - gpulapack::lacpy(Upper, size, size, source.ptr({0, index}), source.ld(), dest, ld(), - whip::memcpy_device_to_host, stream); + gpulapack::lacpy(Upper, size, size, source.ptr({0, index}), source.ld(), dest, ld(), stream); if (band_size_ > size) { const auto size2 = band_size_ - size; gpulapack::lacpy(General, source.size().rows(), size2, source.ptr({0, index + size}), - source.ld(), dest + ld() * size, ld(), whip::memcpy_device_to_host, - stream); + source.ld(), dest + ld() * size, ld(), stream); } }, std::move(source)); diff --git a/include/dlaf/lapack/gpu/lacpy.h b/include/dlaf/lapack/gpu/lacpy.h index 8c10b3651e..92642a43d4 100644 --- a/include/dlaf/lapack/gpu/lacpy.h +++ b/include/dlaf/lapack/gpu/lacpy.h @@ -22,12 +22,12 @@ namespace dlaf::gpulapack { template void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a, const SizeType lda, - T* b, const SizeType ldb, const whip::memcpy_kind kind, const whip::stream_t stream); + T* b, const SizeType ldb, const whip::stream_t stream); #define DLAF_CUBLAS_LACPY_ETI(kword, Type) \ kword template void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const Type* a, \ const SizeType lda, Type* b, const SizeType ldb, \ - const whip::memcpy_kind kind, const whip::stream_t stream) + const whip::stream_t stream) DLAF_CUBLAS_LACPY_ETI(extern, float); DLAF_CUBLAS_LACPY_ETI(extern, double); diff --git a/src/lapack/gpu/lacpy.cu b/src/lapack/gpu/lacpy.cu index 2313c69a6d..8f04d30234 100644 --- a/src/lapack/gpu/lacpy.cu +++ b/src/lapack/gpu/lacpy.cu @@ -107,7 +107,7 @@ __global__ void lacpy(cublasFillMode_t uplo, const unsigned m, const unsigned n, template void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a, const SizeType lda, - T* b, const SizeType ldb, const whip::memcpy_kind kind, const whip::stream_t stream) { + T* b, const SizeType ldb, const whip::stream_t stream) { if (m == 0 || n == 0) return; @@ -118,6 +118,40 @@ void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a constexpr unsigned kernel_tile_size_cols = kernels::LacpyParams::kernel_tile_size_cols; if (uplo == blas::Uplo::General) { + whip::memcpy_kind kind = whip::memcpy_default; + + // If HIP is version 5.6.0 or newer, do not use hipMemcpyDefault as it is buggy with 2D memcpy. + // Instead try to infer the memory type using hipPointerGetAttributes. See: + // - https://github.com/ROCm/clr/commit/56daa6c4891b43ec233e9c63f755e3f7b45842b4 + // - https://github.com/ROCm/clr/commit/d3bfb55d7a934355257a72fab538a0a634b43cad +#if defined(DLAF_WITH_HIP) && HIP_VERSION >= 50600000 + hipPointerAttribute_t src_attributes{}; + whip::check_error(hipPointerGetAttributes(&src_attributes, a)); + + hipPointerAttribute_t dest_attributes{}; + whip::check_error(hipPointerGetAttributes(&dest_attributes, b)); + + if (src_attributes.type == hipMemoryTypeDevice && dest_attributes.type == hipMemoryTypeHost) { + kind = whip::memcpy_device_to_host; + } + else if (src_attributes.type == hipMemoryTypeHost && dest_attributes.type == hipMemoryTypeDevice) { + kind = whip::memcpy_host_to_device; + } + else if (src_attributes.type == hipMemoryTypeDevice && dest_attributes.type == hipMemoryTypeDevice) { + kind = whip::memcpy_device_to_device; + } + else if (src_attributes.type == hipMemoryTypeHost && dest_attributes.type == hipMemoryTypeHost) { + DLAF_ASSERT( + false, + "Attempting to do a HIP lacpy with host source and destination, use the CPU lacpy instead"); + } + else { + DLAF_ASSERT(false, + "Attempting to do a HIP lacpy with unsupported source and destination memory type", + src_attributes.type, dest_attributes.type); + } +#endif + whip::memcpy_2d_async(b, to_sizet(ldb) * sizeof(T), a, to_sizet(lda) * sizeof(T), to_sizet(m) * sizeof(T), to_sizet(n), kind, stream); } diff --git a/test/unit/lapack/gpu/test_lacpy.cpp b/test/unit/lapack/gpu/test_lacpy.cpp index bb633b55d7..16a73a5827 100644 --- a/test/unit/lapack/gpu/test_lacpy.cpp +++ b/test/unit/lapack/gpu/test_lacpy.cpp @@ -70,8 +70,7 @@ TYPED_TEST(LacpyTestGPU, CorrectnessLocal) { auto tile_src = createTile(el, {m, n}, lda); auto tile_dst = createTile(zero, {m, n}, ldb); - gpulapack::lacpy(uplo, m, n, tile_src.ptr(), tile_src.ld(), tile_dst.ptr(), tile_dst.ld(), - whip::memcpy_device_to_device, stream); + gpulapack::lacpy(uplo, m, n, tile_src.ptr(), tile_src.ld(), tile_dst.ptr(), tile_dst.ld(), stream); whip::stream_synchronize(stream); // Verify From 2a547668b218b67224e2ee9730ec8991afb32804 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 7 Mar 2024 15:03:50 +0100 Subject: [PATCH 3/4] Refactor workaround for checking memcpy direction in lacpy --- src/lapack/gpu/lacpy.cu | 109 +++++++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 34 deletions(-) diff --git a/src/lapack/gpu/lacpy.cu b/src/lapack/gpu/lacpy.cu index 8f04d30234..9ab2b5f5e4 100644 --- a/src/lapack/gpu/lacpy.cu +++ b/src/lapack/gpu/lacpy.cu @@ -105,6 +105,80 @@ __global__ void lacpy(cublasFillMode_t uplo, const unsigned m, const unsigned n, } } +static whip::memcpy_kind get_lacpy_memcpy_kind(const void* src, const void* dst) { +// If HIP is version 5.6 or newer, avoid the use of hipMemcpyDefault as it is +// buggy with 2D memcpy. Instead try to infer the memory type using +// hipPointerGetAttributes. See: +// - https://github.com/ROCm/clr/commit/56daa6c4891b43ec233e9c63f755e3f7b45842b4 +// - https://github.com/ROCm/clr/commit/d3bfb55d7a934355257a72fab538a0a634b43cad +// +// Note that hipMemoryTypeManaged is not available in older versions, but it is +// already available in 5.3 so we don't do a separate check for availability. +#if defined(DLAF_WITH_HIP) && HIP_VERSION >= 50600000 + constexpr auto get_memory_type = [](const void* p) { + hipPointerAttribute_t attributes{}; + hipError_t status = hipPointerGetAttributes(&attributes, p); + if (status == hipErrorInvalidValue) { + // If HIP returns hipErrorInvalidValue we assume it's due + // to HIP not recognizing a non-HIP allocated pointer as + // host memory, and we assume the type is host. + return hipMemoryTypeHost; + } + else if (status != hipSuccess) { + throw whip::exception(status); + } + + switch (attributes.type) { +#if HIP_VERSION >= 60000000 + case hipMemoryTypeUnregistered: + [[fallthrough]]; +#endif + case hipMemoryTypeHost: + return hipMemoryTypeHost; + case hipMemoryTypeArray: + [[fallthrough]]; + case hipMemoryTypeDevice: + return hipMemoryTypeDevice; + case hipMemoryTypeUnified: + return hipMemoryTypeUnified; + case hipMemoryTypeManaged: + return hipMemoryTypeManaged; + default: + DLAF_UNREACHABLE_PLAIN; + } + }; + + hipMemoryType src_type = get_memory_type(src); + hipMemoryType dst_type = get_memory_type(dst); + + if (src_type == hipMemoryTypeDevice && dst_type == hipMemoryTypeHost) { + return whip::memcpy_device_to_host; + } + else if (src_type == hipMemoryTypeHost && dst_type == hipMemoryTypeDevice) { + return whip::memcpy_host_to_device; + } + else if (src_type == hipMemoryTypeDevice && dst_type == hipMemoryTypeDevice) { + return whip::memcpy_device_to_device; + } + else if (src_type == hipMemoryTypeManaged || src_type == hipMemoryTypeUnified || + dst_type == hipMemoryTypeManaged || dst_type == hipMemoryTypeUnified) { + return whip::memcpy_default; + } + else if (src_type == hipMemoryTypeHost && dst_type == hipMemoryTypeHost) { + DLAF_ASSERT( + false, + "Attempting to do a HIP lacpy with host source and destination, use the CPU lacpy instead"); + } + else { + DLAF_ASSERT(false, + "Attempting to do a HIP lacpy with unsupported source and destination memory type", + src_type, dst_type); + } +#endif + + return whip::memcpy_default; +} + template void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a, const SizeType lda, T* b, const SizeType ldb, const whip::stream_t stream) { @@ -118,40 +192,7 @@ void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a constexpr unsigned kernel_tile_size_cols = kernels::LacpyParams::kernel_tile_size_cols; if (uplo == blas::Uplo::General) { - whip::memcpy_kind kind = whip::memcpy_default; - - // If HIP is version 5.6.0 or newer, do not use hipMemcpyDefault as it is buggy with 2D memcpy. - // Instead try to infer the memory type using hipPointerGetAttributes. See: - // - https://github.com/ROCm/clr/commit/56daa6c4891b43ec233e9c63f755e3f7b45842b4 - // - https://github.com/ROCm/clr/commit/d3bfb55d7a934355257a72fab538a0a634b43cad -#if defined(DLAF_WITH_HIP) && HIP_VERSION >= 50600000 - hipPointerAttribute_t src_attributes{}; - whip::check_error(hipPointerGetAttributes(&src_attributes, a)); - - hipPointerAttribute_t dest_attributes{}; - whip::check_error(hipPointerGetAttributes(&dest_attributes, b)); - - if (src_attributes.type == hipMemoryTypeDevice && dest_attributes.type == hipMemoryTypeHost) { - kind = whip::memcpy_device_to_host; - } - else if (src_attributes.type == hipMemoryTypeHost && dest_attributes.type == hipMemoryTypeDevice) { - kind = whip::memcpy_host_to_device; - } - else if (src_attributes.type == hipMemoryTypeDevice && dest_attributes.type == hipMemoryTypeDevice) { - kind = whip::memcpy_device_to_device; - } - else if (src_attributes.type == hipMemoryTypeHost && dest_attributes.type == hipMemoryTypeHost) { - DLAF_ASSERT( - false, - "Attempting to do a HIP lacpy with host source and destination, use the CPU lacpy instead"); - } - else { - DLAF_ASSERT(false, - "Attempting to do a HIP lacpy with unsupported source and destination memory type", - src_attributes.type, dest_attributes.type); - } -#endif - + whip::memcpy_kind kind = get_lacpy_memcpy_kind(a, b); whip::memcpy_2d_async(b, to_sizet(ldb) * sizeof(T), a, to_sizet(lda) * sizeof(T), to_sizet(m) * sizeof(T), to_sizet(n), kind, stream); } From 47d6f36caadaa1453a0fa7bccfc2c86c5d4208b8 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 7 Mar 2024 16:25:05 +0100 Subject: [PATCH 4/4] Silence unused variable warnings in lacpy.cu --- src/lapack/gpu/lacpy.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lapack/gpu/lacpy.cu b/src/lapack/gpu/lacpy.cu index 9ab2b5f5e4..b5abd62211 100644 --- a/src/lapack/gpu/lacpy.cu +++ b/src/lapack/gpu/lacpy.cu @@ -10,6 +10,7 @@ #include +#include #include #include #include @@ -174,6 +175,8 @@ static whip::memcpy_kind get_lacpy_memcpy_kind(const void* src, const void* dst) "Attempting to do a HIP lacpy with unsupported source and destination memory type", src_type, dst_type); } +#else + dlaf::internal::silenceUnusedWarningFor(src, dst); #endif return whip::memcpy_default;