diff --git a/include/dlaf/eigensolver/band_to_tridiag/mc.h b/include/dlaf/eigensolver/band_to_tridiag/mc.h index 073cc4d333..e4c8aee74b 100644 --- a/include/dlaf/eigensolver/band_to_tridiag/mc.h +++ b/include/dlaf/eigensolver/band_to_tridiag/mc.h @@ -292,11 +292,11 @@ class BandBlock { const auto index = std::max(SizeType{0}, source.size().cols() - band_size_); if (index > 0) { gpulapack::lacpy(General, band_size_ + 1, index, source.ptr(), source.ld() + 1, ptr(0, j), - ld() + 1, whip::memcpy_device_to_host, stream); + ld() + 1, stream); } const auto size = std::min(band_size_, source.size().cols()); gpulapack::lacpy(Lower, size, size, source.ptr({index, index}), source.ld(), - ptr(0, j + index), ld(), whip::memcpy_device_to_host, stream); + ptr(0, j + index), ld(), stream); }, std::move(source)); } @@ -367,13 +367,11 @@ class BandBlock { const auto index = source.size().cols() - band_size_; const auto size = std::min(band_size_, source.size().rows()); auto dest = ptr(band_size_, j + index); - gpulapack::lacpy(Upper, size, size, source.ptr({0, index}), source.ld(), dest, ld(), - whip::memcpy_device_to_host, stream); + gpulapack::lacpy(Upper, size, size, source.ptr({0, index}), source.ld(), dest, ld(), stream); if (band_size_ > size) { const auto size2 = band_size_ - size; gpulapack::lacpy(General, source.size().rows(), size2, source.ptr({0, index + size}), - source.ld(), dest + ld() * size, ld(), whip::memcpy_device_to_host, - stream); + source.ld(), dest + ld() * size, ld(), stream); } }, std::move(source)); diff --git a/include/dlaf/lapack/gpu/lacpy.h b/include/dlaf/lapack/gpu/lacpy.h index 8c10b3651e..92642a43d4 100644 --- a/include/dlaf/lapack/gpu/lacpy.h +++ b/include/dlaf/lapack/gpu/lacpy.h @@ -22,12 +22,12 @@ namespace dlaf::gpulapack { template void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a, const SizeType lda, - T* b, const SizeType ldb, const whip::memcpy_kind kind, const whip::stream_t stream); + T* b, const SizeType ldb, const whip::stream_t stream); #define DLAF_CUBLAS_LACPY_ETI(kword, Type) \ kword template void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const Type* a, \ const SizeType lda, Type* b, const SizeType ldb, \ - const whip::memcpy_kind kind, const whip::stream_t stream) + const whip::stream_t stream) DLAF_CUBLAS_LACPY_ETI(extern, float); DLAF_CUBLAS_LACPY_ETI(extern, double); diff --git a/src/lapack/gpu/lacpy.cu b/src/lapack/gpu/lacpy.cu index 2313c69a6d..8f04d30234 100644 --- a/src/lapack/gpu/lacpy.cu +++ b/src/lapack/gpu/lacpy.cu @@ -107,7 +107,7 @@ __global__ void lacpy(cublasFillMode_t uplo, const unsigned m, const unsigned n, template void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a, const SizeType lda, - T* b, const SizeType ldb, const whip::memcpy_kind kind, const whip::stream_t stream) { + T* b, const SizeType ldb, const whip::stream_t stream) { if (m == 0 || n == 0) return; @@ -118,6 +118,40 @@ void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a constexpr unsigned kernel_tile_size_cols = kernels::LacpyParams::kernel_tile_size_cols; if (uplo == blas::Uplo::General) { + whip::memcpy_kind kind = whip::memcpy_default; + + // If HIP is version 5.6.0 or newer, do not use hipMemcpyDefault as it is buggy with 2D memcpy. + // Instead try to infer the memory type using hipPointerGetAttributes. See: + // - https://github.com/ROCm/clr/commit/56daa6c4891b43ec233e9c63f755e3f7b45842b4 + // - https://github.com/ROCm/clr/commit/d3bfb55d7a934355257a72fab538a0a634b43cad +#if defined(DLAF_WITH_HIP) && HIP_VERSION >= 50600000 + hipPointerAttribute_t src_attributes{}; + whip::check_error(hipPointerGetAttributes(&src_attributes, a)); + + hipPointerAttribute_t dest_attributes{}; + whip::check_error(hipPointerGetAttributes(&dest_attributes, b)); + + if (src_attributes.type == hipMemoryTypeDevice && dest_attributes.type == hipMemoryTypeHost) { + kind = whip::memcpy_device_to_host; + } + else if (src_attributes.type == hipMemoryTypeHost && dest_attributes.type == hipMemoryTypeDevice) { + kind = whip::memcpy_host_to_device; + } + else if (src_attributes.type == hipMemoryTypeDevice && dest_attributes.type == hipMemoryTypeDevice) { + kind = whip::memcpy_device_to_device; + } + else if (src_attributes.type == hipMemoryTypeHost && dest_attributes.type == hipMemoryTypeHost) { + DLAF_ASSERT( + false, + "Attempting to do a HIP lacpy with host source and destination, use the CPU lacpy instead"); + } + else { + DLAF_ASSERT(false, + "Attempting to do a HIP lacpy with unsupported source and destination memory type", + src_attributes.type, dest_attributes.type); + } +#endif + whip::memcpy_2d_async(b, to_sizet(ldb) * sizeof(T), a, to_sizet(lda) * sizeof(T), to_sizet(m) * sizeof(T), to_sizet(n), kind, stream); } diff --git a/test/unit/lapack/gpu/test_lacpy.cpp b/test/unit/lapack/gpu/test_lacpy.cpp index bb633b55d7..16a73a5827 100644 --- a/test/unit/lapack/gpu/test_lacpy.cpp +++ b/test/unit/lapack/gpu/test_lacpy.cpp @@ -70,8 +70,7 @@ TYPED_TEST(LacpyTestGPU, CorrectnessLocal) { auto tile_src = createTile(el, {m, n}, lda); auto tile_dst = createTile(zero, {m, n}, ldb); - gpulapack::lacpy(uplo, m, n, tile_src.ptr(), tile_src.ld(), tile_dst.ptr(), tile_dst.ld(), - whip::memcpy_device_to_device, stream); + gpulapack::lacpy(uplo, m, n, tile_src.ptr(), tile_src.ld(), tile_dst.ptr(), tile_dst.ld(), stream); whip::stream_synchronize(stream); // Verify