Skip to content

Commit

Permalink
Infer memcpy direction in lacpy for buggy HIP versions
Browse files Browse the repository at this point in the history
  • Loading branch information
msimberg committed Mar 1, 2024
1 parent 5b8f856 commit 6a908e1
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 11 deletions.
10 changes: 4 additions & 6 deletions include/dlaf/eigensolver/band_to_tridiag/mc.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,11 +292,11 @@ class BandBlock {
const auto index = std::max(SizeType{0}, source.size().cols() - band_size_);
if (index > 0) {
gpulapack::lacpy(General, band_size_ + 1, index, source.ptr(), source.ld() + 1, ptr(0, j),
ld() + 1, whip::memcpy_device_to_host, stream);
ld() + 1, stream);
}
const auto size = std::min(band_size_, source.size().cols());
gpulapack::lacpy(Lower, size, size, source.ptr({index, index}), source.ld(),
ptr(0, j + index), ld(), whip::memcpy_device_to_host, stream);
ptr(0, j + index), ld(), stream);
},
std::move(source));
}
Expand Down Expand Up @@ -367,13 +367,11 @@ class BandBlock {
const auto index = source.size().cols() - band_size_;
const auto size = std::min(band_size_, source.size().rows());
auto dest = ptr(band_size_, j + index);
gpulapack::lacpy(Upper, size, size, source.ptr({0, index}), source.ld(), dest, ld(),
whip::memcpy_device_to_host, stream);
gpulapack::lacpy(Upper, size, size, source.ptr({0, index}), source.ld(), dest, ld(), stream);
if (band_size_ > size) {
const auto size2 = band_size_ - size;
gpulapack::lacpy(General, source.size().rows(), size2, source.ptr({0, index + size}),
source.ld(), dest + ld() * size, ld(), whip::memcpy_device_to_host,
stream);
source.ld(), dest + ld() * size, ld(), stream);
}
},
std::move(source));
Expand Down
4 changes: 2 additions & 2 deletions include/dlaf/lapack/gpu/lacpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ namespace dlaf::gpulapack {

template <class T>
void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a, const SizeType lda,
T* b, const SizeType ldb, const whip::memcpy_kind kind, const whip::stream_t stream);
T* b, const SizeType ldb, const whip::stream_t stream);

#define DLAF_CUBLAS_LACPY_ETI(kword, Type) \
kword template void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const Type* a, \
const SizeType lda, Type* b, const SizeType ldb, \
const whip::memcpy_kind kind, const whip::stream_t stream)
const whip::stream_t stream)

DLAF_CUBLAS_LACPY_ETI(extern, float);
DLAF_CUBLAS_LACPY_ETI(extern, double);
Expand Down
36 changes: 35 additions & 1 deletion src/lapack/gpu/lacpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ __global__ void lacpy(cublasFillMode_t uplo, const unsigned m, const unsigned n,

template <class T>
void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a, const SizeType lda,
T* b, const SizeType ldb, const whip::memcpy_kind kind, const whip::stream_t stream) {
T* b, const SizeType ldb, const whip::stream_t stream) {
if (m == 0 || n == 0)
return;

Expand All @@ -118,6 +118,40 @@ void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a
constexpr unsigned kernel_tile_size_cols = kernels::LacpyParams::kernel_tile_size_cols;

if (uplo == blas::Uplo::General) {
whip::memcpy_kind kind = whip::memcpy_default;

// If HIP is version 5.6.0 or newer, do not use hipMemcpyDefault as it is buggy with 2D memcpy.
// Instead try to infer the memory type using hipPointerGetAttributes. See:
// - https://github.com/ROCm/clr/commit/56daa6c4891b43ec233e9c63f755e3f7b45842b4
// - https://github.com/ROCm/clr/commit/d3bfb55d7a934355257a72fab538a0a634b43cad
#if defined(DLAF_WITH_HIP) && HIP_VERSION >= 50600000
hipPointerAttribute_t src_attributes{};
whip::check_error(hipPointerGetAttributes(&src_attributes, a));

hipPointerAttribute_t dest_attributes{};
whip::check_error(hipPointerGetAttributes(&dest_attributes, b));

if (src_attributes.type == hipMemoryTypeDevice && dest_attributes.type == hipMemoryTypeHost) {
kind = whip::memcpy_device_to_host;
}
else if (src_attributes.type == hipMemoryTypeHost && dest_attributes.type == hipMemoryTypeDevice) {
kind = whip::memcpy_host_to_device;
}
else if (src_attributes.type == hipMemoryTypeDevice && dest_attributes.type == hipMemoryTypeDevice) {
kind = whip::memcpy_device_to_device;
}
else if (src_attributes.type == hipMemoryTypeHost && dest_attributes.type == hipMemoryTypeHost) {
DLAF_ASSERT(
false,
"Attempting to do a HIP lacpy with host source and destination, use the CPU lacpy instead");
}
else {
DLAF_ASSERT(false,
"Attempting to do a HIP lacpy with unsupported source and destination memory type",
src_attributes.type, dest_attributes.type);
}
#endif

whip::memcpy_2d_async(b, to_sizet(ldb) * sizeof(T), a, to_sizet(lda) * sizeof(T),
to_sizet(m) * sizeof(T), to_sizet(n), kind, stream);
}
Expand Down
3 changes: 1 addition & 2 deletions test/unit/lapack/gpu/test_lacpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ TYPED_TEST(LacpyTestGPU, CorrectnessLocal) {
auto tile_src = createTile<const T, Device::GPU>(el, {m, n}, lda);
auto tile_dst = createTile<T, Device::GPU>(zero, {m, n}, ldb);

gpulapack::lacpy(uplo, m, n, tile_src.ptr(), tile_src.ld(), tile_dst.ptr(), tile_dst.ld(),
whip::memcpy_device_to_device, stream);
gpulapack::lacpy(uplo, m, n, tile_src.ptr(), tile_src.ld(), tile_dst.ptr(), tile_dst.ld(), stream);
whip::stream_synchronize(stream);

// Verify
Expand Down

0 comments on commit 6a908e1

Please sign in to comment.