Skip to content

Commit

Permalink
Fix: do dngvd on DCU rather than CPU
Browse files Browse the repository at this point in the history
  • Loading branch information
dyzheng committed Jan 16, 2025
1 parent d6bd011 commit ed32cb1
Show file tree
Hide file tree
Showing 2 changed files with 184 additions and 40 deletions.
3 changes: 1 addition & 2 deletions source/module_hsolver/kernels/cuda/dngvd_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -260,8 +260,7 @@ struct dngvx_op<T, base_device::DEVICE_GPU>
T* scc,
const int m,
Real* eigenvalue,
T* vcc,
int* fail_info)
T* vcc)
{

}
Expand Down
221 changes: 183 additions & 38 deletions source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,23 @@

namespace hsolver {

// NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op

static hipsolverHandle_t hipsolver_H = nullptr;

void createGpuSolverHandle() {
return;
if (hipsolver_H == nullptr)
{
hipsolverErrcheck(hipsolverCreate(&hipsolver_H));
}
}

void destroyGpuSolverHandle() {
return;
if (hipsolver_H != nullptr)
{
hipsolverErrcheck(hipsolverDestroy(hipsolver_H));
hipsolver_H = nullptr;
}
}

#ifdef __LCAO
Expand All @@ -24,23 +35,65 @@ void dngvd_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DE
double* _vcc,
int* fail_info)
{
std::vector<double> hcc(nstart * nstart, 0.0);
std::vector<double> scc(nstart * nstart, 0.0);
std::vector<double> vcc(nstart * nstart, 0.0);
std::vector<double> eigenvalue(nstart, 0);
hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
base_device::DEVICE_CPU* cpu_ctx = {};
dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
nstart,
ldh,
hcc.data(),
scc.data(),
eigenvalue.data(),
vcc.data(),
fail_info);
hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
// copied from ../cuda/dngvd_op.cu, "dngvd_op"
assert(nstart == ldh);

hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(double) * ldh * nstart, hipMemcpyDeviceToDevice));
// now vcc contains hcc

// prepare some values for hipsolverDnZhegvd_bufferSize
int * devInfo = nullptr;
int lwork = 0, info_gpu = 0;
double * work = nullptr;
hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;

// calculate the sizes needed for pre-allocated buffer.
hipsolverErrcheck(hipsolverDnDsygvd_bufferSize(
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
nstart,
_vcc, ldh,
_scc, ldh,
_eigenvalue,
&lwork));

// allocate memery
hipErrcheck(hipMalloc((void**)&work, sizeof(double) * lwork));

// compute eigenvalues and eigenvectors.
hipsolverErrcheck(hipsolverDnDsygvd(
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
nstart,
_vcc, ldh,
const_cast<double *>(_scc), ldh,
_eigenvalue,
work, lwork, devInfo));

hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));

// free the buffer
hipErrcheck(hipFree(work));
hipErrcheck(hipFree(devInfo));
if(fail_info != nullptr) *fail_info = info_gpu;


//std::vector<double> hcc(nstart * nstart, 0.0);
//std::vector<double> scc(nstart * nstart, 0.0);
//std::vector<double> vcc(nstart * nstart, 0.0);
//std::vector<double> eigenvalue(nstart, 0);
//hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
//hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
//base_device::DEVICE_CPU* cpu_ctx = {};
//dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
// nstart,
// ldh,
// hcc.data(),
// scc.data(),
// eigenvalue.data(),
// vcc.data(),
// fail_info);
//hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
//hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
}
#endif // __LCAO

Expand All @@ -54,23 +107,64 @@ void dngvd_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const ba
std::complex<float>* _vcc,
int* fail_info)
{
std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
std::vector<float> eigenvalue(nstart, 0);
hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
base_device::DEVICE_CPU* cpu_ctx = {};
dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
nstart,
ldh,
hcc.data(),
scc.data(),
eigenvalue.data(),
vcc.data(),
fail_info);
hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
// copied from ../cuda/dngvd_op.cu, "dngvd_op"
assert(nstart == ldh);

hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<float>) * ldh * nstart, hipMemcpyDeviceToDevice));
// now vcc contains hcc

// prepare some values for hipsolverDnZhegvd_bufferSize
int * devInfo = nullptr;
int lwork = 0, info_gpu = 0;
float2 * work = nullptr;
hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;

// calculate the sizes needed for pre-allocated buffer.
hipsolverErrcheck(hipsolverDnChegvd_bufferSize(
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
nstart,
reinterpret_cast<const float2 *>(_vcc), ldh,
reinterpret_cast<const float2 *>(_scc), ldh,
_eigenvalue,
&lwork));

// allocate memery
hipErrcheck(hipMalloc((void**)&work, sizeof(float2) * lwork));

// compute eigenvalues and eigenvectors.
hipsolverErrcheck(hipsolverDnChegvd(
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
nstart,
reinterpret_cast<float2 *>(_vcc), ldh,
const_cast<float2 *>(reinterpret_cast<const float2 *>(_scc)), ldh,
_eigenvalue,
work, lwork, devInfo));

hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
// free the buffer
hipErrcheck(hipFree(work));
hipErrcheck(hipFree(devInfo));
if(fail_info != nullptr) *fail_info = info_gpu;


//std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
//std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
//std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
//std::vector<float> eigenvalue(nstart, 0);
//hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
//hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
//base_device::DEVICE_CPU* cpu_ctx = {};
//dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
// nstart,
// ldh,
// hcc.data(),
// scc.data(),
// eigenvalue.data(),
// vcc.data(),
// fail_info);
//hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
//hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
}

template <>
Expand All @@ -83,7 +177,58 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
std::complex<double>* _vcc,
int* fail_info)
{
std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
// copied from ../cuda/dngvd_op.cu, "dngvd_op"
assert(nstart == ldh);

// save a copy of scc in case the diagonalization fails
std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));

hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<double>) * ldh * nstart, hipMemcpyDeviceToDevice));

// now vcc contains hcc

// prepare some values for hipsolverDnZhegvd_bufferSize
int * devInfo = nullptr;
int lwork = 0, info_gpu = 0;
double2 * work = nullptr;
hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;

// calculate the sizes needed for pre-allocated buffer.
hipsolverErrcheck(hipsolverDnZhegvd_bufferSize(
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
nstart,
reinterpret_cast<const double2 *>(_vcc), ldh,
reinterpret_cast<const double2 *>(_scc), ldh,
_eigenvalue,
&lwork));

// allocate memery
hipErrcheck(hipMalloc((void**)&work, sizeof(double2) * lwork));

// compute eigenvalues and eigenvectors.
hipsolverErrcheck(hipsolverDnZhegvd(
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
nstart,
reinterpret_cast<double2 *>(_vcc), ldh,
const_cast<double2 *>(reinterpret_cast<const double2 *>(_scc)), ldh,
_eigenvalue,
work, lwork, devInfo));

hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
// free the buffer
hipErrcheck(hipFree(work));
hipErrcheck(hipFree(devInfo));
if(fail_info != nullptr) *fail_info = info_gpu;







/*std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
std::vector<double> eigenvalue(nstart, 0);
Expand All @@ -99,7 +244,7 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
vcc.data(),
fail_info);
hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));*/
}

#ifdef __LCAO
Expand Down

0 comments on commit ed32cb1

Please sign in to comment.