diff --git a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml new file mode 100644 index 0000000000..8e065ad484 --- /dev/null +++ b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml @@ -0,0 +1,40 @@ +--- +features: + - | + This PR adds the ability to run Matrix Product State Simulation on Nvidia GPUs. + To be precise, this PR offloads the Singular Value Decomposition required for + Matrix Product State Simulation to Nvidia GPUs with the help of cuQuantum. + + While choosing for the backend for Matrix Product State simulation users can + choose all as usual, but this time they can choose the device as GPU. + + Example + + .. code-block:: python + + from qiskit_aer import AerSimulator + from qiskit.circuit import QuantumCircuit + from qiskit.compiler import transpile + + num_qubits = 10 + shots = 5 + + qc = QuantumCircuit(num_qubits) + qc.h(0) + + for control, target in zip(range(num_qubits-1), range(1, num_qubits)): + qc.cx(control, target) + + qc.measure_all() + + sim = AerSimulator(method="matrix_product_state", device="GPU") + qc_t = transpile(qc, backend=sim) + job = sim.run(qc_t, shots = shots) + + counts = job.result().get_counts() + counts + + + + + diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp index d4b540d6a1..ae4f3bddd5 100644 --- a/src/simulators/matrix_product_state/matrix_product_state.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state.hpp @@ -360,6 +360,19 @@ void State::set_config(const Config &config) { // Set LAPACK SVD MPS::set_mps_lapack_svd(config.mps_lapack); + + // Set device for SVD + MPS::set_mps_svd_device(config.device); + + // Get CUDA device, if GPU offloading enabled + if (config.device.compare("GPU") == 0) { +#ifdef AER_THRUST_CUDA + cudaDeviceProp prop; + int deviceId{-1}; + HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId)); + HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId)); +#endif // AER_THRUST_CUDA + } } void State::add_metadata(ExperimentResult &result) const { diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp index 424637c4f0..c48a4d4c24 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp @@ -19,6 +19,7 @@ #include "stdlib.h" #include "string.h" #include +#include #include #include "framework/linalg/almost_equal.hpp" @@ -45,6 +46,9 @@ double MPS::json_chop_threshold_ = 1E-8; std::stringstream MPS::logging_str_; bool MPS::mps_log_data_ = 0; bool MPS::mps_lapack_ = false; +#ifdef AER_THRUST_CUDA +std::string MPS::mps_svd_device_; +#endif // AER_THRUST_CUDA //------------------------------------------------------------------------ // local function declarations @@ -663,8 +667,14 @@ void MPS::common_apply_2_qubit_gate( MPS_Tensor left_gamma, right_gamma; rvector_t lambda; +#ifdef AER_THRUST_CUDA + double discarded_value = MPS_Tensor::Decompose( + temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_, + MPS::mps_svd_device_, cuda_stream, cutensor_handle); +#else double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_); +#endif // AER_THRUST_CUDA if (discarded_value > json_chop_threshold_) MPS::print_to_log("discarded_value=", discarded_value, ", "); @@ -1803,7 +1813,18 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) { // step 2 - SVD S.clear(); S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns())); + +#ifdef AER_THRUST_CUDA + if (MPS::mps_svd_device_.compare("GPU") == 0) { + cutensor_csvd_wrapper(reshaped_matrix, U, S, V, cuda_stream, + cutensor_handle); + } else { + csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_); + } +#else csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_); +#endif // AER_THRUST_CUDA + reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(), MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_); diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp index 1180e6cddf..599604fa8b 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp @@ -15,12 +15,12 @@ #ifndef _aer_matrix_product_state_hpp_ #define _aer_matrix_product_state_hpp_ -#include - #include "framework/json.hpp" #include "framework/operations.hpp" #include "framework/utils.hpp" #include "matrix_product_state_tensor.hpp" +#include +#include namespace AER { namespace MatrixProductState { @@ -81,7 +81,14 @@ enum class MPS_swap_direction { SWAP_LEFT, SWAP_RIGHT }; class MPS { public: - MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {} + MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) { +#ifdef AER_THRUST_CUDA + if (mps_svd_device_.compare("GPU") == 0) { + cudaStreamCreate(&cuda_stream); + cutensornetCreate(&cutensor_handle); + } +#endif // AER_THRUST_CUDA + } ~MPS() {} //-------------------------------------------------------------------------- @@ -321,6 +328,9 @@ class MPS { } static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; } + static void set_mps_svd_device(std::string mps_svd_device) { + mps_svd_device_ = mps_svd_device; + } static uint_t get_omp_threads() { return omp_threads_; } static uint_t get_omp_threshold() { return omp_threshold_; } @@ -544,6 +554,11 @@ class MPS { std::vector q_reg_; std::vector lambda_reg_; +#ifdef AER_THRUST_CUDA + cudaStream_t cuda_stream; + cutensornetHandle_t cutensor_handle; +#endif // AER_THRUST_CUDA + struct ordering { // order_ stores the current ordering of the qubits, // location_ stores the location of each qubit in the vector. It is derived @@ -570,6 +585,7 @@ class MPS { static bool mps_log_data_; static MPS_swap_direction mps_swap_direction_; static bool mps_lapack_; + static std::string mps_svd_device_; }; inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) { diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp index cc090deb64..2886333b34 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp @@ -157,12 +157,22 @@ class MPS_Tensor { static MPS_Tensor contract(const MPS_Tensor &left_gamma, const rvector_t &lambda, const MPS_Tensor &right_gamma, bool mul_by_lambda); +#ifdef AER_THRUST_CUDA + static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, + rvector_t &lambda, MPS_Tensor &right_gamma, + bool mps_lapack, std::string mps_svd_device, + cudaStream_t &cuda_stream, + cutensornetHandle_t &cutensor_handle); +#else static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, bool mps_lapack); +#endif // AER_THRUST_CUDA + static void reshape_for_3_qubits_before_SVD(const std::vector &data, MPS_Tensor &reshaped_tensor); + static void contract_2_dimensions(const MPS_Tensor &left_gamma, const MPS_Tensor &right_gamma, uint_t omp_threads, cmatrix_t &result); @@ -591,15 +601,33 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma, // rvector_t &lambda - tensors for the result. // Returns: none. //--------------------------------------------------------------- +#ifdef AER_THRUST_CUDA +double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, + rvector_t &lambda, MPS_Tensor &right_gamma, + bool mps_lapack, std::string mps_svd_device, + cudaStream_t &cuda_stream, + cutensornetHandle_t &cutensor_handle) +#else double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, - bool mps_lapack) { + bool mps_lapack) +#endif // AER_THRUST_CUDA +{ cmatrix_t C; C = reshape_before_SVD(temp.data_); cmatrix_t U, V; rvector_t S(std::min(C.GetRows(), C.GetColumns())); +#ifdef AER_THRUST_CUDA + if (mps_svd_device.compare("GPU") == 0) { + cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cutensor_handle); + } else { + csvd_wrapper(C, U, S, V, mps_lapack); + } +#else csvd_wrapper(C, U, S, V, mps_lapack); +#endif // AER_THRUST_CUDA + double discarded_value = 0.0; discarded_value = reduce_zeros(U, S, V, max_bond_dimension_, truncation_threshold_, mps_lapack); diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index aa0d318423..38c74b16a0 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -28,6 +28,7 @@ #include namespace AER { + // default values constexpr auto mul_factor = 1e2; constexpr long double tiny_factor = 1e30; @@ -667,4 +668,160 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, } } +#ifdef AER_THRUST_CUDA +void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, + cmatrix_t &V, cudaStream_t &stream, + cutensornetHandle_t &handle) { + + bool transposed = false; + + const int64_t rows = A.GetRows(), cols = A.GetColumns(); + + if (rows < cols) { + transposed = true; + A = AER::Utils::dagger(A); + } + cmatrix_t A_cpy = A; + + const int64_t min_dim = std::min(rows, cols); + const int64_t lda = std::max(rows, cols); + + U.resize(lda, min_dim); + V.resize(min_dim, min_dim); + S.resize(min_dim); + + size_t sizeA = A.size() * sizeof(complex_t); + size_t sizeU = U.size() * sizeof(complex_t); + size_t sizeS = S.size() * sizeof(double); + size_t sizeV = V.size() * sizeof(complex_t); + + complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(), + *cutensor_V = V.move_to_buffer(); + + cudaDataType_t typeData = CUDA_C_64F; + + std::vector modesA{'m', 'n'}; + std::vector modesU{'m', 'x'}; + std::vector modesV{'x', 'n'}; + + double *cutensor_S = (double *)malloc(sizeS); + + void *D_A; + void *D_U; + void *D_S; + void *D_V; + + HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_A, sizeA)); + HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_U, sizeU)); + HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_S, sizeS)); + HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_V, sizeV)); + + HANDLE_CUDA_ERROR(cudaMemcpy(D_A, cutensor_A, sizeA, cudaMemcpyHostToDevice)); + + cutensornetTensorDescriptor_t descTensorA; + cutensornetTensorDescriptor_t descTensorU; + cutensornetTensorDescriptor_t descTensorV; + + const int32_t numModesA = modesA.size(); + const int32_t numModesU = modesU.size(); + const int32_t numModesV = modesV.size(); + + std::vector extentA{lda, min_dim}; // shape of A + std::vector extentU{lda, min_dim}; // shape of U :) + std::vector extentV{min_dim, min_dim}; // shape of V + + const int64_t *strides = + NULL; // matrices stores the entries in column-major-order. + + HANDLE_ERROR(cutensornetCreateTensorDescriptor( + handle, numModesA, extentA.data(), strides, modesA.data(), typeData, + &descTensorA)); + HANDLE_ERROR(cutensornetCreateTensorDescriptor( + handle, numModesU, extentU.data(), strides, modesU.data(), typeData, + &descTensorU)); + HANDLE_ERROR(cutensornetCreateTensorDescriptor( + handle, numModesV, extentV.data(), strides, modesV.data(), typeData, + &descTensorV)); + + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc)); + HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes( + handle, descTensorA, descTensorU, descTensorV, NULL, workDesc)); + int64_t hostWorkspaceSize, deviceWorkspaceSize; + // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick + HANDLE_ERROR(cutensornetWorkspaceGetMemorySize( + handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH, + &deviceWorkspaceSize)); + HANDLE_ERROR(cutensornetWorkspaceGetMemorySize( + handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_HOST, CUTENSORNET_WORKSPACE_SCRATCH, + &hostWorkspaceSize)); + + void *devWork = nullptr, *hostWork = nullptr; + if (deviceWorkspaceSize > 0) { + HANDLE_CUDA_ERROR(cudaMalloc(&devWork, deviceWorkspaceSize)); + } + if (hostWorkspaceSize > 0) { + hostWork = malloc(hostWorkspaceSize); + } + HANDLE_ERROR(cutensornetWorkspaceSetMemory( + handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, devWork, deviceWorkspaceSize)); + HANDLE_ERROR(cutensornetWorkspaceSetMemory( + handle, workDesc, CUTENSORNET_MEMSPACE_HOST, + CUTENSORNET_WORKSPACE_SCRATCH, hostWork, hostWorkspaceSize)); + + // Requesting for Exact SVD. + HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_A, descTensorU, D_U, + D_S, descTensorV, D_V, NULL, NULL, workDesc, + stream)); + + HANDLE_CUDA_ERROR( + cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost)); + HANDLE_CUDA_ERROR( + cudaMemcpyAsync(cutensor_S, D_S, sizeS, cudaMemcpyDeviceToHost)); + HANDLE_CUDA_ERROR( + cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost)); + + S.clear(); + for (int i = 0; i < min_dim; i++) + S.push_back(cutensor_S[i]); + + A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A); + U = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_U); + V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V); + + V = AER::Utils::dagger(V); + validate_SVD_result(A_cpy, U, S, V); + if (transposed) { + std::swap(U, V); + } + + /*************** + * Free resources + ****************/ + + HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorA)); + HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU)); + HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV)); + HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + + if (cutensor_S) + free(cutensor_S); + if (D_A) + cudaFree(D_A); + if (D_U) + cudaFree(D_U); + if (D_S) + cudaFree(D_S); + if (D_V) + cudaFree(D_V); + if (devWork) + cudaFree(devWork); + if (hostWork) + free(hostWork); +} +#endif // AER_THRUST_CUDA + } // namespace AER diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp index d08ba73424..69e2518462 100644 --- a/src/simulators/matrix_product_state/svd.hpp +++ b/src/simulators/matrix_product_state/svd.hpp @@ -55,6 +55,41 @@ void validate_SVD_result(const cmatrix_t &A, const cmatrix_t &U, void validate_SVdD_result(const cmatrix_t &A, const cmatrix_t &U, const rvector_t &S, const cmatrix_t &V); +#ifdef AER_THRUST_CUDA + +#include +#include +#include +#include + +#define HANDLE_ERROR(x) \ + { \ + const auto err = x; \ + if (err != CUTENSORNET_STATUS_SUCCESS) { \ + std::stringstream str; \ + str << "ERROR TensorNet::contractor : " \ + << cutensornetGetErrorString(err); \ + throw std::runtime_error(str.str()); \ + } \ + }; + +#define HANDLE_CUDA_ERROR(x) \ + { \ + const auto err = x; \ + if (err != cudaSuccess) { \ + std::stringstream str; \ + str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err); \ + throw std::runtime_error(str.str()); \ + } \ + }; + +// cutensor call +void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, + cmatrix_t &V, cudaStream_t &stream, + cutensornetHandle_t &handle); + +#endif // AER_THRUST_CUDA + //------------------------------------------------------------------------- } // end namespace AER //------------------------------------------------------------------------- diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py index e04ec8dcbc..17b78f6344 100644 --- a/test/terra/backends/simulator_test_case.py +++ b/test/terra/backends/simulator_test_case.py @@ -82,7 +82,13 @@ def _method_device(methods): # add special test device for cuStateVec if available cuStateVec = check_cuStateVec(available_devices) - gpu_methods = ["statevector", "density_matrix", "unitary", "tensor_network"] + gpu_methods = [ + "statevector", + "density_matrix", + "unitary", + "tensor_network", + "matrix_product_state", + ] batchable_methods = ["statevector", "density_matrix"] data_args = [] for method in methods: @@ -102,7 +108,11 @@ def _method_device(methods): # add test cases for cuStateVec if available using special device = 'GPU_cuStateVec' #'GPU_cuStateVec' is used only inside tests not available in Aer # and this is converted to "device='GPU'" and option "cuStateVec_enalbe = True" is added - if cuStateVec and "tensor_network" != method: + if ( + cuStateVec + and "tensor_network" != method + and "matrix_product_state" != method + ): data_args.append((method, "GPU_cuStateVec")) else: data_args.append((method, "CPU"))