diff --git a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
new file mode 100644
index 0000000000..8e065ad484
--- /dev/null
+++ b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
@@ -0,0 +1,40 @@
+---
+features:
+  - |
+    This PR adds the ability to run Matrix Product State Simulation on Nvidia GPUs.
+    To be precise, this PR offloads the Singular Value Decomposition required for
+    Matrix Product State Simulation to Nvidia GPUs with the help of cuQuantum.
+    
+    While choosing for the backend for Matrix Product State simulation users can 
+    choose all as usual, but this time they can choose the device as GPU.
+
+    Example
+
+    .. code-block:: python
+
+      from qiskit_aer import AerSimulator
+      from qiskit.circuit import QuantumCircuit
+      from qiskit.compiler import transpile
+      
+      num_qubits = 10
+      shots = 5
+      
+      qc = QuantumCircuit(num_qubits)
+      qc.h(0)
+      
+      for control, target in zip(range(num_qubits-1), range(1, num_qubits)):
+        qc.cx(control, target)
+        
+      qc.measure_all()
+      
+      sim = AerSimulator(method="matrix_product_state", device="GPU")
+      qc_t = transpile(qc, backend=sim)
+      job = sim.run(qc_t, shots = shots)
+      
+      counts = job.result().get_counts()
+      counts
+
+
+
+
+
diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp
index d4b540d6a1..ae4f3bddd5 100644
--- a/src/simulators/matrix_product_state/matrix_product_state.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state.hpp
@@ -360,6 +360,19 @@ void State::set_config(const Config &config) {
 
   // Set LAPACK SVD
   MPS::set_mps_lapack_svd(config.mps_lapack);
+
+  // Set device for SVD
+  MPS::set_mps_svd_device(config.device);
+
+  // Get CUDA device, if GPU offloading enabled
+  if (config.device.compare("GPU") == 0) {
+#ifdef AER_THRUST_CUDA
+    cudaDeviceProp prop;
+    int deviceId{-1};
+    HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
+    HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
+#endif // AER_THRUST_CUDA
+  }
 }
 
 void State::add_metadata(ExperimentResult &result) const {
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
index 424637c4f0..c48a4d4c24 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
@@ -19,6 +19,7 @@
 #include "stdlib.h"
 #include "string.h"
 #include <iostream>
+#include <string>
 #include <utility>
 
 #include "framework/linalg/almost_equal.hpp"
@@ -45,6 +46,9 @@ double MPS::json_chop_threshold_ = 1E-8;
 std::stringstream MPS::logging_str_;
 bool MPS::mps_log_data_ = 0;
 bool MPS::mps_lapack_ = false;
+#ifdef AER_THRUST_CUDA
+std::string MPS::mps_svd_device_;
+#endif // AER_THRUST_CUDA
 
 //------------------------------------------------------------------------
 // local function declarations
@@ -663,8 +667,14 @@ void MPS::common_apply_2_qubit_gate(
 
   MPS_Tensor left_gamma, right_gamma;
   rvector_t lambda;
+#ifdef AER_THRUST_CUDA
+  double discarded_value = MPS_Tensor::Decompose(
+      temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_,
+      MPS::mps_svd_device_, cuda_stream, cutensor_handle);
+#else
   double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda,
                                                  right_gamma, MPS::mps_lapack_);
+#endif // AER_THRUST_CUDA
 
   if (discarded_value > json_chop_threshold_)
     MPS::print_to_log("discarded_value=", discarded_value, ", ");
@@ -1803,7 +1813,18 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
     // step 2 - SVD
     S.clear();
     S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));
+
+#ifdef AER_THRUST_CUDA
+    if (MPS::mps_svd_device_.compare("GPU") == 0) {
+      cutensor_csvd_wrapper(reshaped_matrix, U, S, V, cuda_stream,
+                            cutensor_handle);
+    } else {
+      csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
+    }
+#else
     csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
+#endif // AER_THRUST_CUDA
+
     reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(),
                  MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_);
 
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
index 1180e6cddf..599604fa8b 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
@@ -15,12 +15,12 @@
 #ifndef _aer_matrix_product_state_hpp_
 #define _aer_matrix_product_state_hpp_
 
-#include <cstdarg>
-
 #include "framework/json.hpp"
 #include "framework/operations.hpp"
 #include "framework/utils.hpp"
 #include "matrix_product_state_tensor.hpp"
+#include <cstdarg>
+#include <string>
 
 namespace AER {
 namespace MatrixProductState {
@@ -81,7 +81,14 @@ enum class MPS_swap_direction { SWAP_LEFT, SWAP_RIGHT };
 
 class MPS {
 public:
-  MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {}
+  MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {
+#ifdef AER_THRUST_CUDA
+    if (mps_svd_device_.compare("GPU") == 0) {
+      cudaStreamCreate(&cuda_stream);
+      cutensornetCreate(&cutensor_handle);
+    }
+#endif // AER_THRUST_CUDA
+  }
   ~MPS() {}
 
   //--------------------------------------------------------------------------
@@ -321,6 +328,9 @@ class MPS {
   }
 
   static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; }
+  static void set_mps_svd_device(std::string mps_svd_device) {
+    mps_svd_device_ = mps_svd_device;
+  }
 
   static uint_t get_omp_threads() { return omp_threads_; }
   static uint_t get_omp_threshold() { return omp_threshold_; }
@@ -544,6 +554,11 @@ class MPS {
   std::vector<MPS_Tensor> q_reg_;
   std::vector<rvector_t> lambda_reg_;
 
+#ifdef AER_THRUST_CUDA
+  cudaStream_t cuda_stream;
+  cutensornetHandle_t cutensor_handle;
+#endif // AER_THRUST_CUDA
+
   struct ordering {
     // order_ stores the current ordering of the qubits,
     // location_ stores the location of each qubit in the vector. It is derived
@@ -570,6 +585,7 @@ class MPS {
   static bool mps_log_data_;
   static MPS_swap_direction mps_swap_direction_;
   static bool mps_lapack_;
+  static std::string mps_svd_device_;
 };
 
 inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) {
diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
index cc090deb64..2886333b34 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -157,12 +157,22 @@ class MPS_Tensor {
   static MPS_Tensor contract(const MPS_Tensor &left_gamma,
                              const rvector_t &lambda,
                              const MPS_Tensor &right_gamma, bool mul_by_lambda);
+#ifdef AER_THRUST_CUDA
+  static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
+                          rvector_t &lambda, MPS_Tensor &right_gamma,
+                          bool mps_lapack, std::string mps_svd_device,
+                          cudaStream_t &cuda_stream,
+                          cutensornetHandle_t &cutensor_handle);
+#else
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
                           bool mps_lapack);
+#endif // AER_THRUST_CUDA
+
   static void
   reshape_for_3_qubits_before_SVD(const std::vector<cmatrix_t> &data,
                                   MPS_Tensor &reshaped_tensor);
+
   static void contract_2_dimensions(const MPS_Tensor &left_gamma,
                                     const MPS_Tensor &right_gamma,
                                     uint_t omp_threads, cmatrix_t &result);
@@ -591,15 +601,33 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma,
 //             rvector_t &lambda - tensors for the result.
 // Returns: none.
 //---------------------------------------------------------------
+#ifdef AER_THRUST_CUDA
+double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
+                             rvector_t &lambda, MPS_Tensor &right_gamma,
+                             bool mps_lapack, std::string mps_svd_device,
+                             cudaStream_t &cuda_stream,
+                             cutensornetHandle_t &cutensor_handle)
+#else
 double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                              rvector_t &lambda, MPS_Tensor &right_gamma,
-                             bool mps_lapack) {
+                             bool mps_lapack)
+#endif // AER_THRUST_CUDA
+{
   cmatrix_t C;
   C = reshape_before_SVD(temp.data_);
   cmatrix_t U, V;
   rvector_t S(std::min(C.GetRows(), C.GetColumns()));
 
+#ifdef AER_THRUST_CUDA
+  if (mps_svd_device.compare("GPU") == 0) {
+    cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cutensor_handle);
+  } else {
+    csvd_wrapper(C, U, S, V, mps_lapack);
+  }
+#else
   csvd_wrapper(C, U, S, V, mps_lapack);
+#endif // AER_THRUST_CUDA
+
   double discarded_value = 0.0;
   discarded_value = reduce_zeros(U, S, V, max_bond_dimension_,
                                  truncation_threshold_, mps_lapack);
diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index aa0d318423..38c74b16a0 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 
 namespace AER {
+
 // default values
 constexpr auto mul_factor = 1e2;
 constexpr long double tiny_factor = 1e30;
@@ -667,4 +668,160 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   }
 }
 
+#ifdef AER_THRUST_CUDA
+void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
+                           cmatrix_t &V, cudaStream_t &stream,
+                           cutensornetHandle_t &handle) {
+
+  bool transposed = false;
+
+  const int64_t rows = A.GetRows(), cols = A.GetColumns();
+
+  if (rows < cols) {
+    transposed = true;
+    A = AER::Utils::dagger(A);
+  }
+  cmatrix_t A_cpy = A;
+
+  const int64_t min_dim = std::min(rows, cols);
+  const int64_t lda = std::max(rows, cols);
+
+  U.resize(lda, min_dim);
+  V.resize(min_dim, min_dim);
+  S.resize(min_dim);
+
+  size_t sizeA = A.size() * sizeof(complex_t);
+  size_t sizeU = U.size() * sizeof(complex_t);
+  size_t sizeS = S.size() * sizeof(double);
+  size_t sizeV = V.size() * sizeof(complex_t);
+
+  complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(),
+            *cutensor_V = V.move_to_buffer();
+
+  cudaDataType_t typeData = CUDA_C_64F;
+
+  std::vector<int32_t> modesA{'m', 'n'};
+  std::vector<int32_t> modesU{'m', 'x'};
+  std::vector<int32_t> modesV{'x', 'n'};
+
+  double *cutensor_S = (double *)malloc(sizeS);
+
+  void *D_A;
+  void *D_U;
+  void *D_S;
+  void *D_V;
+
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_A, sizeA));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_U, sizeU));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_S, sizeS));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_V, sizeV));
+
+  HANDLE_CUDA_ERROR(cudaMemcpy(D_A, cutensor_A, sizeA, cudaMemcpyHostToDevice));
+
+  cutensornetTensorDescriptor_t descTensorA;
+  cutensornetTensorDescriptor_t descTensorU;
+  cutensornetTensorDescriptor_t descTensorV;
+
+  const int32_t numModesA = modesA.size();
+  const int32_t numModesU = modesU.size();
+  const int32_t numModesV = modesV.size();
+
+  std::vector<int64_t> extentA{lda, min_dim};     // shape of A
+  std::vector<int64_t> extentU{lda, min_dim};     // shape of U :)
+  std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V
+
+  const int64_t *strides =
+      NULL; // matrices stores the entries in column-major-order.
+
+  HANDLE_ERROR(cutensornetCreateTensorDescriptor(
+      handle, numModesA, extentA.data(), strides, modesA.data(), typeData,
+      &descTensorA));
+  HANDLE_ERROR(cutensornetCreateTensorDescriptor(
+      handle, numModesU, extentU.data(), strides, modesU.data(), typeData,
+      &descTensorU));
+  HANDLE_ERROR(cutensornetCreateTensorDescriptor(
+      handle, numModesV, extentV.data(), strides, modesV.data(), typeData,
+      &descTensorV));
+
+  cutensornetWorkspaceDescriptor_t workDesc;
+  HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
+  HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes(
+      handle, descTensorA, descTensorU, descTensorV, NULL, workDesc));
+  int64_t hostWorkspaceSize, deviceWorkspaceSize;
+  // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick
+  HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
+      handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+      CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH,
+      &deviceWorkspaceSize));
+  HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
+      handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+      CUTENSORNET_MEMSPACE_HOST, CUTENSORNET_WORKSPACE_SCRATCH,
+      &hostWorkspaceSize));
+
+  void *devWork = nullptr, *hostWork = nullptr;
+  if (deviceWorkspaceSize > 0) {
+    HANDLE_CUDA_ERROR(cudaMalloc(&devWork, deviceWorkspaceSize));
+  }
+  if (hostWorkspaceSize > 0) {
+    hostWork = malloc(hostWorkspaceSize);
+  }
+  HANDLE_ERROR(cutensornetWorkspaceSetMemory(
+      handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
+      CUTENSORNET_WORKSPACE_SCRATCH, devWork, deviceWorkspaceSize));
+  HANDLE_ERROR(cutensornetWorkspaceSetMemory(
+      handle, workDesc, CUTENSORNET_MEMSPACE_HOST,
+      CUTENSORNET_WORKSPACE_SCRATCH, hostWork, hostWorkspaceSize));
+
+  // Requesting for Exact SVD.
+  HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_A, descTensorU, D_U,
+                                    D_S, descTensorV, D_V, NULL, NULL, workDesc,
+                                    stream));
+
+  HANDLE_CUDA_ERROR(
+      cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost));
+  HANDLE_CUDA_ERROR(
+      cudaMemcpyAsync(cutensor_S, D_S, sizeS, cudaMemcpyDeviceToHost));
+  HANDLE_CUDA_ERROR(
+      cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost));
+
+  S.clear();
+  for (int i = 0; i < min_dim; i++)
+    S.push_back(cutensor_S[i]);
+
+  A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A);
+  U = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_U);
+  V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V);
+
+  V = AER::Utils::dagger(V);
+  validate_SVD_result(A_cpy, U, S, V);
+  if (transposed) {
+    std::swap(U, V);
+  }
+
+  /***************
+   * Free resources
+   ****************/
+
+  HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorA));
+  HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
+  HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
+  HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
+
+  if (cutensor_S)
+    free(cutensor_S);
+  if (D_A)
+    cudaFree(D_A);
+  if (D_U)
+    cudaFree(D_U);
+  if (D_S)
+    cudaFree(D_S);
+  if (D_V)
+    cudaFree(D_V);
+  if (devWork)
+    cudaFree(devWork);
+  if (hostWork)
+    free(hostWork);
+}
+#endif // AER_THRUST_CUDA
+
 } // namespace AER
diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp
index d08ba73424..69e2518462 100644
--- a/src/simulators/matrix_product_state/svd.hpp
+++ b/src/simulators/matrix_product_state/svd.hpp
@@ -55,6 +55,41 @@ void validate_SVD_result(const cmatrix_t &A, const cmatrix_t &U,
 void validate_SVdD_result(const cmatrix_t &A, const cmatrix_t &U,
                           const rvector_t &S, const cmatrix_t &V);
 
+#ifdef AER_THRUST_CUDA
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cutensornet.h>
+#include <vector>
+
+#define HANDLE_ERROR(x)                                                        \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != CUTENSORNET_STATUS_SUCCESS) {                                   \
+      std::stringstream str;                                                   \
+      str << "ERROR TensorNet::contractor : "                                  \
+          << cutensornetGetErrorString(err);                                   \
+      throw std::runtime_error(str.str());                                     \
+    }                                                                          \
+  };
+
+#define HANDLE_CUDA_ERROR(x)                                                   \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != cudaSuccess) {                                                  \
+      std::stringstream str;                                                   \
+      str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err);      \
+      throw std::runtime_error(str.str());                                     \
+    }                                                                          \
+  };
+
+// cutensor call
+void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
+                           cmatrix_t &V, cudaStream_t &stream,
+                           cutensornetHandle_t &handle);
+
+#endif // AER_THRUST_CUDA
+
 //-------------------------------------------------------------------------
 } // end namespace AER
 //-------------------------------------------------------------------------
diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py
index e04ec8dcbc..17b78f6344 100644
--- a/test/terra/backends/simulator_test_case.py
+++ b/test/terra/backends/simulator_test_case.py
@@ -82,7 +82,13 @@ def _method_device(methods):
     # add special test device for cuStateVec if available
     cuStateVec = check_cuStateVec(available_devices)
 
-    gpu_methods = ["statevector", "density_matrix", "unitary", "tensor_network"]
+    gpu_methods = [
+        "statevector",
+        "density_matrix",
+        "unitary",
+        "tensor_network",
+        "matrix_product_state",
+    ]
     batchable_methods = ["statevector", "density_matrix"]
     data_args = []
     for method in methods:
@@ -102,7 +108,11 @@ def _method_device(methods):
                     # add test cases for cuStateVec if available using special device = 'GPU_cuStateVec'
                     #'GPU_cuStateVec' is used only inside tests not available in Aer
                     # and this is converted to "device='GPU'" and option "cuStateVec_enalbe = True" is added
-                    if cuStateVec and "tensor_network" != method:
+                    if (
+                        cuStateVec
+                        and "tensor_network" != method
+                        and "matrix_product_state" != method
+                    ):
                         data_args.append((method, "GPU_cuStateVec"))
             else:
                 data_args.append((method, "CPU"))