Skip to content

Commit

Permalink
Add native setStateVector support to lightning.gpu (#930)
Browse files Browse the repository at this point in the history
### Before submitting

Please complete the following checklist when submitting a PR:

- [ ] All new features must include a unit test.
If you've fixed a bug or added code that should be tested, add a test to
the
      [`tests`](../tests) directory!

- [ ] All new functions and code must be clearly commented and
documented.
If you do make documentation changes, make sure that the docs build and
      render correctly by running `make docs`.

- [x] Ensure that the test suite passes, by running `make test`.

- [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing
the
      change, and including a link back to the PR.

- [x] Ensure that code is properly formatted by running `make format`. 

When all the above are checked, delete everything above the dashed
line and fill in the pull request template.


------------------------------------------------------------------------------------------------------------

**Context:**
 [SC-74668]
`setStateVector` via state and wires is supported in the C++ layer in
`lightning.gpu`

**Description of the Change:**

**Benefits:**

**Possible Drawbacks:**

**Related GitHub Issues:**

---------

Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
  • Loading branch information
multiphaseCFD and ringo-but-quantum authored Oct 8, 2024
1 parent 721a453 commit c6f3f3f
Show file tree
Hide file tree
Showing 10 changed files with 222 additions and 214 deletions.
3 changes: 3 additions & 0 deletions .github/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@

### Improvements

* Add `setStateVector(state, wire)` support to the `lightning.gpu` C++ layer.
[(#930)](https://github.com/PennyLaneAI/pennylane-lightning/pull/930)

* Add zero-state initialization to both `StateVectorCudaManaged` and `StateVectorCudaMPI` constructors to remove the `reset_state` in the python layer ctor and refactor `setBasisState(state, wires)` in the C++ layer.
[(#933)](https://github.com/PennyLaneAI/pennylane-lightning/pull/933)

Expand Down
2 changes: 1 addition & 1 deletion pennylane_lightning/core/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
Version number (major.minor.patch[-label])
"""

__version__ = "0.39.0-dev39"
__version__ = "0.39.0-dev40"
Original file line number Diff line number Diff line change
Expand Up @@ -289,61 +289,48 @@ class StateVectorCudaMPI final
BaseType::getDataBuffer().zeroInit();
setBasisState_(value, index, use_async);
}

/**
* @brief Set values for a batch of elements of the state-vector. This
* method is implemented by the customized CUDA kernel defined in the
* DataBuffer class.
* @brief Set values for a batch of elements of the state-vector.
*
* @param num_indices Number of elements to be passed to the state vector.
* @param values Pointer to values to be set for the target elements.
* @param indices Pointer to indices of the target elements.
* @param async Use an asynchronous memory copy.
* @param state_ptr Pointer to initial state data.
* @param num_states Length of initial state data.
* @param wires Wires.
* @param use_async Use an asynchronous memory copy. Default is false.
*/
template <class index_type, std::size_t thread_per_block = 256>
void setStateVector(const index_type num_indices,
const std::complex<Precision> *values,
const index_type *indices, const bool async = false) {
BaseType::getDataBuffer().zeroInit();

std::vector<index_type> indices_local;
std::vector<std::complex<Precision>> values_local;

for (std::size_t i = 0; i < static_cast<std::size_t>(num_indices);
i++) {
int index = indices[i];
PL_ASSERT(index >= 0);
std::size_t rankId =
static_cast<std::size_t>(index) >> BaseType::getNumQubits();

if (rankId == mpi_manager_.getRank()) {
int local_index = static_cast<int>(
compute_local_index(static_cast<std::size_t>(index),
this->getNumLocalQubits()));
indices_local.push_back(local_index);
values_local.push_back(values[i]);
void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
const std::vector<std::size_t> &wires,
bool use_async = false) {
PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
"Inconsistent state and wires dimensions.");

const auto num_qubits = this->getTotalNumQubits();

PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(),
[&num_qubits](const auto i) {
return i >= num_qubits;
}) == wires.end(),
"Invalid wire index.");

using index_type =
typename std::conditional<std::is_same<PrecisionT, float>::value,
int32_t, int64_t>::type;

// Calculate the indices of the state-vector to be set.
// TODO: Could move to GPU/MPI calculation if the state size is large.
std::vector<index_type> indices(num_states);
const std::size_t num_wires = wires.size();
constexpr std::size_t one{1U};
for (std::size_t i = 0; i < num_states; i++) {
std::size_t index{0U};
for (std::size_t j = 0; j < num_wires; j++) {
const std::size_t bit = (i & (one << j)) >> j;
index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]);
}
indices[i] = static_cast<index_type>(index);
}

auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();

index_type num_elements = indices_local.size();

DataBuffer<index_type, int> d_indices{
static_cast<std::size_t>(num_elements), device_id, stream_id, true};

DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
device_id, stream_id, true};

d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(),
async);
d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(),
async);

setStateVector_CUDA(BaseType::getData(), num_elements,
d_values.getData(), d_indices.getData(),
thread_per_block, stream_id);
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
setStateVector_<index_type>(num_states, state_ptr, indices.data(),
use_async);
mpi_manager_.Barrier();
}

Expand Down Expand Up @@ -1548,6 +1535,62 @@ class StateVectorCudaMPI final
return t_indices;
}

/**
* @brief Set values for a batch of elements of the state-vector. This
* method is implemented by the customized CUDA kernel defined in the
* DataBuffer class.
*
* @param num_indices Number of elements to be passed to the state vector.
* @param values Pointer to values to be set for the target elements.
* @param indices Pointer to indices of the target elements.
* @param async Use an asynchronous memory copy.
*/
template <class index_type, std::size_t thread_per_block = 256>
void setStateVector_(const index_type num_indices,
const std::complex<Precision> *values,
const index_type *indices, const bool async = false) {
BaseType::getDataBuffer().zeroInit();

std::vector<index_type> indices_local;
std::vector<std::complex<Precision>> values_local;

for (std::size_t i = 0; i < static_cast<std::size_t>(num_indices);
i++) {
int index = indices[i];
PL_ASSERT(index >= 0);
std::size_t rankId =
static_cast<std::size_t>(index) >> BaseType::getNumQubits();

if (rankId == mpi_manager_.getRank()) {
int local_index = static_cast<int>(
compute_local_index(static_cast<std::size_t>(index),
this->getNumLocalQubits()));
indices_local.push_back(local_index);
values_local.push_back(values[i]);
}
}

auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();

index_type num_elements = indices_local.size();

DataBuffer<index_type, int> d_indices{
static_cast<std::size_t>(num_elements), device_id, stream_id, true};

DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
device_id, stream_id, true};

d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(),
async);
d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(),
async);

setStateVector_CUDA(BaseType::getData(), num_elements,
d_values.getData(), d_indices.getData(),
thread_per_block, stream_id);
}

/**
* @brief Set value for a single element of the state-vector on device. This
* method is implemented by cudaMemcpy.
Expand Down Expand Up @@ -1637,8 +1680,8 @@ class StateVectorCudaMPI final
}

/**
* @brief Apply parametric Pauli gates to local statevector using custateVec
* calls.
* @brief Apply parametric Pauli gates to local statevector using
* custateVec calls.
*
* @param pauli_words List of Pauli words representing operation.
* @param ctrls Control wires
Expand Down Expand Up @@ -1708,7 +1751,8 @@ class StateVectorCudaMPI final
});

// Initialize a vector to store the status of wires and default its
// elements as zeros, which assumes there is no target and control wire.
// elements as zeros, which assumes there is no target and control
// wire.
std::vector<int> statusWires(this->getTotalNumQubits(),
WireStatus::Default);

Expand Down Expand Up @@ -1868,7 +1912,8 @@ class StateVectorCudaMPI final
});

// Initialize a vector to store the status of wires and default its
// elements as zeros, which assumes there is no target and control wire.
// elements as zeros, which assumes there is no target and control
// wire.
std::vector<int> statusWires(this->getTotalNumQubits(),
WireStatus::Default);

Expand Down Expand Up @@ -2009,7 +2054,8 @@ class StateVectorCudaMPI final
});

// Initialize a vector to store the status of wires and default its
// elements as zeros, which assumes there is no target and control wire.
// elements as zeros, which assumes there is no target and control
// wire.
std::vector<int> statusWires(this->getTotalNumQubits(),
WireStatus::Default);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,36 +215,46 @@ class StateVectorCudaManaged
}

/**
* @brief Set values for a batch of elements of the state-vector. This
* method is implemented by the customized CUDA kernel defined in the
* DataBuffer class.
* @brief Set values for a batch of elements of the state-vector.
*
* @param num_indices Number of elements to be passed to the state vector.
* @param values Pointer to values to be set for the target elements.
* @param indices Pointer to indices of the target elements.
* @param async Use an asynchronous memory copy.
* @param state_ptr Pointer to the initial state data.
* @param num_states Length of the initial state data.
* @param wires Wires.
* @param use_async Use an asynchronous memory copy. Default is false.
*/
template <class index_type, std::size_t thread_per_block = 256>
void setStateVector(const index_type num_indices,
const std::complex<Precision> *values,
const index_type *indices, const bool async = false) {
BaseType::getDataBuffer().zeroInit();

auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();

index_type num_elements = num_indices;
DataBuffer<index_type, int> d_indices{
static_cast<std::size_t>(num_elements), device_id, stream_id, true};
DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
device_id, stream_id, true};
void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
const std::vector<std::size_t> &wires,
bool use_async = false) {
PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
"Inconsistent state and wires dimensions.");

d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async);
d_values.CopyHostDataToGpu(values, d_values.getLength(), async);
const auto num_qubits = BaseType::getNumQubits();

setStateVector_CUDA(BaseType::getData(), num_elements,
d_values.getData(), d_indices.getData(),
thread_per_block, stream_id);
PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(),
[&num_qubits](const auto i) {
return i >= num_qubits;
}) == wires.end(),
"Invalid wire index.");

using index_type =
typename std::conditional<std::is_same<PrecisionT, float>::value,
int32_t, int64_t>::type;

// Calculate the indices of the state-vector to be set.
// TODO: Could move to GPU calculation if the state size is large.
std::vector<index_type> indices(num_states);
const std::size_t num_wires = wires.size();
constexpr std::size_t one{1U};
for (std::size_t i = 0; i < num_states; i++) {
std::size_t index{0U};
for (std::size_t j = 0; j < num_wires; j++) {
const std::size_t bit = (i & (one << j)) >> j;
index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]);
}
indices[i] = static_cast<index_type>(index);
}
setStateVector_<index_type>(num_states, state_ptr, indices.data(),
use_async);
}

/**
Expand Down Expand Up @@ -1346,9 +1356,8 @@ class StateVectorCudaManaged
return t_indices;
}

/**
* @brief Set value for a single element of the state-vector on device. This
* method is implemented by cudaMemcpy.
/** @brief Set value for a single element of the state-vector on device.
* This method is implemented by cudaMemcpy.
*
* @param value Value to be set for the target element.
* @param index Index of the target element.
Expand All @@ -1362,6 +1371,40 @@ class StateVectorCudaManaged
stream_id);
}

/**
* @brief Set values for a batch of elements of the state-vector. This
* method is implemented by the customized CUDA kernel defined in the
* DataBuffer class.
*
* @param num_indices Number of elements to be passed to the state vector.
* @param values Pointer to values to be set for the target elements.
* @param indices Pointer to indices of the target elements.
* @param async Use an asynchronous memory copy.
*/
template <class index_type, std::size_t thread_per_block = 256>
void setStateVector_(const index_type num_indices,
const std::complex<Precision> *values,
const index_type *indices, const bool async = false) {
BaseType::getDataBuffer().zeroInit();

auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();

index_type num_elements = num_indices;
DataBuffer<index_type, int> d_indices{
static_cast<std::size_t>(num_elements), device_id, stream_id, true};
DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
device_id, stream_id, true};

d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async);
d_values.CopyHostDataToGpu(values, d_values.getLength(), async);

setStateVector_CUDA(BaseType::getData(), num_elements,
d_values.getData(), d_indices.getData(),
thread_per_block, stream_id);
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
}

/**
* @brief Apply parametric Pauli gates using custateVec calls.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,6 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
using ParamT = PrecisionT; // Parameter's data precision
using np_arr_c = py::array_t<std::complex<ParamT>,
py::array::c_style | py::array::forcecast>;
using np_arr_sparse_ind = typename std::conditional<
std::is_same<ParamT, float>::value,
py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;

registerGatesForStateVector<StateVectorT>(pyclass);

Expand All @@ -91,20 +87,15 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
"Set the state vector to a basis state on GPU.")
.def(
"setStateVector",
[](StateVectorT &sv, const np_arr_sparse_ind &indices,
const np_arr_c &state, const bool use_async) {
using index_type = typename std::conditional<
std::is_same<ParamT, float>::value, int32_t, int64_t>::type;

sv.template setStateVector<index_type>(
static_cast<index_type>(indices.request().size),
static_cast<std::complex<PrecisionT> *>(
state.request().ptr),
static_cast<index_type *>(indices.request().ptr),
use_async);
[](StateVectorT &sv, const np_arr_c &state,
const std::vector<std::size_t> &wires, const bool async) {
const auto state_buffer = state.request();
const auto state_ptr =
static_cast<const std::complex<ParamT> *>(state_buffer.ptr);
sv.setStateVector(state_ptr, state_buffer.size, wires, async);
},
"Set State Vector on GPU with values and their corresponding "
"indices for the state vector on device")
"Set State Vector on GPU with values for the state vector and "
"wires on the host memory.")
.def(
"DeviceToDevice",
[](StateVectorT &sv, const StateVectorT &other, bool async) {
Expand Down
Loading

0 comments on commit c6f3f3f

Please sign in to comment.