From acd216d040c0d9ec1161c82331820841cb13386f Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Wed, 10 Mar 2021 19:17:46 +0900 Subject: [PATCH] Fixes of multi-chunk State implementation (#1149) Co-authored-by: Victor Villar Co-authored-by: Christopher J. Wood --- CONTRIBUTING.md | 3 + src/controllers/controller.hpp | 55 +++ src/controllers/qasm_controller.hpp | 94 ++-- src/controllers/statevector_controller.hpp | 38 +- src/controllers/unitary_controller.hpp | 36 +- .../density_matrix/densitymatrix.hpp | 27 ++ .../density_matrix/densitymatrix_state.hpp | 4 +- .../densitymatrix_state_chunk.hpp | 425 ++++++++++++------ .../density_matrix/densitymatrix_thrust.hpp | 63 +++ src/simulators/state.hpp | 2 +- src/simulators/state_chunk.hpp | 102 +++-- src/simulators/statevector/chunk/chunk.hpp | 2 + .../statevector/chunk/chunk_container.hpp | 3 - .../chunk/device_chunk_container.hpp | 5 +- .../chunk/host_chunk_container.hpp | 3 + .../statevector/qubitvector_thrust.hpp | 10 +- .../statevector/statevector_state.hpp | 4 +- .../statevector/statevector_state_chunk.hpp | 236 +++++++++- src/simulators/unitary/unitary_state.hpp | 4 +- .../unitary/unitary_state_chunk.hpp | 129 ++++-- src/transpile/cacheblocking.hpp | 2 +- .../backends/qasm_simulator/qasm_chunk.py | 136 ++++++ ...est_qasm_simulator_density_matrix_chunk.py | 74 +++ .../test_qasm_simulator_density_matrix_mpi.py | 84 ---- ... test_qasm_simulator_statevector_chunk.py} | 49 +- 25 files changed, 1113 insertions(+), 477 deletions(-) create mode 100644 test/terra/backends/qasm_simulator/qasm_chunk.py create mode 100644 test/terra/backends/test_qasm_simulator_density_matrix_chunk.py delete mode 100644 test/terra/backends/test_qasm_simulator_density_matrix_mpi.py rename test/terra/backends/{test_qasm_simulator_statevector_mpi.py => test_qasm_simulator_statevector_chunk.py} (56%) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a3db828f52..44a59da025 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -681,7 +681,10 @@ This technique allows applying quantum gates to each chunk independently without Before the actual simulation, we apply transpilation to remap the input circuits to the equivalent circuits that has all the quantum gates on the lower qubits than the chunk's number of qubits. And the (noiseless) swap gates are inserted to exchange data. +Please refer to this paper (https://arxiv.org/abs/2102.02957) for more detailed algorithm and implementation of parallel simulation. + So to simulate by using multiple GPUs or multiple nodes on the cluster, following configurations should be set to backend options. +(If there is not enough memory to simulate the input circuit, Qiskit Aer automatically set following options, but it is recommended to explicitly set them) - blocking_enable diff --git a/src/controllers/controller.hpp b/src/controllers/controller.hpp index 8babb798ef..f20e0e3e3f 100755 --- a/src/controllers/controller.hpp +++ b/src/controllers/controller.hpp @@ -51,6 +51,7 @@ #include "noise/noise_model.hpp" #include "transpile/basic_opts.hpp" #include "transpile/truncate_qubits.hpp" +#include "transpile/cacheblocking.hpp" namespace AER { namespace Base { @@ -216,8 +217,19 @@ class Controller { set_distributed_parallelization(const std::vector &circuits, const std::vector &noise); + virtual bool multiple_chunk_required(const Circuit &circuit, + const Noise::NoiseModel &noise) const; + void save_exception_to_results(Result &result,const std::exception &e); + + //setting cache blocking transpiler + Transpile::CacheBlocking transpile_cache_blocking(const Circuit& circ, + const Noise::NoiseModel& noise, + const json_t& config, + const size_t complex_size,bool is_matrix) const; + + // Get system memory size size_t get_system_memory_mb(); size_t get_gpu_memory_mb(); @@ -274,6 +286,8 @@ class Controller { //process information (MPI) int myrank_ = 0; int num_processes_ = 1; + + uint_t cache_block_qubit_ = 0; }; //========================================================================= @@ -348,6 +362,11 @@ void Controller::set_config(const json_t &config) { JSON::get_value(accept_distributed_results_, "accept_distributed_results", config); } + //enable multiple qregs if cache blocking is enabled + cache_block_qubit_ = 0; + if(JSON::check_key("blocking_qubits", config)){ + JSON::get_value(cache_block_qubit_,"blocking_qubits", config); + } } void Controller::clear_config() { @@ -535,6 +554,21 @@ uint_t Controller::get_distributed_num_processes(bool par_shots) const } } +bool Controller::multiple_chunk_required(const Circuit &circ, + const Noise::NoiseModel &noise) const +{ + if(circ.num_qubits < 3) + return false; + + if(num_process_per_experiment_ > 1 || Controller::get_min_memory_mb() < required_memory_mb(circ, noise)) + return true; + + if(cache_block_qubit_ >= 2 && cache_block_qubit_ < circ.num_qubits) + return true; + + return false; +} + size_t Controller::get_system_memory_mb() { size_t total_physical_memory = 0; #if defined(__linux__) || defined(__APPLE__) @@ -654,6 +688,27 @@ void Controller::save_exception_to_results(Result &result,const std::exception & } } +Transpile::CacheBlocking Controller::transpile_cache_blocking(const Circuit& circ, + const Noise::NoiseModel& noise, + const json_t& config, + const size_t complex_size,bool is_matrix) const +{ + Transpile::CacheBlocking cache_block_pass; + + cache_block_pass.set_config(config); + if(!cache_block_pass.enabled()){ + //if blocking is not set by config, automatically set if required + if(multiple_chunk_required(circ,noise)){ + int nplace = num_process_per_experiment_; + if(num_gpus_ > 0) + nplace *= num_gpus_; + cache_block_pass.set_blocking(circ.num_qubits, get_min_memory_mb() << 20, nplace, complex_size,is_matrix); + } + } + + return cache_block_pass; +} + //------------------------------------------------------------------------- // Qobj execution //------------------------------------------------------------------------- diff --git a/src/controllers/qasm_controller.hpp b/src/controllers/qasm_controller.hpp index ba903aa45e..a408b2e83d 100755 --- a/src/controllers/qasm_controller.hpp +++ b/src/controllers/qasm_controller.hpp @@ -215,11 +215,6 @@ class QasmController : public Base::Controller { const Operations::OpSet &opset, const json_t& config) const; - - Transpile::CacheBlocking transpile_cache_blocking(const Circuit& circ, - const Noise::NoiseModel& noise, - const json_t& config) const; - //---------------------------------------------------------------- // Run circuit helpers //---------------------------------------------------------------- @@ -306,9 +301,6 @@ class QasmController : public Base::Controller { // Controller-level parameter for CH method bool extended_stabilizer_measure_sampling_ = false; - - //using multiple chunks - bool multiple_qregs_ = false; }; //========================================================================= @@ -381,11 +373,6 @@ void QasmController::set_config(const json_t& config) { "QasmController: initial_statevector is not a unit vector"); } } - - //enable multiple qregs if cache blocking is enabled - if(JSON::check_key("blocking_enable", config)){ - JSON::get_value(multiple_qregs_,"blocking_enable", config); - } } void QasmController::clear_config() { @@ -407,7 +394,7 @@ void QasmController::run_circuit(const Circuit& circ, // Validate circuit for simulation method switch (simulation_method(circ, noise, true)) { case Method::statevector: { - if(multiple_qregs_){ + if(Base::Controller::multiple_chunk_required(circ,noise)){ if (simulation_precision_ == Precision::double_precision) { // Double-precision Statevector simulation return run_circuit_helper>>( @@ -440,7 +427,7 @@ void QasmController::run_circuit(const Circuit& circ, "QasmController: method statevector_gpu is not supported on this " "system"); #else - if(multiple_qregs_ || (parallel_shots_ > 1 || parallel_experiments_ > 1)){ + if(Base::Controller::multiple_chunk_required(circ,noise) || (parallel_shots_ > 1 || parallel_experiments_ > 1)){ if (simulation_precision_ == Precision::double_precision) { // Double-precision Statevector simulation return run_circuit_helper< @@ -478,7 +465,7 @@ void QasmController::run_circuit(const Circuit& circ, "QasmController: method statevector_thrust is not supported on this " "system"); #else - if(multiple_qregs_){ + if(Base::Controller::multiple_chunk_required(circ,noise)){ if (simulation_precision_ == Precision::double_precision) { // Double-precision Statevector simulation return run_circuit_helper< @@ -511,7 +498,7 @@ void QasmController::run_circuit(const Circuit& circ, #endif } case Method::density_matrix: { - if(multiple_qregs_){ + if(Base::Controller::multiple_chunk_required(circ,noise)){ if (simulation_precision_ == Precision::double_precision) { // Double-precision density matrix simulation return run_circuit_helper< @@ -548,7 +535,7 @@ void QasmController::run_circuit(const Circuit& circ, "QasmController: method density_matrix_gpu is not supported on this " "system"); #else - if(multiple_qregs_ || (parallel_shots_ > 1 || parallel_experiments_ > 1)){ + if(Base::Controller::multiple_chunk_required(circ,noise) || (parallel_shots_ > 1 || parallel_experiments_ > 1)){ if (simulation_precision_ == Precision::double_precision) { // Double-precision density matrix simulation return run_circuit_helper< @@ -586,7 +573,7 @@ void QasmController::run_circuit(const Circuit& circ, "this " "system"); #else - if(multiple_qregs_){ + if(Base::Controller::multiple_chunk_required(circ,noise)){ if (simulation_precision_ == Precision::double_precision) { // Double-precision density matrix simulation return run_circuit_helper< @@ -938,42 +925,6 @@ Transpile::Fusion QasmController::transpile_fusion(Method method, return fusion_pass; } -Transpile::CacheBlocking QasmController::transpile_cache_blocking(const Circuit& circ, - const Noise::NoiseModel& noise, - const json_t& config) const -{ - Transpile::CacheBlocking cache_block_pass; - - cache_block_pass.set_config(config); - if(!cache_block_pass.enabled()){ - //if blocking is not set by config, automatically set if required - if(Base::Controller::num_process_per_experiment_ > 1 || Base::Controller::get_min_memory_mb() < required_memory_mb(circ, noise)){ - int nplace = Base::Controller::num_process_per_experiment_; - if(Base::Controller::num_gpus_ > 0) - nplace *= Base::Controller::num_gpus_; - - size_t complex_size = (simulation_precision_ == Precision::single_precision) ? sizeof(std::complex) : sizeof(std::complex); - - switch (simulation_method(circ, noise, false)) { - case Method::statevector: - case Method::statevector_thrust_cpu: - case Method::statevector_thrust_gpu: - cache_block_pass.set_blocking(circ.num_qubits, Base::Controller::get_min_memory_mb() << 20, nplace, complex_size,false); - break; - case Method::density_matrix: - case Method::density_matrix_thrust_cpu: - case Method::density_matrix_thrust_gpu: - cache_block_pass.set_blocking(circ.num_qubits, Base::Controller::get_min_memory_mb() << 20, nplace, complex_size,true); - break; - default: - throw std::runtime_error("QasmController: No enough memory to simulate this method on the sysytem"); - } - } - } - - return cache_block_pass; -} - void QasmController::set_parallelization_circuit( const Circuit& circ, const Noise::NoiseModel& noise_model) { @@ -1148,9 +1099,19 @@ void QasmController::run_circuit_helper(const Circuit& circ, auto fusion_pass = transpile_fusion(method, opt_circ.opset(), config); fusion_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); - auto cache_block_pass = transpile_cache_blocking(opt_circ,noise,config); + bool is_matrix = false; + if(method == Method::density_matrix || method == Method::density_matrix_thrust_gpu || method == Method::density_matrix_thrust_cpu) + is_matrix = true; + auto cache_block_pass = transpile_cache_blocking(opt_circ,noise,config,(simulation_precision_ == Precision::single_precision) ? sizeof(std::complex) : sizeof(std::complex),is_matrix); cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); + uint_t block_bits = 0; + if(cache_block_pass.enabled()) + block_bits = cache_block_pass.block_bits(); + + //allocate qubit register + state.allocate(Base::Controller::max_qubits_,block_bits); + // Run simulation run_multi_shot(opt_circ, shots, state, initial_state, method, result, rng); } @@ -1179,9 +1140,6 @@ void QasmController::run_multi_shot(const Circuit& circ, // Implement measure sampler auto pos = circ.first_measure_pos; // Position of first measurement op - //allocate qubit register - state.allocate(Base::Controller::max_qubits_); - // Run circuit instructions before first measure std::vector ops(circ.ops.begin(), circ.ops.begin() + pos); @@ -1197,9 +1155,6 @@ void QasmController::run_multi_shot(const Circuit& circ, // Add measure sampling metadata result.metadata.add(true, "measure_sampling"); } else { - //allocate qubit register - state.allocate(Base::Controller::max_qubits_); - // Perform standard execution if we cannot apply the // measurement sampling optimization while (shots-- > 0) { @@ -1225,10 +1180,10 @@ void QasmController::run_circuit_with_sampled_noise(const Circuit& circ, measure_pass.set_config(config); Noise::NoiseModel dummy_noise; - auto cache_block_pass = transpile_cache_blocking(circ,noise,config); - - //allocate qubit register - state.allocate(Base::Controller::max_qubits_); + bool is_matrix = false; + if(method == Method::density_matrix || method == Method::density_matrix_thrust_gpu || method == Method::density_matrix_thrust_cpu) + is_matrix = true; + auto cache_block_pass = transpile_cache_blocking(circ,noise,config,(simulation_precision_ == Precision::single_precision) ? sizeof(std::complex) : sizeof(std::complex),is_matrix); // Sample noise using circuit method while (shots-- > 0) { @@ -1238,6 +1193,13 @@ void QasmController::run_circuit_with_sampled_noise(const Circuit& circ, fusion_pass.optimize_circuit(noise_circ, dummy_noise, state.opset(), result); cache_block_pass.optimize_circuit(noise_circ, dummy_noise, state.opset(), result); + uint_t block_bits = 0; + if(cache_block_pass.enabled()) + block_bits = cache_block_pass.block_bits(); + + //allocate qubit register + state.allocate(Base::Controller::max_qubits_,block_bits); + run_single_shot(noise_circ, state, initial_state, result, rng); } } diff --git a/src/controllers/statevector_controller.hpp b/src/controllers/statevector_controller.hpp index b851632c31..db5c9a9cfe 100755 --- a/src/controllers/statevector_controller.hpp +++ b/src/controllers/statevector_controller.hpp @@ -124,9 +124,6 @@ class StatevectorController : public Base::Controller { // Precision of statevector Precision precision_ = Precision::double_precision; - //using multiple chunks - bool multiple_qregs_ = false; - }; //========================================================================= @@ -182,11 +179,6 @@ void StatevectorController::set_config(const json_t& config) { precision_ = Precision::single_precision; } } - - //enable multiple qregs if cache blocking is enabled - if(JSON::check_key("blocking_enable", config)){ - JSON::get_value(multiple_qregs_,"blocking_enable", config); - } } void StatevectorController::clear_config() { @@ -215,7 +207,7 @@ void StatevectorController::run_circuit( switch (method_) { case Method::automatic: case Method::statevector_cpu: { - if(multiple_qregs_){ + if(Base::Controller::multiple_chunk_required(circ,noise)){ if (precision_ == Precision::double_precision) { // Double-precision Statevector simulation return run_circuit_helper>>( @@ -240,7 +232,7 @@ void StatevectorController::run_circuit( } case Method::statevector_thrust_gpu: { #ifdef AER_THRUST_CUDA - if(multiple_qregs_){ + if(Base::Controller::multiple_chunk_required(circ,noise)){ if (precision_ == Precision::double_precision) { // Double-precision Statevector simulation return run_circuit_helper< @@ -275,7 +267,7 @@ void StatevectorController::run_circuit( } case Method::statevector_thrust_cpu: { #ifdef AER_THRUST_CPU - if(multiple_qregs_){ + if(Base::Controller::multiple_chunk_required(circ,noise)){ if (precision_ == Precision::double_precision) { // Double-precision Statevector simulation return run_circuit_helper< @@ -353,34 +345,32 @@ void StatevectorController::run_circuit_helper( result.set_config(config); // Optimize circuit - const std::vector* op_ptr = &circ.ops; Transpile::Fusion fusion_pass; - Transpile::CacheBlocking cache_block_pass; - fusion_pass.set_config(config); - cache_block_pass.set_config(config); - fusion_pass.set_parallelization(parallel_state_update_); - Circuit opt_circ; + Circuit opt_circ = circ; // copy circuit + Noise::NoiseModel dummy_noise; // dummy object for transpile pass if (fusion_pass.active && circ.num_qubits >= fusion_pass.threshold) { - opt_circ = circ; // copy circuit - Noise::NoiseModel dummy_noise; // dummy object for transpile pass fusion_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); - cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); - op_ptr = &opt_circ.ops; } - // Run single shot collecting measure data or snapshots - state.allocate(Base::Controller::max_qubits_); + Transpile::CacheBlocking cache_block_pass = transpile_cache_blocking(opt_circ,dummy_noise,config,(precision_ == Precision::single_precision) ? sizeof(std::complex) : sizeof(std::complex),false); + cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); + uint_t block_bits = 0; + if(cache_block_pass.enabled()) + block_bits = cache_block_pass.block_bits(); + state.allocate(Base::Controller::max_qubits_,block_bits); + + // Run single shot collecting measure data or snapshots if (initial_state_.empty()) { state.initialize_qreg(circ.num_qubits); } else { state.initialize_qreg(circ.num_qubits, initial_state_); } state.initialize_creg(circ.num_memory, circ.num_registers); - state.apply_ops(*op_ptr, result, rng); + state.apply_ops(opt_circ.ops, result, rng); Base::Controller::save_count_data(result, state.creg()); // Add final state to the data diff --git a/src/controllers/unitary_controller.hpp b/src/controllers/unitary_controller.hpp index 935ca69dc6..f54f52d5b2 100755 --- a/src/controllers/unitary_controller.hpp +++ b/src/controllers/unitary_controller.hpp @@ -113,10 +113,6 @@ class UnitaryController : public Base::Controller { // Precision of a unitary matrix Precision precision_ = Precision::double_precision; - - //using multiple chunks - bool multiple_qregs_ = false; - }; //========================================================================= @@ -172,11 +168,6 @@ void UnitaryController::set_config(const json_t &config) { precision_ = Precision::single_precision; } } - - //enable multiple qregs if cache blocking is enabled - if(JSON::check_key("blocking_enable", config)){ - JSON::get_value(multiple_qregs_,"blocking_enable", config); - } } void UnitaryController::clear_config() { @@ -207,7 +198,7 @@ void UnitaryController::run_circuit(const Circuit &circ, switch (method_) { case Method::automatic: case Method::unitary_cpu: { - if(multiple_qregs_){ + if(Base::Controller::multiple_chunk_required(circ,noise)){ if (precision_ == Precision::double_precision) { // Double-precision unitary simulation return run_circuit_helper< @@ -236,7 +227,7 @@ void UnitaryController::run_circuit(const Circuit &circ, } case Method::unitary_thrust_gpu: { #ifdef AER_THRUST_CUDA - if(multiple_qregs_){ + if(Base::Controller::multiple_chunk_required(circ,noise)){ if (precision_ == Precision::double_precision) { // Double-precision unitary simulation return run_circuit_helper< @@ -270,7 +261,7 @@ void UnitaryController::run_circuit(const Circuit &circ, } case Method::unitary_thrust_cpu: { #ifdef AER_THRUST_CPU - if(multiple_qregs_){ + if(Base::Controller::multiple_chunk_required(circ,noise)){ if (precision_ == Precision::double_precision) { // Double-precision unitary simulation return run_circuit_helper< @@ -354,25 +345,26 @@ void UnitaryController::run_circuit_helper( result.metadata.add(state.name(), "method"); // Optimize circuit - const std::vector* op_ptr = &circ.ops; Transpile::Fusion fusion_pass; - Transpile::CacheBlocking cache_block_pass; fusion_pass.threshold /= 2; // Halve default threshold for unitary simulator fusion_pass.set_config(config); - cache_block_pass.set_config(config); fusion_pass.set_parallelization(parallel_state_update_); - Circuit opt_circ; + Circuit opt_circ = circ; // copy circuit + Noise::NoiseModel dummy_noise; // dummy object for transpile pass if (fusion_pass.active && circ.num_qubits >= fusion_pass.threshold) { - opt_circ = circ; // copy circuit - Noise::NoiseModel dummy_noise; // dummy object for transpile pass fusion_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); - cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); - op_ptr = &opt_circ.ops; } + Transpile::CacheBlocking cache_block_pass = transpile_cache_blocking(opt_circ,dummy_noise,config,(precision_ == Precision::single_precision) ? sizeof(std::complex) : sizeof(std::complex),true); + cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); + + uint_t block_bits = 0; + if(cache_block_pass.enabled()) + block_bits = cache_block_pass.block_bits(); + state.allocate(Base::Controller::max_qubits_,block_bits); + // Run single shot collecting measure data or snapshots - state.allocate(Base::Controller::max_qubits_); if (initial_unitary_.empty()) { state.initialize_qreg(circ.num_qubits); @@ -380,7 +372,7 @@ void UnitaryController::run_circuit_helper( state.initialize_qreg(circ.num_qubits, initial_unitary_); } state.initialize_creg(circ.num_memory, circ.num_registers); - state.apply_ops(*op_ptr, result, rng); + state.apply_ops(opt_circ.ops, result, rng); Base::Controller::save_count_data(result, state.creg()); // Add final state unitary to the data diff --git a/src/simulators/density_matrix/densitymatrix.hpp b/src/simulators/density_matrix/densitymatrix.hpp index a013296702..2e2b9ad833 100755 --- a/src/simulators/density_matrix/densitymatrix.hpp +++ b/src/simulators/density_matrix/densitymatrix.hpp @@ -131,6 +131,7 @@ class DensityMatrix : public UnitaryMatrix { // Return Pauli expectation value double expval_pauli(const reg_t &qubits, const std::string &pauli,const complex_t initial_phase=1.0) const; + double expval_pauli_non_diagonal_chunk(const reg_t &qubits, const std::string &pauli,const complex_t initial_phase=1.0) const; protected: @@ -400,6 +401,32 @@ double DensityMatrix::expval_pauli(const reg_t &qubits, std::move(lambda), size_t(0), nrows >> 1)); } +template +double DensityMatrix::expval_pauli_non_diagonal_chunk(const reg_t &qubits, + const std::string &pauli,const complex_t initial_phase) const +{ + uint_t x_mask, z_mask, num_y, x_max; + std::tie(x_mask, z_mask, num_y, x_max) = QV::pauli_masks_and_phase(qubits, pauli); + + // Size of density matrix + const size_t nrows = BaseMatrix::rows_; + + auto phase = std::complex(initial_phase); + QV::add_y_phase(num_y, phase); + + auto lambda = [&](const int_t i, double &val_re, double &val_im)->void { + (void)val_im; // unused + auto idx_mat = i ^ x_mask + nrows * i; + auto val = std::real(phase * BaseVector::data_[idx_mat]); + if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) { + val = - val; + } + val_re += val; + }; + return std::real(BaseVector::apply_reduction_lambda( + std::move(lambda), size_t(0), nrows)); +} + //----------------------------------------------------------------------- // Z-measurement outcome probabilities //----------------------------------------------------------------------- diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp index 19bf2b43f8..25c6b80322 100644 --- a/src/simulators/density_matrix/densitymatrix_state.hpp +++ b/src/simulators/density_matrix/densitymatrix_state.hpp @@ -129,7 +129,7 @@ class State : public Base::State { virtual std::vector sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) override; - virtual void allocate(uint_t num_qubits) override; + virtual void allocate(uint_t num_qubits,uint_t block_bits) override; //----------------------------------------------------------------------- // Additional methods @@ -359,7 +359,7 @@ const stringmap_t State::snapshotset_( // Initialization //------------------------------------------------------------------------- template -void State::allocate(uint_t num_qubits) +void State::allocate(uint_t num_qubits,uint_t block_bits) { BaseState::qreg_.chunk_setup(num_qubits*2,num_qubits*2,0,1); } diff --git a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp index 2a625d7d13..31128fc989 100644 --- a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp +++ b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp @@ -27,36 +27,34 @@ #include "densitymatrix_thrust.hpp" #endif -//#include "densitymatrix_state.h" - namespace AER { namespace DensityMatrixChunk { +using OpType = Operations::OpType; + // OpSet of supported instructions const Operations::OpSet StateOpSet( // Op types - {Operations::OpType::gate, Operations::OpType::measure, - Operations::OpType::reset, Operations::OpType::snapshot, - Operations::OpType::barrier, Operations::OpType::bfunc, - Operations::OpType::roerror, Operations::OpType::matrix, - Operations::OpType::diagonal_matrix, Operations::OpType::kraus, - Operations::OpType::superop, Operations::OpType::save_expval, - Operations::OpType::save_expval_var}, + {OpType::gate, OpType::measure, + OpType::reset, OpType::snapshot, + OpType::barrier, OpType::bfunc, + OpType::roerror, OpType::matrix, + OpType::diagonal_matrix, OpType::kraus, + OpType::superop, OpType::save_expval, + OpType::save_expval_var, OpType::save_densmat, + OpType::save_probs, OpType::save_probs_ket, + OpType::save_amps_sq + }, // Gates {"U", "CX", "u1", "u2", "u3", "u", "cx", "cy", "cz", "swap", "id", "x", "y", "z", "h", "s", "sdg", "t", "tdg", "ccx", "r", "rx", "ry", "rz", "rxx", "ryy", "rzz", "rzx", "p", "cp", "cu1", "sx", "x90", "delay", "pauli"}, // Snapshots - {"memory", "register", "probabilities", + {"density_matrix", "memory", "register", "probabilities", "probabilities_with_variance", "expectation_value_pauli", "expectation_value_pauli_with_variance"}); -// Allowed gates enum class -enum class Gates { - u1, u2, u3, r, rx,ry, rz, id, x, y, z, h, s, sdg, sx, t, tdg, - cx, cy, cz, swap, rxx, ryy, rzz, rzx, ccx, cp, pauli -}; //========================================================================= // DensityMatrix State subclass @@ -115,8 +113,9 @@ class State : public Base::StateChunk { void initialize_omp(); auto move_to_matrix(); - + auto copy_to_matrix(); protected: + auto apply_to_matrix(bool copy = false); //----------------------------------------------------------------------- // Apply instructions @@ -170,10 +169,28 @@ class State : public Base::StateChunk { // Save data instructions //----------------------------------------------------------------------- + // Save the current density matrix or reduced density matrix + void apply_save_density_matrix(const Operations::Op &op, + ExperimentResult &result, + bool last_op = false); + + // Helper function for computing expectation value + void apply_save_probs(const Operations::Op &op, + ExperimentResult &result); + + // Helper function for saving amplitudes squared + void apply_save_amplitudes_sq(const Operations::Op &op, + ExperimentResult &result); + // Helper function for computing expectation value virtual double expval_pauli(const reg_t &qubits, const std::string& pauli) override; + // Return the reduced density matrix for the simulator + cmatrix_t reduced_density_matrix(const reg_t &qubits, bool last_op = false); + cmatrix_t reduced_density_matrix_helper(const reg_t &qubits, + const reg_t &qubits_sorted); + //----------------------------------------------------------------------- // Measurement Helpers //----------------------------------------------------------------------- @@ -230,8 +247,6 @@ class State : public Base::StateChunk { ExperimentResult &result, bool variance); - // Return the reduced density matrix for the simulator - cmatrix_t reduced_density_matrix(const reg_t &qubits, const reg_t& qubits_sorted); //----------------------------------------------------------------------- // Single-qubit gate helpers @@ -276,7 +291,7 @@ void State::initialize_qreg(uint_t num_qubits) if(BaseState::chunk_bits_ == BaseState::num_qubits_){ for(i=0;i::initialize_qreg(uint_t num_qubits) #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) for(i=0;inum_qubits_ == this->chunk_bits_){ BaseState::qregs_[i].initialize(); } @@ -309,7 +324,7 @@ void State::initialize_qreg(uint_t num_qubits, int_t iChunk; if(BaseState::chunk_bits_ == BaseState::num_qubits_){ for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) for(iChunk=0;iChunk> (BaseState::num_qubits_/2 - BaseState::chunk_bits_/2); - local_row_offset <<= (BaseState::chunk_bits_/2); - local_col_offset <<= (BaseState::chunk_bits_/2); + uint_t irow_chunk = ((iChunk + BaseState::global_chunk_index_) >> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); //copy part of state for this chunk uint_t i,row,col; cvector_t tmp(1ull << BaseState::chunk_bits_); for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - uint_t row = i & ((1ull << (BaseState::chunk_bits_/2))-1); - uint_t col = i >> (BaseState::chunk_bits_/2); - tmp[i] = input[local_row_offset + row + ((local_col_offset + col) << (BaseState::num_qubits_/2))]; + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; } - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_/2); + BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -350,7 +363,7 @@ void State::initialize_qreg(uint_t num_qubits, int_t iChunk; if(BaseState::chunk_bits_ == BaseState::num_qubits_){ for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) for(iChunk=0;iChunk> (BaseState::num_qubits_/2 - BaseState::chunk_bits_/2); - local_row_offset <<= (BaseState::chunk_bits_/2); - local_col_offset <<= (BaseState::chunk_bits_/2); + uint_t irow_chunk = ((iChunk + BaseState::global_chunk_index_) >> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); //copy part of state for this chunk uint_t i,row,col; cvector_t tmp(1ull << BaseState::chunk_bits_); for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - uint_t row = i & ((1ull << (BaseState::chunk_bits_/2))-1); - uint_t col = i >> (BaseState::chunk_bits_/2); - tmp[i] = state[local_row_offset + row + ((local_col_offset + col) << (BaseState::num_qubits_/2))]; + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; } - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_/2); + BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -391,7 +402,7 @@ void State::initialize_qreg(uint_t num_qubits, int_t iChunk; if(BaseState::chunk_bits_ == BaseState::num_qubits_){ for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) for(iChunk=0;iChunk> (BaseState::num_qubits_/2 - BaseState::chunk_bits_/2); - local_row_offset <<= (BaseState::chunk_bits_/2); - local_col_offset <<= (BaseState::chunk_bits_/2); + uint_t irow_chunk = ((iChunk + BaseState::global_chunk_index_) >> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); //copy part of state for this chunk uint_t i,row,col; cvector_t tmp(1ull << BaseState::chunk_bits_); for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - uint_t row = i & ((1ull << (BaseState::chunk_bits_/2))-1); - uint_t col = i >> (BaseState::chunk_bits_/2); - tmp[i] = state[local_row_offset + row + ((local_col_offset + col) << (BaseState::num_qubits_/2))]; + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; } - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_/2); + BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -437,32 +446,94 @@ auto State::move_to_matrix() if(BaseState::num_global_chunks_ == 1){ return BaseState::qregs_[0].move_to_matrix(); } - else{ - int_t iChunk; - auto state = BaseState::qregs_[0].vector(); + return apply_to_matrix(false); +} + +template +auto State::copy_to_matrix() +{ + if(BaseState::num_global_chunks_ == 1){ + return BaseState::qregs_[0].copy_to_matrix(); + } + return apply_to_matrix(true); +} + +template +auto State::apply_to_matrix(bool copy) +{ + int_t iChunk; + uint_t size = 1ull << (BaseState::chunk_bits_*2); + uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; + uint_t num_threads = BaseState::qregs_[0].get_omp_threads(); + auto matrix = BaseState::qregs_[0].copy_to_matrix(); + + if(BaseState::distributed_rank_ == 0){ //TO DO check memory availability - state.resize(BaseState::num_local_chunks_ << BaseState::chunk_bits_); + matrix.resize(1ull << (BaseState::num_qubits_),1ull << (BaseState::num_qubits_)); -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=1;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); +#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) + for(i=0;i> (BaseState::chunk_bits_); + uint_t icol = i & mask; + matrix(icol_chunk+icol,irow_chunk+irow) = recv(icol,irow); } } +#endif + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + if(copy){ + auto tmp = BaseState::qregs_[iChunk].copy_to_matrix(); +#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) + for(i=0;i> (BaseState::chunk_bits_); + uint_t icol = i & mask; + matrix(icol_chunk+icol,irow_chunk+irow) = tmp(icol,irow); + } + } + else{ + auto tmp = BaseState::qregs_[iChunk].move_to_matrix(); +#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) + for(i=0;i> (BaseState::chunk_bits_); + uint_t icol = i & mask; + matrix(icol_chunk+icol,irow_chunk+irow) = tmp(icol,irow); + } + } + } + } + else{ #ifdef AER_MPI - BaseState::gather_state(state); + //send matrices to process 0 + for(iChunk=0;iChunk::apply_op(const int_t iChunk,const Operations::Op &op, case Operations::OpType::superop: BaseState::qregs_[iChunk].apply_superop_matrix(op.qubits, Utils::vectorize_matrix(op.mats[0])); break; - case Operations::OpType::kraus: - apply_kraus(op.qubits, op.mats); - break; case Operations::OpType::save_expval: case Operations::OpType::save_expval_var: BaseState::apply_save_expval(op, result); break; + case Operations::OpType::save_densmat: + apply_save_density_matrix(op, result, final_ops); + break; + case Operations::OpType::save_probs: + case Operations::OpType::save_probs_ket: + apply_save_probs(op, result); + break; + case Operations::OpType::save_amps_sq: + apply_save_amplitudes_sq(op, result); + break; default: throw std::invalid_argument("DensityMatrix::State::invalid instruction \'" + op.name + "\'."); @@ -561,26 +639,26 @@ void State::apply_chunk_swap(const reg_t &qubits) uint_t q0,q1; q0 = qubits[0]; q1 = qubits[1]; - if(qubits[0] >= BaseState::chunk_bits_/2){ - q0 += BaseState::chunk_bits_/2; + if(qubits[0] >= BaseState::chunk_bits_){ + q0 += BaseState::chunk_bits_; } - if(qubits[1] >= BaseState::chunk_bits_/2){ - q1 += BaseState::chunk_bits_/2; + if(qubits[1] >= BaseState::chunk_bits_){ + q1 += BaseState::chunk_bits_; } reg_t qs0 = {{q0, q1}}; BaseState::apply_chunk_swap(qs0); - if(qubits[0] >= BaseState::chunk_bits_/2){ - q0 += (BaseState::num_qubits_ - BaseState::chunk_bits_)/2; + if(qubits[0] >= BaseState::chunk_bits_){ + q0 += (BaseState::num_qubits_ - BaseState::chunk_bits_); } else{ - q0 += BaseState::chunk_bits_/2; + q0 += BaseState::chunk_bits_; } - if(qubits[1] >= BaseState::chunk_bits_/2){ - q1 += (BaseState::num_qubits_ - BaseState::chunk_bits_)/2; + if(qubits[1] >= BaseState::chunk_bits_){ + q1 += (BaseState::num_qubits_ - BaseState::chunk_bits_); } else{ - q1 += BaseState::chunk_bits_/2; + q1 += BaseState::chunk_bits_; } reg_t qs1 = {{q0, q1}}; BaseState::apply_chunk_swap(qs1); @@ -590,9 +668,56 @@ void State::apply_chunk_swap(const reg_t &qubits) // Implementation: Save data //========================================================================= -template -double State::expval_pauli(const reg_t &qubits, - const std::string& pauli) +template +void State::apply_save_probs(const Operations::Op &op, + ExperimentResult &result) { + auto probs = measure_probs(op.qubits); + if (op.type == Operations::OpType::save_probs_ket) { + BaseState::save_data_average(result, op.string_params[0], + Utils::vec2ket(probs, json_chop_threshold_, 16), + op.save_type); + } else { + BaseState::save_data_average(result, op.string_params[0], + std::move(probs), op.save_type); + } +} + +template +void State::apply_save_amplitudes_sq(const Operations::Op &op, + ExperimentResult &result) +{ + if (op.int_params.empty()) { + throw std::invalid_argument("Invalid save_amplitudes_sq instructions (empty params)."); + } + const int_t size = op.int_params.size(); + int_t iChunk; + rvector_t amps_sq(size,0); +#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + if(irow != icol) + continue; + +#pragma omp parallel for if (size > pow(2, omp_qubit_threshold_) && \ + BaseState::threads_ > 1) \ + num_threads(BaseState::threads_) + for (int_t i = 0; i < size; ++i) { + if(op.int_params[i] >= (irow << BaseState::chunk_bits_) && op.int_params[i] < ((irow+1) << BaseState::chunk_bits_)) + amps_sq[i] = BaseState::qregs_[iChunk].probability(op.int_params[i] - (irow << BaseState::chunk_bits_)); + } + } +#ifdef AER_MPI + BaseState::reduce_sum(amps_sq); +#endif + BaseState::save_data_average(result, op.string_params[0], + std::move(amps_sq), op.save_type); +} + +template +double State::expval_pauli(const reg_t &qubits, + const std::string& pauli) { reg_t qubits_in_chunk; reg_t qubits_out_chunk; @@ -604,7 +729,7 @@ double State::expval_pauli(const reg_t &qubits, //get inner/outer chunk pauli string n = pauli.size(); for(i=0;i::expval_pauli(const reg_t &qubits, } } - int_t nrows = 1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)/2); + int_t nrows = 1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)); if(qubits_out_chunk.size() > 0){ //there are bits out of chunk std::complex phase = 1.0; @@ -625,10 +750,10 @@ double State::expval_pauli(const reg_t &qubits, uint_t x_mask, z_mask, num_y, x_max; std::tie(x_mask, z_mask, num_y, x_max) = AER::QV::pauli_masks_and_phase(qubits_out_chunk, pauli_out_chunk); - z_mask >>= (BaseState::chunk_bits_/2); + z_mask >>= (BaseState::chunk_bits_); if(x_mask != 0){ - x_mask >>= (BaseState::chunk_bits_/2); - x_max -= (BaseState::chunk_bits_/2); + x_mask >>= (BaseState::chunk_bits_); + x_max -= (BaseState::chunk_bits_); AER::QV::add_y_phase(num_y,phase); @@ -641,10 +766,10 @@ double State::expval_pauli(const reg_t &qubits, uint_t iChunk = (irow ^ x_mask) + irow * nrows; if(BaseState::chunk_index_begin_[BaseState::distributed_rank_] <= iChunk && BaseState::chunk_index_end_[BaseState::distributed_rank_] > iChunk){ //on this process - double sign = 1.0; - if (z_mask && (AER::Utils::popcount(iChunk & z_mask) & 1)) - sign = -1.0; - expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,phase); + double sign = 2.0; + if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) + sign = -2.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli_non_diagonal_chunk(qubits_in_chunk, pauli_in_chunk,phase); } } } @@ -654,9 +779,9 @@ double State::expval_pauli(const reg_t &qubits, uint_t iChunk = i * (nrows+1); if(BaseState::chunk_index_begin_[BaseState::distributed_rank_] <= iChunk && BaseState::chunk_index_end_[BaseState::distributed_rank_] > iChunk){ //on this process double sign = 1.0; - if (z_mask && (AER::Utils::popcount((i + BaseState::global_chunk_index_) & z_mask) & 1)) + if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) sign = -1.0; - expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk); + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,1.0); } } } @@ -666,7 +791,7 @@ double State::expval_pauli(const reg_t &qubits, for(i=0;i iChunk){ //on this process - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits, pauli); + expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits, pauli,1.0); } } } @@ -677,6 +802,16 @@ double State::expval_pauli(const reg_t &qubits, return expval; } +template +void State::apply_save_density_matrix(const Operations::Op &op, + ExperimentResult &result, + bool last_op) +{ + BaseState::save_data_average(result, op.string_params[0], + reduced_density_matrix(op.qubits, last_op), + op.save_type); +} + //========================================================================= // Implementation: Snapshots //========================================================================= @@ -717,10 +852,10 @@ void State::apply_snapshot(const Operations::Op &op, snapshot_pauli_expval(op, result, true); } break; /* TODO - case DensityMatrix::Snapshots::expval_matrix: { + case Snapshots::expval_matrix: { snapshot_matrix_expval(op, data, false); } break; - case DensityMatrix::Snapshots::expval_matrix_var: { + case Snapshots::expval_matrix_var: { snapshot_matrix_expval(op, data, true); } break; */ @@ -775,10 +910,19 @@ template void State::snapshot_density_matrix(const Operations::Op &op, ExperimentResult &result, bool last_op) +{ + result.legacy_data.add_average_snapshot("density_matrix", op.string_params[0], + BaseState::creg_.memory_hex(), + reduced_density_matrix(op.qubits, last_op), false); +} + + +template +cmatrix_t State::reduced_density_matrix(const reg_t& qubits, bool last_op) { cmatrix_t reduced_state; // Check if tracing over all qubits - if (op.qubits.empty()) { + if (qubits.empty()) { reduced_state = cmatrix_t(1, 1); std::complex sum = 0.0; @@ -790,30 +934,26 @@ void State::snapshot_density_matrix(const Operations::Op &op, #endif reduced_state[0] = sum; } else { - - auto qubits_sorted = op.qubits; + auto qubits_sorted = qubits; std::sort(qubits_sorted.begin(), qubits_sorted.end()); - if ((op.qubits.size() == BaseState::qregs_[0].num_qubits()) && (op.qubits == qubits_sorted)) { + if ((qubits.size() == BaseState::num_qubits_) && (qubits == qubits_sorted)) { if (last_op) { reduced_state = move_to_matrix(); } else { - reduced_state = move_to_matrix(); + reduced_state = copy_to_matrix(); } } else { - reduced_state = reduced_density_matrix(op.qubits, qubits_sorted); + reduced_state = reduced_density_matrix_helper(qubits, qubits_sorted); } } - - result.legacy_data.add_average_snapshot("density_matrix", op.string_params[0], - BaseState::creg_.memory_hex(), - std::move(reduced_state), false); + return reduced_state; } - - + template -cmatrix_t State::reduced_density_matrix(const reg_t& qubits, const reg_t& qubits_sorted) { - +cmatrix_t State::reduced_density_matrix_helper(const reg_t &qubits, + const reg_t &qubits_sorted) +{ // Get superoperator qubits const reg_t squbits = BaseState::qregs_[0].superop_qubits(qubits); const reg_t squbits_sorted = BaseState::qregs_[0].superop_qubits(qubits_sorted); @@ -832,12 +972,12 @@ cmatrix_t State::reduced_density_matrix(const reg_t& qubits, const re auto vmat = BaseState::qregs_[0].vector(); //TO DO check memory availability - vmat.resize(BaseState::num_local_chunks_ << BaseState::chunk_bits_); + vmat.resize(BaseState::num_local_chunks_ << (BaseState::chunk_bits_*2)); #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) for(iChunk=1;iChunk::apply_gate(const uint_t iChunk, const Operations::Op &op) case DensityMatrix::Gates::rzx: BaseState::qregs_[iChunk].apply_unitary_matrix(op.qubits, Linalg::VMatrix::rzx(op.params[0])); break; + case DensityMatrix::Gates::pauli: + apply_pauli(op.qubits, op.string_params[0]); + break; default: // We shouldn't reach here unless there is a bug in gateset throw std::invalid_argument("DensityMatrix::State::invalid gate instruction \'" + @@ -1049,7 +1192,7 @@ rvector_t State::measure_probs(const reg_t &qubits) const reg_t qubits_out_chunk; for(i=0;i::measure_probs(const reg_t &qubits) const #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i,j,k) for(i=0;i> ((BaseState::num_qubits_ - BaseState::chunk_bits_)/2); - icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_)/2)); + irow = (BaseState::global_chunk_index_ + i) >> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); if(irow == icol){ //diagonal chunk auto chunkSum = BaseState::qregs_[i].probabilities(qubits); @@ -1076,12 +1219,12 @@ rvector_t State::measure_probs(const reg_t &qubits) const int idx = 0; int i_in = 0; for(k=0;k> i_in) & 1) << k); i_in++; } else{ - if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_/2)) >> qubits[k]) & 1){ + if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ idx += 1ull << k; } } @@ -1116,8 +1259,8 @@ std::vector State::sample_measure(const reg_t &qubits, #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) for(i=0;i> ((BaseState::num_qubits_ - BaseState::chunk_bits_)/2); - icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_)/2)); + irow = (BaseState::global_chunk_index_ + i) >> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); if(irow == icol) //only diagonal chunk has probabilities chunkSum[i] = std::real( BaseState::qregs_[i].trace() ); else @@ -1150,29 +1293,33 @@ std::vector State::sample_measure(const reg_t &qubits, //get rnds positions for each chunk #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i,j) for(i=0;i vIdx; - std::vector vRnd; - - //find rnds in this chunk - nIn = 0; - for(j=0;j= chunkSum[i] + globalSum && rnds[j] < chunkSum[i+1] + globalSum){ - vRnd.push_back(rnds[j] - (globalSum + chunkSum[i])); - vIdx.push_back(j); - nIn++; - } + uint_t irow,icol; + irow = (BaseState::global_chunk_index_ + i) >> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + if(irow != icol) + continue; + + uint_t nIn; + std::vector vIdx; + std::vector vRnd; + + //find rnds in this chunk + nIn = 0; + for(j=0;j= chunkSum[i] + globalSum && rnds[j] < chunkSum[i+1] + globalSum){ + vRnd.push_back(rnds[j] - (globalSum + chunkSum[i])); + vIdx.push_back(j); + nIn++; } + } - if(nIn > 0){ - auto chunkSamples = BaseState::qregs_[i].sample_measure(vRnd); - uint_t irow; - irow = (BaseState::global_chunk_index_ + i) >> ((BaseState::num_qubits_ - BaseState::chunk_bits_)/2); + if(nIn > 0){ + auto chunkSamples = BaseState::qregs_[i].sample_measure(vRnd); + uint_t irow; + irow = (BaseState::global_chunk_index_ + i) >> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - for(j=0;j State::sample_measure(const reg_t &qubits, std::vector all_samples; all_samples.reserve(shots); for (int_t val : allbit_samples) { - reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_/2); + reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_); reg_t sample; sample.reserve(qubits.size()); for (uint_t qubit : qubits) { @@ -1291,7 +1438,7 @@ void State::measure_reset_update(const reg_t &qubits, template void State::apply_kraus(const reg_t &qubits, - const std::vector &kmats) + const std::vector &kmats) { int_t i; // Convert to Superoperator diff --git a/src/simulators/density_matrix/densitymatrix_thrust.hpp b/src/simulators/density_matrix/densitymatrix_thrust.hpp index 850df29dc8..3767649b39 100755 --- a/src/simulators/density_matrix/densitymatrix_thrust.hpp +++ b/src/simulators/density_matrix/densitymatrix_thrust.hpp @@ -143,6 +143,7 @@ class DensityMatrixThrust : public UnitaryMatrixThrust { // Return the expectation value of an N-qubit Pauli matrix. // The Pauli is input as a length N string of I,X,Y,Z characters. double expval_pauli(const reg_t &qubits, const std::string &pauli,const complex_t initial_phase=1.0) const; + double expval_pauli_non_diagonal_chunk(const reg_t &qubits, const std::string &pauli,const complex_t initial_phase=1.0) const; protected: // Construct a vectorized superoperator from a vectorized matrix @@ -888,6 +889,68 @@ double DensityMatrixThrust::expval_pauli(const reg_t &qubits, expval_pauli_XYZ_func_dm(x_mask, z_mask, x_max, phase, BaseMatrix::rows_) ); } +template +class expval_pauli_XYZ_func_dm_non_diagonal : public GateFuncBase +{ +protected: + uint_t x_mask_; + uint_t z_mask_; + thrust::complex phase_; + uint_t rows_; +public: + expval_pauli_XYZ_func_dm_non_diagonal(uint_t x,uint_t z,uint_t x_max,std::complex p,uint_t stride) + { + rows_ = stride; + x_mask_ = x; + z_mask_ = z; + phase_ = p; + } + + uint_t size(int num_qubits) + { + return rows_; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + double ret = 0.0; + uint_t idx_mat; + + vec = this->data_; + + idx_mat = i ^ x_mask_ + rows_ * i; + + q0 = vec[idx_mat]; + q0 = phase_ * q0; + ret = q0.real(); + if(z_mask_ != 0){ + if(pop_count_kernel(i & z_mask_) & 1) + ret = -ret; + } + return ret; + } + const char* name(void) + { + return "expval_pauli_XYZ"; + } +}; + +template +double DensityMatrixThrust::expval_pauli_non_diagonal_chunk(const reg_t &qubits, + const std::string &pauli,const complex_t initial_phase) const +{ + uint_t x_mask, z_mask, num_y, x_max; + std::tie(x_mask, z_mask, num_y, x_max) = pauli_masks_and_phase(qubits, pauli); + + // Compute the overall phase of the operator. + // This is (-1j) ** number of Y terms modulo 4 + auto phase = std::complex(initial_phase); + add_y_phase(num_y, phase); + return BaseVector::apply_function_sum( + expval_pauli_XYZ_func_dm_non_diagonal(x_mask, z_mask, x_max, phase, BaseMatrix::rows_) ); +} //----------------------------------------------------------------------- // Z-measurement outcome probabilities //----------------------------------------------------------------------- diff --git a/src/simulators/state.hpp b/src/simulators/state.hpp index 3b9c5fe10e..f7485e56df 100644 --- a/src/simulators/state.hpp +++ b/src/simulators/state.hpp @@ -128,7 +128,7 @@ class State { const = 0; //memory allocation (previously called before inisitalize_qreg) - virtual void allocate(uint_t num_qubits) {} + virtual void allocate(uint_t num_qubits,uint_t block_bits) {} // Return the expectation value of a N-qubit Pauli operator // If the simulator does not support Pauli expectation value this should diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp index b8ace7a198..61674a7610 100644 --- a/src/simulators/state_chunk.hpp +++ b/src/simulators/state_chunk.hpp @@ -118,7 +118,7 @@ class StateChunk { bool final_ops = false); //memory allocation (previously called before inisitalize_qreg) - virtual void allocate(uint_t num_qubits); + virtual void allocate(uint_t num_qubits,uint_t block_bits); // Initializes the State to the default state. // Typically this is the n-qubit all |0> state @@ -319,6 +319,11 @@ class StateChunk { void send_chunk(uint_t local_chunk_index, uint_t global_chunk_index); void recv_chunk(uint_t local_chunk_index, uint_t global_chunk_index); + template + void send_data(data_t* pSend, uint_t size, uint_t myid,uint_t pairid); + template + void recv_data(data_t* pRecv, uint_t size, uint_t myid,uint_t pairid); + //reduce values over processes void reduce_sum(rvector_t& sum) const; void reduce_sum(complex_t& sum) const; @@ -433,13 +438,14 @@ void StateChunk::set_distribution(uint_t nprocs) } template -void StateChunk::allocate(uint_t num_qubits) +void StateChunk::allocate(uint_t num_qubits,uint_t block_bits) { int_t i; uint_t nchunks; int max_bits = num_qubits; num_qubits_ = num_qubits; + block_bits_ = block_bits; if(block_bits_ > 0){ chunk_bits_ = block_bits_; @@ -451,11 +457,7 @@ void StateChunk::allocate(uint_t num_qubits) chunk_bits_ = num_qubits_; } - //scale for density/unitary matrix simulators - chunk_bits_ *= qubit_scale(); - num_qubits_ *= qubit_scale(); - - num_global_chunks_ = 1ull << (num_qubits_ - chunk_bits_); + num_global_chunks_ = 1ull << ((num_qubits_ - chunk_bits_)*qubit_scale()); chunk_index_begin_.resize(distributed_procs_); chunk_index_end_.resize(distributed_procs_); @@ -469,8 +471,8 @@ void StateChunk::allocate(uint_t num_qubits) qregs_.resize(num_local_chunks_); - chunk_omp_parallel_ = false; gpu_optimization_ = false; + chunk_omp_parallel_ = false; if(qregs_[0].name().find("gpu") != std::string::npos){ if(chunk_bits_ < num_qubits_){ chunk_omp_parallel_ = true; //CUDA backend requires thread parallelization of chunk loop @@ -481,7 +483,7 @@ void StateChunk::allocate(uint_t num_qubits) nchunks = num_local_chunks_; for(i=0;i::block_diagonal_matrix(const int_t iChunk, reg_t &qubit cvector_t diag_in; for(i=0;i> (qubits[i] - chunk_bits_/qubit_scale())) & 1) + if((gid >> (qubits[i] - chunk_bits_)) & 1) mask_id |= (1ull << i); } } @@ -860,7 +862,7 @@ void StateChunk::apply_chunk_swap(const reg_t &qubits) q1 = t; } - if(q1 < chunk_bits_){ + if(q1 < chunk_bits_*qubit_scale()){ //device #pragma omp parallel for if(chunk_omp_parallel_) private(iChunk) for(iChunk=0;iChunk::apply_chunk_swap(const reg_t &qubits) uint_t nPair,mask0,mask1; uint_t baseChunk,iChunk1,iChunk2; - if(q0 < chunk_bits_) + if(q0 < chunk_bits_*qubit_scale()) nLarge = 1; else nLarge = 2; mask0 = (1ull << q0); mask1 = (1ull << q1); - mask0 >>= chunk_bits_; - mask1 >>= chunk_bits_; + mask0 >>= (chunk_bits_*qubit_scale()); + mask1 >>= (chunk_bits_*qubit_scale()); int proc_bits = 0; uint_t procs = distributed_procs_; @@ -893,8 +895,8 @@ void StateChunk::apply_chunk_swap(const reg_t &qubits) procs >>= 1; } - if(distributed_procs_ == 1 || (proc_bits >= 0 && q1 < (num_qubits_ - proc_bits))){ //no data transfer between processes is needed - if(q0 < chunk_bits_){ + if(distributed_procs_ == 1 || (proc_bits >= 0 && q1 < (num_qubits_*qubit_scale() - proc_bits))){ //no data transfer between processes is needed + if(q0 < chunk_bits_*qubit_scale()){ nPair = num_local_chunks_ >> 1; } else{ @@ -903,7 +905,7 @@ void StateChunk::apply_chunk_swap(const reg_t &qubits) #pragma omp parallel for if(chunk_omp_parallel_) private(iPair,baseChunk,iChunk1,iChunk2) for(iPair=0;iPair::apply_chunk_swap(const reg_t &qubits) uint_t iLocalChunk,iRemoteChunk,iProc; int i; - if(q0 < chunk_bits_){ + if(q0 < chunk_bits_*qubit_scale()){ nLarge = 1; - nu[0] = 1ull << (q1 - chunk_bits_); + nu[0] = 1ull << (q1 - chunk_bits_*qubit_scale()); ub[0] = 0; iu[0] = 0; - nu[1] = 1ull << (num_qubits_ - q1 - 1); - ub[1] = (q1 - chunk_bits_) + 1; + nu[1] = 1ull << (num_qubits_*qubit_scale() - q1 - 1); + ub[1] = (q1 - chunk_bits_*qubit_scale()) + 1; iu[1] = 0; } else{ nLarge = 2; - nu[0] = 1ull << (q0 - chunk_bits_); + nu[0] = 1ull << (q0 - chunk_bits_*qubit_scale()); ub[0] = 0; iu[0] = 0; nu[1] = 1ull << (q1 - q0 - 1); - ub[1] = (q0 - chunk_bits_) + 1; + ub[1] = (q0 - chunk_bits_*qubit_scale()) + 1; iu[1] = 0; - nu[2] = 1ull << (num_qubits_ - q1 - 1); - ub[2] = (q1 - chunk_bits_) + 1; + nu[2] = 1ull << (num_qubits_*qubit_scale() - q1 - 1); + ub[2] = (q1 - chunk_bits_*qubit_scale()) + 1; iu[2] = 0; } - nPair = 1ull << (num_qubits_ - chunk_bits_ - nLarge); + nPair = 1ull << (num_qubits_*qubit_scale() - chunk_bits_*qubit_scale() - nLarge); for(iPair=0;iPair::apply_chunk_swap(const reg_t &qubits) template -void StateChunk::send_chunk(uint_t local_chunk_index, uint_t global_chunk_index) +void StateChunk::send_chunk(uint_t local_chunk_index, uint_t global_pair_index) { #ifdef AER_MPI MPI_Request reqSend; @@ -1029,17 +1031,17 @@ void StateChunk::send_chunk(uint_t local_chunk_index, uint_t global_chu uint_t sizeSend; uint_t iProc; - iProc = get_process_by_chunk(global_chunk_index); + iProc = get_process_by_chunk(global_pair_index); auto pSend = qregs_[local_chunk_index].send_buffer(sizeSend); - MPI_Isend(pSend,sizeSend,MPI_BYTE,iProc,0,distributed_comm_,&reqSend); + MPI_Isend(pSend,sizeSend,MPI_BYTE,iProc,local_chunk_index + global_chunk_index_,distributed_comm_,&reqSend); MPI_Wait(&reqSend,&st); #endif } template -void StateChunk::recv_chunk(uint_t local_chunk_index, uint_t global_chunk_index) +void StateChunk::recv_chunk(uint_t local_chunk_index, uint_t global_pair_index) { #ifdef AER_MPI MPI_Request reqRecv; @@ -1047,10 +1049,44 @@ void StateChunk::recv_chunk(uint_t local_chunk_index, uint_t global_chu uint_t sizeRecv; uint_t iProc; - iProc = get_process_by_chunk(global_chunk_index); + iProc = get_process_by_chunk(global_pair_index); auto pRecv = qregs_[local_chunk_index].recv_buffer(sizeRecv); - MPI_Irecv(pRecv,sizeRecv,MPI_BYTE,iProc,0,distributed_comm_,&reqRecv); + MPI_Irecv(pRecv,sizeRecv,MPI_BYTE,iProc,global_pair_index,distributed_comm_,&reqRecv); + + MPI_Wait(&reqRecv,&st); +#endif +} + +template +template +void StateChunk::send_data(data_t* pSend, uint_t size, uint_t myid,uint_t pairid) +{ +#ifdef AER_MPI + MPI_Request reqSend; + MPI_Status st; + uint_t iProc; + + iProc = get_process_by_chunk(pairid); + + MPI_Isend(pSend,size*sizeof(data_t),MPI_BYTE,iProc,myid,distributed_comm_,&reqSend); + + MPI_Wait(&reqSend,&st); +#endif +} + +template +template +void StateChunk::recv_data(data_t* pRecv, uint_t size, uint_t myid,uint_t pairid) +{ +#ifdef AER_MPI + MPI_Request reqRecv; + MPI_Status st; + uint_t iProc; + + iProc = get_process_by_chunk(pairid); + + MPI_Irecv(pRecv,size*sizeof(data_t),MPI_BYTE,iProc,pairid,distributed_comm_,&reqRecv); MPI_Wait(&reqRecv,&st); #endif diff --git a/src/simulators/statevector/chunk/chunk.hpp b/src/simulators/statevector/chunk/chunk.hpp index dc9c4c0894..e56446e58e 100644 --- a/src/simulators/statevector/chunk/chunk.hpp +++ b/src/simulators/statevector/chunk/chunk.hpp @@ -51,6 +51,8 @@ class Chunk } ~Chunk() { + if(cache_) + cache_.reset(); } void set_device(void) const diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index e90f4592c8..b024313ad2 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -517,7 +517,6 @@ template void ChunkContainer::UnmapChunk(std::shared_ptr> chunk) { chunk->unmap(); -// chunk.reset(); } template @@ -546,7 +545,6 @@ void ChunkContainer::UnmapBuffer(std::shared_ptr> buf) #pragma omp critical { buf->unmap(); -// buf.reset(); } } @@ -585,7 +583,6 @@ void ChunkContainer::UnmapCheckpoint(std::shared_ptr> buf) #pragma omp critical { buf->unmap(); -// buf.reset(); } } } diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index 42b8e78892..8fe2ba9250 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -33,7 +33,7 @@ class DeviceChunkContainer : public ChunkContainer protected: AERDeviceVector> data_; //device vector to chunks and buffers AERDeviceVector> matrix_; //storage for large matrix - mutable AERDeviceVector params_; //storage for additional parameters + mutable AERDeviceVector params_; //storage for additional parameters AERDeviceVector reduce_buffer_; //buffer for reduction int device_id_; //device index std::vector peer_access_; //to which device accepts peer access @@ -349,6 +349,8 @@ uint_t DeviceChunkContainer::Resize(uint_t chunks,uint_t buffers,uint_t template void DeviceChunkContainer::Deallocate(void) { + set_device(); + data_.clear(); data_.shrink_to_fit(); matrix_.clear(); @@ -371,7 +373,6 @@ void DeviceChunkContainer::Deallocate(void) } stream_.clear(); #endif - } template diff --git a/src/simulators/statevector/chunk/host_chunk_container.hpp b/src/simulators/statevector/chunk/host_chunk_container.hpp index b4b9fb8d96..a6b32d1375 100644 --- a/src/simulators/statevector/chunk/host_chunk_container.hpp +++ b/src/simulators/statevector/chunk/host_chunk_container.hpp @@ -166,8 +166,11 @@ template void HostChunkContainer::Deallocate(void) { data_.clear(); + data_.shrink_to_fit(); matrix_.clear(); + matrix_.shrink_to_fit(); params_.clear(); + params_.shrink_to_fit(); } diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index c68e7b1ce6..a3155f2c84 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -956,15 +956,11 @@ bool QubitVectorThrust::fetch_chunk(void) const int tid,nid; int idev; - tid = omp_get_thread_num(); - nid = omp_get_num_threads(); - - idev = tid * chunk_manager_.num_devices() / nid; - if(chunk_->device() < 0){ //on host + idev = 0; do{ - buffer_chunk_ = chunk_manager_.MapBufferChunk(idev); + buffer_chunk_ = chunk_manager_.MapBufferChunk(idev++ % chunk_manager_.num_devices()); }while(!buffer_chunk_); chunk_->set_cache(buffer_chunk_); buffer_chunk_->CopyIn(chunk_); @@ -2587,7 +2583,7 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, QubitVecto else{ thrust::complex* pChunk0; thrust::complex* pChunk1; - std::shared_ptr> pBuffer0; + std::shared_ptr> pBuffer0 = nullptr; std::shared_ptr> pExec; if(chunk_->device() >= 0){ diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp index e7364fc7d3..24da46f3ce 100755 --- a/src/simulators/statevector/statevector_state.hpp +++ b/src/simulators/statevector/statevector_state.hpp @@ -149,7 +149,7 @@ class State : public Base::State { virtual std::vector sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) override; - virtual void allocate(uint_t num_qubits) override; + virtual void allocate(uint_t num_qubits,uint_t block_bits) override; //----------------------------------------------------------------------- // Additional methods @@ -437,7 +437,7 @@ const stringmap_t State::snapshotset_( // Initialization //------------------------------------------------------------------------- template -void State::allocate(uint_t num_qubits) +void State::allocate(uint_t num_qubits,uint_t block_bits) { BaseState::qreg_.chunk_setup(num_qubits,num_qubits,0,1); } diff --git a/src/simulators/statevector/statevector_state_chunk.hpp b/src/simulators/statevector/statevector_state_chunk.hpp index 5736a937fd..2ba9ce0855 100644 --- a/src/simulators/statevector/statevector_state_chunk.hpp +++ b/src/simulators/statevector/statevector_state_chunk.hpp @@ -33,16 +33,24 @@ namespace AER { namespace StatevectorChunk { +using OpType = Operations::OpType; + +// OpSet of supported instructions const Operations::OpSet StateOpSet( // Op types - {Operations::OpType::gate, Operations::OpType::measure, - Operations::OpType::reset, Operations::OpType::initialize, - Operations::OpType::snapshot, Operations::OpType::barrier, - Operations::OpType::bfunc, Operations::OpType::roerror, - Operations::OpType::matrix, Operations::OpType::diagonal_matrix, - Operations::OpType::multiplexer, Operations::OpType::kraus, - Operations::OpType::sim_op, Operations::OpType::save_expval, - Operations::OpType::save_expval_var}, + {OpType::gate, OpType::measure, + OpType::reset, OpType::initialize, + OpType::snapshot, OpType::barrier, + OpType::bfunc, OpType::roerror, + OpType::matrix, OpType::diagonal_matrix, + OpType::multiplexer, OpType::kraus, + OpType::sim_op, OpType::save_expval, + OpType::save_expval_var, OpType::save_densmat, + OpType::save_probs, OpType::save_probs_ket, + OpType::save_amps, OpType::save_amps_sq, + OpType::save_statevec + // OpType::save_statevec_ket // TODO + }, // Gates {"u1", "u2", "u3", "u", "U", "CX", "cx", "cz", "cy", "cp", "cu1", "cu2", "cu3", "swap", "id", "p", @@ -52,19 +60,13 @@ const Operations::OpSet StateOpSet( "mcswap", "mcphase", "mcr", "mcrx", "mcry", "mcry", "sx", "csx", "mcsx", "delay", "pauli", "mcx_gray"}, // Snapshots - {"memory", "register", "probabilities", + {"statevector", "memory", "register", "probabilities", "probabilities_with_variance", "expectation_value_pauli", "density_matrix", + "density_matrix_with_variance", "expectation_value_pauli_with_variance", "expectation_value_matrix_single_shot", "expectation_value_matrix", "expectation_value_matrix_with_variance", "expectation_value_pauli_single_shot"}); -// Allowed gates enum class -enum class Gates { - id, h, s, sdg, t, tdg, - rxx, ryy, rzz, rzx, - mcx, mcy, mcz, mcr, mcrx, mcry, - mcrz, mcp, mcu2, mcu3, mcswap, mcsx, pauli -}; //========================================================================= // QubitVector State subclass @@ -119,6 +121,7 @@ class State : public Base::StateChunk { void initialize_omp(); auto move_to_vector(); + auto copy_to_vector(); protected: @@ -185,6 +188,30 @@ class State : public Base::StateChunk { // Save data instructions //----------------------------------------------------------------------- + // Save the current state of the statevector simulator + // If `last_op` is True this will use move semantics to move the simulator + // state to the results, otherwise it will use copy semantics to leave + // the current simulator state unchanged. + void apply_save_statevector(const Operations::Op &op, + ExperimentResult &result, + bool last_op); + + // Save the current state of the statevector simulator as a ket-form map. + void apply_save_statevector_ket(const Operations::Op &op, + ExperimentResult &result); + + // Save the current density matrix or reduced density matrix + void apply_save_density_matrix(const Operations::Op &op, + ExperimentResult &result); + + // Helper function for computing expectation value + void apply_save_probs(const Operations::Op &op, + ExperimentResult &result); + + // Helper function for saving amplitudes and amplitudes squared + void apply_save_amplitudes(const Operations::Op &op, + ExperimentResult &result); + // Helper function for computing expectation value virtual double expval_pauli(const reg_t &qubits, const std::string& pauli) override; @@ -480,6 +507,35 @@ auto State::move_to_vector() } } +template +auto State::copy_to_vector() +{ + if(BaseState::num_global_chunks_ == 1){ + return BaseState::qregs_[0].copy_to_vector(); + } + else{ + int_t iChunk; + auto state = BaseState::qregs_[0].copy_to_vector(); + + //TO DO check memory availability + state.resize(BaseState::num_local_chunks_ << BaseState::chunk_bits_); + +#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) + for(iChunk=1;iChunk::apply_op(const int_t iChunk,const Operations::Op &op, case Operations::OpType::save_expval_var: BaseState::apply_save_expval(op, result); break; + case Operations::OpType::save_densmat: + apply_save_density_matrix(op, result); + break; + case Operations::OpType::save_statevec: + apply_save_statevector(op, result, final_ops); + break; + // case Operations::OpType::save_statevec_ket: + // apply_save_statevector_ket(op, result); + // break; + case Operations::OpType::save_probs: + case Operations::OpType::save_probs_ket: + apply_save_probs(op, result); + break; + case Operations::OpType::save_amps: + case Operations::OpType::save_amps_sq: + apply_save_amplitudes(op, result); + break; default: throw std::invalid_argument("QubitVector::State::invalid instruction \'" + op.name + "\'."); @@ -546,6 +619,22 @@ void State::apply_op(const int_t iChunk,const Operations::Op &op, // Implementation: Save data //========================================================================= +template +void State::apply_save_probs(const Operations::Op &op, + ExperimentResult &result) { + // get probs as hexadecimal + auto probs = measure_probs(op.qubits); + if (op.type == Operations::OpType::save_probs_ket) { + // Convert to ket dict + BaseState::save_data_average(result, op.string_params[0], + Utils::vec2ket(probs, json_chop_threshold_, 16), + op.save_type); + } else { + BaseState::save_data_average(result, op.string_params[0], + std::move(probs), op.save_type); + } +} + template double State::expval_pauli(const reg_t &qubits, const std::string& pauli) @@ -585,7 +674,7 @@ double State::expval_pauli(const reg_t &qubits, bool on_same_process = true; #ifdef AER_MPI int proc_bits = 0; - uint_t procs = distributed_procs_; + uint_t procs = BaseState::distributed_procs_; while(procs > 1){ if((procs & 1) != 0){ proc_bits = -1; @@ -618,12 +707,12 @@ double State::expval_pauli(const reg_t &qubits, z_count_pair = AER::Utils::popcount(pair_chunk & z_mask); if(iProc == BaseState::distributed_rank_){ //pair is on the same process - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[pair_chunk - BaseState::global_chunk_index_],z_count,z_count_pair); + expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[pair_chunk - BaseState::global_chunk_index_],z_count,z_count_pair,phase); } else{ BaseState::recv_chunk(iChunk-BaseState::global_chunk_index_,pair_chunk); //refer receive buffer to calculate expectation value - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[iChunk-BaseState::global_chunk_index_],z_count,z_count_pair); + expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[iChunk-BaseState::global_chunk_index_],z_count,z_count_pair,phase); } } else if(iProc == BaseState::distributed_rank_){ //pair is on this process @@ -655,6 +744,111 @@ double State::expval_pauli(const reg_t &qubits, return expval; } +template +void State::apply_save_statevector(const Operations::Op &op, + ExperimentResult &result, + bool last_op) +{ + if (op.qubits.size() != BaseState::num_qubits_) { + throw std::invalid_argument( + op.name + " was not applied to all qubits." + " Only the full statevector can be saved."); + } + if (last_op) { + BaseState::save_data_pershot(result, op.string_params[0], + move_to_vector(), + op.save_type); + } else { + BaseState::save_data_pershot(result, op.string_params[0], + copy_to_vector(), + op.save_type); + } +} + +template +void State::apply_save_statevector_ket(const Operations::Op &op, + ExperimentResult &result) +{ + if (op.qubits.size() != BaseState::num_qubits_) { + throw std::invalid_argument( + op.name + " was not applied to all qubits." + " Only the full statevector can be saved."); + } + // TODO: compute state ket + std::map state_ket; + + BaseState::save_data_pershot(result, op.string_params[0], + std::move(state_ket), op.save_type); +} + +template +void State::apply_save_density_matrix(const Operations::Op &op, + ExperimentResult &result) +{ + cmatrix_t reduced_state; + + // Check if tracing over all qubits + if (op.qubits.empty()) { + reduced_state = cmatrix_t(1, 1); + + double sum = 0.0; +#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:sum) + for(int_t i=0;i +void State::apply_save_amplitudes(const Operations::Op &op, + ExperimentResult &result) +{ + if (op.int_params.empty()) { + throw std::invalid_argument("Invalid save_amplitudes instructions (empty params)."); + } + const int_t size = op.int_params.size(); + if (op.type == Operations::OpType::save_amps) { + Vector amps(size, false); + for (int_t i = 0; i < size; ++i) { + uint_t iChunk = op.int_params[i] >> BaseState::chunk_bits_; + amps[i] = 0.0; + if(iChunk >= BaseState::global_chunk_index_ && iChunk < BaseState::global_chunk_index_ + BaseState::num_local_chunks_){ + amps[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].get_state(op.int_params[i] - (iChunk << BaseState::chunk_bits_)); + } +#ifdef AER_MPI + complex_t amp = amps[i]; + BaseState::reduce_sum(amp); + amps[i] = amp; +#endif + } + BaseState::save_data_pershot(result, op.string_params[0], + std::move(amps), op.save_type); + } + else{ + rvector_t amps_sq(size,0); + for (int_t i = 0; i < size; ++i) { + uint_t iChunk = op.int_params[i] >> BaseState::chunk_bits_; + if(iChunk >= BaseState::global_chunk_index_ && iChunk < BaseState::global_chunk_index_ + BaseState::num_local_chunks_){ + amps_sq[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].probability(op.int_params[i] - (iChunk << BaseState::chunk_bits_)); + } + } +#ifdef AER_MPI + BaseState::reduce_sum(amps_sq); +#endif + BaseState::save_data_average(result, op.string_params[0], + std::move(amps_sq), op.save_type); + } +} + //========================================================================= // Implementation: Snapshots //========================================================================= @@ -926,7 +1120,7 @@ cmatrix_t State::vec2density(const reg_t &qubits, const T &vec) { // Return full density matrix cmatrix_t densmat(DIM, DIM); - if ((N == BaseState::qregs_[0].num_qubits()) && (qubits == qubits_sorted)) { + if ((N == BaseState::num_qubits_) && (qubits == qubits_sorted)) { const int_t mask = QV::MASKS[N]; #pragma omp parallel for if (2 * N > omp_qubit_threshold_ && \ BaseState::threads_ > 1) \ @@ -937,7 +1131,7 @@ cmatrix_t State::vec2density(const reg_t &qubits, const T &vec) { densmat(row, col) = complex_t(vec[row]) * complex_t(std::conj(vec[col])); } } else { - const size_t END = 1ULL << (BaseState::qregs_[0].num_qubits() - N); + const size_t END = 1ULL << (BaseState::num_qubits_ - N); // Initialize matrix values with first block { const auto inds = QV::indexes(qubits, qubits_sorted, 0); diff --git a/src/simulators/unitary/unitary_state.hpp b/src/simulators/unitary/unitary_state.hpp index 3b63562721..17bdd91c4b 100755 --- a/src/simulators/unitary/unitary_state.hpp +++ b/src/simulators/unitary/unitary_state.hpp @@ -104,7 +104,7 @@ class State : public Base::State { // Config: {"omp_qubit_threshold": 7} virtual void set_config(const json_t &config) override; - virtual void allocate(uint_t num_qubits) override; + virtual void allocate(uint_t num_qubits,uint_t block_bits) override; //----------------------------------------------------------------------- // Additional methods @@ -256,7 +256,7 @@ const stringmap_t State::gateset_({ }); template -void State::allocate(uint_t num_qubits) +void State::allocate(uint_t num_qubits,uint_t block_bits) { BaseState::qreg_.chunk_setup(num_qubits*2,num_qubits*2,0,1); } diff --git a/src/simulators/unitary/unitary_state_chunk.hpp b/src/simulators/unitary/unitary_state_chunk.hpp index d98f0cac35..a0276cc7d1 100644 --- a/src/simulators/unitary/unitary_state_chunk.hpp +++ b/src/simulators/unitary/unitary_state_chunk.hpp @@ -27,8 +27,6 @@ #include "unitarymatrix_thrust.hpp" #endif -//#include "unitary_state.hpp" - namespace AER { namespace QubitUnitaryChunk { @@ -36,7 +34,8 @@ namespace QubitUnitaryChunk { const Operations::OpSet StateOpSet( // Op types {Operations::OpType::gate, Operations::OpType::barrier, - Operations::OpType::matrix, Operations::OpType::diagonal_matrix}, + Operations::OpType::matrix, Operations::OpType::diagonal_matrix, + Operations::OpType::snapshot, Operations::OpType::save_unitary}, // Gates {"u1", "u2", "u3", "u", "U", "CX", "cx", "cz", "cy", "cp", "cu1", "cu2", "cu3", "swap", "id", "p", @@ -46,13 +45,7 @@ const Operations::OpSet StateOpSet( "mcswap", "mcphase", "mcr", "mcrx", "mcry", "mcry", "sx", "csx", "mcsx", "delay", "pauli"}, // Snapshots - {}); - -// Allowed gates enum class -enum class Gates { - id, h, s, sdg, t, tdg, rxx, ryy, rzz, rzx, - mcx, mcy, mcz, mcr, mcrx, mcry, mcrz, mcp, mcu2, mcu3, mcswap, mcsx, pauli, -}; + {"unitary"}); //========================================================================= // QubitUnitary State subclass @@ -128,6 +121,9 @@ class State : public Base::StateChunk { // Apply a matrix to given qubits (identity on all other qubits) void apply_matrix(const uint_t iChunk,const reg_t &qubits, const cvector_t &vmat); + // Apply a diagonal matrix + void apply_diagonal_matrix(const uint_t iChunk,const reg_t &qubits, const cvector_t &diag); + //----------------------------------------------------------------------- // 1-Qubit Gates //----------------------------------------------------------------------- @@ -197,7 +193,7 @@ void State::apply_op(const int_t iChunk,const Operations::Op & apply_matrix(iChunk,op.qubits, op.mats[0]); break; case Operations::OpType::diagonal_matrix: - BaseState::qregs_[iChunk].apply_diagonal_matrix(op.qubits, op.params); + apply_diagonal_matrix(iChunk,op.qubits, op.params); break; default: throw std::invalid_argument( @@ -240,7 +236,7 @@ void State::initialize_qreg(uint_t num_qubits) if(BaseState::chunk_bits_ == BaseState::num_qubits_){ for(i=0;i::initialize_qreg(uint_t num_qubits) else{ //multi-chunk distribution #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) for(i=0;inum_qubits_ == this->chunk_bits_){ BaseState::qregs_[i].initialize(); } @@ -278,19 +274,19 @@ void State::initialize_qreg(uint_t num_qubits, int_t iChunk; if(BaseState::chunk_bits_ == BaseState::num_qubits_){ for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, int_t iChunk; if(BaseState::chunk_bits_ == BaseState::num_qubits_){ for(iChunk=0;iChunk::move_to_matrix() if(BaseState::num_global_chunks_ == 1){ return BaseState::qregs_[0].move_to_matrix(); } - else{ - int_t iChunk; - auto state = BaseState::qregs_[0].vector(); //using vector to gather distributed matrix + int_t iChunk; + uint_t size = 1ull << (BaseState::chunk_bits_*2); + uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; + uint_t num_threads = BaseState::qregs_[0].get_omp_threads(); + + auto matrix = BaseState::qregs_[0].copy_to_matrix(); + if(BaseState::distributed_rank_ == 0){ //TO DO check memory availability - state.resize(BaseState::num_local_chunks_ << BaseState::chunk_bits_); - -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=1;iChunk 1) num_threads(num_threads) + for(i=0;i> (BaseState::chunk_bits_); + uint_t icol = i & mask; + matrix[offset+i] = recv(icol,irow); } } +#endif + for(iChunk=0;iChunk 1) num_threads(num_threads) + for(i=0;i> (BaseState::chunk_bits_); + uint_t icol = i & mask; + matrix[offset+i] = tmp(icol,irow); + } + } + } + else{ #ifdef AER_MPI - BaseState::gather_state(state); + //send matrices to process 0 + for(iChunk=0;iChunk::apply_gate(const uint_t iChunk,const Operations::O BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::ryy(op.params[0])); break; case QubitUnitary::Gates::rzz: - BaseState::qregs_[iChunk].apply_diagonal_matrix(op.qubits, Linalg::VMatrix::rzz_diag(op.params[0])); + apply_diagonal_matrix(iChunk,op.qubits, Linalg::VMatrix::rzz_diag(op.params[0])); break; case QubitUnitary::Gates::rzx: BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::rzx(op.params[0])); @@ -497,12 +521,28 @@ void State::apply_matrix(const uint_t iChunk,const reg_t &qubi const cvector_t &vmat) { // Check if diagonal matrix if (vmat.size() == 1ULL << qubits.size()) { - BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, vmat); + apply_diagonal_matrix(iChunk,qubits, vmat); } else { BaseState::qregs_[iChunk].apply_matrix(qubits, vmat); } } +template +void State::apply_diagonal_matrix(const uint_t iChunk, const reg_t &qubits, const cvector_t &diag) +{ + if(BaseState::gpu_optimization_){ + //GPU computes all chunks in one kernel, so pass qubits and diagonal matrix as is + BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits,diag); + } + else{ + reg_t qubits_in = qubits; + cvector_t diag_in = diag; + + BaseState::block_diagonal_matrix(iChunk,qubits_in,diag_in); + BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits_in,diag_in); + } +} + template void State::apply_gate_phase(const uint_t iChunk,uint_t qubit, complex_t phase) { cmatrix_t diag(1, 2); @@ -540,8 +580,7 @@ void State::apply_global_phase() { int_t i; #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) for(i=0;i