Skip to content

Commit

Permalink
Fixes of multi-chunk State implementation (#1149)
Browse files Browse the repository at this point in the history
Co-authored-by: Victor Villar <vvilpas@gmail.com>
Co-authored-by: Christopher J. Wood <cjwood@us.ibm.com>
  • Loading branch information
3 people authored Mar 10, 2021
1 parent 3d2575a commit acd216d
Show file tree
Hide file tree
Showing 25 changed files with 1,113 additions and 477 deletions.
3 changes: 3 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,10 @@ This technique allows applying quantum gates to each chunk independently without
Before the actual simulation, we apply transpilation to remap the input circuits to the equivalent circuits that has all the quantum gates on the lower qubits than the chunk's number of qubits.
And the (noiseless) swap gates are inserted to exchange data.

Please refer to this paper (https://arxiv.org/abs/2102.02957) for more detailed algorithm and implementation of parallel simulation.

So to simulate by using multiple GPUs or multiple nodes on the cluster, following configurations should be set to backend options.
(If there is not enough memory to simulate the input circuit, Qiskit Aer automatically set following options, but it is recommended to explicitly set them)

- blocking_enable

Expand Down
55 changes: 55 additions & 0 deletions src/controllers/controller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include "noise/noise_model.hpp"
#include "transpile/basic_opts.hpp"
#include "transpile/truncate_qubits.hpp"
#include "transpile/cacheblocking.hpp"

namespace AER {
namespace Base {
Expand Down Expand Up @@ -216,8 +217,19 @@ class Controller {
set_distributed_parallelization(const std::vector<Circuit> &circuits,
const std::vector<Noise::NoiseModel> &noise);

virtual bool multiple_chunk_required(const Circuit &circuit,
const Noise::NoiseModel &noise) const;

void save_exception_to_results(Result &result,const std::exception &e);


//setting cache blocking transpiler
Transpile::CacheBlocking transpile_cache_blocking(const Circuit& circ,
const Noise::NoiseModel& noise,
const json_t& config,
const size_t complex_size,bool is_matrix) const;


// Get system memory size
size_t get_system_memory_mb();
size_t get_gpu_memory_mb();
Expand Down Expand Up @@ -274,6 +286,8 @@ class Controller {
//process information (MPI)
int myrank_ = 0;
int num_processes_ = 1;

uint_t cache_block_qubit_ = 0;
};

//=========================================================================
Expand Down Expand Up @@ -348,6 +362,11 @@ void Controller::set_config(const json_t &config) {
JSON::get_value(accept_distributed_results_, "accept_distributed_results", config);
}

//enable multiple qregs if cache blocking is enabled
cache_block_qubit_ = 0;
if(JSON::check_key("blocking_qubits", config)){
JSON::get_value(cache_block_qubit_,"blocking_qubits", config);
}
}

void Controller::clear_config() {
Expand Down Expand Up @@ -535,6 +554,21 @@ uint_t Controller::get_distributed_num_processes(bool par_shots) const
}
}

bool Controller::multiple_chunk_required(const Circuit &circ,
const Noise::NoiseModel &noise) const
{
if(circ.num_qubits < 3)
return false;

if(num_process_per_experiment_ > 1 || Controller::get_min_memory_mb() < required_memory_mb(circ, noise))
return true;

if(cache_block_qubit_ >= 2 && cache_block_qubit_ < circ.num_qubits)
return true;

return false;
}

size_t Controller::get_system_memory_mb() {
size_t total_physical_memory = 0;
#if defined(__linux__) || defined(__APPLE__)
Expand Down Expand Up @@ -654,6 +688,27 @@ void Controller::save_exception_to_results(Result &result,const std::exception &
}
}

Transpile::CacheBlocking Controller::transpile_cache_blocking(const Circuit& circ,
const Noise::NoiseModel& noise,
const json_t& config,
const size_t complex_size,bool is_matrix) const
{
Transpile::CacheBlocking cache_block_pass;

cache_block_pass.set_config(config);
if(!cache_block_pass.enabled()){
//if blocking is not set by config, automatically set if required
if(multiple_chunk_required(circ,noise)){
int nplace = num_process_per_experiment_;
if(num_gpus_ > 0)
nplace *= num_gpus_;
cache_block_pass.set_blocking(circ.num_qubits, get_min_memory_mb() << 20, nplace, complex_size,is_matrix);
}
}

return cache_block_pass;
}

//-------------------------------------------------------------------------
// Qobj execution
//-------------------------------------------------------------------------
Expand Down
94 changes: 28 additions & 66 deletions src/controllers/qasm_controller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,11 +215,6 @@ class QasmController : public Base::Controller {
const Operations::OpSet &opset,
const json_t& config) const;


Transpile::CacheBlocking transpile_cache_blocking(const Circuit& circ,
const Noise::NoiseModel& noise,
const json_t& config) const;

//----------------------------------------------------------------
// Run circuit helpers
//----------------------------------------------------------------
Expand Down Expand Up @@ -306,9 +301,6 @@ class QasmController : public Base::Controller {

// Controller-level parameter for CH method
bool extended_stabilizer_measure_sampling_ = false;

//using multiple chunks
bool multiple_qregs_ = false;
};

//=========================================================================
Expand Down Expand Up @@ -381,11 +373,6 @@ void QasmController::set_config(const json_t& config) {
"QasmController: initial_statevector is not a unit vector");
}
}

//enable multiple qregs if cache blocking is enabled
if(JSON::check_key("blocking_enable", config)){
JSON::get_value(multiple_qregs_,"blocking_enable", config);
}
}

void QasmController::clear_config() {
Expand All @@ -407,7 +394,7 @@ void QasmController::run_circuit(const Circuit& circ,
// Validate circuit for simulation method
switch (simulation_method(circ, noise, true)) {
case Method::statevector: {
if(multiple_qregs_){
if(Base::Controller::multiple_chunk_required(circ,noise)){
if (simulation_precision_ == Precision::double_precision) {
// Double-precision Statevector simulation
return run_circuit_helper<StatevectorChunk::State<QV::QubitVector<double>>>(
Expand Down Expand Up @@ -440,7 +427,7 @@ void QasmController::run_circuit(const Circuit& circ,
"QasmController: method statevector_gpu is not supported on this "
"system");
#else
if(multiple_qregs_ || (parallel_shots_ > 1 || parallel_experiments_ > 1)){
if(Base::Controller::multiple_chunk_required(circ,noise) || (parallel_shots_ > 1 || parallel_experiments_ > 1)){
if (simulation_precision_ == Precision::double_precision) {
// Double-precision Statevector simulation
return run_circuit_helper<
Expand Down Expand Up @@ -478,7 +465,7 @@ void QasmController::run_circuit(const Circuit& circ,
"QasmController: method statevector_thrust is not supported on this "
"system");
#else
if(multiple_qregs_){
if(Base::Controller::multiple_chunk_required(circ,noise)){
if (simulation_precision_ == Precision::double_precision) {
// Double-precision Statevector simulation
return run_circuit_helper<
Expand Down Expand Up @@ -511,7 +498,7 @@ void QasmController::run_circuit(const Circuit& circ,
#endif
}
case Method::density_matrix: {
if(multiple_qregs_){
if(Base::Controller::multiple_chunk_required(circ,noise)){
if (simulation_precision_ == Precision::double_precision) {
// Double-precision density matrix simulation
return run_circuit_helper<
Expand Down Expand Up @@ -548,7 +535,7 @@ void QasmController::run_circuit(const Circuit& circ,
"QasmController: method density_matrix_gpu is not supported on this "
"system");
#else
if(multiple_qregs_ || (parallel_shots_ > 1 || parallel_experiments_ > 1)){
if(Base::Controller::multiple_chunk_required(circ,noise) || (parallel_shots_ > 1 || parallel_experiments_ > 1)){
if (simulation_precision_ == Precision::double_precision) {
// Double-precision density matrix simulation
return run_circuit_helper<
Expand Down Expand Up @@ -586,7 +573,7 @@ void QasmController::run_circuit(const Circuit& circ,
"this "
"system");
#else
if(multiple_qregs_){
if(Base::Controller::multiple_chunk_required(circ,noise)){
if (simulation_precision_ == Precision::double_precision) {
// Double-precision density matrix simulation
return run_circuit_helper<
Expand Down Expand Up @@ -938,42 +925,6 @@ Transpile::Fusion QasmController::transpile_fusion(Method method,
return fusion_pass;
}

Transpile::CacheBlocking QasmController::transpile_cache_blocking(const Circuit& circ,
const Noise::NoiseModel& noise,
const json_t& config) const
{
Transpile::CacheBlocking cache_block_pass;

cache_block_pass.set_config(config);
if(!cache_block_pass.enabled()){
//if blocking is not set by config, automatically set if required
if(Base::Controller::num_process_per_experiment_ > 1 || Base::Controller::get_min_memory_mb() < required_memory_mb(circ, noise)){
int nplace = Base::Controller::num_process_per_experiment_;
if(Base::Controller::num_gpus_ > 0)
nplace *= Base::Controller::num_gpus_;

size_t complex_size = (simulation_precision_ == Precision::single_precision) ? sizeof(std::complex<float>) : sizeof(std::complex<double>);

switch (simulation_method(circ, noise, false)) {
case Method::statevector:
case Method::statevector_thrust_cpu:
case Method::statevector_thrust_gpu:
cache_block_pass.set_blocking(circ.num_qubits, Base::Controller::get_min_memory_mb() << 20, nplace, complex_size,false);
break;
case Method::density_matrix:
case Method::density_matrix_thrust_cpu:
case Method::density_matrix_thrust_gpu:
cache_block_pass.set_blocking(circ.num_qubits, Base::Controller::get_min_memory_mb() << 20, nplace, complex_size,true);
break;
default:
throw std::runtime_error("QasmController: No enough memory to simulate this method on the sysytem");
}
}
}

return cache_block_pass;
}

void QasmController::set_parallelization_circuit(
const Circuit& circ,
const Noise::NoiseModel& noise_model) {
Expand Down Expand Up @@ -1148,9 +1099,19 @@ void QasmController::run_circuit_helper(const Circuit& circ,
auto fusion_pass = transpile_fusion(method, opt_circ.opset(), config);
fusion_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result);

auto cache_block_pass = transpile_cache_blocking(opt_circ,noise,config);
bool is_matrix = false;
if(method == Method::density_matrix || method == Method::density_matrix_thrust_gpu || method == Method::density_matrix_thrust_cpu)
is_matrix = true;
auto cache_block_pass = transpile_cache_blocking(opt_circ,noise,config,(simulation_precision_ == Precision::single_precision) ? sizeof(std::complex<float>) : sizeof(std::complex<double>),is_matrix);
cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result);

uint_t block_bits = 0;
if(cache_block_pass.enabled())
block_bits = cache_block_pass.block_bits();

//allocate qubit register
state.allocate(Base::Controller::max_qubits_,block_bits);

// Run simulation
run_multi_shot(opt_circ, shots, state, initial_state, method, result, rng);
}
Expand Down Expand Up @@ -1179,9 +1140,6 @@ void QasmController::run_multi_shot(const Circuit& circ,
// Implement measure sampler
auto pos = circ.first_measure_pos; // Position of first measurement op

//allocate qubit register
state.allocate(Base::Controller::max_qubits_);

// Run circuit instructions before first measure
std::vector<Operations::Op> ops(circ.ops.begin(),
circ.ops.begin() + pos);
Expand All @@ -1197,9 +1155,6 @@ void QasmController::run_multi_shot(const Circuit& circ,
// Add measure sampling metadata
result.metadata.add(true, "measure_sampling");
} else {
//allocate qubit register
state.allocate(Base::Controller::max_qubits_);

// Perform standard execution if we cannot apply the
// measurement sampling optimization
while (shots-- > 0) {
Expand All @@ -1225,10 +1180,10 @@ void QasmController::run_circuit_with_sampled_noise(const Circuit& circ,
measure_pass.set_config(config);
Noise::NoiseModel dummy_noise;

auto cache_block_pass = transpile_cache_blocking(circ,noise,config);

//allocate qubit register
state.allocate(Base::Controller::max_qubits_);
bool is_matrix = false;
if(method == Method::density_matrix || method == Method::density_matrix_thrust_gpu || method == Method::density_matrix_thrust_cpu)
is_matrix = true;
auto cache_block_pass = transpile_cache_blocking(circ,noise,config,(simulation_precision_ == Precision::single_precision) ? sizeof(std::complex<float>) : sizeof(std::complex<double>),is_matrix);

// Sample noise using circuit method
while (shots-- > 0) {
Expand All @@ -1238,6 +1193,13 @@ void QasmController::run_circuit_with_sampled_noise(const Circuit& circ,
fusion_pass.optimize_circuit(noise_circ, dummy_noise, state.opset(), result);
cache_block_pass.optimize_circuit(noise_circ, dummy_noise, state.opset(), result);

uint_t block_bits = 0;
if(cache_block_pass.enabled())
block_bits = cache_block_pass.block_bits();

//allocate qubit register
state.allocate(Base::Controller::max_qubits_,block_bits);

run_single_shot(noise_circ, state, initial_state, result, rng);
}
}
Expand Down
Loading

0 comments on commit acd216d

Please sign in to comment.