Skip to content

Commit

Permalink
Merge pull request #619 from jakurzak/master
Browse files Browse the repository at this point in the history
Add support for AMD GPUs
  • Loading branch information
95-martin-orion authored Nov 9, 2023
2 parents 3e7da1b + 33c49f2 commit a5e7a82
Show file tree
Hide file tree
Showing 24 changed files with 433 additions and 55 deletions.
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@ cmake_minimum_required(VERSION 3.11)

execute_process(COMMAND which nvcc OUTPUT_VARIABLE has_nvcc)
if(has_nvcc STREQUAL "")
project(qsim)
execute_process(COMMAND which hipcc OUTPUT_VARIABLE has_hipcc)
if(has_hipcc STREQUAL "")
project(qsim)
else()
project(qsim LANGUAGES CXX HIP)
ADD_SUBDIRECTORY(pybind_interface/hip)
endif()
else()
project(qsim LANGUAGES CXX CUDA)
ADD_SUBDIRECTORY(pybind_interface/cuda)
Expand Down
16 changes: 16 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ TESTS = run-cxx-tests

CXX=g++
NVCC=nvcc
HIPCC=hipcc

CXXFLAGS = -O3 -fopenmp
ARCHFLAGS = -march=native
NVCCFLAGS = -O3
HIPCCFLAGS = -O3

# CUQUANTUM_ROOT should be set.
CUSTATEVECFLAGS = -I$(CUQUANTUM_ROOT)/include -L${CUQUANTUM_ROOT}/lib -L$(CUQUANTUM_ROOT)/lib64 -lcustatevec -lcublas
Expand All @@ -22,6 +24,8 @@ export ARCHFLAGS
export NVCC
export NVCCFLAGS
export CUSTATEVECFLAGS
export HIPCC
export HIPCCFLAGS

ifeq ($(PYBIND11), true)
TARGETS += pybind
Expand All @@ -43,6 +47,10 @@ qsim-cuda:
qsim-custatevec:
$(MAKE) -C apps/ qsim-custatevec

.PHONY: qsim-hip
qsim-hip:
$(MAKE) -C apps/ qsim-hip

.PHONY: pybind
pybind:
$(MAKE) -C pybind_interface/ pybind
Expand All @@ -59,6 +67,10 @@ cuda-tests:
custatevec-tests:
$(MAKE) -C tests/ custatevec-tests

.PHONY: hip-tests
hip-tests:
$(MAKE) -C tests/ hip-tests

.PHONY: run-cxx-tests
run-cxx-tests: cxx-tests
$(MAKE) -C tests/ run-cxx-tests
Expand All @@ -71,6 +83,10 @@ run-cuda-tests: cuda-tests
run-custatevec-tests: custatevec-tests
$(MAKE) -C tests/ run-custatevec-tests

.PHONY: run-hip-tests
run-hip-tests: hip-tests
$(MAKE) -C tests/ run-hip-tests

PYTESTS = $(shell find qsimcirq_tests/ -name '*_test.py')

.PHONY: run-py-tests
Expand Down
9 changes: 9 additions & 0 deletions apps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda.cu=%cuda.x)
CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu")
CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x)

HIP_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu')
HIP_TARGETS := $(HIP_TARGETS:%cuda.cu=%hip.x)

.PHONY: qsim
qsim: $(CXX_TARGETS)

Expand All @@ -16,6 +19,9 @@ qsim-cuda: $(CUDA_TARGETS)
.PHONY: qsim-custatevec
qsim-custatevec: $(CUSTATEVEC_TARGETS)

.PHONY: qsim-hip
qsim-hip: $(HIP_TARGETS)

%.x: %.cc
$(CXX) -o ./$@ $< $(CXXFLAGS) $(ARCHFLAGS)

Expand All @@ -25,6 +31,9 @@ qsim-custatevec: $(CUSTATEVEC_TARGETS)
%custatevec.x: %custatevec.cu
$(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS)

%hip.x: %cuda.cu
$(HIPCC) -o ./$@ $< $(HIPCCFLAGS)

.PHONY: clean
clean:
-rm -f ./*.x ./*.a ./*.so ./*.mod
16 changes: 11 additions & 5 deletions apps/make.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,15 @@ g++ -O3 -march=native -fopenmp -o qsim_amplitudes.x qsim_amplitudes.cc
g++ -O3 -march=native -fopenmp -o qsimh_base.x qsimh_base.cc
g++ -O3 -march=native -fopenmp -o qsimh_amplitudes.x qsimh_amplitudes.cc

nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
if command -v nvcc &>/dev/null; then
nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu

# CUQUANTUM_ROOT should be set.
CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
if [ -n "$CUQUANTUM_ROOT" ]; then
CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
fi
elif command -v hipcc &>/dev/null; then
hipcc -O3 -o qsim_base_hip.x qsim_base_cuda.cu
hipcc -O3 -o qsim_qtrajectory_hip.x qsim_qtrajectory_cuda.cu
fi
2 changes: 2 additions & 0 deletions docs/_book.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ upper_tabs:
path: /qsim/tutorials/q32d14
- title: "Simulate noise"
path: /qsim/tutorials/noisy_qsimcirq
- title: "AMD GPU support"
path: /qsim/tutorials/amd_gpu

- name: "Guides"
contents:
Expand Down
86 changes: 86 additions & 0 deletions docs/tutorials/amd_gpu.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Support for AMD Instinct™ MI Series Accelerators

qsim provides support for AMD Instinct accelerators.
The implementation covers the native GPU support in qsim
by utilizing [AMD HIP SDK](https://rocm.docs.amd.com/projects/HIP)
(Heterogeneous-Compute Interface for Portability).
The cuQuantum implementation is currently not covered.

## Building

Building qsim with support for AMD Instinct accelerators requires installation of
[AMD ROCm™ Open Software Platform](https://www.amd.com/en/developer/resources/rocm-hub.html).
Instructions for installing ROCm are available at https://rocm.docs.amd.com/.

To enable support for AMD GPUs, qsim needs to be built from sources.
This can be done as follows:

```
conda env list
conda create -y -n CirqDevEnv python=3
conda activate CirqDevEnv
pip install pybind11
git clone https://github.com/quantumlib/qsim.git
cd qsim
make -j qsim # to build CPU qsim
make -j qsim-hip # to build HIP qsim
make -j pybind # to build Python bindings
make -j cxx-tests # to build CPU tests
make -j hip-tests # to build HIP tests
pip install .
```

Note: To avoid problems when building qsim with support for AMD GPUs,
make sure to use the latest version of CMake.

## Testing

### Simulator

To test the qsim simulator:

```
make run-cxx-tests # to run CPU tests
make run-hip-tests # to run HIP tests
```

or

```
cd tests
for file in *.x; do ./"$file"; done # to run all tests
for file in *_hip_test.x; do ./"$file"; done # to run HIP tests only
```

### Python Bindings

To test the Python bindings:

```
make run-py-tests
```

or

```
cd qsimcirq_tests
python3 -m pytest -v qsimcirq_test.py
```

## Using

Using qsim on AMD Instinct GPUs is identical to using it on NVIDIA GPUs.
I.e., it is done by passing `use_gpu=True` and `gpu_mode=0` as `qsimcirq.QSimOptions`:

```
simulator = qsimcirq.QSimSimulator(qsim_options=qsimcirq.QSimOptions(
use_gpu=True,
gpu_mode=0,
...
))
```

Note: `gpu_mode` has to be set to zero for AMD GPUs, as cuStateVec is not supported.
61 changes: 61 additions & 0 deletions lib/cuda2hip.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright 2023 Advanced Micro Devices, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef SIMULATOR_CUDA2HIP_H_
#define SIMULATOR_CUDA2HIP_H_

#define cublasCaxpy hipblasCaxpy
#define cublasCdotc hipblasCdotc
#define cublasCreate hipblasCreate
#define cublasCscal hipblasCscal
#define cublasCsscal hipblasCsscal
#define cublasDestroy hipblasDestroy
#define cublasDznrm2 hipblasDznrm2
#define cublasHandle_t hipblasHandle_t
#define cublasScnrm2 hipblasScnrm2
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define cublasStatus_t hipblasStatus_t
#define cublasZaxpy hipblasZaxpy
#define cublasZdotc hipblasZdotc
#define cublasZdscal hipblasZdscal
#define cublasZscal hipblasZscal
#define cuCimagf hipCimagf
#define cuCimag hipCimag
#define cuComplex hipComplex
#define cuCrealf hipCrealf
#define cuCreal hipCreal
#define CUDA_C_32F HIPBLAS_C_32F
#define CUDA_C_64F HIPBLAS_C_64F
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaFree hipFree
#define cudaGetErrorString hipGetErrorString
#define cudaMalloc hipMalloc
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpy hipMemcpy
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemset hipMemset
#define cudaPeekAtLastError hipPeekAtLastError
#define cudaSuccess hipSuccess
#define cuDoubleComplex hipDoubleComplex

template <typename T>
__device__ __forceinline__ T __shfl_down_sync(
unsigned mask, T var, unsigned int delta, int width = warpSize) {
return __shfl_down(var, delta, width);
}

#endif // SIMULATOR_CUDA2HIP_H_
4 changes: 0 additions & 4 deletions lib/fuser_mqubit.h
Original file line number Diff line number Diff line change
Expand Up @@ -561,8 +561,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat,
std::vector<GateF*>& orphaned_gates,
std::vector<GateFused>& fused_gates) {
unsigned count = 0;

for (std::size_t i = 0; i < orphaned_gates.size(); ++i) {
auto ogate1 = orphaned_gates[i];

Expand All @@ -575,8 +573,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {

if (ogate2->visited == kFinal) continue;

++count;

unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size();

if (cur_size <= max_fused_size) {
Expand Down
13 changes: 9 additions & 4 deletions lib/simulator_cuda_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@
#ifndef SIMULATOR_CUDA_KERNELS_H_
#define SIMULATOR_CUDA_KERNELS_H_

#include <cuda.h>
#include <cuda_runtime.h>

#include "util_cuda.h"
#ifdef __NVCC__
#include <cuda.h>
#include <cuda_runtime.h>

#include "util_cuda.h"
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

namespace qsim {

Expand Down
10 changes: 8 additions & 2 deletions lib/statespace_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
#ifndef STATESPACE_CUDA_H_
#define STATESPACE_CUDA_H_

#include <cuda.h>
#ifdef __NVCC__
#include <cuda.h>
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

#include <algorithm>
#include <complex>
Expand Down Expand Up @@ -102,7 +107,8 @@ class StateSpaceCUDA :
}

void SetAllZeros(State& state) const {
cudaMemset(state.get(), 0, MinSize(state.num_qubits()) * sizeof(fp_type));
ErrorCheck(cudaMemset(state.get(), 0,
MinSize(state.num_qubits()) * sizeof(fp_type)));
}

// Uniform superposition.
Expand Down
7 changes: 6 additions & 1 deletion lib/statespace_cuda_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
#ifndef STATESPACE_CUDA_KERNELS_H_
#define STATESPACE_CUDA_KERNELS_H_

#include <cuda.h>
#ifdef __NVCC__
#include <cuda.h>
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

#include "util_cuda.h"

Expand Down
6 changes: 5 additions & 1 deletion lib/util_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@
#ifndef UTIL_CUDA_H_
#define UTIL_CUDA_H_

#include <cuda.h>
#ifdef __NVCC__
#include <cuda.h>
#elif __HIP__
#include <hip/hip_runtime.h>
#endif

#include <cstdlib>

Expand Down
Loading

0 comments on commit a5e7a82

Please sign in to comment.