Skip to content

Commit

Permalink
[rhi] Update compute CommandList APIs (except dispatch) (taichi-dev#7037
Browse files Browse the repository at this point in the history
)

Issue: taichi-dev#6832

### Brief Summary

Compute-only CommandList APIs other than dispatch has been changed to
use `noexcept`, and the behavior specifications has been added, and
relevant checks are now in place for Vulkan and partially for DX11.

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
2 people authored and feisuzhu committed Jan 5, 2023
1 parent 94d1ecc commit f68aa3a
Show file tree
Hide file tree
Showing 13 changed files with 279 additions and 97 deletions.
1 change: 1 addition & 0 deletions misc/prtags.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"dx12" : "DirectX 12 backend",
"spirv" : "SPIR-V common codegen",
"gfx" : "Common graphics runtime",
"rhi" : "Unified Device API",
"wasm" : "WebAssembly backend",
"misc" : "Miscellaneous",
"std" : "Standard library",
Expand Down
19 changes: 11 additions & 8 deletions taichi/rhi/amdgpu/amdgpu_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,20 @@ class AmdgpuCommandList : public CommandList {
~AmdgpuCommandList() override {
}

void bind_pipeline(Pipeline *p) override{TI_NOT_IMPLEMENTED};
void bind_pipeline(Pipeline *p) noexcept final{TI_NOT_IMPLEMENTED};
RhiResult bind_shader_resources(ShaderResourceSet *res,
int set_index = 0) final{TI_NOT_IMPLEMENTED};
RhiResult bind_raster_resources(RasterResources *res) final{
int set_index = 0) noexcept final{
TI_NOT_IMPLEMENTED};
void buffer_barrier(DevicePtr ptr, size_t size) override{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};
void memory_barrier() override{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) override{
RhiResult bind_raster_resources(RasterResources *res) noexcept final{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) override{
void buffer_barrier(DevicePtr ptr,
size_t size) noexcept final{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) noexcept final{
TI_NOT_IMPLEMENTED};
void memory_barrier() noexcept final{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) noexcept final{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) noexcept final{
TI_NOT_IMPLEMENTED};
void dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1) override{
TI_NOT_IMPLEMENTED};
Expand Down
18 changes: 10 additions & 8 deletions taichi/rhi/cpu/cpu_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,20 @@ class CpuCommandList : public CommandList {
~CpuCommandList() override {
}

void bind_pipeline(Pipeline *p) override{TI_NOT_IMPLEMENTED};
void bind_pipeline(Pipeline *p) noexcept override{TI_NOT_IMPLEMENTED};
RhiResult bind_shader_resources(ShaderResourceSet *res,
int set_index = 0) override{
int set_index = 0) noexcept override{
TI_NOT_IMPLEMENTED};
RhiResult bind_raster_resources(RasterResources *res) override{
RhiResult bind_raster_resources(RasterResources *res) noexcept override{
TI_NOT_IMPLEMENTED};
void buffer_barrier(DevicePtr ptr, size_t size) override{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};
void memory_barrier() override{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) override{
void buffer_barrier(DevicePtr ptr,
size_t size) noexcept override{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) noexcept override{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) override{
void memory_barrier() noexcept override{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) noexcept override{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) noexcept override{
TI_NOT_IMPLEMENTED};
void dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1) override{
TI_NOT_IMPLEMENTED};
Expand Down
19 changes: 11 additions & 8 deletions taichi/rhi/cuda/cuda_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,20 @@ class CudaCommandList : public CommandList {
~CudaCommandList() override {
}

void bind_pipeline(Pipeline *p) override{TI_NOT_IMPLEMENTED};
void bind_pipeline(Pipeline *p) noexcept override{TI_NOT_IMPLEMENTED};
RhiResult bind_shader_resources(ShaderResourceSet *res,
int set_index = 0) final{TI_NOT_IMPLEMENTED};
RhiResult bind_raster_resources(RasterResources *res) final{
int set_index = 0) noexcept final{
TI_NOT_IMPLEMENTED};
void buffer_barrier(DevicePtr ptr, size_t size) override{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};
void memory_barrier() override{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) override{
RhiResult bind_raster_resources(RasterResources *res) noexcept final{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) override{
void buffer_barrier(DevicePtr ptr,
size_t size) noexcept override{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) noexcept override{
TI_NOT_IMPLEMENTED};
void memory_barrier() noexcept override{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) noexcept override{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) noexcept override{
TI_NOT_IMPLEMENTED};
void dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1) override{
TI_NOT_IMPLEMENTED};
Expand Down
71 changes: 62 additions & 9 deletions taichi/rhi/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ enum class RhiResult {
out_of_memory = -4,
};

constexpr size_t kBufferSizeEntireSize = size_t(-1);
constexpr size_t kBufferSizeEntireSize = std::numeric_limits<size_t>::max();

#define MAKE_ENUM_FLAGS(name) \
inline name operator|(name a, name b) { \
Expand Down Expand Up @@ -278,7 +278,7 @@ class TI_DLL_EXPORT CommandList {
* Doing so resets all bound resources.
* @params[in] pipeline The pipeline to be bound
*/
virtual void bind_pipeline(Pipeline *p) = 0;
virtual void bind_pipeline(Pipeline *p) noexcept = 0;

/**
* Bind a ShaderResourceSet to a set index.
Expand All @@ -296,7 +296,7 @@ class TI_DLL_EXPORT CommandList {
* `error` If binding failed due to other reasons
*/
virtual RhiResult bind_shader_resources(ShaderResourceSet *res,
int set_index = 0) = 0;
int set_index = 0) noexcept = 0;

/**
* Bind RasterResources to the command list.
Expand All @@ -308,13 +308,66 @@ class TI_DLL_EXPORT CommandList {
* `not_supported` If some bindings are not supported by the backend
* `error` If binding failed due to other reasons
*/
virtual RhiResult bind_raster_resources(RasterResources *res) = 0;
virtual RhiResult bind_raster_resources(RasterResources *res) noexcept = 0;

/**
* Insert a memory barrier into the command list.
* The barrier affects a continous region of memory.
* Changes to memory before the barrier will be visible to accesses after the
* barrier (API command ordering). i.e. Command later to this barrier will see
* the changes made by commands before this barrier.
* This barrier is limited in scope to the Stream that the command list is
* submitted to. Other Streams or Devices may not observe this barrier.
* @params[in] ptr The pointer to the start of the region
* @params[in] size The size of the memory region.
* Size is clamped to the underlying buffer size.
*/
virtual void buffer_barrier(DevicePtr ptr, size_t size) noexcept = 0;

/**
* Insert a memory barrier into the command list.
* The barrier affects an entire buffer.
* Behaviour is the same as `buffer_barrier(DevicePtr, size_t)`
* @params[in] alloc The memory allocation of this barrier
*/
virtual void buffer_barrier(DeviceAllocation alloc) noexcept = 0;

/**
* Insert a memory barrier into the command list.
* The barrier affects all global memory.
* Behaviour is the same as `buffer_barrier(DevicePtr, size_t)`
* @params[in] alloc The memory allocation of this barrier
*/
virtual void memory_barrier() noexcept = 0;

/**
* Insert a buffer copy operation into the command list.
* @params[in] src The source Device Pointer
* @params[in] dst The destination Device Pointer
* @params[in] size The size of the region to be copied.
* The size will be clamped to the minimum between
* `dst.size - dst.offset` and `src.size - src.offset`
*/
virtual void buffer_copy(DevicePtr dst,
DevicePtr src,
size_t size) noexcept = 0;

/**
* Insert a memory region fill operation into the command list
* The memory region will be filled with the given (bit precise) value.
* - (Encouraged behavior) If the `data` is 0, the underlying API might
* provide a faster code path.
* - (Encouraged behavior) If the `size` is -1 (max of size_t) the underlying
* API might provide a faster code path.
* @params[in] ptr The start of the memory region.
* ptr.offset will be aligned down to a multiple of 4 bytes.
* @params[in] size The size of the region.
* The size will be clamped to the underlying buffer's size.
*/
virtual void buffer_fill(DevicePtr ptr,
size_t size,
uint32_t data) noexcept = 0;

virtual void buffer_barrier(DevicePtr ptr, size_t size) = 0;
virtual void buffer_barrier(DeviceAllocation alloc) = 0;
virtual void memory_barrier() = 0;
virtual void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) = 0;
virtual void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) = 0;
virtual void dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1) = 0;

struct ComputeSize {
Expand Down
68 changes: 57 additions & 11 deletions taichi/rhi/dx/dx_device.cpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
#ifdef TI_WITH_DX11

#include "taichi/rhi/dx/dx_device.h"
#include "taichi/rhi/impl_support.h"

#include "spirv_hlsl.hpp"
#include <d3dcompiler.h>

namespace taichi::lang {
namespace directx11 {

using namespace rhi_impl;

#ifdef TAICHI_DX11_DEBUG_WINDOW
IDXGISwapChain *g_swapchain = nullptr;
#endif
Expand Down Expand Up @@ -89,13 +92,13 @@ Dx11CommandList::~Dx11CommandList() {
d3d11_deferred_context_->Release();
}

void Dx11CommandList::bind_pipeline(Pipeline *p) {
void Dx11CommandList::bind_pipeline(Pipeline *p) noexcept {
Dx11Pipeline *pipeline = static_cast<Dx11Pipeline *>(p);
d3d11_deferred_context_->CSSetShader(pipeline->get_program(), nullptr, 0);
}

RhiResult Dx11CommandList::bind_shader_resources(ShaderResourceSet *res,
int set_index) {
int set_index) noexcept {
Dx11ResourceSet *set = static_cast<Dx11ResourceSet *>(res);
if (set_index > 0) {
// TODO: Add remapping?
Expand Down Expand Up @@ -123,26 +126,49 @@ RhiResult Dx11CommandList::bind_shader_resources(ShaderResourceSet *res,
return RhiResult::success;
}

RhiResult Dx11CommandList::bind_raster_resources(RasterResources *res) {
RhiResult Dx11CommandList::bind_raster_resources(
RasterResources *res) noexcept {
TI_NOT_IMPLEMENTED;
}

void Dx11CommandList::buffer_barrier(DevicePtr ptr, size_t size) {
TI_NOT_IMPLEMENTED;
void Dx11CommandList::buffer_barrier(DevicePtr ptr, size_t size) noexcept {
// No-op
// Not needed for DX11
}

void Dx11CommandList::buffer_barrier(DeviceAllocation alloc) {
TI_NOT_IMPLEMENTED;
void Dx11CommandList::buffer_barrier(DeviceAllocation alloc) noexcept {
// No-op
// Not needed for DX11
}

void Dx11CommandList::memory_barrier() {
void Dx11CommandList::memory_barrier() noexcept {
// No-op
// Not needed for DX11
}

void Dx11CommandList::buffer_copy(DevicePtr dst, DevicePtr src, size_t size) {
void Dx11CommandList::buffer_copy(DevicePtr dst,
DevicePtr src,
size_t size) noexcept {
ID3D11Buffer *src_buf = device_->alloc_id_to_default_copy(src.alloc_id);
ID3D11Buffer *dst_buf = device_->alloc_id_to_default_copy(dst.alloc_id);

D3D11_BUFFER_DESC src_desc;
D3D11_BUFFER_DESC dst_desc;
src_buf->GetDesc(&src_desc);
dst_buf->GetDesc(&dst_desc);

// Clamp to minimum available size
if (saturate_uadd(src.offset, size) > size_t(src_desc.ByteWidth)) {
size = saturate_usub(size_t(src_desc.ByteWidth), src.offset);
}
if (saturate_uadd(dst.offset, size) > size_t(dst_desc.ByteWidth)) {
size = saturate_usub(size_t(dst_desc.ByteWidth), dst.offset);
}

if (size == 0) {
return;
}

D3D11_BOX box{};
box.left = src.offset;
box.right = size;
Expand All @@ -155,13 +181,33 @@ void Dx11CommandList::buffer_copy(DevicePtr dst, DevicePtr src, size_t size) {
src_buf, 0, &box);
}

void Dx11CommandList::buffer_fill(DevicePtr ptr, size_t size, uint32_t data) {
void Dx11CommandList::buffer_fill(DevicePtr ptr,
size_t size,
uint32_t data) noexcept {
ID3D11UnorderedAccessView *uav =
device_->alloc_id_to_uav(d3d11_deferred_context_, ptr.alloc_id);
D3D11_BUFFER_DESC desc;
device_->alloc_id_to_default_copy(ptr.alloc_id)->GetDesc(&desc);

// Align to 4 bytes
ptr.offset = ptr.offset & size_t(-4);

// Check for overflow
if (ptr.offset > desc.ByteWidth) {
return;
}

TI_ASSERT_INFO(ptr.offset == 0, "DX11 only support full resource clear");
if (saturate_uadd(ptr.offset, size) >= desc.ByteWidth) {
size = kBufferSizeEntireSize;
}

const UINT values[4] = {data, data, data, data};

if (size != kBufferSizeEntireSize) {
// TODO: Add DX11.1 clear regions support
RHI_LOG_ERROR("DX11 Backend does not support subregion clears");
}

d3d11_deferred_context_->ClearUnorderedAccessViewUint(uav, values);

// FIXME: what if the default is not a raw buffer?
Expand Down
16 changes: 8 additions & 8 deletions taichi/rhi/dx/dx_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,15 @@ class Dx11CommandList : public CommandList {
Dx11CommandList(Dx11Device *ti_device);
~Dx11CommandList() override;

void bind_pipeline(Pipeline *p) override;
void bind_pipeline(Pipeline *p) noexcept final;
RhiResult bind_shader_resources(ShaderResourceSet *res,
int set_index = 0) final;
RhiResult bind_raster_resources(RasterResources *res) final;
void buffer_barrier(DevicePtr ptr, size_t size) override;
void buffer_barrier(DeviceAllocation alloc) override;
void memory_barrier() override;
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) override;
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) override;
int set_index = 0) noexcept final;
RhiResult bind_raster_resources(RasterResources *res) noexcept final;
void buffer_barrier(DevicePtr ptr, size_t size) noexcept final;
void buffer_barrier(DeviceAllocation alloc) noexcept final;
void memory_barrier() noexcept final;
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) noexcept final;
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) noexcept final;
void dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1) override;

// These are not implemented in compute only device
Expand Down
20 changes: 20 additions & 0 deletions taichi/rhi/impl_support.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <forward_list>
#include <unordered_set>
#include <mutex>
#include <type_traits>

namespace taichi::lang {

Expand Down Expand Up @@ -40,6 +41,25 @@ void disabled_function([[maybe_unused]] Ts... C) {

#define RHI_ASSERT(cond) assert(cond);

template <typename T>
constexpr auto saturate_uadd(T a, T b) {
static_assert(std::is_unsigned<T>::value);
const T c = a + b;
if (c < a) {
return std::numeric_limits<T>::max();
}
return c;
}

template <typename T>
constexpr auto saturate_usub(T x, T y) {
static_assert(std::is_unsigned<T>::value);
T res = x - y;
res &= -(res <= x);

return res;
}

// Wrapped return-code & object tuple for simplicity
// Easier to read then std::pair
// NOTE: If an internal function can fail, wrap return object with this!
Expand Down
Loading

0 comments on commit f68aa3a

Please sign in to comment.