Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[rhi] Update compute CommandList APIs (except dispatch) #7037

Merged
merged 4 commits into from
Jan 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions misc/prtags.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"dx12" : "DirectX 12 backend",
"spirv" : "SPIR-V common codegen",
"gfx" : "Common graphics runtime",
"rhi" : "Unified Device API",
"wasm" : "WebAssembly backend",
"misc" : "Miscellaneous",
"std" : "Standard library",
Expand Down
19 changes: 11 additions & 8 deletions taichi/rhi/amdgpu/amdgpu_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,20 @@ class AmdgpuCommandList : public CommandList {
~AmdgpuCommandList() override {
}

void bind_pipeline(Pipeline *p) override{TI_NOT_IMPLEMENTED};
void bind_pipeline(Pipeline *p) noexcept final{TI_NOT_IMPLEMENTED};
RhiResult bind_shader_resources(ShaderResourceSet *res,
int set_index = 0) final{TI_NOT_IMPLEMENTED};
RhiResult bind_raster_resources(RasterResources *res) final{
int set_index = 0) noexcept final{
TI_NOT_IMPLEMENTED};
void buffer_barrier(DevicePtr ptr, size_t size) override{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};
void memory_barrier() override{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) override{
RhiResult bind_raster_resources(RasterResources *res) noexcept final{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) override{
void buffer_barrier(DevicePtr ptr,
size_t size) noexcept final{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) noexcept final{
TI_NOT_IMPLEMENTED};
void memory_barrier() noexcept final{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) noexcept final{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) noexcept final{
TI_NOT_IMPLEMENTED};
void dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1) override{
TI_NOT_IMPLEMENTED};
Expand Down
18 changes: 10 additions & 8 deletions taichi/rhi/cpu/cpu_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,20 @@ class CpuCommandList : public CommandList {
~CpuCommandList() override {
}

void bind_pipeline(Pipeline *p) override{TI_NOT_IMPLEMENTED};
void bind_pipeline(Pipeline *p) noexcept override{TI_NOT_IMPLEMENTED};
RhiResult bind_shader_resources(ShaderResourceSet *res,
int set_index = 0) override{
int set_index = 0) noexcept override{
TI_NOT_IMPLEMENTED};
RhiResult bind_raster_resources(RasterResources *res) override{
RhiResult bind_raster_resources(RasterResources *res) noexcept override{
TI_NOT_IMPLEMENTED};
void buffer_barrier(DevicePtr ptr, size_t size) override{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};
void memory_barrier() override{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) override{
void buffer_barrier(DevicePtr ptr,
size_t size) noexcept override{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) noexcept override{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) override{
void memory_barrier() noexcept override{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) noexcept override{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) noexcept override{
TI_NOT_IMPLEMENTED};
void dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1) override{
TI_NOT_IMPLEMENTED};
Expand Down
19 changes: 11 additions & 8 deletions taichi/rhi/cuda/cuda_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,20 @@ class CudaCommandList : public CommandList {
~CudaCommandList() override {
}

void bind_pipeline(Pipeline *p) override{TI_NOT_IMPLEMENTED};
void bind_pipeline(Pipeline *p) noexcept override{TI_NOT_IMPLEMENTED};
RhiResult bind_shader_resources(ShaderResourceSet *res,
int set_index = 0) final{TI_NOT_IMPLEMENTED};
RhiResult bind_raster_resources(RasterResources *res) final{
int set_index = 0) noexcept final{
TI_NOT_IMPLEMENTED};
void buffer_barrier(DevicePtr ptr, size_t size) override{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};
void memory_barrier() override{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) override{
RhiResult bind_raster_resources(RasterResources *res) noexcept final{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) override{
void buffer_barrier(DevicePtr ptr,
size_t size) noexcept override{TI_NOT_IMPLEMENTED};
void buffer_barrier(DeviceAllocation alloc) noexcept override{
TI_NOT_IMPLEMENTED};
void memory_barrier() noexcept override{TI_NOT_IMPLEMENTED};
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) noexcept override{
TI_NOT_IMPLEMENTED};
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) noexcept override{
TI_NOT_IMPLEMENTED};
void dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1) override{
TI_NOT_IMPLEMENTED};
Expand Down
71 changes: 62 additions & 9 deletions taichi/rhi/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ enum class RhiResult {
out_of_memory = -4,
};

constexpr size_t kBufferSizeEntireSize = size_t(-1);
constexpr size_t kBufferSizeEntireSize = std::numeric_limits<size_t>::max();

#define MAKE_ENUM_FLAGS(name) \
inline name operator|(name a, name b) { \
Expand Down Expand Up @@ -278,7 +278,7 @@ class TI_DLL_EXPORT CommandList {
* Doing so resets all bound resources.
* @params[in] pipeline The pipeline to be bound
*/
virtual void bind_pipeline(Pipeline *p) = 0;
virtual void bind_pipeline(Pipeline *p) noexcept = 0;

/**
* Bind a ShaderResourceSet to a set index.
Expand All @@ -296,7 +296,7 @@ class TI_DLL_EXPORT CommandList {
* `error` If binding failed due to other reasons
*/
virtual RhiResult bind_shader_resources(ShaderResourceSet *res,
int set_index = 0) = 0;
int set_index = 0) noexcept = 0;

/**
* Bind RasterResources to the command list.
Expand All @@ -308,13 +308,66 @@ class TI_DLL_EXPORT CommandList {
* `not_supported` If some bindings are not supported by the backend
* `error` If binding failed due to other reasons
*/
virtual RhiResult bind_raster_resources(RasterResources *res) = 0;
virtual RhiResult bind_raster_resources(RasterResources *res) noexcept = 0;

/**
* Insert a memory barrier into the command list.
* The barrier affects a continous region of memory.
* Changes to memory before the barrier will be visible to accesses after the
* barrier (API command ordering). i.e. Command later to this barrier will see
* the changes made by commands before this barrier.
* This barrier is limited in scope to the Stream that the command list is
* submitted to. Other Streams or Devices may not observe this barrier.
* @params[in] ptr The pointer to the start of the region
* @params[in] size The size of the memory region.
* Size is clamped to the underlying buffer size.
*/
virtual void buffer_barrier(DevicePtr ptr, size_t size) noexcept = 0;

/**
* Insert a memory barrier into the command list.
* The barrier affects an entire buffer.
* Behaviour is the same as `buffer_barrier(DevicePtr, size_t)`
* @params[in] alloc The memory allocation of this barrier
*/
virtual void buffer_barrier(DeviceAllocation alloc) noexcept = 0;

/**
* Insert a memory barrier into the command list.
* The barrier affects all global memory.
* Behaviour is the same as `buffer_barrier(DevicePtr, size_t)`
* @params[in] alloc The memory allocation of this barrier
*/
virtual void memory_barrier() noexcept = 0;

/**
* Insert a buffer copy operation into the command list.
* @params[in] src The source Device Pointer
* @params[in] dst The destination Device Pointer
* @params[in] size The size of the region to be copied.
* The size will be clamped to the minimum between
* `dst.size - dst.offset` and `src.size - src.offset`
*/
virtual void buffer_copy(DevicePtr dst,
DevicePtr src,
size_t size) noexcept = 0;

/**
* Insert a memory region fill operation into the command list
* The memory region will be filled with the given (bit precise) value.
* - (Encouraged behavior) If the `data` is 0, the underlying API might
* provide a faster code path.
* - (Encouraged behavior) If the `size` is -1 (max of size_t) the underlying
* API might provide a faster code path.
* @params[in] ptr The start of the memory region.
* ptr.offset will be aligned down to a multiple of 4 bytes.
* @params[in] size The size of the region.
* The size will be clamped to the underlying buffer's size.
*/
virtual void buffer_fill(DevicePtr ptr,
size_t size,
uint32_t data) noexcept = 0;

virtual void buffer_barrier(DevicePtr ptr, size_t size) = 0;
virtual void buffer_barrier(DeviceAllocation alloc) = 0;
virtual void memory_barrier() = 0;
virtual void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) = 0;
virtual void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) = 0;
virtual void dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1) = 0;

struct ComputeSize {
Expand Down
68 changes: 57 additions & 11 deletions taichi/rhi/dx/dx_device.cpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
#ifdef TI_WITH_DX11

#include "taichi/rhi/dx/dx_device.h"
#include "taichi/rhi/impl_support.h"

#include "spirv_hlsl.hpp"
#include <d3dcompiler.h>

namespace taichi::lang {
namespace directx11 {

using namespace rhi_impl;

#ifdef TAICHI_DX11_DEBUG_WINDOW
IDXGISwapChain *g_swapchain = nullptr;
#endif
Expand Down Expand Up @@ -89,13 +92,13 @@ Dx11CommandList::~Dx11CommandList() {
d3d11_deferred_context_->Release();
}

void Dx11CommandList::bind_pipeline(Pipeline *p) {
void Dx11CommandList::bind_pipeline(Pipeline *p) noexcept {
Dx11Pipeline *pipeline = static_cast<Dx11Pipeline *>(p);
d3d11_deferred_context_->CSSetShader(pipeline->get_program(), nullptr, 0);
}

RhiResult Dx11CommandList::bind_shader_resources(ShaderResourceSet *res,
int set_index) {
int set_index) noexcept {
Dx11ResourceSet *set = static_cast<Dx11ResourceSet *>(res);
if (set_index > 0) {
// TODO: Add remapping?
Expand Down Expand Up @@ -123,26 +126,49 @@ RhiResult Dx11CommandList::bind_shader_resources(ShaderResourceSet *res,
return RhiResult::success;
}

RhiResult Dx11CommandList::bind_raster_resources(RasterResources *res) {
RhiResult Dx11CommandList::bind_raster_resources(
RasterResources *res) noexcept {
TI_NOT_IMPLEMENTED;
}

void Dx11CommandList::buffer_barrier(DevicePtr ptr, size_t size) {
TI_NOT_IMPLEMENTED;
void Dx11CommandList::buffer_barrier(DevicePtr ptr, size_t size) noexcept {
// No-op
// Not needed for DX11
}

void Dx11CommandList::buffer_barrier(DeviceAllocation alloc) {
TI_NOT_IMPLEMENTED;
void Dx11CommandList::buffer_barrier(DeviceAllocation alloc) noexcept {
// No-op
// Not needed for DX11
}

void Dx11CommandList::memory_barrier() {
void Dx11CommandList::memory_barrier() noexcept {
// No-op
// Not needed for DX11
}

void Dx11CommandList::buffer_copy(DevicePtr dst, DevicePtr src, size_t size) {
void Dx11CommandList::buffer_copy(DevicePtr dst,
DevicePtr src,
size_t size) noexcept {
ID3D11Buffer *src_buf = device_->alloc_id_to_default_copy(src.alloc_id);
ID3D11Buffer *dst_buf = device_->alloc_id_to_default_copy(dst.alloc_id);

D3D11_BUFFER_DESC src_desc;
D3D11_BUFFER_DESC dst_desc;
src_buf->GetDesc(&src_desc);
dst_buf->GetDesc(&dst_desc);

// Clamp to minimum available size
if (saturate_uadd(src.offset, size) > size_t(src_desc.ByteWidth)) {
size = saturate_usub(size_t(src_desc.ByteWidth), src.offset);
}
if (saturate_uadd(dst.offset, size) > size_t(dst_desc.ByteWidth)) {
size = saturate_usub(size_t(dst_desc.ByteWidth), dst.offset);
}

if (size == 0) {
return;
}

D3D11_BOX box{};
box.left = src.offset;
box.right = size;
Expand All @@ -155,13 +181,33 @@ void Dx11CommandList::buffer_copy(DevicePtr dst, DevicePtr src, size_t size) {
src_buf, 0, &box);
}

void Dx11CommandList::buffer_fill(DevicePtr ptr, size_t size, uint32_t data) {
void Dx11CommandList::buffer_fill(DevicePtr ptr,
size_t size,
uint32_t data) noexcept {
ID3D11UnorderedAccessView *uav =
device_->alloc_id_to_uav(d3d11_deferred_context_, ptr.alloc_id);
D3D11_BUFFER_DESC desc;
device_->alloc_id_to_default_copy(ptr.alloc_id)->GetDesc(&desc);

// Align to 4 bytes
ptr.offset = ptr.offset & size_t(-4);

// Check for overflow
if (ptr.offset > desc.ByteWidth) {
return;
}

TI_ASSERT_INFO(ptr.offset == 0, "DX11 only support full resource clear");
if (saturate_uadd(ptr.offset, size) >= desc.ByteWidth) {
size = kBufferSizeEntireSize;
}

const UINT values[4] = {data, data, data, data};

if (size != kBufferSizeEntireSize) {
// TODO: Add DX11.1 clear regions support
RHI_LOG_ERROR("DX11 Backend does not support subregion clears");
}

d3d11_deferred_context_->ClearUnorderedAccessViewUint(uav, values);

// FIXME: what if the default is not a raw buffer?
Expand Down
16 changes: 8 additions & 8 deletions taichi/rhi/dx/dx_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,15 @@ class Dx11CommandList : public CommandList {
Dx11CommandList(Dx11Device *ti_device);
~Dx11CommandList() override;

void bind_pipeline(Pipeline *p) override;
void bind_pipeline(Pipeline *p) noexcept final;
RhiResult bind_shader_resources(ShaderResourceSet *res,
int set_index = 0) final;
RhiResult bind_raster_resources(RasterResources *res) final;
void buffer_barrier(DevicePtr ptr, size_t size) override;
void buffer_barrier(DeviceAllocation alloc) override;
void memory_barrier() override;
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) override;
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) override;
int set_index = 0) noexcept final;
RhiResult bind_raster_resources(RasterResources *res) noexcept final;
void buffer_barrier(DevicePtr ptr, size_t size) noexcept final;
void buffer_barrier(DeviceAllocation alloc) noexcept final;
void memory_barrier() noexcept final;
void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) noexcept final;
void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) noexcept final;
void dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1) override;

// These are not implemented in compute only device
Expand Down
20 changes: 20 additions & 0 deletions taichi/rhi/impl_support.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <forward_list>
#include <unordered_set>
#include <mutex>
#include <type_traits>

namespace taichi::lang {

Expand Down Expand Up @@ -40,6 +41,25 @@ void disabled_function([[maybe_unused]] Ts... C) {

#define RHI_ASSERT(cond) assert(cond);

template <typename T>
constexpr auto saturate_uadd(T a, T b) {
static_assert(std::is_unsigned<T>::value);
const T c = a + b;
if (c < a) {
return std::numeric_limits<T>::max();
}
return c;
}

template <typename T>
constexpr auto saturate_usub(T x, T y) {
static_assert(std::is_unsigned<T>::value);
T res = x - y;
res &= -(res <= x);

return res;
}

// Wrapped return-code & object tuple for simplicity
// Easier to read then std::pair
// NOTE: If an internal function can fail, wrap return object with this!
Expand Down
Loading