From 15dfbd32a83d5dd22da990bc6a18487e24a17caf Mon Sep 17 00:00:00 2001 From: Vecvec Date: Sat, 16 Nov 2024 09:06:59 +1300 Subject: [PATCH 1/3] move alignment into HAL --- wgpu-core/src/command/ray_tracing.rs | 25 +++++++------- wgpu-core/src/device/ray_tracing.rs | 4 +-- wgpu-core/src/ray_tracing.rs | 49 ++-------------------------- wgpu-hal/src/dx12/adapter.rs | 2 ++ wgpu-hal/src/dx12/device.rs | 5 +++ wgpu-hal/src/dynamic/device.rs | 7 +++- wgpu-hal/src/empty.rs | 5 +++ wgpu-hal/src/gles/adapter.rs | 2 ++ wgpu-hal/src/gles/device.rs | 6 +++- wgpu-hal/src/lib.rs | 15 +++++++++ wgpu-hal/src/metal/adapter.rs | 2 ++ wgpu-hal/src/metal/device.rs | 5 +++ wgpu-hal/src/vulkan/adapter.rs | 7 ++++ wgpu-hal/src/vulkan/device.rs | 22 +++++++++++-- wgpu-hal/src/vulkan/mod.rs | 9 +++++ 15 files changed, 101 insertions(+), 64 deletions(-) diff --git a/wgpu-core/src/command/ray_tracing.rs b/wgpu-core/src/command/ray_tracing.rs index efc3ebbe2d..ec90a5d28e 100644 --- a/wgpu-core/src/command/ray_tracing.rs +++ b/wgpu-core/src/command/ray_tracing.rs @@ -5,7 +5,7 @@ use crate::{ id::CommandEncoderId, init_tracker::MemoryInitKind, ray_tracing::{ - tlas_instance_into_bytes, BlasAction, BlasBuildEntry, BlasGeometries, BlasTriangleGeometry, + BlasAction, BlasBuildEntry, BlasGeometries, BlasTriangleGeometry, BuildAccelerationStructureError, TlasAction, TlasBuildEntry, TlasInstance, TlasPackage, TraceBlasBuildEntry, TraceBlasGeometries, TraceBlasTriangleGeometry, TraceTlasInstance, TraceTlasPackage, ValidateBlasActionsError, ValidateTlasActionsError, @@ -60,9 +60,6 @@ struct TlasBufferStore { entry: TlasBuildEntry, } -// TODO: Get this from the device (e.g. VkPhysicalDeviceAccelerationStructurePropertiesKHR.minAccelerationStructureScratchOffsetAlignment) this is currently the largest possible some devices have 0, 64, 128 (lower limits) so this could create excess allocation (Note: dx12 has 256). -const SCRATCH_BUFFER_ALIGNMENT: u32 = 256; - impl Global { // Currently this function is very similar to its safe counterpart, however certain parts of it are very different, // making for the two to be implemented differently, the main difference is this function has separate buffers for each @@ -193,6 +190,7 @@ impl Global { &mut scratch_buffer_blas_size, &mut blas_storage, hub, + device.alignments.ray_tracing_scratch_buffer_alignment, )?; let mut scratch_buffer_tlas_size = 0; @@ -260,7 +258,7 @@ impl Global { let scratch_buffer_offset = scratch_buffer_tlas_size; scratch_buffer_tlas_size += align_to( tlas.size_info.build_scratch_size as u32, - SCRATCH_BUFFER_ALIGNMENT, + device.alignments.ray_tracing_scratch_buffer_alignment, ) as u64; tlas_storage.push(UnsafeTlasStore { @@ -508,6 +506,7 @@ impl Global { &mut scratch_buffer_blas_size, &mut blas_storage, hub, + device.alignments.ray_tracing_scratch_buffer_alignment, )?; let mut tlas_lock_store = Vec::<(Option, Arc)>::new(); @@ -535,7 +534,7 @@ impl Global { let scratch_buffer_offset = scratch_buffer_tlas_size; scratch_buffer_tlas_size += align_to( tlas.size_info.build_scratch_size as u32, - SCRATCH_BUFFER_ALIGNMENT, + device.alignments.ray_tracing_scratch_buffer_alignment, ) as u64; let first_byte_index = instance_buffer_staging_source.len(); @@ -558,10 +557,13 @@ impl Global { cmd_buf_data.trackers.blas_s.set_single(blas.clone()); - instance_buffer_staging_source.extend(tlas_instance_into_bytes( - &instance, - blas.handle, - device.backend(), + instance_buffer_staging_source.extend(device.raw().tlas_instance_to_bytes( + hal::TlasInstance { + transform: *instance.transform, + custom_index: instance.custom_index, + mask: instance.mask, + blas_address: blas.handle, + }, )); instance_count += 1; @@ -1013,6 +1015,7 @@ fn iter_buffers<'a, 'b>( scratch_buffer_blas_size: &mut u64, blas_storage: &mut Vec>, hub: &Hub, + ray_tracing_scratch_buffer_alignment: u32, ) -> Result<(), BuildAccelerationStructureError> { let mut triangle_entries = Vec::>::new(); @@ -1192,7 +1195,7 @@ fn iter_buffers<'a, 'b>( let scratch_buffer_offset = *scratch_buffer_blas_size; *scratch_buffer_blas_size += align_to( blas.size_info.build_scratch_size as u32, - SCRATCH_BUFFER_ALIGNMENT, + ray_tracing_scratch_buffer_alignment, ) as u64; blas_storage.push(BlasStore { diff --git a/wgpu-core/src/device/ray_tracing.rs b/wgpu-core/src/device/ray_tracing.rs index baab420919..12afc7e6a8 100644 --- a/wgpu-core/src/device/ray_tracing.rs +++ b/wgpu-core/src/device/ray_tracing.rs @@ -12,7 +12,7 @@ use crate::{ global::Global, id::{self, BlasId, TlasId}, lock::RwLock, - ray_tracing::{get_raw_tlas_instance_size, CreateBlasError, CreateTlasError}, + ray_tracing::{CreateBlasError, CreateTlasError}, resource, LabelHelpers, }; use hal::AccelerationStructureTriangleIndices; @@ -135,7 +135,7 @@ impl Device { .map_err(DeviceError::from_hal)?; let instance_buffer_size = - get_raw_tlas_instance_size(self.backend()) * desc.max_instances.max(1) as usize; + self.alignments.raw_tlas_instance_size * desc.max_instances.max(1) as usize; let instance_buffer = unsafe { self.raw().create_buffer(&hal::BufferDescriptor { label: Some("(wgpu-core) instances_buffer"), diff --git a/wgpu-core/src/ray_tracing.rs b/wgpu-core/src/ray_tracing.rs index a094b35308..9f4a11946d 100644 --- a/wgpu-core/src/ray_tracing.rs +++ b/wgpu-core/src/ray_tracing.rs @@ -13,8 +13,8 @@ use crate::{ id::{BlasId, BufferId, TlasId}, resource::CreateBufferError, }; -use std::{mem::size_of, sync::Arc}; -use std::{num::NonZeroU64, slice}; +use std::num::NonZeroU64; +use std::sync::Arc; use crate::resource::{Blas, ResourceErrorIdent, Tlas}; use thiserror::Error; @@ -276,48 +276,3 @@ pub struct TraceTlasPackage { pub instances: Vec>, pub lowest_unmodified: u32, } - -pub(crate) fn get_raw_tlas_instance_size(backend: wgt::Backend) -> usize { - // TODO: this should be provided by the backend - match backend { - wgt::Backend::Empty => 0, - wgt::Backend::Vulkan => 64, - _ => unimplemented!(), - } -} - -#[derive(Clone)] -#[repr(C)] -struct RawTlasInstance { - transform: [f32; 12], - custom_index_and_mask: u32, - shader_binding_table_record_offset_and_flags: u32, - acceleration_structure_reference: u64, -} - -pub(crate) fn tlas_instance_into_bytes( - instance: &TlasInstance, - blas_address: u64, - backend: wgt::Backend, -) -> Vec { - // TODO: get the device to do this - match backend { - wgt::Backend::Empty => vec![], - wgt::Backend::Vulkan => { - const MAX_U24: u32 = (1u32 << 24u32) - 1u32; - let temp = RawTlasInstance { - transform: *instance.transform, - custom_index_and_mask: (instance.custom_index & MAX_U24) - | (u32::from(instance.mask) << 24), - shader_binding_table_record_offset_and_flags: 0, - acceleration_structure_reference: blas_address, - }; - let temp: *const _ = &temp; - unsafe { - slice::from_raw_parts::(temp.cast::(), size_of::()) - .to_vec() - } - } - _ => unimplemented!(), - } -} diff --git a/wgpu-hal/src/dx12/adapter.rs b/wgpu-hal/src/dx12/adapter.rs index 45d69f5584..b6bedce5cc 100644 --- a/wgpu-hal/src/dx12/adapter.rs +++ b/wgpu-hal/src/dx12/adapter.rs @@ -522,6 +522,8 @@ impl super::Adapter { // Direct3D correctly bounds-checks all array accesses: // https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#18.6.8.2%20Device%20Memory%20Reads uniform_bounds_check_alignment: wgt::BufferSize::new(1).unwrap(), + raw_tlas_instance_size: 0, + ray_tracing_scratch_buffer_alignment: 0, }, downlevel, }, diff --git a/wgpu-hal/src/dx12/device.rs b/wgpu-hal/src/dx12/device.rs index b2b9629ed4..9e089bbd12 100644 --- a/wgpu-hal/src/dx12/device.rs +++ b/wgpu-hal/src/dx12/device.rs @@ -21,6 +21,7 @@ use super::{conv, descriptor, D3D12Lib}; use crate::{ auxil::{self, dxgi::result::HResult}, dx12::{borrow_optional_interface_temporarily, shader_compilation, Event}, + TlasInstance, }; // this has to match Naga's HLSL backend, and also needs to be null-terminated @@ -1939,4 +1940,8 @@ impl crate::Device for super::Device { total_reserved_bytes: upstream.total_reserved_bytes, }) } + + fn tlas_instance_to_bytes(&self, _instance: TlasInstance) -> Vec { + todo!() + } } diff --git a/wgpu-hal/src/dynamic/device.rs b/wgpu-hal/src/dynamic/device.rs index f38d9fd328..28ef35287c 100644 --- a/wgpu-hal/src/dynamic/device.rs +++ b/wgpu-hal/src/dynamic/device.rs @@ -5,7 +5,7 @@ use crate::{ GetAccelerationStructureBuildSizesDescriptor, Label, MemoryRange, PipelineCacheDescriptor, PipelineCacheError, PipelineError, PipelineLayoutDescriptor, RenderPipelineDescriptor, SamplerDescriptor, ShaderError, ShaderInput, ShaderModuleDescriptor, TextureDescriptor, - TextureViewDescriptor, + TextureViewDescriptor, TlasInstance, }; use super::{ @@ -158,6 +158,7 @@ pub trait DynDevice: DynResource { &self, acceleration_structure: Box, ); + fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec; fn get_internal_counters(&self) -> wgt::HalCounters; fn generate_allocator_report(&self) -> Option; @@ -520,6 +521,10 @@ impl DynDevice for D { unsafe { D::destroy_acceleration_structure(self, acceleration_structure.unbox()) } } + fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec { + D::tlas_instance_to_bytes(self, instance) + } + fn get_internal_counters(&self) -> wgt::HalCounters { D::get_internal_counters(self) } diff --git a/wgpu-hal/src/empty.rs b/wgpu-hal/src/empty.rs index e74c91c07e..d4fc7c3127 100644 --- a/wgpu-hal/src/empty.rs +++ b/wgpu-hal/src/empty.rs @@ -1,5 +1,6 @@ #![allow(unused_variables)] +use crate::TlasInstance; use std::ops::Range; #[derive(Clone, Debug)] @@ -306,6 +307,10 @@ impl crate::Device for Context { } unsafe fn destroy_acceleration_structure(&self, _acceleration_structure: Resource) {} + fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec { + vec![] + } + fn get_internal_counters(&self) -> wgt::HalCounters { Default::default() } diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs index a654215d21..032ad1b46b 100644 --- a/wgpu-hal/src/gles/adapter.rs +++ b/wgpu-hal/src/gles/adapter.rs @@ -851,6 +851,8 @@ impl super::Adapter { // being, provide 1 as the value here, to cause as little // trouble as possible. uniform_bounds_check_alignment: wgt::BufferSize::new(1).unwrap(), + raw_tlas_instance_size: 0, + ray_tracing_scratch_buffer_alignment: 0, }, }, }) diff --git a/wgpu-hal/src/gles/device.rs b/wgpu-hal/src/gles/device.rs index 365ac7d72d..8d0f2e70d2 100644 --- a/wgpu-hal/src/gles/device.rs +++ b/wgpu-hal/src/gles/device.rs @@ -8,7 +8,7 @@ use std::{ sync::{Arc, Mutex}, }; -use crate::AtomicFenceValue; +use crate::{AtomicFenceValue, TlasInstance}; use arrayvec::ArrayVec; use std::sync::atomic::Ordering; @@ -1633,6 +1633,10 @@ impl crate::Device for super::Device { ) { } + fn tlas_instance_to_bytes(&self, _instance: TlasInstance) -> Vec { + unimplemented!() + } + fn get_internal_counters(&self) -> wgt::HalCounters { self.counters.clone() } diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs index ad4b458cd5..15fa9a2874 100644 --- a/wgpu-hal/src/lib.rs +++ b/wgpu-hal/src/lib.rs @@ -971,6 +971,7 @@ pub trait Device: WasmNotSendSync { &self, acceleration_structure: ::AccelerationStructure, ); + fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec; fn get_internal_counters(&self) -> wgt::HalCounters; @@ -1771,6 +1772,12 @@ pub struct Alignments { /// [`Uniform`]: wgt::BufferBindingType::Uniform /// [size]: BufferBinding::size pub uniform_bounds_check_alignment: wgt::BufferSize, + + /// The size of the raw TLAS instance + pub raw_tlas_instance_size: usize, + + /// What the scratch buffer for building an acceleration structure must be aligned to + pub ray_tracing_scratch_buffer_alignment: u32, } #[derive(Clone, Debug)] @@ -2519,3 +2526,11 @@ bitflags::bitflags! { pub struct AccelerationStructureBarrier { pub usage: Range, } + +#[derive(Debug, Copy, Clone)] +pub struct TlasInstance { + pub transform: [f32; 12], + pub custom_index: u32, + pub mask: u8, + pub blas_address: u64, +} diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs index e7db97a1f9..fa7622a372 100644 --- a/wgpu-hal/src/metal/adapter.rs +++ b/wgpu-hal/src/metal/adapter.rs @@ -1001,6 +1001,8 @@ impl super::PrivateCapabilities { // Metal Shading Language it generates, so from `wgpu_hal`'s // users' point of view, references are tightly checked. uniform_bounds_check_alignment: wgt::BufferSize::new(1).unwrap(), + raw_tlas_instance_size: 0, + ray_tracing_scratch_buffer_alignment: 0, }, downlevel, } diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs index 81887aac7c..394de2bf9f 100644 --- a/wgpu-hal/src/metal/device.rs +++ b/wgpu-hal/src/metal/device.rs @@ -8,6 +8,7 @@ use std::{ use super::conv; use crate::auxil::map_naga_stage; +use crate::TlasInstance; type DeviceResult = Result; @@ -1426,6 +1427,10 @@ impl crate::Device for super::Device { unimplemented!() } + fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec { + unimplemented!() + } + fn get_internal_counters(&self) -> wgt::HalCounters { self.counters.clone() } diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs index 2916928e46..6e88e03b8c 100644 --- a/wgpu-hal/src/vulkan/adapter.rs +++ b/wgpu-hal/src/vulkan/adapter.rs @@ -1140,6 +1140,13 @@ impl PhysicalDeviceProperties { }; wgt::BufferSize::new(alignment).unwrap() }, + raw_tlas_instance_size: 64, + ray_tracing_scratch_buffer_alignment: self.acceleration_structure.map_or( + 0, + |acceleration_structure| { + acceleration_structure.min_acceleration_structure_scratch_offset_alignment + }, + ), } } } diff --git a/wgpu-hal/src/vulkan/device.rs b/wgpu-hal/src/vulkan/device.rs index fe72ec40f3..4071db37e5 100644 --- a/wgpu-hal/src/vulkan/device.rs +++ b/wgpu-hal/src/vulkan/device.rs @@ -1,16 +1,18 @@ -use super::conv; +use super::{conv, RawTlasInstance}; use arrayvec::ArrayVec; use ash::{khr, vk}; use parking_lot::Mutex; +use crate::TlasInstance; use std::{ borrow::Cow, collections::{hash_map::Entry, BTreeMap}, ffi::{CStr, CString}, + mem, mem::MaybeUninit, num::NonZeroU32, - ptr, + ptr, slice, sync::Arc, }; @@ -2557,6 +2559,22 @@ impl crate::Device for super::Device { self.counters.clone() } + + fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec { + const MAX_U24: u32 = (1u32 << 24u32) - 1u32; + let temp = RawTlasInstance { + transform: instance.transform, + custom_index_and_mask: (instance.custom_index & MAX_U24) + | (u32::from(instance.mask) << 24), + shader_binding_table_record_offset_and_flags: 0, + acceleration_structure_reference: instance.blas_address, + }; + let temp: *const _ = &temp; + unsafe { + slice::from_raw_parts::(temp.cast::(), mem::size_of::()) + .to_vec() + } + } } impl super::DeviceShared { diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs index ff3c865fc7..c4c5f70d7d 100644 --- a/wgpu-hal/src/vulkan/mod.rs +++ b/wgpu-hal/src/vulkan/mod.rs @@ -1423,3 +1423,12 @@ fn get_lost_err() -> crate::DeviceError { #[allow(unreachable_code)] crate::DeviceError::Lost } + +#[derive(Clone)] +#[repr(C)] +struct RawTlasInstance { + transform: [f32; 12], + custom_index_and_mask: u32, + shader_binding_table_record_offset_and_flags: u32, + acceleration_structure_reference: u64, +} From 787e8d5c47e80876704ae0e3a379ae6925bc93a9 Mon Sep 17 00:00:00 2001 From: Vecvec Date: Sat, 16 Nov 2024 09:32:41 +1300 Subject: [PATCH 2/3] MacOS CI --- wgpu-hal/src/metal/device.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs index 394de2bf9f..745ad598a7 100644 --- a/wgpu-hal/src/metal/device.rs +++ b/wgpu-hal/src/metal/device.rs @@ -1427,7 +1427,7 @@ impl crate::Device for super::Device { unimplemented!() } - fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec { + fn tlas_instance_to_bytes(&self, _instance: TlasInstance) -> Vec { unimplemented!() } From 51fe5764da39c5a38abc03418a7b3099d25f610f Mon Sep 17 00:00:00 2001 From: Vecvec Date: Sun, 17 Nov 2024 15:33:46 +1300 Subject: [PATCH 3/3] Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5149ae6c40..79dbdca2fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -104,6 +104,7 @@ By @ErichDonGubler in [#6456](https://github.com/gfx-rs/wgpu/pull/6456), [#6148] #### General - Return submission index in `map_async` and `on_submitted_work_done` to track down completion of async callbacks. By @eliemichel in [#6360](https://github.com/gfx-rs/wgpu/pull/6360). +- Move raytracing alignments into HAL instead of in core. By @Vecvec in [#6563](https://github.com/gfx-rs/wgpu/pull/6563). ### Changes