fix: handle Queue::submit non-fatally (#6318)

* Change the signature of `wgpu_core::Global::queue_submit` to return a `(SubmissionIndex, …)` in addition to its current error type. * Change the control flow of errors in `Queue::submit` to break to the end of a block. This is similar to what we already do in many APIs in `wgpu_core`. * Hoist the scope of the local `submit_index` binding so it can be used at the point where we need to convert current error paths to also return the submission index. Later, we will likely want to avoid actually retrieving a new submission index so we can minimize the critical section of code. We'll need to figure out a strategy for returning a valid (but not necessarily unique) index in the case of failures that prevent successful submission.
gfx-rs · Sep 25, 2024 · fc2fd95 · fc2fd95
1 parent 859dd88
commit fc2fd95
Show file tree

Hide file tree

Showing 5 changed files with 115 additions and 24 deletions.
diff --git a/deno_webgpu/queue.rs b/deno_webgpu/queue.rs
@@ -44,7 +44,7 @@ pub fn op_webgpu_queue_submit(
         })
         .collect::<Result<Vec<_>, AnyError>>()?;
 
-    let maybe_err = instance.queue_submit(queue, &ids).err();
+    let maybe_err = instance.queue_submit(queue, &ids).err().map(|(_idx, e)| e);
 
     for rid in command_buffers {
         let resource = state.resource_table.take::<WebGpuCommandBuffer>(rid)?;

diff --git a/tests/tests/regression/issue_6317.rs b/tests/tests/regression/issue_6317.rs
@@ -0,0 +1,58 @@
+use wgpu::{DownlevelFlags, Limits};
+use wgpu_macros::gpu_test;
+use wgpu_test::{fail, GpuTestConfiguration, TestParameters};
+
+#[gpu_test]
+static NON_FATAL_ERRORS_IN_QUEUE_SUBMIT: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .downlevel_flags(DownlevelFlags::COMPUTE_SHADERS)
+            .limits(Limits::downlevel_defaults()),
+    )
+    .run_sync(|ctx| {
+        let shader_with_trivial_bind_group = concat!(
+            "@group(0) @binding(0) var<storage, read_write> stuff: u32;\n",
+            "\n",
+            "@compute @workgroup_size(1) fn main() { stuff = 2u; }\n"
+        );
+
+        let module = ctx
+            .device
+            .create_shader_module(wgpu::ShaderModuleDescriptor {
+                label: None,
+                source: wgpu::ShaderSource::Wgsl(shader_with_trivial_bind_group.into()),
+            });
+
+        let compute_pipeline =
+            ctx.device
+                .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                    label: None,
+                    layout: None,
+                    module: &module,
+                    entry_point: None,
+                    compilation_options: Default::default(),
+                    cache: Default::default(),
+                });
+
+        fail(
+            &ctx.device,
+            || {
+                let mut command_encoder = ctx.device.create_command_encoder(&Default::default());
+                {
+                    let mut render_pass = command_encoder.begin_compute_pass(&Default::default());
+                    render_pass.set_pipeline(&compute_pipeline);
+
+                    // NOTE: We deliberately don't set a bind group here, to provoke a validation
+                    // error.
+
+                    render_pass.dispatch_workgroups(1, 1, 1);
+                }
+
+                let _idx = ctx.queue.submit([command_encoder.finish()]);
+            },
+            Some(concat!(
+                "The current set ComputePipeline with '' label ",
+                "expects a BindGroup to be set at index 0"
+            )),
+        )
+    });
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
@@ -6,6 +6,7 @@ mod regression {
     mod issue_4485;
     mod issue_4514;
     mod issue_5553;
+    mod issue_6317;
 }
 
 mod bgra8unorm_storage;

diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
@@ -1027,11 +1027,13 @@ impl Global {
         &self,
         queue_id: QueueId,
         command_buffer_ids: &[id::CommandBufferId],
-    ) -> Result<SubmissionIndex, QueueSubmitError> {
+    ) -> Result<SubmissionIndex, (SubmissionIndex, QueueSubmitError)> {
         profiling::scope!("Queue::submit");
         api_log!("Queue::submit {queue_id:?}");
 
-        let (submit_index, callbacks) = {
+        let submit_index;
+
+        let res = 'error: {
             let hub = &self.hub;
 
             let queue = hub.queues.get(queue_id);
@@ -1042,7 +1044,7 @@ impl Global {
 
             // Fence lock must be acquired after the snatch lock everywhere to avoid deadlocks.
             let mut fence = device.fence.write();
-            let submit_index = device
+            submit_index = device
                 .active_submission_index
                 .fetch_add(1, Ordering::SeqCst)
                 + 1;
@@ -1119,18 +1121,29 @@ impl Global {
                         }
 
                         // execute resource transitions
-                        unsafe {
+                        if let Err(e) = unsafe {
                             baked.encoder.begin_encoding(hal_label(
                                 Some("(wgpu internal) Transit"),
                                 device.instance_flags,
                             ))
                         }
-                        .map_err(|e| device.handle_hal_error(e))?;
+                        .map_err(|e| device.handle_hal_error(e))
+                        {
+                            break 'error Err(e.into());
+                        }
 
                         //Note: locking the trackers has to be done after the storages
                         let mut trackers = device.trackers.lock();
-                        baked.initialize_buffer_memory(&mut trackers, &snatch_guard)?;
-                        baked.initialize_texture_memory(&mut trackers, device, &snatch_guard)?;
+                        if let Err(e) = baked.initialize_buffer_memory(&mut trackers, &snatch_guard)
+                        {
+                            break 'error Err(e.into());
+                        }
+                        if let Err(e) =
+                            baked.initialize_texture_memory(&mut trackers, device, &snatch_guard)
+                        {
+                            break 'error Err(e.into());
+                        }
+
                         //Note: stateless trackers are not merged:
                         // device already knows these resources exist.
                         CommandBuffer::insert_barriers_from_device_tracker(
@@ -1147,13 +1160,16 @@ impl Global {
                         // Note: we could technically do it after all of the command buffers,
                         // but here we have a command encoder by hand, so it's easier to use it.
                         if !used_surface_textures.is_empty() {
-                            unsafe {
+                            if let Err(e) = unsafe {
                                 baked.encoder.begin_encoding(hal_label(
                                     Some("(wgpu internal) Present"),
                                     device.instance_flags,
                                 ))
                             }
-                            .map_err(|e| device.handle_hal_error(e))?;
+                            .map_err(|e| device.handle_hal_error(e))
+                            {
+                                break 'error Err(e.into());
+                            }
                             let texture_barriers = trackers
                                 .textures
                                 .set_from_usage_scope_and_drain_transitions(
@@ -1180,7 +1196,7 @@ impl Global {
                     }
 
                     if let Some(first_error) = first_error {
-                        return Err(first_error);
+                        break 'error Err(first_error);
                     }
                 }
             }
@@ -1190,9 +1206,9 @@ impl Global {
             {
                 used_surface_textures.set_size(hub.textures.read().len());
                 for texture in pending_writes.dst_textures.values() {
-                    match texture.try_inner(&snatch_guard)? {
-                        TextureInner::Native { .. } => {}
-                        TextureInner::Surface { .. } => {
+                    match texture.try_inner(&snatch_guard) {
+                        Ok(TextureInner::Native { .. }) => {}
+                        Ok(TextureInner::Surface { .. }) => {
                             // Compare the Arcs by pointer as Textures don't implement Eq
                             submit_surface_textures_owned
                                 .insert(Arc::as_ptr(texture), texture.clone());
@@ -1203,6 +1219,7 @@ impl Global {
                                     .unwrap()
                             };
                         }
+                        Err(e) => break 'error Err(e.into()),
                     }
                 }
 
@@ -1224,10 +1241,12 @@ impl Global {
                 }
             }
 
-            if let Some(pending_execution) =
-                pending_writes.pre_submit(&device.command_allocator, device, &queue)?
-            {
-                active_executions.insert(0, pending_execution);
+            match pending_writes.pre_submit(&device.command_allocator, device, &queue) {
+                Ok(Some(pending_execution)) => {
+                    active_executions.insert(0, pending_execution);
+                }
+                Ok(None) => {}
+                Err(e) => break 'error Err(e.into()),
             }
 
             let hal_command_buffers = active_executions
@@ -1249,14 +1268,17 @@ impl Global {
                     submit_surface_textures.push(raw);
                 }
 
-                unsafe {
+                if let Err(e) = unsafe {
                     queue.raw().submit(
                         &hal_command_buffers,
                         &submit_surface_textures,
                         (fence.as_mut(), submit_index),
                     )
                 }
-                .map_err(|e| device.handle_hal_error(e))?;
+                .map_err(|e| device.handle_hal_error(e))
+                {
+                    break 'error Err(e.into());
+                }
 
                 // Advance the successful submission index.
                 device
@@ -1280,12 +1302,19 @@ impl Global {
             let (closures, _) =
                 match device.maintain(fence_guard, wgt::Maintain::Poll, snatch_guard) {
                     Ok(closures) => closures,
-                    Err(WaitIdleError::Device(err)) => return Err(QueueSubmitError::Queue(err)),
-                    Err(WaitIdleError::StuckGpu) => return Err(QueueSubmitError::StuckGpu),
+                    Err(WaitIdleError::Device(err)) => {
+                        break 'error Err(QueueSubmitError::Queue(err))
+                    }
+                    Err(WaitIdleError::StuckGpu) => break 'error Err(QueueSubmitError::StuckGpu),
                     Err(WaitIdleError::WrongSubmissionIndex(..)) => unreachable!(),
                 };
 
-            (submit_index, closures)
+            Ok(closures)
+        };
+
+        let callbacks = match res {
+            Ok(ok) => ok,
+            Err(e) => return Err((submit_index, e)),
         };
 
         // the closures should execute with nothing locked!

diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs
@@ -2074,7 +2074,10 @@ impl crate::Context for ContextWgpuCore {
 
         let index = match self.0.queue_submit(queue_data.id, &temp_command_buffers) {
             Ok(index) => index,
-            Err(err) => self.handle_error_fatal(err, "Queue::submit"),
+            Err((index, err)) => {
+                self.handle_error_nolabel(&queue_data.error_sink, err, "Queue::submit");
+                index
+            }
         };
 
         for cmdbuf in &temp_command_buffers {