From 6f4f701171a631691e62f6d7401d4bca9ed0a1fd Mon Sep 17 00:00:00 2001
From: Erich Gubler <erichdongubler@gmail.com>
Date: Thu, 19 Sep 2024 17:15:19 -0400
Subject: [PATCH] fix: handle `Queue::submit` non-fatally
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Change the signature of `wgpu_core::Global::queue_submit` to return
  a `(SubmissionIndex, …)` in addition to its current error type.
* Change the control flow of errors in `Queue::submit` to break to the
  end of a block. This is similar to what we already do in many APIs in
  `wgpu_core`.
* Hoist the scope of the local `submit_index` binding so it can be used
  at the point where we need to convert current error paths to also
  return the submission index.

Later, we will likely want to avoid actually retrieving a new submission
index so we can minimize the critical section of code. We'll need to
figure out a strategy for returning a valid (but not necessarily unique)
index in the case of failures that prevent successful submission.
---
 deno_webgpu/queue.rs                 |   2 +-
 tests/tests/regression/issue_6317.rs |  58 ++++
 tests/tests/root.rs                  |   1 +
 wgpu-core/src/device/queue.rs        | 426 +++++++++++++++------------
 wgpu/src/backend/wgpu_core.rs        |   5 +-
 5 files changed, 295 insertions(+), 197 deletions(-)
 create mode 100644 tests/tests/regression/issue_6317.rs

diff --git a/deno_webgpu/queue.rs b/deno_webgpu/queue.rs
index fdbf993f8c..5915b68f27 100644
--- a/deno_webgpu/queue.rs
+++ b/deno_webgpu/queue.rs
@@ -44,7 +44,7 @@ pub fn op_webgpu_queue_submit(
         })
         .collect::<Result<Vec<_>, AnyError>>()?;
 
-    let maybe_err = instance.queue_submit(queue, &ids).err();
+    let maybe_err = instance.queue_submit(queue, &ids).err().map(|(_idx, e)| e);
 
     for rid in command_buffers {
         let resource = state.resource_table.take::<WebGpuCommandBuffer>(rid)?;
diff --git a/tests/tests/regression/issue_6317.rs b/tests/tests/regression/issue_6317.rs
new file mode 100644
index 0000000000..20945006f7
--- /dev/null
+++ b/tests/tests/regression/issue_6317.rs
@@ -0,0 +1,58 @@
+use wgpu::{DownlevelFlags, Limits};
+use wgpu_macros::gpu_test;
+use wgpu_test::{fail, GpuTestConfiguration, TestParameters};
+
+#[gpu_test]
+static NON_FATAL_ERRORS_IN_QUEUE_SUBMIT: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .downlevel_flags(DownlevelFlags::COMPUTE_SHADERS)
+            .limits(Limits::downlevel_defaults()),
+    )
+    .run_sync(|ctx| {
+        let shader_with_trivial_bind_group = concat!(
+            "@group(0) @binding(0) var<storage, read_write> stuff: u32;\n",
+            "\n",
+            "@compute @workgroup_size(1) fn main() { stuff = 2u; }\n"
+        );
+
+        let module = ctx
+            .device
+            .create_shader_module(wgpu::ShaderModuleDescriptor {
+                label: None,
+                source: wgpu::ShaderSource::Wgsl(shader_with_trivial_bind_group.into()),
+            });
+
+        let compute_pipeline =
+            ctx.device
+                .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                    label: None,
+                    layout: None,
+                    module: &module,
+                    entry_point: None,
+                    compilation_options: Default::default(),
+                    cache: Default::default(),
+                });
+
+        fail(
+            &ctx.device,
+            || {
+                let mut command_encoder = ctx.device.create_command_encoder(&Default::default());
+                {
+                    let mut render_pass = command_encoder.begin_compute_pass(&Default::default());
+                    render_pass.set_pipeline(&compute_pipeline);
+
+                    // NOTE: We deliberately don't set a bind group here, to provoke a validation
+                    // error.
+
+                    render_pass.dispatch_workgroups(1, 1, 1);
+                }
+
+                let _idx = ctx.queue.submit([command_encoder.finish()]);
+            },
+            Some(concat!(
+                "The current set ComputePipeline with '' label ",
+                "expects a BindGroup to be set at index 0"
+            )),
+        )
+    });
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
index df0dce5fed..3bb8e14a90 100644
--- a/tests/tests/root.rs
+++ b/tests/tests/root.rs
@@ -6,6 +6,7 @@ mod regression {
     mod issue_4485;
     mod issue_4514;
     mod issue_5553;
+    mod issue_6317;
 }
 
 mod bgra8unorm_storage;
diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index f576b24125..218e8a6813 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -1027,11 +1027,13 @@ impl Global {
         &self,
         queue_id: QueueId,
         command_buffer_ids: &[id::CommandBufferId],
-    ) -> Result<SubmissionIndex, QueueSubmitError> {
+    ) -> Result<SubmissionIndex, (SubmissionIndex, QueueSubmitError)> {
         profiling::scope!("Queue::submit");
         api_log!("Queue::submit {queue_id:?}");
 
-        let (submit_index, callbacks) = {
+        let submit_index;
+
+        let res = 'error: {
             let hub = &self.hub;
 
             let queue = hub.queues.get(queue_id);
@@ -1042,7 +1044,7 @@ impl Global {
 
             // Fence lock must be acquired after the snatch lock everywhere to avoid deadlocks.
             let mut fence = device.fence.write();
-            let submit_index = device
+            submit_index = device
                 .active_submission_index
                 .fetch_add(1, Ordering::SeqCst)
                 + 1;
@@ -1055,237 +1057,271 @@ impl Global {
             let mut submit_surface_textures_owned = FastHashMap::default();
 
             {
-                let command_buffer_guard = hub.command_buffers.read();
+                {
+                    let command_buffer_guard = hub.command_buffers.read();
 
-                if !command_buffer_ids.is_empty() {
-                    profiling::scope!("prepare");
+                    if !command_buffer_ids.is_empty() {
+                        profiling::scope!("prepare");
 
-                    let mut first_error = None;
+                        let mut first_error = None;
 
-                    //TODO: if multiple command buffers are submitted, we can re-use the last
-                    // native command buffer of the previous chain instead of always creating
-                    // a temporary one, since the chains are not finished.
+                        //TODO: if multiple command buffers are submitted, we can re-use the last
+                        // native command buffer of the previous chain instead of always creating
+                        // a temporary one, since the chains are not finished.
 
-                    // finish all the command buffers first
-                    for command_buffer_id in command_buffer_ids {
-                        profiling::scope!("process command buffer");
+                        // finish all the command buffers first
+                        for command_buffer_id in command_buffer_ids {
+                            profiling::scope!("process command buffer");
 
-                        // we reset the used surface textures every time we use
-                        // it, so make sure to set_size on it.
-                        used_surface_textures.set_size(device.tracker_indices.textures.size());
+                            // we reset the used surface textures every time we use
+                            // it, so make sure to set_size on it.
+                            used_surface_textures.set_size(device.tracker_indices.textures.size());
 
-                        let command_buffer = command_buffer_guard.get(*command_buffer_id);
+                            let command_buffer = command_buffer_guard.get(*command_buffer_id);
 
-                        // Note that we are required to invalidate all command buffers in both the success and failure paths.
-                        // This is why we `continue` and don't early return via `?`.
-                        #[allow(unused_mut)]
-                        let mut cmd_buf_data = command_buffer.try_take();
+                            // Note that we are required to invalidate all command buffers in both the success and failure paths.
+                            // This is why we `continue` and don't early return via `?`.
+                            #[allow(unused_mut)]
+                            let mut cmd_buf_data = command_buffer.try_take();
 
-                        #[cfg(feature = "trace")]
-                        if let Some(ref mut trace) = *device.trace.lock() {
-                            if let Ok(ref mut cmd_buf_data) = cmd_buf_data {
-                                trace.add(Action::Submit(
-                                    submit_index,
-                                    cmd_buf_data.commands.take().unwrap(),
-                                ));
+                            #[cfg(feature = "trace")]
+                            if let Some(ref mut trace) = *device.trace.lock() {
+                                if let Ok(ref mut cmd_buf_data) = cmd_buf_data {
+                                    trace.add(Action::Submit(
+                                        submit_index,
+                                        cmd_buf_data.commands.take().unwrap(),
+                                    ));
+                                }
                             }
-                        }
 
-                        let mut baked = match cmd_buf_data {
-                            Ok(cmd_buf_data) => {
-                                let res = validate_command_buffer(
-                                    &command_buffer,
-                                    &queue,
-                                    &cmd_buf_data,
-                                    &snatch_guard,
-                                    &mut submit_surface_textures_owned,
-                                    &mut used_surface_textures,
-                                );
-                                if let Err(err) = res {
-                                    first_error.get_or_insert(err);
-                                    cmd_buf_data.destroy(&command_buffer.device);
+                            let mut baked = match cmd_buf_data {
+                                Ok(cmd_buf_data) => {
+                                    let res = validate_command_buffer(
+                                        &command_buffer,
+                                        &queue,
+                                        &cmd_buf_data,
+                                        &snatch_guard,
+                                        &mut submit_surface_textures_owned,
+                                        &mut used_surface_textures,
+                                    );
+                                    if let Err(err) = res {
+                                        first_error.get_or_insert(err);
+                                        cmd_buf_data.destroy(&command_buffer.device);
+                                        continue;
+                                    }
+                                    cmd_buf_data.into_baked_commands()
+                                }
+                                Err(err) => {
+                                    first_error.get_or_insert(err.into());
                                     continue;
                                 }
-                                cmd_buf_data.into_baked_commands()
-                            }
-                            Err(err) => {
-                                first_error.get_or_insert(err.into());
+                            };
+
+                            if first_error.is_some() {
                                 continue;
                             }
-                        };
 
-                        if first_error.is_some() {
-                            continue;
-                        }
+                            // execute resource transitions
+                            if let Err(e) = unsafe {
+                                baked.encoder.begin_encoding(hal_label(
+                                    Some("(wgpu internal) Transit"),
+                                    device.instance_flags,
+                                ))
+                            }
+                            .map_err(|e| device.handle_hal_error(e))
+                            {
+                                break 'error Err(e.into());
+                            }
 
-                        // execute resource transitions
-                        unsafe {
-                            baked.encoder.begin_encoding(hal_label(
-                                Some("(wgpu internal) Transit"),
-                                device.instance_flags,
-                            ))
+                            //Note: locking the trackers has to be done after the storages
+                            let mut trackers = device.trackers.lock();
+                            if let Err(e) =
+                                baked.initialize_buffer_memory(&mut trackers, &snatch_guard)
+                            {
+                                break 'error Err(e.into());
+                            }
+                            if let Err(e) = baked.initialize_texture_memory(
+                                &mut trackers,
+                                device,
+                                &snatch_guard,
+                            ) {
+                                break 'error Err(e.into());
+                            }
+
+                            //Note: stateless trackers are not merged:
+                            // device already knows these resources exist.
+                            CommandBuffer::insert_barriers_from_device_tracker(
+                                baked.encoder.as_mut(),
+                                &mut trackers,
+                                &baked.trackers,
+                                &snatch_guard,
+                            );
+
+                            let transit = unsafe { baked.encoder.end_encoding().unwrap() };
+                            baked.list.insert(0, transit);
+
+                            // Transition surface textures into `Present` state.
+                            // Note: we could technically do it after all of the command buffers,
+                            // but here we have a command encoder by hand, so it's easier to use it.
+                            if !used_surface_textures.is_empty() {
+                                if let Err(e) = unsafe {
+                                    baked.encoder.begin_encoding(hal_label(
+                                        Some("(wgpu internal) Present"),
+                                        device.instance_flags,
+                                    ))
+                                }
+                                .map_err(|e| device.handle_hal_error(e))
+                                {
+                                    break 'error Err(e.into());
+                                }
+                                let texture_barriers = trackers
+                                    .textures
+                                    .set_from_usage_scope_and_drain_transitions(
+                                        &used_surface_textures,
+                                        &snatch_guard,
+                                    )
+                                    .collect::<Vec<_>>();
+                                let present = unsafe {
+                                    baked.encoder.transition_textures(&texture_barriers);
+                                    baked.encoder.end_encoding().unwrap()
+                                };
+                                baked.list.push(present);
+                                used_surface_textures = track::TextureUsageScope::default();
+                            }
+
+                            // done
+                            active_executions.push(EncoderInFlight {
+                                raw: baked.encoder,
+                                cmd_buffers: baked.list,
+                                trackers: baked.trackers,
+                                pending_buffers: FastHashMap::default(),
+                                pending_textures: FastHashMap::default(),
+                            });
                         }
-                        .map_err(|e| device.handle_hal_error(e))?;
 
-                        //Note: locking the trackers has to be done after the storages
-                        let mut trackers = device.trackers.lock();
-                        baked.initialize_buffer_memory(&mut trackers, &snatch_guard)?;
-                        baked.initialize_texture_memory(&mut trackers, device, &snatch_guard)?;
-                        //Note: stateless trackers are not merged:
-                        // device already knows these resources exist.
-                        CommandBuffer::insert_barriers_from_device_tracker(
-                            baked.encoder.as_mut(),
-                            &mut trackers,
-                            &baked.trackers,
-                            &snatch_guard,
-                        );
+                        if let Some(first_error) = first_error {
+                            break 'error Err(first_error);
+                        }
+                    }
+                }
 
-                        let transit = unsafe { baked.encoder.end_encoding().unwrap() };
-                        baked.list.insert(0, transit);
+                let mut pending_writes = device.pending_writes.lock();
 
-                        // Transition surface textures into `Present` state.
-                        // Note: we could technically do it after all of the command buffers,
-                        // but here we have a command encoder by hand, so it's easier to use it.
-                        if !used_surface_textures.is_empty() {
-                            unsafe {
-                                baked.encoder.begin_encoding(hal_label(
-                                    Some("(wgpu internal) Present"),
-                                    device.instance_flags,
-                                ))
+                {
+                    used_surface_textures.set_size(hub.textures.read().len());
+                    for texture in pending_writes.dst_textures.values() {
+                        match texture.try_inner(&snatch_guard) {
+                            Ok(TextureInner::Native { .. }) => {}
+                            Ok(TextureInner::Surface { .. }) => {
+                                // Compare the Arcs by pointer as Textures don't implement Eq
+                                submit_surface_textures_owned
+                                    .insert(Arc::as_ptr(texture), texture.clone());
+
+                                unsafe {
+                                    used_surface_textures
+                                        .merge_single(texture, None, hal::TextureUses::PRESENT)
+                                        .unwrap()
+                                };
                             }
-                            .map_err(|e| device.handle_hal_error(e))?;
-                            let texture_barriers = trackers
-                                .textures
-                                .set_from_usage_scope_and_drain_transitions(
-                                    &used_surface_textures,
-                                    &snatch_guard,
-                                )
-                                .collect::<Vec<_>>();
-                            let present = unsafe {
-                                baked.encoder.transition_textures(&texture_barriers);
-                                baked.encoder.end_encoding().unwrap()
-                            };
-                            baked.list.push(present);
-                            used_surface_textures = track::TextureUsageScope::default();
+                            Err(e) => break 'error Err(e.into()),
                         }
-
-                        // done
-                        active_executions.push(EncoderInFlight {
-                            raw: baked.encoder,
-                            cmd_buffers: baked.list,
-                            trackers: baked.trackers,
-                            pending_buffers: FastHashMap::default(),
-                            pending_textures: FastHashMap::default(),
-                        });
                     }
 
-                    if let Some(first_error) = first_error {
-                        return Err(first_error);
+                    if !used_surface_textures.is_empty() {
+                        let mut trackers = device.trackers.lock();
+
+                        let texture_barriers = trackers
+                            .textures
+                            .set_from_usage_scope_and_drain_transitions(
+                                &used_surface_textures,
+                                &snatch_guard,
+                            )
+                            .collect::<Vec<_>>();
+                        unsafe {
+                            pending_writes
+                                .command_encoder
+                                .transition_textures(&texture_barriers);
+                        };
                     }
                 }
-            }
 
-            let mut pending_writes = device.pending_writes.lock();
-
-            {
-                used_surface_textures.set_size(hub.textures.read().len());
-                for texture in pending_writes.dst_textures.values() {
-                    match texture.try_inner(&snatch_guard)? {
-                        TextureInner::Native { .. } => {}
-                        TextureInner::Surface { .. } => {
-                            // Compare the Arcs by pointer as Textures don't implement Eq
-                            submit_surface_textures_owned
-                                .insert(Arc::as_ptr(texture), texture.clone());
-
-                            unsafe {
-                                used_surface_textures
-                                    .merge_single(texture, None, hal::TextureUses::PRESENT)
-                                    .unwrap()
-                            };
-                        }
+                match pending_writes.pre_submit(&device.command_allocator, device, &queue) {
+                    Ok(Some(pending_execution)) => {
+                        active_executions.insert(0, pending_execution);
                     }
+                    Ok(None) => {}
+                    Err(e) => break 'error Err(e.into()),
                 }
 
-                if !used_surface_textures.is_empty() {
-                    let mut trackers = device.trackers.lock();
+                let hal_command_buffers = active_executions
+                    .iter()
+                    .flat_map(|e| e.cmd_buffers.iter().map(|b| b.as_ref()))
+                    .collect::<Vec<_>>();
 
-                    let texture_barriers = trackers
-                        .textures
-                        .set_from_usage_scope_and_drain_transitions(
-                            &used_surface_textures,
-                            &snatch_guard,
-                        )
-                        .collect::<Vec<_>>();
-                    unsafe {
-                        pending_writes
-                            .command_encoder
-                            .transition_textures(&texture_barriers);
-                    };
-                }
-            }
+                {
+                    let mut submit_surface_textures =
+                        SmallVec::<[&dyn hal::DynSurfaceTexture; 2]>::with_capacity(
+                            submit_surface_textures_owned.len(),
+                        );
 
-            if let Some(pending_execution) =
-                pending_writes.pre_submit(&device.command_allocator, device, &queue)?
-            {
-                active_executions.insert(0, pending_execution);
-            }
+                    for texture in submit_surface_textures_owned.values() {
+                        let raw = match texture.inner.get(&snatch_guard) {
+                            Some(TextureInner::Surface { raw, .. }) => raw.as_ref(),
+                            _ => unreachable!(),
+                        };
+                        submit_surface_textures.push(raw);
+                    }
 
-            let hal_command_buffers = active_executions
-                .iter()
-                .flat_map(|e| e.cmd_buffers.iter().map(|b| b.as_ref()))
-                .collect::<Vec<_>>();
+                    if let Err(e) = unsafe {
+                        queue.raw().submit(
+                            &hal_command_buffers,
+                            &submit_surface_textures,
+                            (fence.as_mut(), submit_index),
+                        )
+                    }
+                    .map_err(|e| device.handle_hal_error(e))
+                    {
+                        break 'error Err(e.into());
+                    }
 
-            {
-                let mut submit_surface_textures =
-                    SmallVec::<[&dyn hal::DynSurfaceTexture; 2]>::with_capacity(
-                        submit_surface_textures_owned.len(),
-                    );
-
-                for texture in submit_surface_textures_owned.values() {
-                    let raw = match texture.inner.get(&snatch_guard) {
-                        Some(TextureInner::Surface { raw, .. }) => raw.as_ref(),
-                        _ => unreachable!(),
-                    };
-                    submit_surface_textures.push(raw);
+                    // Advance the successful submission index.
+                    device
+                        .last_successful_submission_index
+                        .fetch_max(submit_index, Ordering::SeqCst);
                 }
 
-                unsafe {
-                    queue.raw().submit(
-                        &hal_command_buffers,
-                        &submit_surface_textures,
-                        (fence.as_mut(), submit_index),
-                    )
-                }
-                .map_err(|e| device.handle_hal_error(e))?;
+                profiling::scope!("cleanup");
+
+                // this will register the new submission to the life time tracker
+                device.lock_life().track_submission(
+                    submit_index,
+                    pending_writes.temp_resources.drain(..),
+                    active_executions,
+                );
+                drop(pending_writes);
+
+                // This will schedule destruction of all resources that are no longer needed
+                // by the user but used in the command stream, among other things.
+                let fence_guard = RwLockWriteGuard::downgrade(fence);
+                let (closures, _) =
+                    match device.maintain(fence_guard, wgt::Maintain::Poll, snatch_guard) {
+                        Ok(closures) => closures,
+                        Err(WaitIdleError::Device(err)) => {
+                            break 'error Err(QueueSubmitError::Queue(err))
+                        }
+                        Err(WaitIdleError::StuckGpu) => {
+                            break 'error Err(QueueSubmitError::StuckGpu)
+                        }
+                        Err(WaitIdleError::WrongSubmissionIndex(..)) => unreachable!(),
+                    };
 
-                // Advance the successful submission index.
-                device
-                    .last_successful_submission_index
-                    .fetch_max(submit_index, Ordering::SeqCst);
+                Ok(closures)
             }
+        };
 
-            profiling::scope!("cleanup");
-
-            // this will register the new submission to the life time tracker
-            device.lock_life().track_submission(
-                submit_index,
-                pending_writes.temp_resources.drain(..),
-                active_executions,
-            );
-            drop(pending_writes);
-
-            // This will schedule destruction of all resources that are no longer needed
-            // by the user but used in the command stream, among other things.
-            let fence_guard = RwLockWriteGuard::downgrade(fence);
-            let (closures, _) =
-                match device.maintain(fence_guard, wgt::Maintain::Poll, snatch_guard) {
-                    Ok(closures) => closures,
-                    Err(WaitIdleError::Device(err)) => return Err(QueueSubmitError::Queue(err)),
-                    Err(WaitIdleError::StuckGpu) => return Err(QueueSubmitError::StuckGpu),
-                    Err(WaitIdleError::WrongSubmissionIndex(..)) => unreachable!(),
-                };
-
-            (submit_index, closures)
+        let callbacks = match res {
+            Ok(ok) => ok,
+            Err(e) => return Err((submit_index, e)),
         };
 
         // the closures should execute with nothing locked!
diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs
index 3aac20e21f..1d1ffda209 100644
--- a/wgpu/src/backend/wgpu_core.rs
+++ b/wgpu/src/backend/wgpu_core.rs
@@ -2074,7 +2074,10 @@ impl crate::Context for ContextWgpuCore {
 
         let index = match self.0.queue_submit(queue_data.id, &temp_command_buffers) {
             Ok(index) => index,
-            Err(err) => self.handle_error_fatal(err, "Queue::submit"),
+            Err((index, err)) => {
+                self.handle_error_nolabel(&queue_data.error_sink, err, "Queue::submit");
+                index
+            }
         };
 
         for cmdbuf in &temp_command_buffers {