From c82e92baeea4d9454019efa90813d50f24d69595 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Fri, 29 Mar 2024 12:14:28 -0700
Subject: [PATCH 1/3] Micro-optimize `queue_material_meshes`, primarily to
 remove bit manipulation.

This commit makes the following optimizations:

`MeshPipelineKey` has been split into `BaseMeshPipelineKey`, which lives
in `bevy_render` and `MeshPipelineKey`, which lives in `bevy_pbr`.
Conceptually, `BaseMeshPipelineKey` is a superclass of
`MeshPipelineKey`. For `BaseMeshPipelineKey`, the bits start at the
highest (most significant) bit and grow downward toward the lowest bit;
for `MeshPipelineKey`, the bits start at the lowest bit and grow upward
toward the highest bit. This prevents them from colliding.

The goal of this is to avoid having to reassemble bits of the pipeline
key for every mesh every frame. Instead, we can just use a bitwise or
operation to combine the pieces that make up a `MeshPipelineKey`.

Previously, all of `specialize()` was marked as `#[inline]`. This
bloated `queue_material_meshes` unnecessarily, as most of it is a slow
path that's rarely hit. This commit refactors the function to move the
slow path to `specialize_slow()`.

Together, these two changes shave about 5% off `queue_material_meshes`.
---
 crates/bevy_pbr/src/material.rs               |  45 ++++---
 crates/bevy_pbr/src/prepass/mod.rs            |   7 +-
 crates/bevy_pbr/src/render/light.rs           |  13 +-
 crates/bevy_pbr/src/render/mesh.rs            |  38 +++---
 crates/bevy_render/src/mesh/mesh/mod.rs       |  55 ++++++++-
 .../render_resource/pipeline_specializer.rs   | 115 +++++++++++-------
 crates/bevy_sprite/src/mesh2d/material.rs     |   2 +-
 examples/2d/mesh2d_manual.rs                  |   2 +-
 examples/shader/shader_instancing.rs          |   3 +-
 9 files changed, 172 insertions(+), 108 deletions(-)

diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs
index 4e42cde7f5696..634c3ba69f440 100644
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@@ -659,25 +659,9 @@ pub fn queue_material_meshes<M: Material>(
                 continue;
             };
 
-            let forward = match material.properties.render_method {
-                OpaqueRendererMethod::Forward => true,
-                OpaqueRendererMethod::Deferred => false,
-                OpaqueRendererMethod::Auto => unreachable!(),
-            };
-
-            let mut mesh_key = view_key;
-
-            mesh_key |= MeshPipelineKey::from_primitive_topology(mesh.primitive_topology);
-
-            if mesh.morph_targets.is_some() {
-                mesh_key |= MeshPipelineKey::MORPH_TARGETS;
-            }
-
-            if material.properties.reads_view_transmission_texture {
-                mesh_key |= MeshPipelineKey::READS_VIEW_TRANSMISSION_TEXTURE;
-            }
-
-            mesh_key |= alpha_mode_pipeline_key(material.properties.alpha_mode);
+            let mut mesh_key = view_key
+                | MeshPipelineKey::from_bits_retain(mesh.key_bits.bits())
+                | material.properties.mesh_pipeline_key_bits;
 
             if render_lightmaps
                 .render_lightmaps
@@ -721,7 +705,7 @@ pub fn queue_material_meshes<M: Material>(
                             batch_range: 0..1,
                             dynamic_offset: None,
                         });
-                    } else if forward {
+                    } else if material.properties.render_method == OpaqueRendererMethod::Forward {
                         opaque_phase.add(Opaque3d {
                             entity: *visible_entity,
                             draw_function: draw_opaque_pbr,
@@ -745,7 +729,7 @@ pub fn queue_material_meshes<M: Material>(
                             batch_range: 0..1,
                             dynamic_offset: None,
                         });
-                    } else if forward {
+                    } else if material.properties.render_method == OpaqueRendererMethod::Forward {
                         alpha_mask_phase.add(AlphaMask3d {
                             entity: *visible_entity,
                             draw_function: draw_alpha_mask_pbr,
@@ -817,7 +801,7 @@ impl DefaultOpaqueRendererMethod {
 /// bandwidth usage which can be unsuitable for low end mobile or other bandwidth-constrained devices.
 ///
 /// If a material indicates `OpaqueRendererMethod::Auto`, `DefaultOpaqueRendererMethod` will be used.
-#[derive(Default, Clone, Copy, Debug, Reflect)]
+#[derive(Default, Clone, Copy, Debug, PartialEq, Reflect)]
 pub enum OpaqueRendererMethod {
     #[default]
     Forward,
@@ -832,6 +816,11 @@ pub struct MaterialProperties {
     pub render_method: OpaqueRendererMethod,
     /// The [`AlphaMode`] of this material.
     pub alpha_mode: AlphaMode,
+    /// The bits in the [`MeshPipelineKey`] for this material.
+    ///
+    /// These are precalculated so that we can just "or" them together in
+    /// [`queue_material_meshes`].
+    pub mesh_pipeline_key_bits: MeshPipelineKey,
     /// Add a bias to the view depth of the mesh which can be used to force a specific render order
     /// for meshes with equal depth, to avoid z-fighting.
     /// The bias is in depth-texture units so large values may be needed to overcome small depth differences.
@@ -1055,6 +1044,14 @@ fn prepare_material<M: Material>(
         OpaqueRendererMethod::Deferred => OpaqueRendererMethod::Deferred,
         OpaqueRendererMethod::Auto => default_opaque_render_method,
     };
+
+    let mut mesh_pipeline_key_bits = MeshPipelineKey::empty();
+    mesh_pipeline_key_bits.set(
+        MeshPipelineKey::READS_VIEW_TRANSMISSION_TEXTURE,
+        material.reads_view_transmission_texture(),
+    );
+    mesh_pipeline_key_bits.insert(alpha_mode_pipeline_key(material.alpha_mode()));
+
     Ok(PreparedMaterial {
         bindings: prepared.bindings,
         bind_group: prepared.bind_group,
@@ -1062,8 +1059,10 @@ fn prepare_material<M: Material>(
         properties: MaterialProperties {
             alpha_mode: material.alpha_mode(),
             depth_bias: material.depth_bias(),
-            reads_view_transmission_texture: material.reads_view_transmission_texture(),
+            reads_view_transmission_texture: mesh_pipeline_key_bits
+                .contains(MeshPipelineKey::READS_VIEW_TRANSMISSION_TEXTURE),
             render_method: method,
+            mesh_pipeline_key_bits,
         },
     })
 }
diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index 6e78c8f4c8c5e..424ed389d1bff 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -783,11 +783,8 @@ pub fn queue_prepass_material_meshes<M: Material>(
                 continue;
             };
 
-            let mut mesh_key =
-                MeshPipelineKey::from_primitive_topology(mesh.primitive_topology) | view_key;
-            if mesh.morph_targets.is_some() {
-                mesh_key |= MeshPipelineKey::MORPH_TARGETS;
-            }
+            let mut mesh_key = view_key | MeshPipelineKey::from_bits_retain(mesh.key_bits.bits());
+
             let alpha_mode = material.properties.alpha_mode;
             match alpha_mode {
                 AlphaMode::Opaque => {}
diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs
index 12d8961f988a4..f054c5b3d27ee 100644
--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@@ -1643,6 +1643,10 @@ pub fn queue_shadows<M: Material>(
             };
             // NOTE: Lights with shadow mapping disabled will have no visible entities
             // so no meshes will be queued
+
+            let mut light_key = MeshPipelineKey::DEPTH_PREPASS;
+            light_key.set(MeshPipelineKey::DEPTH_CLAMP_ORTHO, is_directional_light);
+
             for entity in visible_entities.iter().copied() {
                 let Some(mesh_instance) = render_mesh_instances.get(&entity) else {
                     continue;
@@ -1661,14 +1665,7 @@ pub fn queue_shadows<M: Material>(
                 };
 
                 let mut mesh_key =
-                    MeshPipelineKey::from_primitive_topology(mesh.primitive_topology)
-                        | MeshPipelineKey::DEPTH_PREPASS;
-                if mesh.morph_targets.is_some() {
-                    mesh_key |= MeshPipelineKey::MORPH_TARGETS;
-                }
-                if is_directional_light {
-                    mesh_key |= MeshPipelineKey::DEPTH_CLAMP_ORTHO;
-                }
+                    light_key | MeshPipelineKey::from_bits_retain(mesh.key_bits.bits());
 
                 // Even though we don't use the lightmap in the shadow map, the
                 // `SetMeshBindGroup` render command will bind the data for it. So
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 31f3a29352a28..21483a7d0fb3d 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -491,17 +491,16 @@ bitflags::bitflags! {
         const SCREEN_SPACE_AMBIENT_OCCLUSION    = 1 << 9;
         const DEPTH_CLAMP_ORTHO                 = 1 << 10;
         const TEMPORAL_JITTER                   = 1 << 11;
-        const MORPH_TARGETS                     = 1 << 12;
-        const READS_VIEW_TRANSMISSION_TEXTURE   = 1 << 13;
-        const LIGHTMAPPED                       = 1 << 14;
-        const IRRADIANCE_VOLUME                 = 1 << 15;
+        const READS_VIEW_TRANSMISSION_TEXTURE   = 1 << 12;
+        const LIGHTMAPPED                       = 1 << 13;
+        const IRRADIANCE_VOLUME                 = 1 << 14;
+        const LAST_FLAG                         = Self::IRRADIANCE_VOLUME.bits();
         const BLEND_RESERVED_BITS               = Self::BLEND_MASK_BITS << Self::BLEND_SHIFT_BITS; // ← Bitmask reserving bits for the blend state
         const BLEND_OPAQUE                      = 0 << Self::BLEND_SHIFT_BITS;                   // ← Values are just sequential within the mask, and can range from 0 to 3
         const BLEND_PREMULTIPLIED_ALPHA         = 1 << Self::BLEND_SHIFT_BITS;                   //
         const BLEND_MULTIPLY                    = 2 << Self::BLEND_SHIFT_BITS;                   // ← We still have room for one more value without adding more bits
         const BLEND_ALPHA                       = 3 << Self::BLEND_SHIFT_BITS;
         const MSAA_RESERVED_BITS                = Self::MSAA_MASK_BITS << Self::MSAA_SHIFT_BITS;
-        const PRIMITIVE_TOPOLOGY_RESERVED_BITS  = Self::PRIMITIVE_TOPOLOGY_MASK_BITS << Self::PRIMITIVE_TOPOLOGY_SHIFT_BITS;
         const TONEMAP_METHOD_RESERVED_BITS      = Self::TONEMAP_METHOD_MASK_BITS << Self::TONEMAP_METHOD_SHIFT_BITS;
         const TONEMAP_METHOD_NONE               = 0 << Self::TONEMAP_METHOD_SHIFT_BITS;
         const TONEMAP_METHOD_REINHARD           = 1 << Self::TONEMAP_METHOD_SHIFT_BITS;
@@ -525,36 +524,32 @@ bitflags::bitflags! {
         const SCREEN_SPACE_SPECULAR_TRANSMISSION_MEDIUM = 1 << Self::SCREEN_SPACE_SPECULAR_TRANSMISSION_SHIFT_BITS;
         const SCREEN_SPACE_SPECULAR_TRANSMISSION_HIGH = 2 << Self::SCREEN_SPACE_SPECULAR_TRANSMISSION_SHIFT_BITS;
         const SCREEN_SPACE_SPECULAR_TRANSMISSION_ULTRA = 3 << Self::SCREEN_SPACE_SPECULAR_TRANSMISSION_SHIFT_BITS;
+        const MORPH_TARGETS                     = BaseMeshPipelineKey::MORPH_TARGETS.bits();
     }
 }
 
 impl MeshPipelineKey {
     const MSAA_MASK_BITS: u32 = 0b111;
-    const MSAA_SHIFT_BITS: u32 = 32 - Self::MSAA_MASK_BITS.count_ones();
-
-    const PRIMITIVE_TOPOLOGY_MASK_BITS: u32 = 0b111;
-    const PRIMITIVE_TOPOLOGY_SHIFT_BITS: u32 =
-        Self::MSAA_SHIFT_BITS - Self::PRIMITIVE_TOPOLOGY_MASK_BITS.count_ones();
+    const MSAA_SHIFT_BITS: u32 = Self::LAST_FLAG.bits().trailing_zeros();
 
     const BLEND_MASK_BITS: u32 = 0b11;
-    const BLEND_SHIFT_BITS: u32 =
-        Self::PRIMITIVE_TOPOLOGY_SHIFT_BITS - Self::BLEND_MASK_BITS.count_ones();
+    const BLEND_SHIFT_BITS: u32 = Self::MSAA_MASK_BITS.count_ones() + Self::MSAA_SHIFT_BITS;
 
     const TONEMAP_METHOD_MASK_BITS: u32 = 0b111;
     const TONEMAP_METHOD_SHIFT_BITS: u32 =
-        Self::BLEND_SHIFT_BITS - Self::TONEMAP_METHOD_MASK_BITS.count_ones();
+        Self::BLEND_MASK_BITS.count_ones() + Self::BLEND_SHIFT_BITS;
 
     const SHADOW_FILTER_METHOD_MASK_BITS: u32 = 0b11;
     const SHADOW_FILTER_METHOD_SHIFT_BITS: u32 =
-        Self::TONEMAP_METHOD_SHIFT_BITS - Self::SHADOW_FILTER_METHOD_MASK_BITS.count_ones();
+        Self::TONEMAP_METHOD_MASK_BITS.count_ones() + Self::TONEMAP_METHOD_SHIFT_BITS;
 
     const VIEW_PROJECTION_MASK_BITS: u32 = 0b11;
     const VIEW_PROJECTION_SHIFT_BITS: u32 =
-        Self::SHADOW_FILTER_METHOD_SHIFT_BITS - Self::VIEW_PROJECTION_MASK_BITS.count_ones();
+        Self::SHADOW_FILTER_METHOD_MASK_BITS.count_ones() + Self::SHADOW_FILTER_METHOD_SHIFT_BITS;
 
     const SCREEN_SPACE_SPECULAR_TRANSMISSION_MASK_BITS: u32 = 0b11;
-    const SCREEN_SPACE_SPECULAR_TRANSMISSION_SHIFT_BITS: u32 = Self::VIEW_PROJECTION_SHIFT_BITS
-        - Self::SCREEN_SPACE_SPECULAR_TRANSMISSION_MASK_BITS.count_ones();
+    const SCREEN_SPACE_SPECULAR_TRANSMISSION_SHIFT_BITS: u32 =
+        Self::VIEW_PROJECTION_MASK_BITS.count_ones() + Self::VIEW_PROJECTION_SHIFT_BITS;
 
     pub fn from_msaa_samples(msaa_samples: u32) -> Self {
         let msaa_bits =
@@ -576,14 +571,15 @@ impl MeshPipelineKey {
 
     pub fn from_primitive_topology(primitive_topology: PrimitiveTopology) -> Self {
         let primitive_topology_bits = ((primitive_topology as u32)
-            & Self::PRIMITIVE_TOPOLOGY_MASK_BITS)
-            << Self::PRIMITIVE_TOPOLOGY_SHIFT_BITS;
+            & BaseMeshPipelineKey::PRIMITIVE_TOPOLOGY_MASK_BITS)
+            << BaseMeshPipelineKey::PRIMITIVE_TOPOLOGY_SHIFT_BITS;
         Self::from_bits_retain(primitive_topology_bits)
     }
 
     pub fn primitive_topology(&self) -> PrimitiveTopology {
-        let primitive_topology_bits = (self.bits() >> Self::PRIMITIVE_TOPOLOGY_SHIFT_BITS)
-            & Self::PRIMITIVE_TOPOLOGY_MASK_BITS;
+        let primitive_topology_bits = (self.bits()
+            >> BaseMeshPipelineKey::PRIMITIVE_TOPOLOGY_SHIFT_BITS)
+            & BaseMeshPipelineKey::PRIMITIVE_TOPOLOGY_MASK_BITS;
         match primitive_topology_bits {
             x if x == PrimitiveTopology::PointList as u32 => PrimitiveTopology::PointList,
             x if x == PrimitiveTopology::LineList as u32 => PrimitiveTopology::LineList,
diff --git a/crates/bevy_render/src/mesh/mesh/mod.rs b/crates/bevy_render/src/mesh/mesh/mod.rs
index e5f355bc51578..7756ed61b42e7 100644
--- a/crates/bevy_render/src/mesh/mesh/mod.rs
+++ b/crates/bevy_render/src/mesh/mesh/mod.rs
@@ -1,6 +1,7 @@
 mod conversions;
 pub mod skinning;
 use bevy_transform::components::Transform;
+use bitflags::bitflags;
 pub use wgpu::PrimitiveTopology;
 
 use crate::{
@@ -1393,6 +1394,43 @@ impl From<&Indices> for IndexFormat {
     }
 }
 
+bitflags! {
+    /// Mesh pipeline key bits start from the highest bit and go downward. PBR
+    /// mesh pipeline key bits start from the lowest bit and go upward. This
+    /// allows the PBR bits in the downstream crate `bevy_pbr` to coexist in the
+    /// same field without any shifts.
+    #[derive(Clone, Debug)]
+    pub struct BaseMeshPipelineKey: u32 {
+        const MORPH_TARGETS = 1 << 31;
+    }
+}
+
+impl BaseMeshPipelineKey {
+    pub const PRIMITIVE_TOPOLOGY_MASK_BITS: u32 = 0b111;
+    pub const PRIMITIVE_TOPOLOGY_SHIFT_BITS: u32 =
+        31 - Self::PRIMITIVE_TOPOLOGY_MASK_BITS.count_ones();
+
+    pub fn from_primitive_topology(primitive_topology: PrimitiveTopology) -> Self {
+        let primitive_topology_bits = ((primitive_topology as u32)
+            & Self::PRIMITIVE_TOPOLOGY_MASK_BITS)
+            << Self::PRIMITIVE_TOPOLOGY_SHIFT_BITS;
+        Self::from_bits_retain(primitive_topology_bits)
+    }
+
+    pub fn primitive_topology(&self) -> PrimitiveTopology {
+        let primitive_topology_bits = (self.bits() >> Self::PRIMITIVE_TOPOLOGY_SHIFT_BITS)
+            & Self::PRIMITIVE_TOPOLOGY_MASK_BITS;
+        match primitive_topology_bits {
+            x if x == PrimitiveTopology::PointList as u32 => PrimitiveTopology::PointList,
+            x if x == PrimitiveTopology::LineList as u32 => PrimitiveTopology::LineList,
+            x if x == PrimitiveTopology::LineStrip as u32 => PrimitiveTopology::LineStrip,
+            x if x == PrimitiveTopology::TriangleList as u32 => PrimitiveTopology::TriangleList,
+            x if x == PrimitiveTopology::TriangleStrip as u32 => PrimitiveTopology::TriangleStrip,
+            _ => PrimitiveTopology::default(),
+        }
+    }
+}
+
 /// The GPU-representation of a [`Mesh`].
 /// Consists of a vertex data buffer and an optional index data buffer.
 #[derive(Debug, Clone)]
@@ -1402,10 +1440,17 @@ pub struct GpuMesh {
     pub vertex_count: u32,
     pub morph_targets: Option<TextureView>,
     pub buffer_info: GpuBufferInfo,
-    pub primitive_topology: PrimitiveTopology,
+    pub key_bits: BaseMeshPipelineKey,
     pub layout: MeshVertexBufferLayoutRef,
 }
 
+impl GpuMesh {
+    #[inline]
+    pub fn primitive_topology(&self) -> PrimitiveTopology {
+        self.key_bits.primitive_topology()
+    }
+}
+
 /// The index/vertex buffer info of a [`GpuMesh`].
 #[derive(Debug, Clone)]
 pub enum GpuBufferInfo {
@@ -1461,11 +1506,17 @@ impl RenderAsset for Mesh {
         let mesh_vertex_buffer_layout =
             self.get_mesh_vertex_buffer_layout(mesh_vertex_buffer_layouts);
 
+        let mut key_bits = BaseMeshPipelineKey::from_primitive_topology(self.primitive_topology());
+        key_bits.set(
+            BaseMeshPipelineKey::MORPH_TARGETS,
+            self.morph_targets.is_some(),
+        );
+
         Ok(GpuMesh {
             vertex_buffer,
             vertex_count: self.count_vertices() as u32,
             buffer_info,
-            primitive_topology: self.primitive_topology(),
+            key_bits,
             layout: mesh_vertex_buffer_layout,
             morph_targets: self
                 .morph_targets
diff --git a/crates/bevy_render/src/render_resource/pipeline_specializer.rs b/crates/bevy_render/src/render_resource/pipeline_specializer.rs
index 746f7bee7afff..bc4a36dbd191b 100644
--- a/crates/bevy_render/src/render_resource/pipeline_specializer.rs
+++ b/crates/bevy_render/src/render_resource/pipeline_specializer.rs
@@ -8,6 +8,7 @@ use crate::{
     },
 };
 use bevy_ecs::system::Resource;
+use bevy_utils::hashbrown::hash_map::VacantEntry;
 use bevy_utils::{default, hashbrown::hash_map::RawEntryMut, tracing::error, Entry, HashMap};
 use std::{fmt::Debug, hash::Hash};
 use thiserror::Error;
@@ -84,9 +85,14 @@ pub trait SpecializedMeshPipeline {
 #[derive(Resource)]
 pub struct SpecializedMeshPipelines<S: SpecializedMeshPipeline> {
     mesh_layout_cache: HashMap<(MeshVertexBufferLayoutRef, S::Key), CachedRenderPipelineId>,
-    vertex_layout_cache: HashMap<VertexBufferLayout, HashMap<S::Key, CachedRenderPipelineId>>,
+    vertex_layout_cache: VertexLayoutCache<S>,
 }
 
+pub type VertexLayoutCache<S> = HashMap<
+    VertexBufferLayout,
+    HashMap<<S as SpecializedMeshPipeline>::Key, CachedRenderPipelineId>,
+>;
+
 impl<S: SpecializedMeshPipeline> Default for SpecializedMeshPipelines<S> {
     fn default() -> Self {
         Self {
@@ -105,55 +111,72 @@ impl<S: SpecializedMeshPipeline> SpecializedMeshPipelines<S> {
         key: S::Key,
         layout: &MeshVertexBufferLayoutRef,
     ) -> Result<CachedRenderPipelineId, SpecializedMeshPipelineError> {
-        match self.mesh_layout_cache.entry((layout.clone(), key.clone())) {
+        return match self.mesh_layout_cache.entry((layout.clone(), key.clone())) {
             Entry::Occupied(entry) => Ok(*entry.into_mut()),
-            Entry::Vacant(entry) => {
-                let descriptor = specialize_pipeline
-                    .specialize(key.clone(), layout)
-                    .map_err(|mut err| {
-                        {
-                            let SpecializedMeshPipelineError::MissingVertexAttribute(err) =
-                                &mut err;
-                            err.pipeline_type = Some(std::any::type_name::<S>());
-                        }
-                        err
-                    })?;
-                // Different MeshVertexBufferLayouts can produce the same final VertexBufferLayout
-                // We want compatible vertex buffer layouts to use the same pipelines, so we must "deduplicate" them
-                let layout_map = match self
-                    .vertex_layout_cache
-                    .raw_entry_mut()
-                    .from_key(&descriptor.vertex.buffers[0])
-                {
-                    RawEntryMut::Occupied(entry) => entry.into_mut(),
-                    RawEntryMut::Vacant(entry) => {
-                        entry
-                            .insert(descriptor.vertex.buffers[0].clone(), Default::default())
-                            .1
+            Entry::Vacant(entry) => specialize_slow(
+                &mut self.vertex_layout_cache,
+                cache,
+                specialize_pipeline,
+                key,
+                layout,
+                entry,
+            ),
+        };
+
+        #[inline(never)]
+        fn specialize_slow<S>(
+            vertex_layout_cache: &mut VertexLayoutCache<S>,
+            cache: &PipelineCache,
+            specialize_pipeline: &S,
+            key: S::Key,
+            layout: &MeshVertexBufferLayoutRef,
+            entry: VacantEntry<(MeshVertexBufferLayoutRef, S::Key), CachedRenderPipelineId>,
+        ) -> Result<CachedRenderPipelineId, SpecializedMeshPipelineError>
+        where
+            S: SpecializedMeshPipeline,
+        {
+            let descriptor = specialize_pipeline
+                .specialize(key.clone(), layout)
+                .map_err(|mut err| {
+                    {
+                        let SpecializedMeshPipelineError::MissingVertexAttribute(err) = &mut err;
+                        err.pipeline_type = Some(std::any::type_name::<S>());
                     }
-                };
-                Ok(*entry.insert(match layout_map.entry(key) {
-                    Entry::Occupied(entry) => {
-                        if cfg!(debug_assertions) {
-                            let stored_descriptor =
-                                cache.get_render_pipeline_descriptor(*entry.get());
-                            if stored_descriptor != &descriptor {
-                                error!(
-                                    "The cached pipeline descriptor for {} is not \
-                                equal to the generated descriptor for the given key. \
-                                This means the SpecializePipeline implementation uses \
-                                unused' MeshVertexBufferLayout information to specialize \
-                                the pipeline. This is not allowed because it would invalidate \
-                                the pipeline cache.",
-                                    std::any::type_name::<S>()
-                                );
-                            }
+                    err
+                })?;
+            // Different MeshVertexBufferLayouts can produce the same final VertexBufferLayout
+            // We want compatible vertex buffer layouts to use the same pipelines, so we must "deduplicate" them
+            let layout_map = match vertex_layout_cache
+                .raw_entry_mut()
+                .from_key(&descriptor.vertex.buffers[0])
+            {
+                RawEntryMut::Occupied(entry) => entry.into_mut(),
+                RawEntryMut::Vacant(entry) => {
+                    entry
+                        .insert(descriptor.vertex.buffers[0].clone(), Default::default())
+                        .1
+                }
+            };
+            Ok(*entry.insert(match layout_map.entry(key) {
+                Entry::Occupied(entry) => {
+                    if cfg!(debug_assertions) {
+                        let stored_descriptor = cache.get_render_pipeline_descriptor(*entry.get());
+                        if stored_descriptor != &descriptor {
+                            error!(
+                                "The cached pipeline descriptor for {} is not \
+                                    equal to the generated descriptor for the given key. \
+                                    This means the SpecializePipeline implementation uses \
+                                    unused' MeshVertexBufferLayout information to specialize \
+                                    the pipeline. This is not allowed because it would invalidate \
+                                    the pipeline cache.",
+                                std::any::type_name::<S>()
+                            );
                         }
-                        *entry.into_mut()
                     }
-                    Entry::Vacant(entry) => *entry.insert(cache.queue_render_pipeline(descriptor)),
-                }))
-            }
+                    *entry.into_mut()
+                }
+                Entry::Vacant(entry) => *entry.insert(cache.queue_render_pipeline(descriptor)),
+            }))
         }
     }
 }
diff --git a/crates/bevy_sprite/src/mesh2d/material.rs b/crates/bevy_sprite/src/mesh2d/material.rs
index 9938142095cc7..7861f84ac4253 100644
--- a/crates/bevy_sprite/src/mesh2d/material.rs
+++ b/crates/bevy_sprite/src/mesh2d/material.rs
@@ -426,7 +426,7 @@ pub fn queue_material2d_meshes<M: Material2d>(
                 continue;
             };
             let mesh_key =
-                view_key | Mesh2dPipelineKey::from_primitive_topology(mesh.primitive_topology);
+                view_key | Mesh2dPipelineKey::from_primitive_topology(mesh.primitive_topology());
 
             let pipeline_id = pipelines.specialize(
                 &pipeline_cache,
diff --git a/examples/2d/mesh2d_manual.rs b/examples/2d/mesh2d_manual.rs
index fa760345e5753..cdfb25c73de8c 100644
--- a/examples/2d/mesh2d_manual.rs
+++ b/examples/2d/mesh2d_manual.rs
@@ -383,7 +383,7 @@ pub fn queue_colored_mesh2d(
                 let mut mesh2d_key = mesh_key;
                 if let Some(mesh) = render_meshes.get(mesh2d_handle) {
                     mesh2d_key |=
-                        Mesh2dPipelineKey::from_primitive_topology(mesh.primitive_topology);
+                        Mesh2dPipelineKey::from_primitive_topology(mesh.primitive_topology());
                 }
 
                 let pipeline_id =
diff --git a/examples/shader/shader_instancing.rs b/examples/shader/shader_instancing.rs
index 47c7b6e4bac0d..fbb553edc276e 100644
--- a/examples/shader/shader_instancing.rs
+++ b/examples/shader/shader_instancing.rs
@@ -133,7 +133,8 @@ fn queue_custom(
             let Some(mesh) = meshes.get(mesh_instance.mesh_asset_id) else {
                 continue;
             };
-            let key = view_key | MeshPipelineKey::from_primitive_topology(mesh.primitive_topology);
+            let key =
+                view_key | MeshPipelineKey::from_primitive_topology(mesh.primitive_topology());
             let pipeline = pipelines
                 .specialize(&pipeline_cache, &custom_pipeline, key, &mesh.layout)
                 .unwrap();

From e44147bf20f8ae4f4c43f71d909af56e15726e1c Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 1 Apr 2024 13:55:09 -0700
Subject: [PATCH 2/3] Add an assertion that the bits didn't collide

---
 crates/bevy_pbr/Cargo.toml         |  1 +
 crates/bevy_pbr/src/render/mesh.rs | 24 +++++++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/crates/bevy_pbr/Cargo.toml b/crates/bevy_pbr/Cargo.toml
index 6944aeb5d7c32..d75be9f3c9d07 100644
--- a/crates/bevy_pbr/Cargo.toml
+++ b/crates/bevy_pbr/Cargo.toml
@@ -50,6 +50,7 @@ serde = { version = "1", features = ["derive", "rc"] }
 bincode = "1"
 range-alloc = "0.1"
 nonmax = "0.5"
+static_assertions = "1"
 
 [lints]
 workspace = true
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 55cb93d8e2c83..6ba0bf5528d71 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -31,6 +31,7 @@ use bevy_utils::{tracing::error, Entry, HashMap, Parallel};
 
 #[cfg(debug_assertions)]
 use bevy_utils::warn_once;
+use static_assertions::const_assert_eq;
 
 use crate::render::{
     morph::{
@@ -508,7 +509,13 @@ bitflags::bitflags! {
     // NOTE: Apparently quadro drivers support up to 64x MSAA.
     /// MSAA uses the highest 3 bits for the MSAA log2(sample count) to support up to 128x MSAA.
     pub struct MeshPipelineKey: u32 {
+        // Nothing
         const NONE                              = 0;
+
+        // Inherited bits
+        const MORPH_TARGETS                     = BaseMeshPipelineKey::MORPH_TARGETS.bits();
+
+        // Flag bits
         const HDR                               = 1 << 0;
         const TONEMAP_IN_SHADER                 = 1 << 1;
         const DEBAND_DITHER                     = 1 << 2;
@@ -526,6 +533,8 @@ bitflags::bitflags! {
         const LIGHTMAPPED                       = 1 << 13;
         const IRRADIANCE_VOLUME                 = 1 << 14;
         const LAST_FLAG                         = Self::IRRADIANCE_VOLUME.bits();
+
+        // Bitfields
         const BLEND_RESERVED_BITS               = Self::BLEND_MASK_BITS << Self::BLEND_SHIFT_BITS; // ← Bitmask reserving bits for the blend state
         const BLEND_OPAQUE                      = 0 << Self::BLEND_SHIFT_BITS;                   // ← Values are just sequential within the mask, and can range from 0 to 3
         const BLEND_PREMULTIPLIED_ALPHA         = 1 << Self::BLEND_SHIFT_BITS;                   //
@@ -555,7 +564,13 @@ bitflags::bitflags! {
         const SCREEN_SPACE_SPECULAR_TRANSMISSION_MEDIUM = 1 << Self::SCREEN_SPACE_SPECULAR_TRANSMISSION_SHIFT_BITS;
         const SCREEN_SPACE_SPECULAR_TRANSMISSION_HIGH = 2 << Self::SCREEN_SPACE_SPECULAR_TRANSMISSION_SHIFT_BITS;
         const SCREEN_SPACE_SPECULAR_TRANSMISSION_ULTRA = 3 << Self::SCREEN_SPACE_SPECULAR_TRANSMISSION_SHIFT_BITS;
-        const MORPH_TARGETS                     = BaseMeshPipelineKey::MORPH_TARGETS.bits();
+        const ALL_RESERVED_BITS =
+            Self::BLEND_RESERVED_BITS.bits() |
+            Self::MSAA_RESERVED_BITS.bits() |
+            Self::TONEMAP_METHOD_RESERVED_BITS.bits() |
+            Self::SHADOW_FILTER_METHOD_RESERVED_BITS.bits() |
+            Self::VIEW_PROJECTION_RESERVED_BITS.bits() |
+            Self::SCREEN_SPACE_SPECULAR_TRANSMISSION_RESERVED_BITS.bits();
     }
 }
 
@@ -622,6 +637,13 @@ impl MeshPipelineKey {
     }
 }
 
+// Ensure that we didn't overflow the number of bits available in `MeshPipelineKey`.
+const_assert_eq!(
+    (((MeshPipelineKey::LAST_FLAG.bits() << 1) - 1) | MeshPipelineKey::ALL_RESERVED_BITS.bits())
+        & BaseMeshPipelineKey::all().bits(),
+    0
+);
+
 fn is_skinned(layout: &MeshVertexBufferLayoutRef) -> bool {
     layout.0.contains(Mesh::ATTRIBUTE_JOINT_INDEX)
         && layout.0.contains(Mesh::ATTRIBUTE_JOINT_WEIGHT)

From 516ac6a58ed4ea3993c99fcf863f4d6f19c52aa7 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 1 Apr 2024 13:55:59 -0700
Subject: [PATCH 3/3] Reword comment

---
 crates/bevy_render/src/mesh/mesh/mod.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crates/bevy_render/src/mesh/mesh/mod.rs b/crates/bevy_render/src/mesh/mesh/mod.rs
index 7756ed61b42e7..f1c229f5f4230 100644
--- a/crates/bevy_render/src/mesh/mesh/mod.rs
+++ b/crates/bevy_render/src/mesh/mesh/mod.rs
@@ -1395,10 +1395,10 @@ impl From<&Indices> for IndexFormat {
 }
 
 bitflags! {
-    /// Mesh pipeline key bits start from the highest bit and go downward. PBR
-    /// mesh pipeline key bits start from the lowest bit and go upward. This
-    /// allows the PBR bits in the downstream crate `bevy_pbr` to coexist in the
-    /// same field without any shifts.
+    /// Our base mesh pipeline key bits start from the highest bit and go
+    /// downward. The PBR mesh pipeline key bits start from the lowest bit and
+    /// go upward. This allows the PBR bits in the downstream crate `bevy_pbr`
+    /// to coexist in the same field without any shifts.
     #[derive(Clone, Debug)]
     pub struct BaseMeshPipelineKey: u32 {
         const MORPH_TARGETS = 1 << 31;