diff --git a/CHANGELOG.md b/CHANGELOG.md
index e80adea94e..730e03a06c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,12 +1,19 @@
 # Change Log
 
-## wgpu-0.12.1
+## wgpu-core-0.12.1, wgpu-hal-0.12.1 (2021-12-29)
   - zero initialization uses now render target clears when possible (faster and doesn't enforce COPY_DST internally if not necessary)
     - fix use of MSAA targets in WebGL
     - fix not providing `COPY_DST` flag for textures causing assertions in some cases
     - fix surface textures not getting zero initialized
     - clear_texture supports now depth/stencil targets
   - error message on creating depth/stencil volume texture
+  - Vulkan:
+    - fix validation error on debug message types
+  - DX12:
+    - fix check for integrated GPUs
+    - fix stencil subresource transitions
+  - Metal:
+    - implement push constants
 
 ## wgpu-0.12 (2021-12-18)
   - API:
diff --git a/Cargo.lock b/Cargo.lock
index e02857f707..47f17ea587 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -43,9 +43,9 @@ dependencies = [
 
 [[package]]
 name = "ash"
-version = "0.33.3+1.2.191"
+version = "0.34.0+1.2.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc4f1d82f164f838ae413296d1131aa6fa79b917d25bebaa7033d25620c09219"
+checksum = "b0f780da53d0063880d45554306489f09dd8d1bda47688b4a57bc579119356df"
 dependencies = [
  "libloading",
 ]
@@ -1656,7 +1656,7 @@ dependencies = [
 
 [[package]]
 name = "wgpu-core"
-version = "0.12.0"
+version = "0.12.1"
 dependencies = [
  "arrayvec",
  "bitflags",
@@ -1679,7 +1679,7 @@ dependencies = [
 
 [[package]]
 name = "wgpu-hal"
-version = "0.12.0"
+version = "0.12.1"
 dependencies = [
  "arrayvec",
  "ash",
diff --git a/wgpu-core/Cargo.toml b/wgpu-core/Cargo.toml
index 8d68c4c8c6..a88f2a8ae4 100644
--- a/wgpu-core/Cargo.toml
+++ b/wgpu-core/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "wgpu-core"
-version = "0.12.0"
+version = "0.12.1"
 authors = ["wgpu developers"]
 edition = "2018"
 description = "WebGPU core logic on wgpu-hal"
diff --git a/wgpu-hal/Cargo.toml b/wgpu-hal/Cargo.toml
index 43e850ba6f..7ca9722b3c 100644
--- a/wgpu-hal/Cargo.toml
+++ b/wgpu-hal/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "wgpu-hal"
-version = "0.12.0"
+version = "0.12.1"
 authors = ["wgpu developers"]
 edition = "2018"
 description = "WebGPU hardware abstraction layer"
@@ -37,7 +37,7 @@ block = { version = "0.1", optional = true }
 foreign-types = { version = "0.3", optional = true }
 
 # backend: Vulkan
-ash = { version = "0.33", optional = true }
+ash = { version = "0.34", optional = true, default-features = false, features = ["debug", "loaded"] }
 gpu-alloc = { version = "0.5", optional = true }
 gpu-descriptor = { version = "0.2", optional = true }
 inplace_it = { version ="0.3.3", optional = true }
diff --git a/wgpu-hal/src/dx12/adapter.rs b/wgpu-hal/src/dx12/adapter.rs
index 37046dffcc..69f63b9350 100644
--- a/wgpu-hal/src/dx12/adapter.rs
+++ b/wgpu-hal/src/dx12/adapter.rs
@@ -98,7 +98,7 @@ impl super::Adapter {
             device_type: if (desc.Flags & dxgi::DXGI_ADAPTER_FLAG_SOFTWARE) != 0 {
                 workarounds.avoid_cpu_descriptor_overwrites = true;
                 wgt::DeviceType::Cpu
-            } else if features_architecture.CacheCoherentUMA != 0 {
+            } else if features_architecture.UMA != 0 {
                 wgt::DeviceType::IntegratedGpu
             } else {
                 wgt::DeviceType::DiscreteGpu
diff --git a/wgpu-hal/src/dx12/command.rs b/wgpu-hal/src/dx12/command.rs
index 91a6bd9b67..48f72377b0 100644
--- a/wgpu-hal/src/dx12/command.rs
+++ b/wgpu-hal/src/dx12/command.rs
@@ -319,15 +319,30 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
                     // Only one barrier if it affects the whole image.
                     self.temp.barriers.push(raw);
                 } else {
-                    // Generate barrier for each layer/level combination.
+                    // Selected texture aspect is relevant if the texture format has both depth _and_ stencil aspects.
+                    let planes = if crate::FormatAspects::from(barrier.texture.format)
+                        .contains(crate::FormatAspects::DEPTH | crate::FormatAspects::STENCIL)
+                    {
+                        match barrier.range.aspect {
+                            wgt::TextureAspect::All => 0..2,
+                            wgt::TextureAspect::StencilOnly => 1..2,
+                            wgt::TextureAspect::DepthOnly => 0..1,
+                        }
+                    } else {
+                        0..1
+                    };
+
                     for rel_mip_level in 0..mip_level_count {
                         for rel_array_layer in 0..array_layer_count {
-                            raw.u.Transition_mut().Subresource = barrier.texture.calc_subresource(
-                                barrier.range.base_mip_level + rel_mip_level,
-                                barrier.range.base_array_layer + rel_array_layer,
-                                0,
-                            );
-                            self.temp.barriers.push(raw);
+                            for plane in planes.clone() {
+                                raw.u.Transition_mut().Subresource =
+                                    barrier.texture.calc_subresource(
+                                        barrier.range.base_mip_level + rel_mip_level,
+                                        barrier.range.base_array_layer + rel_array_layer,
+                                        plane,
+                                    );
+                                self.temp.barriers.push(raw);
+                            }
                         }
                     }
                 }
@@ -607,10 +622,15 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
         }
         if let Some(ref ds) = desc.depth_stencil_attachment {
             let mut flags = native::ClearFlags::empty();
-            if !ds.depth_ops.contains(crate::AttachmentOps::LOAD) {
+            let aspects = ds.target.view.format_aspects;
+            if !ds.depth_ops.contains(crate::AttachmentOps::LOAD)
+                && aspects.contains(crate::FormatAspects::DEPTH)
+            {
                 flags |= native::ClearFlags::DEPTH;
             }
-            if !ds.stencil_ops.contains(crate::AttachmentOps::LOAD) {
+            if !ds.stencil_ops.contains(crate::AttachmentOps::LOAD)
+                && aspects.contains(crate::FormatAspects::STENCIL)
+            {
                 flags |= native::ClearFlags::STENCIL;
             }
 
diff --git a/wgpu-hal/src/dx12/device.rs b/wgpu-hal/src/dx12/device.rs
index 25423ed5f5..660601d41a 100644
--- a/wgpu-hal/src/dx12/device.rs
+++ b/wgpu-hal/src/dx12/device.rs
@@ -1,3 +1,5 @@
+use crate::FormatAspects;
+
 use super::{conv, descriptor, view, HResult as _};
 use parking_lot::Mutex;
 use std::{ffi, mem, num::NonZeroU32, ptr, slice, sync::Arc};
@@ -495,6 +497,7 @@ impl crate::Device<super::Api> for super::Device {
 
         Ok(super::TextureView {
             raw_format: view_desc.format,
+            format_aspects: FormatAspects::from(desc.format),
             target_base: (
                 texture.resource,
                 texture.calc_subresource(desc.range.base_mip_level, desc.range.base_array_layer, 0),
@@ -558,7 +561,7 @@ impl crate::Device<super::Api> for super::Device {
                 .usage
                 .intersects(crate::TextureUses::DEPTH_STENCIL_WRITE)
             {
-                let raw_desc = view_desc.to_dsv(crate::FormatAspects::empty());
+                let raw_desc = view_desc.to_dsv(FormatAspects::empty());
                 let handle = self.dsv_pool.lock().alloc_handle();
                 self.raw.CreateDepthStencilView(
                     texture.resource.as_mut_ptr(),
diff --git a/wgpu-hal/src/dx12/mod.rs b/wgpu-hal/src/dx12/mod.rs
index 1867d02368..b4dd295059 100644
--- a/wgpu-hal/src/dx12/mod.rs
+++ b/wgpu-hal/src/dx12/mod.rs
@@ -422,6 +422,7 @@ impl Texture {
 #[derive(Debug)]
 pub struct TextureView {
     raw_format: native::Format,
+    format_aspects: crate::FormatAspects, // May explicitly ignore stencil aspect of raw_format!
     target_base: (native::Resource, u32),
     handle_srv: Option<descriptor::Handle>,
     handle_uav: Option<descriptor::Handle>,
diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index bb7b5d3832..a794ec12d0 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -918,6 +918,7 @@ impl super::PrivateCapabilities {
             | F::MAPPABLE_PRIMARY_BUFFERS
             | F::VERTEX_WRITABLE_STORAGE
             | F::TEXTURE_ADAPTER_SPECIFIC_FORMAT_FEATURES
+            | F::PUSH_CONSTANTS
             | F::POLYGON_MODE_LINE
             | F::CLEAR_COMMANDS
             | F::TEXTURE_FORMAT_16BIT_NORM;
diff --git a/wgpu-hal/src/metal/command.rs b/wgpu-hal/src/metal/command.rs
index 465d91e197..e7a0642b89 100644
--- a/wgpu-hal/src/metal/command.rs
+++ b/wgpu-hal/src/metal/command.rs
@@ -16,6 +16,7 @@ impl Default for super::CommandState {
             stage_infos: Default::default(),
             storage_buffer_length_map: Default::default(),
             work_group_memory_sizes: Vec::new(),
+            push_constants: Vec::new(),
         }
     }
 }
@@ -61,6 +62,7 @@ impl super::CommandState {
         self.stage_infos.fs.clear();
         self.stage_infos.cs.clear();
         self.work_group_memory_sizes.clear();
+        self.push_constants.clear();
     }
 
     fn make_sizes_buffer_update<'a>(
@@ -587,12 +589,41 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
 
     unsafe fn set_push_constants(
         &mut self,
-        _layout: &super::PipelineLayout,
-        _stages: wgt::ShaderStages,
-        _offset: u32,
-        _data: &[u32],
+        layout: &super::PipelineLayout,
+        stages: wgt::ShaderStages,
+        offset: u32,
+        data: &[u32],
     ) {
-        //TODO
+        let state_pc = &mut self.state.push_constants;
+        if state_pc.len() < layout.total_push_constants as usize {
+            state_pc.resize(layout.total_push_constants as usize, 0);
+        }
+        assert_eq!(offset as usize % WORD_SIZE, 0);
+
+        let offset = offset as usize / WORD_SIZE;
+        state_pc[offset..offset + data.len()].copy_from_slice(data);
+
+        if stages.contains(wgt::ShaderStages::COMPUTE) {
+            self.state.compute.as_ref().unwrap().set_bytes(
+                layout.push_constants_infos.cs.unwrap().buffer_index as _,
+                (layout.total_push_constants as usize * WORD_SIZE) as _,
+                state_pc.as_ptr() as _,
+            )
+        }
+        if stages.contains(wgt::ShaderStages::VERTEX) {
+            self.state.render.as_ref().unwrap().set_vertex_bytes(
+                layout.push_constants_infos.vs.unwrap().buffer_index as _,
+                (layout.total_push_constants as usize * WORD_SIZE) as _,
+                state_pc.as_ptr() as _,
+            )
+        }
+        if stages.contains(wgt::ShaderStages::FRAGMENT) {
+            self.state.render.as_ref().unwrap().set_fragment_bytes(
+                layout.push_constants_infos.fs.unwrap().buffer_index as _,
+                (layout.total_push_constants as usize * WORD_SIZE) as _,
+                state_pc.as_ptr() as _,
+            )
+        }
     }
 
     unsafe fn insert_debug_marker(&mut self, label: &str) {
diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs
index d4018dcfd1..51b96a6657 100644
--- a/wgpu-hal/src/metal/device.rs
+++ b/wgpu-hal/src/metal/device.rs
@@ -471,6 +471,7 @@ impl crate::Device<super::Api> for super::Device {
         let mut bind_group_infos = arrayvec::ArrayVec::new();
 
         // First, place the push constants
+        let mut total_push_constants = 0;
         for info in stage_data.iter_mut() {
             for pcr in desc.push_constant_ranges {
                 if pcr.stages.contains(map_naga_stage(info.stage)) {
@@ -492,6 +493,8 @@ impl crate::Device<super::Api> for super::Device {
                 info.pc_buffer = Some(info.counters.buffers);
                 info.counters.buffers += 1;
             }
+
+            total_push_constants = total_push_constants.max(info.pc_limit);
         }
 
         // Second, place the described resources
@@ -641,6 +644,7 @@ impl crate::Device<super::Api> for super::Device {
                     image: naga::proc::BoundsCheckPolicy::ReadZeroSkipWrite,
                 },
             },
+            total_push_constants,
         })
     }
     unsafe fn destroy_pipeline_layout(&self, _pipeline_layout: super::PipelineLayout) {}
diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs
index 9dc686d3c8..2ea58c2635 100644
--- a/wgpu-hal/src/metal/mod.rs
+++ b/wgpu-hal/src/metal/mod.rs
@@ -524,6 +524,7 @@ pub struct PipelineLayout {
     bind_group_infos: ArrayVec<BindGroupLayoutInfo, { crate::MAX_BIND_GROUPS }>,
     push_constants_infos: MultiStageData<Option<PushConstantsInfo>>,
     total_counters: MultiStageResourceCounters,
+    total_push_constants: u32,
 }
 
 trait AsNative {
@@ -709,6 +710,7 @@ struct CommandState {
     stage_infos: MultiStageData<PipelineStageInfo>,
     storage_buffer_length_map: fxhash::FxHashMap<naga::ResourceBinding, wgt::BufferSize>,
     work_group_memory_sizes: Vec<u32>,
+    push_constants: Vec<u32>,
 }
 
 pub struct CommandEncoder {
diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
index be4e94046a..8c98f62df6 100644
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@@ -1116,8 +1116,8 @@ impl super::Adapter {
         let timeline_semaphore_fn = if enabled_extensions.contains(&khr::TimelineSemaphore::name())
         {
             Some(super::ExtensionFn::Extension(khr::TimelineSemaphore::new(
-                &self.instance.entry,
                 &self.instance.raw,
+                &raw_device,
             )))
         } else if self.phd_capabilities.properties.api_version >= vk::API_VERSION_1_2 {
             Some(super::ExtensionFn::Promoted)
diff --git a/wgpu-hal/src/vulkan/command.rs b/wgpu-hal/src/vulkan/command.rs
index deca4839df..dbcb1123ee 100644
--- a/wgpu-hal/src/vulkan/command.rs
+++ b/wgpu-hal/src/vulkan/command.rs
@@ -594,9 +594,11 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
             .cmd_set_scissor(self.active, 0, &vk_scissors);
     }
     unsafe fn set_stencil_reference(&mut self, value: u32) {
-        self.device
-            .raw
-            .cmd_set_stencil_reference(self.active, vk::StencilFaceFlags::all(), value);
+        self.device.raw.cmd_set_stencil_reference(
+            self.active,
+            vk::StencilFaceFlags::FRONT_AND_BACK,
+            value,
+        );
     }
     unsafe fn set_blend_constants(&mut self, color: &[f32; 4]) {
         self.device.raw.cmd_set_blend_constants(self.active, color);
diff --git a/wgpu-hal/src/vulkan/device.rs b/wgpu-hal/src/vulkan/device.rs
index 1b21385c7c..fd1b49853c 100644
--- a/wgpu-hal/src/vulkan/device.rs
+++ b/wgpu-hal/src/vulkan/device.rs
@@ -1716,7 +1716,7 @@ impl crate::Device<super::Api> for super::Device {
                     .values(&values);
                 let result = match self.shared.extension_fns.timeline_semaphore {
                     Some(super::ExtensionFn::Extension(ref ext)) => {
-                        ext.wait_semaphores(self.shared.raw.handle(), &vk_info, timeout_us)
+                        ext.wait_semaphores(&vk_info, timeout_us)
                     }
                     Some(super::ExtensionFn::Promoted) => {
                         self.shared.raw.wait_semaphores(&vk_info, timeout_us)
diff --git a/wgpu-hal/src/vulkan/instance.rs b/wgpu-hal/src/vulkan/instance.rs
index bf1297bdac..71e0fc7e38 100644
--- a/wgpu-hal/src/vulkan/instance.rs
+++ b/wgpu-hal/src/vulkan/instance.rs
@@ -208,7 +208,11 @@ impl super::Instance {
             let vk_info = vk::DebugUtilsMessengerCreateInfoEXT::builder()
                 .flags(vk::DebugUtilsMessengerCreateFlagsEXT::empty())
                 .message_severity(severity)
-                .message_type(vk::DebugUtilsMessageTypeFlagsEXT::all())
+                .message_type(
+                    vk::DebugUtilsMessageTypeFlagsEXT::GENERAL
+                        | vk::DebugUtilsMessageTypeFlagsEXT::VALIDATION
+                        | vk::DebugUtilsMessageTypeFlagsEXT::PERFORMANCE,
+                )
                 .pfn_user_callback(Some(debug_utils_messenger_callback));
             let messenger = extension
                 .create_debug_utils_messenger(&vk_info, None)
@@ -438,7 +442,7 @@ impl Drop for super::InstanceShared {
 
 impl crate::Instance<super::Api> for super::Instance {
     unsafe fn init(desc: &crate::InstanceDescriptor) -> Result<Self, crate::InstanceError> {
-        let entry = match ash::Entry::new() {
+        let entry = match ash::Entry::load() {
             Ok(entry) => entry,
             Err(err) => {
                 log::info!("Missing Vulkan entry points: {:?}", err);
diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs
index 457911a3cd..234d737a19 100644
--- a/wgpu-hal/src/vulkan/mod.rs
+++ b/wgpu-hal/src/vulkan/mod.rs
@@ -514,9 +514,7 @@ impl Fence {
         match *self {
             Self::TimelineSemaphore(raw) => unsafe {
                 Ok(match *extension.unwrap() {
-                    ExtensionFn::Extension(ref ext) => {
-                        ext.get_semaphore_counter_value(device.handle(), raw)?
-                    }
+                    ExtensionFn::Extension(ref ext) => ext.get_semaphore_counter_value(raw)?,
                     ExtensionFn::Promoted => device.get_semaphore_counter_value(raw)?,
                 })
             },
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index f7df88fd11..5acacbca68 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -628,9 +628,9 @@ pub struct Limits {
     pub max_sampled_textures_per_shader_stage: u32,
     /// Amount of samplers visible in a single shader stage. Defaults to 16. Higher is "better".
     pub max_samplers_per_shader_stage: u32,
-    /// Amount of storage buffers visible in a single shader stage. Defaults to 4. Higher is "better".
+    /// Amount of storage buffers visible in a single shader stage. Defaults to 8. Higher is "better".
     pub max_storage_buffers_per_shader_stage: u32,
-    /// Amount of storage textures visible in a single shader stage. Defaults to 4. Higher is "better".
+    /// Amount of storage textures visible in a single shader stage. Defaults to 8. Higher is "better".
     pub max_storage_textures_per_shader_stage: u32,
     /// Amount of uniform buffers visible in a single shader stage. Defaults to 12. Higher is "better".
     pub max_uniform_buffers_per_shader_stage: u32,
@@ -667,11 +667,13 @@ pub struct Limits {
     /// Defaults to 256. Lower is "better".
     pub min_storage_buffer_offset_alignment: u32,
     /// Maximum allowed number of components (scalars) of input or output locations for
-    /// inter-stage communication (vertex outputs to fragment inputs).
+    /// inter-stage communication (vertex outputs to fragment inputs). Defaults to 60.
     pub max_inter_stage_shader_components: u32,
-    /// Maximum number of bytes used for workgroup memory in a compute entry point.
+    /// Maximum number of bytes used for workgroup memory in a compute entry point. Defaults to
+    /// 16352.
     pub max_compute_workgroup_storage_size: u32,
     /// Maximum value of the product of the `workgroup_size` dimensions for a compute entry-point.
+    /// Defaults to 256.
     pub max_compute_invocations_per_workgroup: u32,
     /// The maximum value of the workgroup_size X dimension for a compute stage `ShaderModule` entry-point.
     /// Defaults to 256.
@@ -680,7 +682,7 @@ pub struct Limits {
     /// Defaults to 256.
     pub max_compute_workgroup_size_y: u32,
     /// The maximum value of the workgroup_size Z dimension for a compute stage `ShaderModule` entry-point.
-    /// Defaults to 256.
+    /// Defaults to 64.
     pub max_compute_workgroup_size_z: u32,
     /// The maximum value for each dimension of a `ComputePass::dispatch(x, y, z)` operation.
     /// Defaults to 65535.