diff --git a/src/back/hlsl/help.rs b/src/back/hlsl/help.rs
index 5f81c296dc..ec913ba66d 100644
--- a/src/back/hlsl/help.rs
+++ b/src/back/hlsl/help.rs
@@ -54,6 +54,11 @@ pub(super) struct WrappedStructMatrixAccess {
     pub(super) index: u32,
 }
 
+#[derive(Clone, Copy, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
+pub(super) struct WrappedMatCx2 {
+    pub(super) columns: crate::VectorSize,
+}
+
 /// HLSL backend requires its own `ImageQuery` enum.
 ///
 /// It is used inside `WrappedImageQuery` and should be unique per ImageQuery function.
@@ -461,12 +466,36 @@ impl<'a, W: Write> super::Writer<'a, W> {
                                 )?;
                             }
                         }
-                        _ => {
-                            writeln!(
-                                self.out,
-                                "{}{}.{} = {}{};",
-                                INDENT, RETURN_VARIABLE_NAME, field_name, ARGUMENT_VARIABLE_NAME, i,
-                            )?;
+                        ref other => {
+                            // We cast arrays of native HLSL `floatCx2`s to arrays of `matCx2`s
+                            // (where the inner matrix is represented by a struct with C `float2` members).
+                            // See the module-level block comment in mod.rs for details.
+                            if let Some(super::writer::MatrixType {
+                                columns,
+                                rows: crate::VectorSize::Bi,
+                                width: 4,
+                            }) = super::writer::get_inner_matrix_data(module, member.ty)
+                            {
+                                write!(
+                                    self.out,
+                                    "{}{}.{} = (__mat{}x2",
+                                    INDENT, RETURN_VARIABLE_NAME, field_name, columns as u8
+                                )?;
+                                if let crate::TypeInner::Array { base, size, .. } = *other {
+                                    self.write_array_size(module, base, size)?;
+                                }
+                                writeln!(self.out, "){}{};", ARGUMENT_VARIABLE_NAME, i,)?;
+                            } else {
+                                writeln!(
+                                    self.out,
+                                    "{}{}.{} = {}{};",
+                                    INDENT,
+                                    RETURN_VARIABLE_NAME,
+                                    field_name,
+                                    ARGUMENT_VARIABLE_NAME,
+                                    i,
+                                )?;
+                            }
                         }
                     }
                 }
@@ -1050,4 +1079,117 @@ impl<'a, W: Write> super::Writer<'a, W> {
         }
         Ok(())
     }
+
+    pub(super) fn write_mat_cx2_typedef_and_functions(
+        &mut self,
+        WrappedMatCx2 { columns }: WrappedMatCx2,
+    ) -> BackendResult {
+        use crate::back::INDENT;
+
+        // typedef
+        write!(self.out, "typedef struct {{ ")?;
+        for i in 0..columns as u8 {
+            write!(self.out, "float2 _{}; ", i)?;
+        }
+        writeln!(self.out, "}} __mat{}x2;", columns as u8)?;
+
+        // __get_col_of_mat
+        writeln!(
+            self.out,
+            "float2 __get_col_of_mat{}x2(__mat{}x2 mat, uint idx) {{",
+            columns as u8, columns as u8
+        )?;
+        writeln!(self.out, "{}switch(idx) {{", INDENT)?;
+        for i in 0..columns as u8 {
+            writeln!(self.out, "{}case {}: {{ return mat._{}; }}", INDENT, i, i)?;
+        }
+        writeln!(self.out, "{}default: {{ return (float2)0; }}", INDENT)?;
+        writeln!(self.out, "{}}}", INDENT)?;
+        writeln!(self.out, "}}")?;
+
+        // __set_col_of_mat
+        writeln!(
+            self.out,
+            "void __set_col_of_mat{}x2(__mat{}x2 mat, uint idx, float2 value) {{",
+            columns as u8, columns as u8
+        )?;
+        writeln!(self.out, "{}switch(idx) {{", INDENT)?;
+        for i in 0..columns as u8 {
+            writeln!(
+                self.out,
+                "{}case {}: {{ mat._{} = value; break; }}",
+                INDENT, i, i
+            )?;
+        }
+        writeln!(self.out, "{}}}", INDENT)?;
+        writeln!(self.out, "}}")?;
+
+        // __set_el_of_mat
+        writeln!(
+            self.out,
+            "void __set_el_of_mat{}x2(__mat{}x2 mat, uint idx, uint vec_idx, float value) {{",
+            columns as u8, columns as u8
+        )?;
+        writeln!(self.out, "{}switch(idx) {{", INDENT)?;
+        for i in 0..columns as u8 {
+            writeln!(
+                self.out,
+                "{}case {}: {{ mat._{}[vec_idx] = value; break; }}",
+                INDENT, i, i
+            )?;
+        }
+        writeln!(self.out, "{}}}", INDENT)?;
+        writeln!(self.out, "}}")?;
+
+        writeln!(self.out)?;
+
+        Ok(())
+    }
+
+    pub(super) fn write_all_mat_cx2_typedefs_and_functions(
+        &mut self,
+        module: &crate::Module,
+    ) -> BackendResult {
+        for (handle, _) in module.global_variables.iter() {
+            let global = &module.global_variables[handle];
+
+            if global.space == crate::AddressSpace::Uniform {
+                if let Some(super::writer::MatrixType {
+                    columns,
+                    rows: crate::VectorSize::Bi,
+                    width: 4,
+                }) = super::writer::get_inner_matrix_data(module, global.ty)
+                {
+                    let entry = WrappedMatCx2 { columns };
+                    if !self.wrapped.mat_cx2s.contains(&entry) {
+                        self.write_mat_cx2_typedef_and_functions(entry)?;
+                        self.wrapped.mat_cx2s.insert(entry);
+                    }
+                }
+            }
+        }
+
+        for (_, ty) in module.types.iter() {
+            if let crate::TypeInner::Struct { ref members, .. } = ty.inner {
+                for member in members.iter() {
+                    if let crate::TypeInner::Array { .. } = module.types[member.ty].inner {
+                        if let Some(super::writer::MatrixType {
+                            columns,
+                            rows: crate::VectorSize::Bi,
+                            width: 4,
+                        }) = super::writer::get_inner_matrix_data(module, member.ty)
+                        {
+                            let entry = WrappedMatCx2 { columns };
+                            if !self.wrapped.mat_cx2s.contains(&entry) {
+                                self.write_mat_cx2_typedef_and_functions(entry)?;
+                                self.wrapped.mat_cx2s.insert(entry);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
 }
diff --git a/src/back/hlsl/mod.rs b/src/back/hlsl/mod.rs
index db027f7769..76abb29783 100644
--- a/src/back/hlsl/mod.rs
+++ b/src/back/hlsl/mod.rs
@@ -57,8 +57,8 @@ that the columns of a `matKx2<f32>` need only be [aligned as required
 for `vec2<f32>`][ilov], which is [eight-byte alignment][8bb].
 
 To compensate for this, any time a `matKx2<f32>` appears in a WGSL
-`uniform` variable, whether directly as the variable's type or as a
-struct member, we actually emit `K` separate `float2` members, and
+`uniform` variable, whether directly as the variable's type or as part
+of a struct/array, we actually emit `K` separate `float2` members, and
 assemble/disassemble the matrix from its columns (in WGSL; rows in
 HLSL) upon load and store.
 
@@ -92,14 +92,10 @@ float3x2 GetMatmOnBaz(Baz obj) {
 We also emit an analogous `Set` function, as well as functions for
 accessing individual columns by dynamic index.
 
-At present, we do not generate correct HLSL when `matCx2<f32>` us used
-directly as the type of a WGSL `uniform` global ([#1837]).
-
 [hlsl]: https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl
 [ilov]: https://gpuweb.github.io/gpuweb/wgsl/#internal-value-layout
 [16bb]: https://github.com/microsoft/DirectXShaderCompiler/wiki/Buffer-Packing#constant-buffer-packing
 [8bb]: https://gpuweb.github.io/gpuweb/wgsl/#alignment-and-size
-[#1837]: https://github.com/gfx-rs/naga/issues/1837
 */
 
 mod conv;
@@ -253,6 +249,7 @@ struct Wrapped {
     image_queries: crate::FastHashSet<help::WrappedImageQuery>,
     constructors: crate::FastHashSet<help::WrappedConstructor>,
     struct_matrix_access: crate::FastHashSet<help::WrappedStructMatrixAccess>,
+    mat_cx2s: crate::FastHashSet<help::WrappedMatCx2>,
 }
 
 impl Wrapped {
@@ -261,6 +258,7 @@ impl Wrapped {
         self.image_queries.clear();
         self.constructors.clear();
         self.struct_matrix_access.clear();
+        self.mat_cx2s.clear();
     }
 }
 
diff --git a/src/back/hlsl/writer.rs b/src/back/hlsl/writer.rs
index 4a14cc6ba8..b582cfa4f2 100644
--- a/src/back/hlsl/writer.rs
+++ b/src/back/hlsl/writer.rs
@@ -150,6 +150,8 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
             .map(|ep| (ep.stage, ep.function.result.clone()))
             .collect::<Vec<(ShaderStage, Option<crate::FunctionResult>)>>();
 
+        self.write_all_mat_cx2_typedefs_and_functions(module)?;
+
         // Write all structs
         for (handle, ty) in module.types.iter() {
             if let TypeInner::Struct { ref members, span } = ty.inner {
@@ -661,19 +663,41 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
 
         if global.space == crate::AddressSpace::Uniform {
             write!(self.out, " {{ ")?;
-            // Even though Naga IR matrices are column-major, we must describe
-            // matrices passed from the CPU as being in row-major order.  See
-            // the module-level comments for details.
-            if let TypeInner::Matrix { .. } = module.types[global.ty].inner {
-                write!(self.out, "row_major ")?;
-            }
-            self.write_type(module, global.ty)?;
-            let sub_name = &self.names[&NameKey::GlobalVariable(handle)];
-            write!(self.out, " {}", sub_name)?;
+
+            let matrix_data = get_inner_matrix_data(module, global.ty);
+
+            // We treat matrices of the form `matCx2` as a sequence of C `vec2`s.
+            // See the module-level block comment in mod.rs for details.
+            if let Some(MatrixType {
+                columns,
+                rows: crate::VectorSize::Bi,
+                width: 4,
+            }) = matrix_data
+            {
+                write!(
+                    self.out,
+                    "__mat{}x2 {}",
+                    columns as u8,
+                    &self.names[&NameKey::GlobalVariable(handle)]
+                )?;
+            } else {
+                // Even though Naga IR matrices are column-major, we must describe
+                // matrices passed from the CPU as being in row-major order.
+                // See the module-level block comment in mod.rs for details.
+                if matrix_data.is_some() {
+                    write!(self.out, "row_major ")?;
+                }
+
+                self.write_type(module, global.ty)?;
+                let sub_name = &self.names[&NameKey::GlobalVariable(handle)];
+                write!(self.out, " {}", sub_name)?;
+            }
+
             // need to write the array size if the type was emitted with `write_type`
             if let TypeInner::Array { base, size, .. } = module.types[global.ty].inner {
                 self.write_array_size(module, base, size)?;
             }
+
             writeln!(self.out, "; }}")?;
         } else {
             writeln!(self.out, ";")?;
@@ -801,16 +825,31 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
             write!(self.out, "{}", back::INDENT)?;
 
             match module.types[member.ty].inner {
-                TypeInner::Array {
-                    base,
-                    size,
-                    stride: _,
-                } => {
+                TypeInner::Array { base, size, .. } => {
                     // HLSL arrays are written as `type name[size]`
-                    if let TypeInner::Matrix { .. } = module.types[base].inner {
-                        write!(self.out, "row_major ")?;
+
+                    let matrix_data = get_inner_matrix_data(module, member.ty);
+
+                    // We treat matrices of the form `matCx2` as a sequence of C `vec2`s.
+                    // See the module-level block comment in mod.rs for details.
+                    if let Some(MatrixType {
+                        columns,
+                        rows: crate::VectorSize::Bi,
+                        width: 4,
+                    }) = matrix_data
+                    {
+                        write!(self.out, "__mat{}x2", columns as u8)?;
+                    } else {
+                        // Even though Naga IR matrices are column-major, we must describe
+                        // matrices passed from the CPU as being in row-major order.
+                        // See the module-level block comment in mod.rs for details.
+                        if matrix_data.is_some() {
+                            write!(self.out, "row_major ")?;
+                        }
+
+                        self.write_type(module, base)?;
                     }
-                    self.write_type(module, base)?;
+
                     // Write `name`
                     write!(
                         self.out,
@@ -820,8 +859,8 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                     // Write [size]
                     self.write_array_size(module, base, size)?;
                 }
-                // We treat matrices of the form `matCx2` as a sequence of C `vec2`s
-                // (see top level module docs for details).
+                // We treat matrices of the form `matCx2` as a sequence of C `vec2`s.
+                // See the module-level block comment in mod.rs for details.
                 TypeInner::Matrix {
                     rows,
                     columns,
@@ -848,6 +887,9 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                         self.write_modifier(binding)?;
                     }
 
+                    // Even though Naga IR matrices are column-major, we must describe
+                    // matrices passed from the CPU as being in row-major order.
+                    // See the module-level block comment in mod.rs for details.
                     if let TypeInner::Matrix { .. } = module.types[member.ty].inner {
                         write!(self.out, "row_major ")?;
                     }
@@ -1285,17 +1327,6 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
             }
             Statement::Store { pointer, value } => {
                 let ty_inner = func_ctx.info[pointer].ty.inner_with(&module.types);
-                let array_info = match *ty_inner {
-                    TypeInner::Pointer { base, .. } => match module.types[base].inner {
-                        crate::TypeInner::Array {
-                            size: crate::ArraySize::Constant(ch),
-                            ..
-                        } => Some((ch, base)),
-                        _ => None,
-                    },
-                    _ => None,
-                };
-
                 if let Some(crate::AddressSpace::Storage { .. }) = ty_inner.pointer_space() {
                     let var_handle = self.fill_access_chain(module, pointer, func_ctx)?;
                     self.write_storage_store(
@@ -1305,26 +1336,9 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                         func_ctx,
                         level,
                     )?;
-                } else if let Some((const_handle, base_ty)) = array_info {
-                    let size = module.constants[const_handle].to_array_length().unwrap();
-                    writeln!(self.out, "{}{{", level)?;
-                    write!(self.out, "{}", level.next())?;
-                    self.write_type(module, base_ty)?;
-                    write!(self.out, " _result[{}]=", size)?;
-                    self.write_expr(module, value, func_ctx)?;
-                    writeln!(self.out, ";")?;
-                    write!(
-                        self.out,
-                        "{}for(int _i=0; _i<{}; ++_i) ",
-                        level.next(),
-                        size
-                    )?;
-                    self.write_expr(module, pointer, func_ctx)?;
-                    writeln!(self.out, "[_i] = _result[_i];")?;
-                    writeln!(self.out, "{}}}", level)?;
                 } else {
-                    // We treat matrices of the form `matCx2` as a sequence of C `vec2`s
-                    // (see top level module docs for details).
+                    // We treat matrices of the form `matCx2` as a sequence of C `vec2`s.
+                    // See the module-level block comment in mod.rs for details.
                     //
                     // We handle matrix Stores here directly (including sub accesses for Vectors and Scalars).
                     // Loads are handled by `Expression::AccessIndex` (since sub accesses work fine for Loads).
@@ -1487,10 +1501,132 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                             writeln!(self.out, ");")?;
                         }
                     } else {
-                        self.write_expr(module, pointer, func_ctx)?;
-                        write!(self.out, " = ")?;
-                        self.write_expr(module, value, func_ctx)?;
-                        writeln!(self.out, ";")?
+                        // We handle `Store`s to __matCx2 column vectors and scalar elements via
+                        // the previously injected functions __set_col_of_matCx2 / __set_el_of_matCx2.
+                        struct MatrixData {
+                            columns: crate::VectorSize,
+                            base: Handle<crate::Expression>,
+                        }
+
+                        enum Index {
+                            Expression(Handle<crate::Expression>),
+                            Static(u32),
+                        }
+
+                        let mut matrix = None;
+                        let mut vector = None;
+                        let mut scalar = None;
+
+                        let mut current_expr = pointer;
+                        for _ in 0..3 {
+                            let resolved = func_ctx.info[current_expr].ty.inner_with(&module.types);
+                            match (resolved, &func_ctx.expressions[current_expr]) {
+                                (
+                                    &TypeInner::ValuePointer {
+                                        size: Some(crate::VectorSize::Bi),
+                                        ..
+                                    },
+                                    &crate::Expression::Access { base, index },
+                                ) => {
+                                    vector = Some(index);
+                                    current_expr = base;
+                                }
+                                (
+                                    &TypeInner::ValuePointer { size: None, .. },
+                                    &crate::Expression::Access { base, index },
+                                ) => {
+                                    scalar = Some(Index::Expression(index));
+                                    current_expr = base;
+                                }
+                                (
+                                    &TypeInner::ValuePointer { size: None, .. },
+                                    &crate::Expression::AccessIndex { base, index },
+                                ) => {
+                                    scalar = Some(Index::Static(index));
+                                    current_expr = base;
+                                }
+                                _ => {
+                                    if let Some(MatrixType {
+                                        columns,
+                                        rows: crate::VectorSize::Bi,
+                                        width: 4,
+                                    }) = get_inner_matrix_of_struct_array_member(
+                                        module,
+                                        current_expr,
+                                        func_ctx,
+                                        true,
+                                    ) {
+                                        matrix = Some(MatrixData {
+                                            columns,
+                                            base: current_expr,
+                                        });
+                                    }
+
+                                    break;
+                                }
+                            }
+                        }
+
+                        if let (Some(MatrixData { columns, base }), Some(vec_index)) =
+                            (matrix, vector)
+                        {
+                            if scalar.is_some() {
+                                write!(self.out, "__set_el_of_mat{}x2", columns as u8)?;
+                            } else {
+                                write!(self.out, "__set_col_of_mat{}x2", columns as u8)?;
+                            }
+                            write!(self.out, "(")?;
+                            self.write_expr(module, base, func_ctx)?;
+                            write!(self.out, ", ")?;
+                            self.write_expr(module, vec_index, func_ctx)?;
+
+                            if let Some(scalar_index) = scalar {
+                                write!(self.out, ", ")?;
+                                match scalar_index {
+                                    Index::Static(index) => {
+                                        write!(self.out, "{}", index)?;
+                                    }
+                                    Index::Expression(index) => {
+                                        self.write_expr(module, index, func_ctx)?;
+                                    }
+                                }
+                            }
+
+                            write!(self.out, ", ")?;
+                            self.write_expr(module, value, func_ctx)?;
+
+                            writeln!(self.out, ");")?;
+                        } else {
+                            self.write_expr(module, pointer, func_ctx)?;
+                            write!(self.out, " = ")?;
+
+                            // We cast the RHS of this store in cases where the LHS
+                            // is a struct member with type:
+                            //  - matCx2 or
+                            //  - a (possibly nested) array of matCx2's
+                            if let Some(MatrixType {
+                                columns,
+                                rows: crate::VectorSize::Bi,
+                                width: 4,
+                            }) = get_inner_matrix_of_struct_array_member(
+                                module, pointer, func_ctx, false,
+                            ) {
+                                let mut resolved =
+                                    func_ctx.info[pointer].ty.inner_with(&module.types);
+                                if let TypeInner::Pointer { base, .. } = *resolved {
+                                    resolved = &module.types[base].inner;
+                                }
+
+                                write!(self.out, "(__mat{}x2", columns as u8)?;
+                                if let TypeInner::Array { base, size, .. } = *resolved {
+                                    self.write_array_size(module, base, size)?;
+                                }
+                                write!(self.out, ")")?;
+                            }
+
+                            self.write_expr(module, value, func_ctx)?;
+                            writeln!(self.out, ";")?
+                        }
                     }
                 }
             }
@@ -1863,6 +1999,26 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                 {
                     // do nothing, the chain is written on `Load`/`Store`
                 } else {
+                    // We use the function __get_col_of_matCx2 here in cases
+                    // where `base`s type resolves to a matCx2 and is part of a
+                    // struct member with type of (possibly nested) array of matCx2's.
+                    //
+                    // Note that this only works for `Load`s and we handle
+                    // `Store`s differently in `Statement::Store`.
+                    if let Some(MatrixType {
+                        columns,
+                        rows: crate::VectorSize::Bi,
+                        width: 4,
+                    }) = get_inner_matrix_of_struct_array_member(module, base, func_ctx, true)
+                    {
+                        write!(self.out, "__get_col_of_mat{}x2(", columns as u8)?;
+                        self.write_expr(module, base, func_ctx)?;
+                        write!(self.out, ", ")?;
+                        self.write_expr(module, index, func_ctx)?;
+                        write!(self.out, ")")?;
+                        return Ok(());
+                    }
+
                     let base_ty_res = &func_ctx.info[base].ty;
                     let resolved = base_ty_res.inner_with(&module.types);
 
@@ -1895,18 +2051,64 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                 {
                     // do nothing, the chain is written on `Load`/`Store`
                 } else {
+                    fn write_access<W: fmt::Write>(
+                        writer: &mut super::Writer<'_, W>,
+                        resolved: &TypeInner,
+                        base_ty_handle: Option<Handle<crate::Type>>,
+                        index: u32,
+                    ) -> BackendResult {
+                        match *resolved {
+                            TypeInner::Vector { .. } => {
+                                // Write vector access as a swizzle
+                                write!(writer.out, ".{}", back::COMPONENTS[index as usize])?
+                            }
+                            TypeInner::Matrix { .. }
+                            | TypeInner::Array { .. }
+                            | TypeInner::BindingArray { .. }
+                            | TypeInner::ValuePointer { .. } => write!(writer.out, "[{}]", index)?,
+                            TypeInner::Struct { .. } => {
+                                // This will never panic in case the type is a `Struct`, this is not true
+                                // for other types so we can only check while inside this match arm
+                                let ty = base_ty_handle.unwrap();
+
+                                write!(
+                                    writer.out,
+                                    ".{}",
+                                    &writer.names[&NameKey::StructMember(ty, index)]
+                                )?
+                            }
+                            ref other => {
+                                return Err(Error::Custom(format!("Cannot index {:?}", other)))
+                            }
+                        }
+                        Ok(())
+                    }
+
+                    // We write the matrix column access in a special way since
+                    // the type of `base` is our special __matCx2 struct.
+                    if let Some(MatrixType {
+                        rows: crate::VectorSize::Bi,
+                        width: 4,
+                        ..
+                    }) = get_inner_matrix_of_struct_array_member(module, base, func_ctx, true)
+                    {
+                        self.write_expr(module, base, func_ctx)?;
+                        write!(self.out, "._{}", index)?;
+                        return Ok(());
+                    }
+
                     let base_ty_res = &func_ctx.info[base].ty;
                     let mut resolved = base_ty_res.inner_with(&module.types);
                     let base_ty_handle = match *resolved {
-                        TypeInner::Pointer { base, space: _ } => {
+                        TypeInner::Pointer { base, .. } => {
                             resolved = &module.types[base].inner;
                             Some(base)
                         }
                         _ => base_ty_res.handle(),
                     };
 
-                    // We treat matrices of the form `matCx2` as a sequence of C `vec2`s
-                    // (see top level module docs for details).
+                    // We treat matrices of the form `matCx2` as a sequence of C `vec2`s.
+                    // See the module-level block comment in mod.rs for details.
                     //
                     // We handle matrix reconstruction here for Loads.
                     // Stores are handled directly by `Statement::Store`.
@@ -1929,34 +2131,10 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                             }
                             _ => {}
                         }
-                    };
+                    }
 
                     self.write_expr(module, base, func_ctx)?;
-
-                    match *resolved {
-                        TypeInner::Vector { .. } => {
-                            // Write vector access as a swizzle
-                            write!(self.out, ".{}", back::COMPONENTS[index as usize])?
-                        }
-                        TypeInner::Matrix { .. }
-                        | TypeInner::Array { .. }
-                        | TypeInner::BindingArray { .. }
-                        | TypeInner::ValuePointer { .. } => write!(self.out, "[{}]", index)?,
-                        TypeInner::Struct { .. } => {
-                            // This will never panic in case the type is a `Struct`, this is not true
-                            // for other types so we can only check while inside this match arm
-                            let ty = base_ty_handle.unwrap();
-
-                            write!(
-                                self.out,
-                                ".{}",
-                                &self.names[&NameKey::StructMember(ty, index)]
-                            )?
-                        }
-                        ref other => {
-                            return Err(Error::Custom(format!("Cannot index {:?}", other)))
-                        }
-                    }
+                    write_access(self, resolved, base_ty_handle, index)?;
                 }
             }
             Expression::FunctionArgument(pos) => {
@@ -2127,7 +2305,42 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                         self.write_storage_load(module, var_handle, result_ty, func_ctx)?;
                     }
                     _ => {
+                        let mut close_paren = false;
+
+                        // We cast the value loaded to a native HLSL floatCx2
+                        // in cases where it is of type:
+                        //  - __matCx2 or
+                        //  - a (possibly nested) array of __matCx2's
+                        if let Some(MatrixType {
+                            rows: crate::VectorSize::Bi,
+                            width: 4,
+                            ..
+                        }) = get_inner_matrix_of_struct_array_member(
+                            module, pointer, func_ctx, false,
+                        )
+                        .or_else(|| get_inner_matrix_of_global_uniform(module, pointer, func_ctx))
+                        {
+                            let mut resolved = func_ctx.info[pointer].ty.inner_with(&module.types);
+                            if let TypeInner::Pointer { base, .. } = *resolved {
+                                resolved = &module.types[base].inner;
+                            }
+
+                            write!(self.out, "((")?;
+                            if let TypeInner::Array { base, size, .. } = *resolved {
+                                self.write_type(module, base)?;
+                                self.write_array_size(module, base, size)?;
+                            } else {
+                                self.write_value_type(module, resolved)?;
+                            }
+                            write!(self.out, ")")?;
+                            close_paren = true;
+                        }
+
                         self.write_expr(module, pointer, func_ctx)?;
+
+                        if close_paren {
+                            write!(self.out, ")")?;
+                        }
                     }
                 }
             }
@@ -2586,3 +2799,139 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         Ok(())
     }
 }
+
+pub(super) struct MatrixType {
+    pub(super) columns: crate::VectorSize,
+    pub(super) rows: crate::VectorSize,
+    pub(super) width: crate::Bytes,
+}
+
+pub(super) fn get_inner_matrix_data(
+    module: &Module,
+    handle: Handle<crate::Type>,
+) -> Option<MatrixType> {
+    match module.types[handle].inner {
+        TypeInner::Matrix {
+            columns,
+            rows,
+            width,
+        } => Some(MatrixType {
+            columns,
+            rows,
+            width,
+        }),
+        TypeInner::Array { base, .. } => get_inner_matrix_data(module, base),
+        _ => None,
+    }
+}
+
+/// Returns the matrix data if the access chain starting at `base`:
+/// - starts with an expression with resolved type of [`TypeInner::Matrix`] if `direct = true`
+/// - contains one or more expressions with resolved type of [`TypeInner::Array`] of [`TypeInner::Matrix`]
+/// - ends at an expression with resolved type of [`TypeInner::Struct`]
+pub(super) fn get_inner_matrix_of_struct_array_member(
+    module: &Module,
+    base: Handle<crate::Expression>,
+    func_ctx: &back::FunctionCtx<'_>,
+    direct: bool,
+) -> Option<MatrixType> {
+    let mut mat_data = None;
+    let mut array_base = None;
+
+    let mut current_base = base;
+    loop {
+        let mut resolved = func_ctx.info[current_base].ty.inner_with(&module.types);
+        if let TypeInner::Pointer { base, .. } = *resolved {
+            resolved = &module.types[base].inner;
+        };
+
+        match *resolved {
+            TypeInner::Matrix {
+                columns,
+                rows,
+                width,
+            } => {
+                mat_data = Some(MatrixType {
+                    columns,
+                    rows,
+                    width,
+                })
+            }
+            TypeInner::Array { base, .. } => {
+                array_base = Some(base);
+            }
+            TypeInner::Struct { .. } => {
+                if let Some(array_base) = array_base {
+                    if direct {
+                        return mat_data;
+                    } else {
+                        return get_inner_matrix_data(module, array_base);
+                    }
+                }
+
+                break;
+            }
+            _ => break,
+        }
+
+        current_base = match func_ctx.expressions[current_base] {
+            crate::Expression::Access { base, .. } => base,
+            crate::Expression::AccessIndex { base, .. } => base,
+            _ => break,
+        };
+    }
+    None
+}
+
+/// Returns the matrix data if the access chain starting at `base`:
+/// - starts with an expression with resolved type of [`TypeInner::Matrix`]
+/// - contains zero or more expressions with resolved type of [`TypeInner::Array`] of [`TypeInner::Matrix`]
+/// - ends with an [`Expression::GlobalVariable`](crate::Expression::GlobalVariable) in [`AddressSpace::Uniform`](crate::AddressSpace::Uniform)
+fn get_inner_matrix_of_global_uniform(
+    module: &Module,
+    base: Handle<crate::Expression>,
+    func_ctx: &back::FunctionCtx<'_>,
+) -> Option<MatrixType> {
+    let mut mat_data = None;
+    let mut array_base = None;
+
+    let mut current_base = base;
+    loop {
+        let mut resolved = func_ctx.info[current_base].ty.inner_with(&module.types);
+        if let TypeInner::Pointer { base, .. } = *resolved {
+            resolved = &module.types[base].inner;
+        };
+
+        match *resolved {
+            TypeInner::Matrix {
+                columns,
+                rows,
+                width,
+            } => {
+                mat_data = Some(MatrixType {
+                    columns,
+                    rows,
+                    width,
+                })
+            }
+            TypeInner::Array { base, .. } => {
+                array_base = Some(base);
+            }
+            _ => break,
+        }
+
+        current_base = match func_ctx.expressions[current_base] {
+            crate::Expression::Access { base, .. } => base,
+            crate::Expression::AccessIndex { base, .. } => base,
+            crate::Expression::GlobalVariable(handle)
+                if module.global_variables[handle].space == crate::AddressSpace::Uniform =>
+            {
+                return mat_data.or_else(|| {
+                    array_base.and_then(|array_base| get_inner_matrix_data(module, array_base))
+                })
+            }
+            _ => break,
+        };
+    }
+    None
+}
diff --git a/tests/in/access.param.ron b/tests/in/access.param.ron
index 3e79bea26c..8408e4cb6b 100644
--- a/tests/in/access.param.ron
+++ b/tests/in/access.param.ron
@@ -12,6 +12,7 @@
 					(group: 0, binding: 0): (buffer: Some(0), mutable: false),
 					(group: 0, binding: 1): (buffer: Some(1), mutable: false),
 					(group: 0, binding: 2): (buffer: Some(2), mutable: false),
+					(group: 0, binding: 3): (buffer: Some(3), mutable: false),
 				},
 				sizes_buffer: Some(24),
 			),
diff --git a/tests/in/access.wgsl b/tests/in/access.wgsl
index 456aee4b32..c2e4d25b6e 100644
--- a/tests/in/access.wgsl
+++ b/tests/in/access.wgsl
@@ -61,6 +61,43 @@ fn test_matrix_within_struct_accesses() {
     t.m[idx][idx] = 40.0;
 }
 
+struct MatCx2InArray {
+	am: array<mat4x2<f32>, 2>,
+}
+
+@group(0) @binding(3)
+var<uniform> nested_mat_cx2: MatCx2InArray;
+
+fn test_matrix_within_array_within_struct_accesses() {
+	var idx = 1;
+
+    idx--;
+
+	// loads
+    _ = nested_mat_cx2.am;
+    _ = nested_mat_cx2.am[0];
+    _ = nested_mat_cx2.am[0][0];
+    _ = nested_mat_cx2.am[0][idx];
+    _ = nested_mat_cx2.am[0][0][1];
+    _ = nested_mat_cx2.am[0][0][idx];
+    _ = nested_mat_cx2.am[0][idx][1];
+    _ = nested_mat_cx2.am[0][idx][idx];
+
+    var t = MatCx2InArray(array<mat4x2<f32>, 2>());
+
+    idx++;
+
+	// stores
+    t.am = array<mat4x2<f32>, 2>();
+    t.am[0] = mat4x2<f32>(vec2<f32>(8.0), vec2<f32>(7.0), vec2<f32>(6.0), vec2<f32>(5.0));
+    t.am[0][0] = vec2<f32>(9.0);
+    t.am[0][idx] = vec2<f32>(90.0);
+    t.am[0][0][1] = 10.0;
+    t.am[0][0][idx] = 20.0;
+    t.am[0][idx][1] = 30.0;
+    t.am[0][idx][idx] = 40.0;
+}
+
 fn read_from_private(foo: ptr<function, f32>) -> f32 {
     return *foo;
 }
@@ -77,6 +114,7 @@ fn foo_vert(@builtin(vertex_index) vi: u32) -> @builtin(position) vec4<f32> {
     foo = 1.0;
 
 	test_matrix_within_struct_accesses();
+	test_matrix_within_array_within_struct_accesses();
 
     // test storage loads
 	let _matrix = bar._matrix;
diff --git a/tests/in/globals.wgsl b/tests/in/globals.wgsl
index 89ab106f9d..59820ab367 100644
--- a/tests/in/globals.wgsl
+++ b/tests/in/globals.wgsl
@@ -20,10 +20,16 @@ var<storage> dummy: array<vec2<f32>>;
 var<uniform> float_vecs: array<vec4<f32>, 20>;
 
 @group(0) @binding(4)
-var<uniform> global_vec: vec4<f32>;
+var<uniform> global_vec: vec3<f32>;
 
 @group(0) @binding(5)
-var<uniform> global_mat: mat4x4<f32>;
+var<uniform> global_mat: mat3x2<f32>;
+
+@group(0) @binding(6)
+var<uniform> global_nested_arrays_of_matrices_2x4: array<array<mat2x4<f32>, 2>, 2>;
+
+@group(0) @binding(7)
+var<uniform> global_nested_arrays_of_matrices_4x2: array<array<mat4x2<f32>, 2>, 2>;
 
 fn test_msl_packed_vec3_as_arg(arg: vec3<f32>) {}
 
@@ -56,6 +62,7 @@ fn test_msl_packed_vec3() {
 fn main() {
     test_msl_packed_vec3();
 
+    wg[7] = (global_nested_arrays_of_matrices_4x2[0][0] * global_nested_arrays_of_matrices_2x4[0][0][0]).x;
     wg[6] = (global_mat * global_vec).x;
     wg[5] = dummy[1].y;
     wg[4] = float_vecs[0].w;
diff --git a/tests/out/glsl/access.assign_through_ptr.Compute.glsl b/tests/out/glsl/access.assign_through_ptr.Compute.glsl
index 34b989a0f8..c7ee8362c2 100644
--- a/tests/out/glsl/access.assign_through_ptr.Compute.glsl
+++ b/tests/out/glsl/access.assign_through_ptr.Compute.glsl
@@ -16,12 +16,15 @@ struct AlignedWrapper {
 struct Baz {
     mat3x2 m;
 };
+struct MatCx2InArray {
+    mat4x2 am[2];
+};
 shared uint val;
 
 
 float read_from_private(inout float foo_1) {
-    float _e5 = foo_1;
-    return _e5;
+    float _e6 = foo_1;
+    return _e6;
 }
 
 float test_arr_as_arg(float a[5][10]) {
diff --git a/tests/out/glsl/access.atomics.Compute.glsl b/tests/out/glsl/access.atomics.Compute.glsl
index 83e7e1e780..56c844f6a1 100644
--- a/tests/out/glsl/access.atomics.Compute.glsl
+++ b/tests/out/glsl/access.atomics.Compute.glsl
@@ -16,6 +16,9 @@ struct AlignedWrapper {
 struct Baz {
     mat3x2 m;
 };
+struct MatCx2InArray {
+    mat4x2 am[2];
+};
 layout(std430) buffer Bar_block_0Compute {
     mat4x3 _matrix;
     mat2x2 matrix_array[2];
@@ -26,8 +29,8 @@ layout(std430) buffer Bar_block_0Compute {
 
 
 float read_from_private(inout float foo_1) {
-    float _e5 = foo_1;
-    return _e5;
+    float _e6 = foo_1;
+    return _e6;
 }
 
 float test_arr_as_arg(float a[5][10]) {
@@ -42,22 +45,22 @@ void assign_through_ptr_fn(inout uint p) {
 void main() {
     int tmp = 0;
     int value = _group_0_binding_0_cs.atom;
-    int _e9 = atomicAdd(_group_0_binding_0_cs.atom, 5);
-    tmp = _e9;
-    int _e12 = atomicAdd(_group_0_binding_0_cs.atom, -5);
-    tmp = _e12;
-    int _e15 = atomicAnd(_group_0_binding_0_cs.atom, 5);
-    tmp = _e15;
-    int _e18 = atomicOr(_group_0_binding_0_cs.atom, 5);
-    tmp = _e18;
-    int _e21 = atomicXor(_group_0_binding_0_cs.atom, 5);
-    tmp = _e21;
-    int _e24 = atomicMin(_group_0_binding_0_cs.atom, 5);
-    tmp = _e24;
-    int _e27 = atomicMax(_group_0_binding_0_cs.atom, 5);
-    tmp = _e27;
-    int _e30 = atomicExchange(_group_0_binding_0_cs.atom, 5);
-    tmp = _e30;
+    int _e10 = atomicAdd(_group_0_binding_0_cs.atom, 5);
+    tmp = _e10;
+    int _e13 = atomicAdd(_group_0_binding_0_cs.atom, -5);
+    tmp = _e13;
+    int _e16 = atomicAnd(_group_0_binding_0_cs.atom, 5);
+    tmp = _e16;
+    int _e19 = atomicOr(_group_0_binding_0_cs.atom, 5);
+    tmp = _e19;
+    int _e22 = atomicXor(_group_0_binding_0_cs.atom, 5);
+    tmp = _e22;
+    int _e25 = atomicMin(_group_0_binding_0_cs.atom, 5);
+    tmp = _e25;
+    int _e28 = atomicMax(_group_0_binding_0_cs.atom, 5);
+    tmp = _e28;
+    int _e31 = atomicExchange(_group_0_binding_0_cs.atom, 5);
+    tmp = _e31;
     _group_0_binding_0_cs.atom = value;
     return;
 }
diff --git a/tests/out/glsl/access.foo_frag.Fragment.glsl b/tests/out/glsl/access.foo_frag.Fragment.glsl
index 36c8fb2a44..a3d6d21dd1 100644
--- a/tests/out/glsl/access.foo_frag.Fragment.glsl
+++ b/tests/out/glsl/access.foo_frag.Fragment.glsl
@@ -14,6 +14,9 @@ struct AlignedWrapper {
 struct Baz {
     mat3x2 m;
 };
+struct MatCx2InArray {
+    mat4x2 am[2];
+};
 layout(std430) buffer Bar_block_0Fragment {
     mat4x3 _matrix;
     mat2x2 matrix_array[2];
@@ -27,8 +30,8 @@ layout(std430) buffer type_11_block_1Fragment { ivec2 _group_0_binding_2_fs; };
 layout(location = 0) out vec4 _fs2p_location0;
 
 float read_from_private(inout float foo_1) {
-    float _e5 = foo_1;
-    return _e5;
+    float _e6 = foo_1;
+    return _e6;
 }
 
 float test_arr_as_arg(float a[5][10]) {
diff --git a/tests/out/glsl/access.foo_vert.Vertex.glsl b/tests/out/glsl/access.foo_vert.Vertex.glsl
index cd3a65ac06..9fbfc272f7 100644
--- a/tests/out/glsl/access.foo_vert.Vertex.glsl
+++ b/tests/out/glsl/access.foo_vert.Vertex.glsl
@@ -14,6 +14,9 @@ struct AlignedWrapper {
 struct Baz {
     mat3x2 m;
 };
+struct MatCx2InArray {
+    mat4x2 am[2];
+};
 layout(std430) buffer Bar_block_0Vertex {
     mat4x3 _matrix;
     mat2x2 matrix_array[2];
@@ -26,6 +29,8 @@ uniform Baz_block_1Vertex { Baz _group_0_binding_1_vs; };
 
 layout(std430) buffer type_11_block_2Vertex { ivec2 _group_0_binding_2_vs; };
 
+uniform MatCx2InArray_block_3Vertex { MatCx2InArray _group_0_binding_3_vs; };
+
 
 void test_matrix_within_struct_accesses() {
     int idx = 1;
@@ -62,9 +67,46 @@ void test_matrix_within_struct_accesses() {
     return;
 }
 
+void test_matrix_within_array_within_struct_accesses() {
+    int idx_1 = 1;
+    MatCx2InArray t_1 = MatCx2InArray(mat4x2[2](mat4x2(0.0), mat4x2(0.0)));
+    int _e7 = idx_1;
+    idx_1 = (_e7 - 1);
+    mat4x2 unnamed_7[2] = _group_0_binding_3_vs.am;
+    mat4x2 unnamed_8 = _group_0_binding_3_vs.am[0];
+    vec2 unnamed_9 = _group_0_binding_3_vs.am[0][0];
+    int _e25 = idx_1;
+    vec2 unnamed_10 = _group_0_binding_3_vs.am[0][_e25];
+    float unnamed_11 = _group_0_binding_3_vs.am[0][0][1];
+    int _e41 = idx_1;
+    float unnamed_12 = _group_0_binding_3_vs.am[0][0][_e41];
+    int _e47 = idx_1;
+    float unnamed_13 = _group_0_binding_3_vs.am[0][_e47][1];
+    int _e55 = idx_1;
+    int _e57 = idx_1;
+    float unnamed_14 = _group_0_binding_3_vs.am[0][_e55][_e57];
+    t_1 = MatCx2InArray(mat4x2[2](mat4x2(vec2(0.0, 0.0), vec2(0.0, 0.0), vec2(0.0, 0.0), vec2(0.0, 0.0)), mat4x2(vec2(0.0, 0.0), vec2(0.0, 0.0), vec2(0.0, 0.0), vec2(0.0, 0.0))));
+    int _e63 = idx_1;
+    idx_1 = (_e63 + 1);
+    t_1.am = mat4x2[2](mat4x2(vec2(0.0, 0.0), vec2(0.0, 0.0), vec2(0.0, 0.0), vec2(0.0, 0.0)), mat4x2(vec2(0.0, 0.0), vec2(0.0, 0.0), vec2(0.0, 0.0), vec2(0.0, 0.0)));
+    t_1.am[0] = mat4x2(vec2(8.0), vec2(7.0), vec2(6.0), vec2(5.0));
+    t_1.am[0][0] = vec2(9.0);
+    int _e90 = idx_1;
+    t_1.am[0][_e90] = vec2(90.0);
+    t_1.am[0][0][1] = 10.0;
+    int _e107 = idx_1;
+    t_1.am[0][0][_e107] = 20.0;
+    int _e113 = idx_1;
+    t_1.am[0][_e113][1] = 30.0;
+    int _e121 = idx_1;
+    int _e123 = idx_1;
+    t_1.am[0][_e121][_e123] = 40.0;
+    return;
+}
+
 float read_from_private(inout float foo_1) {
-    float _e5 = foo_1;
-    return _e5;
+    float _e6 = foo_1;
+    return _e6;
 }
 
 float test_arr_as_arg(float a[5][10]) {
@@ -83,16 +125,17 @@ void main() {
     float baz_1 = foo;
     foo = 1.0;
     test_matrix_within_struct_accesses();
+    test_matrix_within_array_within_struct_accesses();
     mat4x3 _matrix = _group_0_binding_0_vs._matrix;
     uvec2 arr[2] = _group_0_binding_0_vs.arr;
     float b = _group_0_binding_0_vs._matrix[3][0];
     int a_1 = _group_0_binding_0_vs.data[(uint(_group_0_binding_0_vs.data.length()) - 2u)].value;
     ivec2 c_1 = _group_0_binding_2_vs;
-    float _e31 = read_from_private(foo);
+    float _e32 = read_from_private(foo);
     c = int[5](a_1, int(b), 3, 4, 5);
     c[(vi + 1u)] = 42;
     int value = c[vi];
-    float _e45 = test_arr_as_arg(float[5][10](float[10](0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), float[10](0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), float[10](0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), float[10](0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), float[10](0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)));
+    float _e46 = test_arr_as_arg(float[5][10](float[10](0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), float[10](0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), float[10](0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), float[10](0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), float[10](0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)));
     gl_Position = vec4((_matrix * vec4(ivec4(value))), 2.0);
     gl_Position.yz = vec2(-gl_Position.y, gl_Position.z * 2.0 - gl_Position.w);
     return;
diff --git a/tests/out/glsl/globals.main.Compute.glsl b/tests/out/glsl/globals.main.Compute.glsl
index 40066567f3..0d5cb797bf 100644
--- a/tests/out/glsl/globals.main.Compute.glsl
+++ b/tests/out/glsl/globals.main.Compute.glsl
@@ -19,9 +19,13 @@ layout(std430) readonly buffer type_6_block_1Compute { vec2 _group_0_binding_2_c
 
 uniform type_8_block_2Compute { vec4 _group_0_binding_3_cs[20]; };
 
-uniform type_7_block_3Compute { vec4 _group_0_binding_4_cs; };
+uniform type_4_block_3Compute { vec3 _group_0_binding_4_cs; };
 
-uniform type_9_block_4Compute { mat4x4 _group_0_binding_5_cs; };
+uniform type_9_block_4Compute { mat3x2 _group_0_binding_5_cs; };
+
+uniform type_12_block_5Compute { mat2x4 _group_0_binding_6_cs[2][2]; };
+
+uniform type_15_block_6Compute { mat4x2 _group_0_binding_7_cs[2][2]; };
 
 
 void test_msl_packed_vec3_as_arg(vec3 arg) {
@@ -33,8 +37,8 @@ void test_msl_packed_vec3_() {
     _group_0_binding_1_cs.v3_ = vec3(1.0);
     _group_0_binding_1_cs.v3_.x = 1.0;
     _group_0_binding_1_cs.v3_.x = 2.0;
-    int _e21 = idx;
-    _group_0_binding_1_cs.v3_[_e21] = 3.0;
+    int _e23 = idx;
+    _group_0_binding_1_cs.v3_[_e23] = 3.0;
     Foo data = _group_0_binding_1_cs;
     vec3 unnamed = data.v3_;
     vec2 unnamed_1 = data.v3_.zx;
@@ -49,17 +53,20 @@ void main() {
     float Foo_1 = 1.0;
     bool at = true;
     test_msl_packed_vec3_();
-    mat4x4 _e10 = _group_0_binding_5_cs;
-    vec4 _e11 = _group_0_binding_4_cs;
-    wg[6] = (_e10 * _e11).x;
-    float _e19 = _group_0_binding_2_cs[1].y;
-    wg[5] = _e19;
-    float _e25 = _group_0_binding_3_cs[0].w;
-    wg[4] = _e25;
-    float _e29 = _group_0_binding_1_cs.v1_;
-    wg[3] = _e29;
-    float _e34 = _group_0_binding_1_cs.v3_.x;
-    wg[2] = _e34;
+    mat4x2 _e16 = _group_0_binding_7_cs[0][0];
+    vec4 _e23 = _group_0_binding_6_cs[0][0][0];
+    wg[7] = (_e16 * _e23).x;
+    mat3x2 _e28 = _group_0_binding_5_cs;
+    vec3 _e29 = _group_0_binding_4_cs;
+    wg[6] = (_e28 * _e29).x;
+    float _e37 = _group_0_binding_2_cs[1].y;
+    wg[5] = _e37;
+    float _e43 = _group_0_binding_3_cs[0].w;
+    wg[4] = _e43;
+    float _e47 = _group_0_binding_1_cs.v1_;
+    wg[3] = _e47;
+    float _e52 = _group_0_binding_1_cs.v3_.x;
+    wg[2] = _e52;
     _group_0_binding_1_cs.v1_ = 4.0;
     wg[1] = float(uint(_group_0_binding_2_cs.length()));
     at_1 = 2u;
diff --git a/tests/out/hlsl/access.hlsl b/tests/out/hlsl/access.hlsl
index 5f264572da..5510ec05bc 100644
--- a/tests/out/hlsl/access.hlsl
+++ b/tests/out/hlsl/access.hlsl
@@ -1,4 +1,52 @@
 
+typedef struct { float2 _0; float2 _1; } __mat2x2;
+float2 __get_col_of_mat2x2(__mat2x2 mat, uint idx) {
+    switch(idx) {
+    case 0: { return mat._0; }
+    case 1: { return mat._1; }
+    default: { return (float2)0; }
+    }
+}
+void __set_col_of_mat2x2(__mat2x2 mat, uint idx, float2 value) {
+    switch(idx) {
+    case 0: { mat._0 = value; break; }
+    case 1: { mat._1 = value; break; }
+    }
+}
+void __set_el_of_mat2x2(__mat2x2 mat, uint idx, uint vec_idx, float value) {
+    switch(idx) {
+    case 0: { mat._0[vec_idx] = value; break; }
+    case 1: { mat._1[vec_idx] = value; break; }
+    }
+}
+
+typedef struct { float2 _0; float2 _1; float2 _2; float2 _3; } __mat4x2;
+float2 __get_col_of_mat4x2(__mat4x2 mat, uint idx) {
+    switch(idx) {
+    case 0: { return mat._0; }
+    case 1: { return mat._1; }
+    case 2: { return mat._2; }
+    case 3: { return mat._3; }
+    default: { return (float2)0; }
+    }
+}
+void __set_col_of_mat4x2(__mat4x2 mat, uint idx, float2 value) {
+    switch(idx) {
+    case 0: { mat._0 = value; break; }
+    case 1: { mat._1 = value; break; }
+    case 2: { mat._2 = value; break; }
+    case 3: { mat._3 = value; break; }
+    }
+}
+void __set_el_of_mat4x2(__mat4x2 mat, uint idx, uint vec_idx, float value) {
+    switch(idx) {
+    case 0: { mat._0[vec_idx] = value; break; }
+    case 1: { mat._1[vec_idx] = value; break; }
+    case 2: { mat._2[vec_idx] = value; break; }
+    case 3: { mat._3[vec_idx] = value; break; }
+    }
+}
+
 struct GlobalConst {
     uint a;
     int _pad1_0;
@@ -17,6 +65,10 @@ struct Baz {
     float2 m_0; float2 m_1; float2 m_2;
 };
 
+struct MatCx2InArray {
+    __mat4x2 am[2];
+};
+
 GlobalConst ConstructGlobalConst(uint arg0, uint3 arg1, int arg2) {
     GlobalConst ret = (GlobalConst)0;
     ret.a = arg0;
@@ -25,6 +77,12 @@ GlobalConst ConstructGlobalConst(uint arg0, uint3 arg1, int arg2) {
     return ret;
 }
 
+typedef float4x2 ret_Constructarray2_float4x2_[2];
+ret_Constructarray2_float4x2_ Constructarray2_float4x2_(float4x2 arg0, float4x2 arg1) {
+    float4x2 ret[2] = { arg0, arg1 };
+    return ret;
+}
+
 typedef float ret_Constructarray10_float_[10];
 ret_Constructarray10_float_ Constructarray10_float_(float arg0, float arg1, float arg2, float arg3, float arg4, float arg5, float arg6, float arg7, float arg8, float arg9) {
     float ret[10] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 };
@@ -41,6 +99,7 @@ static GlobalConst global_const = ConstructGlobalConst(0u, uint3(0u, 0u, 0u), 0)
 RWByteAddressBuffer bar : register(u0);
 cbuffer baz : register(b1) { Baz baz; }
 RWByteAddressBuffer qux : register(u2);
+cbuffer nested_mat_cx2_ : register(b3) { MatCx2InArray nested_mat_cx2_; }
 groupshared uint val;
 
 float3x2 GetMatmOnBaz(Baz obj) {
@@ -114,10 +173,55 @@ void test_matrix_within_struct_accesses()
     return;
 }
 
+MatCx2InArray ConstructMatCx2InArray(float4x2 arg0[2]) {
+    MatCx2InArray ret = (MatCx2InArray)0;
+    ret.am = (__mat4x2[2])arg0;
+    return ret;
+}
+
+void test_matrix_within_array_within_struct_accesses()
+{
+    int idx_1 = 1;
+    MatCx2InArray t_1 = (MatCx2InArray)0;
+
+    int _expr7 = idx_1;
+    idx_1 = (_expr7 - 1);
+    float4x2 unnamed_7[2] = ((float4x2[2])nested_mat_cx2_.am);
+    float4x2 unnamed_8 = ((float4x2)nested_mat_cx2_.am[0]);
+    float2 unnamed_9 = nested_mat_cx2_.am[0]._0;
+    int _expr25 = idx_1;
+    float2 unnamed_10 = __get_col_of_mat4x2(nested_mat_cx2_.am[0], _expr25);
+    float unnamed_11 = nested_mat_cx2_.am[0]._0[1];
+    int _expr41 = idx_1;
+    float unnamed_12 = nested_mat_cx2_.am[0]._0[_expr41];
+    int _expr47 = idx_1;
+    float unnamed_13 = __get_col_of_mat4x2(nested_mat_cx2_.am[0], _expr47)[1];
+    int _expr55 = idx_1;
+    int _expr57 = idx_1;
+    float unnamed_14 = __get_col_of_mat4x2(nested_mat_cx2_.am[0], _expr55)[_expr57];
+    t_1 = ConstructMatCx2InArray(Constructarray2_float4x2_(float4x2(float2(0.0, 0.0), float2(0.0, 0.0), float2(0.0, 0.0), float2(0.0, 0.0)), float4x2(float2(0.0, 0.0), float2(0.0, 0.0), float2(0.0, 0.0), float2(0.0, 0.0))));
+    int _expr63 = idx_1;
+    idx_1 = (_expr63 + 1);
+    t_1.am = (__mat4x2[2])Constructarray2_float4x2_(float4x2(float2(0.0, 0.0), float2(0.0, 0.0), float2(0.0, 0.0), float2(0.0, 0.0)), float4x2(float2(0.0, 0.0), float2(0.0, 0.0), float2(0.0, 0.0), float2(0.0, 0.0)));
+    t_1.am[0] = (__mat4x2)float4x2((8.0).xx, (7.0).xx, (6.0).xx, (5.0).xx);
+    t_1.am[0]._0 = (9.0).xx;
+    int _expr90 = idx_1;
+    __set_col_of_mat4x2(t_1.am[0], _expr90, (90.0).xx);
+    t_1.am[0]._0[1] = 10.0;
+    int _expr107 = idx_1;
+    t_1.am[0]._0[_expr107] = 20.0;
+    int _expr113 = idx_1;
+    __set_el_of_mat4x2(t_1.am[0], _expr113, 1, 30.0);
+    int _expr121 = idx_1;
+    int _expr123 = idx_1;
+    __set_el_of_mat4x2(t_1.am[0], _expr121, _expr123, 40.0);
+    return;
+}
+
 float read_from_private(inout float foo_1)
 {
-    float _expr5 = foo_1;
-    return _expr5;
+    float _expr6 = foo_1;
+    return _expr6;
 }
 
 float test_arr_as_arg(float a[5][10])
@@ -152,19 +256,17 @@ float4 foo_vert(uint vi : SV_VertexID) : SV_Position
     float baz_1 = foo;
     foo = 1.0;
     test_matrix_within_struct_accesses();
+    test_matrix_within_array_within_struct_accesses();
     float4x3 _matrix = float4x3(asfloat(bar.Load3(0+0)), asfloat(bar.Load3(0+16)), asfloat(bar.Load3(0+32)), asfloat(bar.Load3(0+48)));
     uint2 arr[2] = {asuint(bar.Load2(104+0)), asuint(bar.Load2(104+8))};
     float b = asfloat(bar.Load(0+48+0));
     int a_1 = asint(bar.Load(0+(((NagaBufferLengthRW(bar) - 120) / 8) - 2u)*8+120));
     int2 c_1 = asint(qux.Load2(0));
-    const float _e31 = read_from_private(foo);
-    {
-        int _result[5]=Constructarray5_int_(a_1, int(b), 3, 4, 5);
-        for(int _i=0; _i<5; ++_i) c[_i] = _result[_i];
-    }
+    const float _e32 = read_from_private(foo);
+    c = Constructarray5_int_(a_1, int(b), 3, 4, 5);
     c[(vi + 1u)] = 42;
     int value = c[vi];
-    const float _e45 = test_arr_as_arg(Constructarray5_array10_float__(Constructarray10_float_(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), Constructarray10_float_(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), Constructarray10_float_(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), Constructarray10_float_(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), Constructarray10_float_(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)));
+    const float _e46 = test_arr_as_arg(Constructarray5_array10_float__(Constructarray10_float_(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), Constructarray10_float_(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), Constructarray10_float_(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), Constructarray10_float_(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), Constructarray10_float_(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)));
     return float4(mul(float4((value).xxxx), _matrix), 2.0);
 }
 
@@ -200,22 +302,22 @@ void atomics()
     int tmp = (int)0;
 
     int value_1 = asint(bar.Load(96));
-    int _e9; bar.InterlockedAdd(96, 5, _e9);
-    tmp = _e9;
-    int _e12; bar.InterlockedAdd(96, -5, _e12);
-    tmp = _e12;
-    int _e15; bar.InterlockedAnd(96, 5, _e15);
-    tmp = _e15;
-    int _e18; bar.InterlockedOr(96, 5, _e18);
-    tmp = _e18;
-    int _e21; bar.InterlockedXor(96, 5, _e21);
-    tmp = _e21;
-    int _e24; bar.InterlockedMin(96, 5, _e24);
-    tmp = _e24;
-    int _e27; bar.InterlockedMax(96, 5, _e27);
-    tmp = _e27;
-    int _e30; bar.InterlockedExchange(96, 5, _e30);
-    tmp = _e30;
+    int _e10; bar.InterlockedAdd(96, 5, _e10);
+    tmp = _e10;
+    int _e13; bar.InterlockedAdd(96, -5, _e13);
+    tmp = _e13;
+    int _e16; bar.InterlockedAnd(96, 5, _e16);
+    tmp = _e16;
+    int _e19; bar.InterlockedOr(96, 5, _e19);
+    tmp = _e19;
+    int _e22; bar.InterlockedXor(96, 5, _e22);
+    tmp = _e22;
+    int _e25; bar.InterlockedMin(96, 5, _e25);
+    tmp = _e25;
+    int _e28; bar.InterlockedMax(96, 5, _e28);
+    tmp = _e28;
+    int _e31; bar.InterlockedExchange(96, 5, _e31);
+    tmp = _e31;
     bar.Store(96, asuint(value_1));
     return;
 }
diff --git a/tests/out/hlsl/globals.hlsl b/tests/out/hlsl/globals.hlsl
index 2274aa4288..def5c8dbb2 100644
--- a/tests/out/hlsl/globals.hlsl
+++ b/tests/out/hlsl/globals.hlsl
@@ -1,5 +1,56 @@
 static const bool Foo_2 = true;
 
+typedef struct { float2 _0; float2 _1; float2 _2; } __mat3x2;
+float2 __get_col_of_mat3x2(__mat3x2 mat, uint idx) {
+    switch(idx) {
+    case 0: { return mat._0; }
+    case 1: { return mat._1; }
+    case 2: { return mat._2; }
+    default: { return (float2)0; }
+    }
+}
+void __set_col_of_mat3x2(__mat3x2 mat, uint idx, float2 value) {
+    switch(idx) {
+    case 0: { mat._0 = value; break; }
+    case 1: { mat._1 = value; break; }
+    case 2: { mat._2 = value; break; }
+    }
+}
+void __set_el_of_mat3x2(__mat3x2 mat, uint idx, uint vec_idx, float value) {
+    switch(idx) {
+    case 0: { mat._0[vec_idx] = value; break; }
+    case 1: { mat._1[vec_idx] = value; break; }
+    case 2: { mat._2[vec_idx] = value; break; }
+    }
+}
+
+typedef struct { float2 _0; float2 _1; float2 _2; float2 _3; } __mat4x2;
+float2 __get_col_of_mat4x2(__mat4x2 mat, uint idx) {
+    switch(idx) {
+    case 0: { return mat._0; }
+    case 1: { return mat._1; }
+    case 2: { return mat._2; }
+    case 3: { return mat._3; }
+    default: { return (float2)0; }
+    }
+}
+void __set_col_of_mat4x2(__mat4x2 mat, uint idx, float2 value) {
+    switch(idx) {
+    case 0: { mat._0 = value; break; }
+    case 1: { mat._1 = value; break; }
+    case 2: { mat._2 = value; break; }
+    case 3: { mat._3 = value; break; }
+    }
+}
+void __set_el_of_mat4x2(__mat4x2 mat, uint idx, uint vec_idx, float value) {
+    switch(idx) {
+    case 0: { mat._0[vec_idx] = value; break; }
+    case 1: { mat._1[vec_idx] = value; break; }
+    case 2: { mat._2[vec_idx] = value; break; }
+    case 3: { mat._3[vec_idx] = value; break; }
+    }
+}
+
 struct Foo {
     float3 v3_;
     float v1_;
@@ -10,8 +61,10 @@ groupshared uint at_1;
 RWByteAddressBuffer alignment : register(u1);
 ByteAddressBuffer dummy : register(t2);
 cbuffer float_vecs : register(b3) { float4 float_vecs[20]; }
-cbuffer global_vec : register(b4) { float4 global_vec; }
-cbuffer global_mat : register(b5) { row_major float4x4 global_mat; }
+cbuffer global_vec : register(b4) { float3 global_vec; }
+cbuffer global_mat : register(b5) { __mat3x2 global_mat; }
+cbuffer global_nested_arrays_of_matrices_2x4_ : register(b6) { row_major float2x4 global_nested_arrays_of_matrices_2x4_[2][2]; }
+cbuffer global_nested_arrays_of_matrices_4x2_ : register(b7) { __mat4x2 global_nested_arrays_of_matrices_4x2_[2][2]; }
 
 void test_msl_packed_vec3_as_arg(float3 arg)
 {
@@ -32,8 +85,8 @@ void test_msl_packed_vec3_()
     alignment.Store3(0, asuint((1.0).xxx));
     alignment.Store(0+0, asuint(1.0));
     alignment.Store(0+0, asuint(2.0));
-    int _expr21 = idx;
-    alignment.Store(_expr21*4+0, asuint(3.0));
+    int _expr23 = idx;
+    alignment.Store(_expr23*4+0, asuint(3.0));
     Foo data = ConstructFoo(asfloat(alignment.Load3(0)), asfloat(alignment.Load(12)));
     float3 unnamed = data.v3_;
     float2 unnamed_1 = data.v3_.zx;
@@ -58,17 +111,20 @@ void main()
     bool at = true;
 
     test_msl_packed_vec3_();
-    float4x4 _expr10 = global_mat;
-    float4 _expr11 = global_vec;
-    wg[6] = mul(_expr11, _expr10).x;
-    float _expr19 = asfloat(dummy.Load(4+8));
-    wg[5] = _expr19;
-    float _expr25 = float_vecs[0].w;
-    wg[4] = _expr25;
-    float _expr29 = asfloat(alignment.Load(12));
-    wg[3] = _expr29;
-    float _expr34 = asfloat(alignment.Load(0+0));
-    wg[2] = _expr34;
+    float4x2 _expr16 = ((float4x2)global_nested_arrays_of_matrices_4x2_[0][0]);
+    float4 _expr23 = global_nested_arrays_of_matrices_2x4_[0][0][0];
+    wg[7] = mul(_expr23, _expr16).x;
+    float3x2 _expr28 = ((float3x2)global_mat);
+    float3 _expr29 = global_vec;
+    wg[6] = mul(_expr29, _expr28).x;
+    float _expr37 = asfloat(dummy.Load(4+8));
+    wg[5] = _expr37;
+    float _expr43 = float_vecs[0].w;
+    wg[4] = _expr43;
+    float _expr47 = asfloat(alignment.Load(12));
+    wg[3] = _expr47;
+    float _expr52 = asfloat(alignment.Load(0+0));
+    wg[2] = _expr52;
     alignment.Store(12, asuint(4.0));
     wg[1] = float(((NagaBufferLength(dummy) - 0) / 8));
     at_1 = 2u;
diff --git a/tests/out/msl/access.msl b/tests/out/msl/access.msl
index ef4bf86a6e..19e4ab21d7 100644
--- a/tests/out/msl/access.msl
+++ b/tests/out/msl/access.msl
@@ -35,19 +35,28 @@ struct Bar {
 struct Baz {
     metal::float3x2 m;
 };
-struct type_14 {
-    float inner[10];
+struct type_13 {
+    metal::float4x2 inner[2];
+};
+struct MatCx2InArray {
+    type_13 am;
 };
-struct type_15 {
-    type_14 inner[5];
+struct type_17 {
+    float inner[10];
 };
 struct type_18 {
+    type_17 inner[5];
+};
+struct type_21 {
     int inner[5];
 };
 constant metal::uint3 const_type_1_ = {0u, 0u, 0u};
 constant GlobalConst const_GlobalConst = {0u, {}, const_type_1_, 0};
-constant type_14 const_type_14_ = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-constant type_15 const_type_15_ = {const_type_14_, const_type_14_, const_type_14_, const_type_14_, const_type_14_};
+constant metal::float2 const_type_14_ = {0.0, 0.0};
+constant metal::float4x2 const_type_12_ = {const_type_14_, const_type_14_, const_type_14_, const_type_14_};
+constant type_13 const_type_13_ = {const_type_12_, const_type_12_};
+constant type_17 const_type_17_ = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+constant type_18 const_type_18_ = {const_type_17_, const_type_17_, const_type_17_, const_type_17_, const_type_17_};
 constant metal::int2 const_type_11_ = {0, 0};
 
 void test_matrix_within_struct_accesses(
@@ -87,15 +96,54 @@ void test_matrix_within_struct_accesses(
     return;
 }
 
+void test_matrix_within_array_within_struct_accesses(
+    constant MatCx2InArray& nested_mat_cx2_
+) {
+    int idx_1 = 1;
+    MatCx2InArray t_1 = {};
+    int _e7 = idx_1;
+    idx_1 = _e7 - 1;
+    type_13 unnamed_7 = nested_mat_cx2_.am;
+    metal::float4x2 unnamed_8 = nested_mat_cx2_.am.inner[0];
+    metal::float2 unnamed_9 = nested_mat_cx2_.am.inner[0][0];
+    int _e25 = idx_1;
+    metal::float2 unnamed_10 = nested_mat_cx2_.am.inner[0][_e25];
+    float unnamed_11 = nested_mat_cx2_.am.inner[0][0].y;
+    int _e41 = idx_1;
+    float unnamed_12 = nested_mat_cx2_.am.inner[0][0][_e41];
+    int _e47 = idx_1;
+    float unnamed_13 = nested_mat_cx2_.am.inner[0][_e47].y;
+    int _e55 = idx_1;
+    int _e57 = idx_1;
+    float unnamed_14 = nested_mat_cx2_.am.inner[0][_e55][_e57];
+    t_1 = MatCx2InArray {const_type_13_};
+    int _e63 = idx_1;
+    idx_1 = _e63 + 1;
+    for(int _i=0; _i<2; ++_i) t_1.am.inner[_i] = const_type_13_.inner[_i];
+    t_1.am.inner[0] = metal::float4x2(metal::float2(8.0), metal::float2(7.0), metal::float2(6.0), metal::float2(5.0));
+    t_1.am.inner[0][0] = metal::float2(9.0);
+    int _e90 = idx_1;
+    t_1.am.inner[0][_e90] = metal::float2(90.0);
+    t_1.am.inner[0][0].y = 10.0;
+    int _e107 = idx_1;
+    t_1.am.inner[0][0][_e107] = 20.0;
+    int _e113 = idx_1;
+    t_1.am.inner[0][_e113].y = 30.0;
+    int _e121 = idx_1;
+    int _e123 = idx_1;
+    t_1.am.inner[0][_e121][_e123] = 40.0;
+    return;
+}
+
 float read_from_private(
     thread float& foo_1
 ) {
-    float _e5 = foo_1;
-    return _e5;
+    float _e6 = foo_1;
+    return _e6;
 }
 
 float test_arr_as_arg(
-    type_15 a
+    type_18 a
 ) {
     return a.inner[4].inner[9];
 }
@@ -117,23 +165,25 @@ vertex foo_vertOutput foo_vert(
 , device Bar const& bar [[buffer(0)]]
 , constant Baz& baz [[buffer(1)]]
 , device metal::int2 const& qux [[buffer(2)]]
+, constant MatCx2InArray& nested_mat_cx2_ [[buffer(3)]]
 , constant _mslBufferSizes& _buffer_sizes [[buffer(24)]]
 ) {
     float foo = 0.0;
-    type_18 c = {};
+    type_21 c = {};
     float baz_1 = foo;
     foo = 1.0;
     test_matrix_within_struct_accesses(baz);
+    test_matrix_within_array_within_struct_accesses(nested_mat_cx2_);
     metal::float4x3 _matrix = bar._matrix;
     type_8 arr = bar.arr;
     float b = bar._matrix[3].x;
     int a_1 = bar.data[(1 + (_buffer_sizes.size1 - 120 - 8) / 8) - 2u].value;
     metal::int2 c_1 = qux;
-    float _e31 = read_from_private(foo);
-    for(int _i=0; _i<5; ++_i) c.inner[_i] = type_18 {a_1, static_cast<int>(b), 3, 4, 5}.inner[_i];
+    float _e32 = read_from_private(foo);
+    for(int _i=0; _i<5; ++_i) c.inner[_i] = type_21 {a_1, static_cast<int>(b), 3, 4, 5}.inner[_i];
     c.inner[vi + 1u] = 42;
     int value = c.inner[vi];
-    float _e45 = test_arr_as_arg(const_type_15_);
+    float _e46 = test_arr_as_arg(const_type_18_);
     return foo_vertOutput { metal::float4(_matrix * static_cast<metal::float4>(metal::int4(value)), 2.0) };
 }
 
@@ -161,22 +211,22 @@ kernel void atomics(
 ) {
     int tmp = {};
     int value_1 = metal::atomic_load_explicit(&bar.atom, metal::memory_order_relaxed);
-    int _e9 = metal::atomic_fetch_add_explicit(&bar.atom, 5, metal::memory_order_relaxed);
-    tmp = _e9;
-    int _e12 = metal::atomic_fetch_sub_explicit(&bar.atom, 5, metal::memory_order_relaxed);
-    tmp = _e12;
-    int _e15 = metal::atomic_fetch_and_explicit(&bar.atom, 5, metal::memory_order_relaxed);
-    tmp = _e15;
-    int _e18 = metal::atomic_fetch_or_explicit(&bar.atom, 5, metal::memory_order_relaxed);
-    tmp = _e18;
-    int _e21 = metal::atomic_fetch_xor_explicit(&bar.atom, 5, metal::memory_order_relaxed);
-    tmp = _e21;
-    int _e24 = metal::atomic_fetch_min_explicit(&bar.atom, 5, metal::memory_order_relaxed);
-    tmp = _e24;
-    int _e27 = metal::atomic_fetch_max_explicit(&bar.atom, 5, metal::memory_order_relaxed);
-    tmp = _e27;
-    int _e30 = metal::atomic_exchange_explicit(&bar.atom, 5, metal::memory_order_relaxed);
-    tmp = _e30;
+    int _e10 = metal::atomic_fetch_add_explicit(&bar.atom, 5, metal::memory_order_relaxed);
+    tmp = _e10;
+    int _e13 = metal::atomic_fetch_sub_explicit(&bar.atom, 5, metal::memory_order_relaxed);
+    tmp = _e13;
+    int _e16 = metal::atomic_fetch_and_explicit(&bar.atom, 5, metal::memory_order_relaxed);
+    tmp = _e16;
+    int _e19 = metal::atomic_fetch_or_explicit(&bar.atom, 5, metal::memory_order_relaxed);
+    tmp = _e19;
+    int _e22 = metal::atomic_fetch_xor_explicit(&bar.atom, 5, metal::memory_order_relaxed);
+    tmp = _e22;
+    int _e25 = metal::atomic_fetch_min_explicit(&bar.atom, 5, metal::memory_order_relaxed);
+    tmp = _e25;
+    int _e28 = metal::atomic_fetch_max_explicit(&bar.atom, 5, metal::memory_order_relaxed);
+    tmp = _e28;
+    int _e31 = metal::atomic_exchange_explicit(&bar.atom, 5, metal::memory_order_relaxed);
+    tmp = _e31;
     metal::atomic_store_explicit(&bar.atom, value_1, metal::memory_order_relaxed);
     return;
 }
diff --git a/tests/out/msl/globals.msl b/tests/out/msl/globals.msl
index b02d6056d4..b461cce3f4 100644
--- a/tests/out/msl/globals.msl
+++ b/tests/out/msl/globals.msl
@@ -20,8 +20,20 @@ typedef metal::float2 type_6[1];
 struct type_8 {
     metal::float4 inner[20];
 };
+struct type_11 {
+    metal::float2x4 inner[2];
+};
+struct type_12 {
+    type_11 inner[2];
+};
+struct type_14 {
+    metal::float4x2 inner[2];
+};
+struct type_15 {
+    type_14 inner[2];
+};
 constant metal::float3 const_type_4_ = {0.0, 0.0, 0.0};
-constant metal::float3x3 const_type_11_ = {const_type_4_, const_type_4_, const_type_4_};
+constant metal::float3x3 const_type_17_ = {const_type_4_, const_type_4_, const_type_4_};
 
 void test_msl_packed_vec3_as_arg(
     metal::float3 arg
@@ -36,14 +48,14 @@ void test_msl_packed_vec3_(
     alignment.v3_ = metal::float3(1.0);
     alignment.v3_[0] = 1.0;
     alignment.v3_[0] = 2.0;
-    int _e21 = idx;
-    alignment.v3_[_e21] = 3.0;
+    int _e23 = idx;
+    alignment.v3_[_e23] = 3.0;
     Foo data = alignment;
     metal::float3 unnamed = data.v3_;
     metal::float2 unnamed_1 = metal::float3(data.v3_).zx;
     test_msl_packed_vec3_as_arg(data.v3_);
-    metal::float3 unnamed_2 = metal::float3(data.v3_) * const_type_11_;
-    metal::float3 unnamed_3 = const_type_11_ * metal::float3(data.v3_);
+    metal::float3 unnamed_2 = metal::float3(data.v3_) * const_type_17_;
+    metal::float3 unnamed_3 = const_type_17_ * metal::float3(data.v3_);
     metal::float3 unnamed_4 = data.v3_ * 2.0;
     metal::float3 unnamed_5 = 2.0 * data.v3_;
 }
@@ -54,24 +66,29 @@ kernel void main_(
 , device Foo& alignment [[user(fake0)]]
 , device type_6 const& dummy [[user(fake0)]]
 , constant type_8& float_vecs [[user(fake0)]]
-, constant metal::float4& global_vec [[user(fake0)]]
-, constant metal::float4x4& global_mat [[user(fake0)]]
+, constant metal::float3& global_vec [[user(fake0)]]
+, constant metal::float3x2& global_mat [[user(fake0)]]
+, constant type_12& global_nested_arrays_of_matrices_2x4_ [[user(fake0)]]
+, constant type_15& global_nested_arrays_of_matrices_4x2_ [[user(fake0)]]
 , constant _mslBufferSizes& _buffer_sizes [[user(fake0)]]
 ) {
     float Foo_1 = 1.0;
     bool at = true;
     test_msl_packed_vec3_(alignment);
-    metal::float4x4 _e10 = global_mat;
-    metal::float4 _e11 = global_vec;
-    wg.inner[6] = (_e10 * _e11).x;
-    float _e19 = dummy[1].y;
-    wg.inner[5] = _e19;
-    float _e25 = float_vecs.inner[0].w;
-    wg.inner[4] = _e25;
-    float _e29 = alignment.v1_;
-    wg.inner[3] = _e29;
-    float _e34 = alignment.v3_[0];
-    wg.inner[2] = _e34;
+    metal::float4x2 _e16 = global_nested_arrays_of_matrices_4x2_.inner[0].inner[0];
+    metal::float4 _e23 = global_nested_arrays_of_matrices_2x4_.inner[0].inner[0][0];
+    wg.inner[7] = (_e16 * _e23).x;
+    metal::float3x2 _e28 = global_mat;
+    metal::float3 _e29 = global_vec;
+    wg.inner[6] = (_e28 * _e29).x;
+    float _e37 = dummy[1].y;
+    wg.inner[5] = _e37;
+    float _e43 = float_vecs.inner[0].w;
+    wg.inner[4] = _e43;
+    float _e47 = alignment.v1_;
+    wg.inner[3] = _e47;
+    float _e52 = alignment.v3_[0];
+    wg.inner[2] = _e52;
     alignment.v1_ = 4.0;
     wg.inner[1] = static_cast<float>(1 + (_buffer_sizes.size3 - 0 - 8) / 8);
     metal::atomic_store_explicit(&at_1, 2u, metal::memory_order_relaxed);
diff --git a/tests/out/spv/access.spvasm b/tests/out/spv/access.spvasm
index 218b08a74a..fe7e75d4a4 100644
--- a/tests/out/spv/access.spvasm
+++ b/tests/out/spv/access.spvasm
@@ -1,91 +1,105 @@
 ; SPIR-V
 ; Version: 1.1
 ; Generator: rspirv
-; Bound: 248
+; Bound: 320
 OpCapability Shader
 OpExtension "SPV_KHR_storage_buffer_storage_class"
 %1 = OpExtInstImport "GLSL.std.450"
 OpMemoryModel Logical GLSL450
-OpEntryPoint Vertex %163 "foo_vert" %158 %161
-OpEntryPoint Fragment %202 "foo_frag" %201
-OpEntryPoint GLCompute %221 "atomics"
-OpEntryPoint GLCompute %245 "assign_through_ptr"
-OpExecutionMode %202 OriginUpperLeft
-OpExecutionMode %221 LocalSize 1 1 1
-OpExecutionMode %245 LocalSize 1 1 1
+OpEntryPoint Vertex %233 "foo_vert" %228 %231
+OpEntryPoint Fragment %274 "foo_frag" %273
+OpEntryPoint GLCompute %293 "atomics"
+OpEntryPoint GLCompute %317 "assign_through_ptr"
+OpExecutionMode %274 OriginUpperLeft
+OpExecutionMode %293 LocalSize 1 1 1
+OpExecutionMode %317 LocalSize 1 1 1
 OpSource GLSL 450
-OpMemberName %34 0 "a"
-OpMemberName %34 1 "b"
-OpMemberName %34 2 "c"
-OpName %34 "GlobalConst"
-OpMemberName %35 0 "value"
-OpName %35 "AlignedWrapper"
-OpMemberName %44 0 "_matrix"
-OpMemberName %44 1 "matrix_array"
-OpMemberName %44 2 "atom"
-OpMemberName %44 3 "arr"
-OpMemberName %44 4 "data"
-OpName %44 "Bar"
-OpMemberName %46 0 "m"
-OpName %46 "Baz"
-OpName %60 "global_const"
-OpName %62 "bar"
-OpName %64 "baz"
-OpName %67 "qux"
-OpName %70 "val"
-OpName %71 "idx"
-OpName %73 "t"
-OpName %77 "test_matrix_within_struct_accesses"
-OpName %136 "foo"
-OpName %137 "read_from_private"
-OpName %142 "a"
-OpName %143 "test_arr_as_arg"
-OpName %149 "p"
-OpName %150 "assign_through_ptr_fn"
-OpName %153 "foo"
-OpName %154 "c"
-OpName %158 "vi"
-OpName %163 "foo_vert"
-OpName %202 "foo_frag"
-OpName %218 "tmp"
-OpName %221 "atomics"
-OpName %245 "assign_through_ptr"
-OpMemberDecorate %34 0 Offset 0
-OpMemberDecorate %34 1 Offset 16
-OpMemberDecorate %34 2 Offset 28
-OpMemberDecorate %35 0 Offset 0
-OpDecorate %40 ArrayStride 16
-OpDecorate %42 ArrayStride 8
-OpDecorate %43 ArrayStride 8
-OpMemberDecorate %44 0 Offset 0
-OpMemberDecorate %44 0 ColMajor
-OpMemberDecorate %44 0 MatrixStride 16
-OpMemberDecorate %44 1 Offset 64
-OpMemberDecorate %44 1 ColMajor
-OpMemberDecorate %44 1 MatrixStride 8
-OpMemberDecorate %44 2 Offset 96
-OpMemberDecorate %44 3 Offset 104
-OpMemberDecorate %44 4 Offset 120
+OpMemberName %36 0 "a"
+OpMemberName %36 1 "b"
+OpMemberName %36 2 "c"
+OpName %36 "GlobalConst"
+OpMemberName %37 0 "value"
+OpName %37 "AlignedWrapper"
+OpMemberName %46 0 "_matrix"
+OpMemberName %46 1 "matrix_array"
+OpMemberName %46 2 "atom"
+OpMemberName %46 3 "arr"
+OpMemberName %46 4 "data"
+OpName %46 "Bar"
+OpMemberName %48 0 "m"
+OpName %48 "Baz"
+OpMemberName %52 0 "am"
+OpName %52 "MatCx2InArray"
+OpName %68 "global_const"
+OpName %70 "bar"
+OpName %72 "baz"
+OpName %75 "qux"
+OpName %78 "nested_mat_cx2"
+OpName %81 "val"
+OpName %82 "idx"
+OpName %84 "t"
+OpName %88 "test_matrix_within_struct_accesses"
+OpName %146 "idx"
+OpName %147 "t"
+OpName %151 "test_matrix_within_array_within_struct_accesses"
+OpName %206 "foo"
+OpName %207 "read_from_private"
+OpName %212 "a"
+OpName %213 "test_arr_as_arg"
+OpName %219 "p"
+OpName %220 "assign_through_ptr_fn"
+OpName %223 "foo"
+OpName %224 "c"
+OpName %228 "vi"
+OpName %233 "foo_vert"
+OpName %274 "foo_frag"
+OpName %290 "tmp"
+OpName %293 "atomics"
+OpName %317 "assign_through_ptr"
+OpMemberDecorate %36 0 Offset 0
+OpMemberDecorate %36 1 Offset 16
+OpMemberDecorate %36 2 Offset 28
+OpMemberDecorate %37 0 Offset 0
+OpDecorate %42 ArrayStride 16
+OpDecorate %44 ArrayStride 8
+OpDecorate %45 ArrayStride 8
 OpMemberDecorate %46 0 Offset 0
 OpMemberDecorate %46 0 ColMajor
-OpMemberDecorate %46 0 MatrixStride 8
-OpDecorate %49 ArrayStride 4
-OpDecorate %50 ArrayStride 40
-OpDecorate %53 ArrayStride 4
-OpDecorate %62 DescriptorSet 0
-OpDecorate %62 Binding 0
-OpDecorate %44 Block
-OpDecorate %64 DescriptorSet 0
-OpDecorate %64 Binding 1
-OpDecorate %65 Block
-OpMemberDecorate %65 0 Offset 0
-OpDecorate %67 DescriptorSet 0
-OpDecorate %67 Binding 2
-OpDecorate %68 Block
-OpMemberDecorate %68 0 Offset 0
-OpDecorate %158 BuiltIn VertexIndex
-OpDecorate %161 BuiltIn Position
-OpDecorate %201 Location 0
+OpMemberDecorate %46 0 MatrixStride 16
+OpMemberDecorate %46 1 Offset 64
+OpMemberDecorate %46 1 ColMajor
+OpMemberDecorate %46 1 MatrixStride 8
+OpMemberDecorate %46 2 Offset 96
+OpMemberDecorate %46 3 Offset 104
+OpMemberDecorate %46 4 Offset 120
+OpMemberDecorate %48 0 Offset 0
+OpMemberDecorate %48 0 ColMajor
+OpMemberDecorate %48 0 MatrixStride 8
+OpDecorate %51 ArrayStride 32
+OpMemberDecorate %52 0 Offset 0
+OpMemberDecorate %52 0 ColMajor
+OpMemberDecorate %52 0 MatrixStride 8
+OpDecorate %54 ArrayStride 4
+OpDecorate %55 ArrayStride 40
+OpDecorate %58 ArrayStride 4
+OpDecorate %70 DescriptorSet 0
+OpDecorate %70 Binding 0
+OpDecorate %46 Block
+OpDecorate %72 DescriptorSet 0
+OpDecorate %72 Binding 1
+OpDecorate %73 Block
+OpMemberDecorate %73 0 Offset 0
+OpDecorate %75 DescriptorSet 0
+OpDecorate %75 Binding 2
+OpDecorate %76 Block
+OpMemberDecorate %76 0 Offset 0
+OpDecorate %78 DescriptorSet 0
+OpDecorate %78 Binding 3
+OpDecorate %79 Block
+OpMemberDecorate %79 0 Offset 0
+OpDecorate %228 BuiltIn VertexIndex
+OpDecorate %231 BuiltIn Position
+OpDecorate %273 Location 0
 %2 = OpTypeVoid
 %4 = OpTypeInt 32 0
 %3 = OpConstant  %4  0
@@ -106,283 +120,369 @@ OpDecorate %201 Location 0
 %19 = OpConstant  %10  20.0
 %20 = OpConstant  %10  30.0
 %21 = OpConstant  %10  40.0
-%22 = OpConstant  %6  10
-%23 = OpConstant  %6  5
-%24 = OpConstant  %6  4
-%25 = OpConstant  %6  9
-%26 = OpConstant  %10  0.0
-%27 = OpConstant  %4  3
-%28 = OpConstant  %4  2
-%29 = OpConstant  %6  3
-%30 = OpConstant  %4  1
-%31 = OpConstant  %6  42
-%32 = OpConstant  %4  42
-%33 = OpTypeVector %4 3
-%34 = OpTypeStruct %4 %33 %6
-%35 = OpTypeStruct %6
-%37 = OpTypeVector %10 3
-%36 = OpTypeMatrix %37 4
-%39 = OpTypeVector %10 2
-%38 = OpTypeMatrix %39 2
-%40 = OpTypeArray %38 %7
-%41 = OpTypeVector %4 2
-%42 = OpTypeArray %41 %7
-%43 = OpTypeRuntimeArray %35
-%44 = OpTypeStruct %36 %40 %6 %42 %43
-%45 = OpTypeMatrix %39 3
-%46 = OpTypeStruct %45
-%47 = OpTypeVector %6 2
-%48 = OpTypePointer Function %10
-%49 = OpTypeArray %10 %22
-%50 = OpTypeArray %49 %23
-%51 = OpTypeVector %10 4
-%52 = OpTypePointer StorageBuffer %6
-%53 = OpTypeArray %6 %23
-%54 = OpTypePointer Workgroup %4
-%55 = OpConstantComposite  %33  %3 %3 %3
-%56 = OpConstantComposite  %34  %3 %55 %5
-%57 = OpConstantComposite  %49  %26 %26 %26 %26 %26 %26 %26 %26 %26 %26
-%58 = OpConstantComposite  %50  %57 %57 %57 %57 %57
-%59 = OpConstantComposite  %47  %5 %5
-%61 = OpTypePointer Private %34
-%60 = OpVariable  %61  Private %56
-%63 = OpTypePointer StorageBuffer %44
-%62 = OpVariable  %63  StorageBuffer
-%65 = OpTypeStruct %46
-%66 = OpTypePointer Uniform %65
-%64 = OpVariable  %66  Uniform
-%68 = OpTypeStruct %47
-%69 = OpTypePointer StorageBuffer %68
-%67 = OpVariable  %69  StorageBuffer
-%70 = OpVariable  %54  Workgroup
-%72 = OpTypePointer Function %6
-%74 = OpTypePointer Function %46
-%75 = OpConstantNull  %46
-%78 = OpTypeFunction %2
-%79 = OpTypePointer Uniform %46
-%81 = OpTypePointer StorageBuffer %47
-%85 = OpTypePointer Uniform %45
-%88 = OpTypePointer Uniform %39
-%94 = OpTypePointer Uniform %10
-%114 = OpTypePointer Function %45
-%120 = OpTypePointer Function %39
-%126 = OpTypePointer Function %10
-%138 = OpTypeFunction %10 %48
-%144 = OpTypeFunction %10 %50
-%151 = OpTypeFunction %2 %54
-%155 = OpTypePointer Function %53
-%156 = OpConstantNull  %53
-%159 = OpTypePointer Input %4
-%158 = OpVariable  %159  Input
-%162 = OpTypePointer Output %51
-%161 = OpVariable  %162  Output
-%169 = OpTypePointer StorageBuffer %36
-%172 = OpTypePointer StorageBuffer %42
-%175 = OpTypePointer StorageBuffer %37
-%176 = OpTypePointer StorageBuffer %10
-%179 = OpTypePointer StorageBuffer %43
-%182 = OpTypePointer StorageBuffer %35
-%183 = OpConstant  %4  4
-%195 = OpTypeVector %6 4
-%201 = OpVariable  %162  Output
-%219 = OpConstantNull  %6
-%223 = OpTypePointer StorageBuffer %6
-%226 = OpConstant  %4  64
-%77 = OpFunction  %2  None %78
-%76 = OpLabel
-%71 = OpVariable  %72  Function %8
-%73 = OpVariable  %74  Function %75
-%80 = OpAccessChain  %79  %64 %3
-OpBranch %82
-%82 = OpLabel
-%83 = OpLoad  %6  %71
-%84 = OpISub  %6  %83 %8
-OpStore %71 %84
-%86 = OpAccessChain  %85  %80 %3
-%87 = OpLoad  %45  %86
-%89 = OpAccessChain  %88  %80 %3 %3
-%90 = OpLoad  %39  %89
-%91 = OpLoad  %6  %71
-%92 = OpAccessChain  %88  %80 %3 %91
-%93 = OpLoad  %39  %92
-%95 = OpAccessChain  %94  %80 %3 %3 %30
-%96 = OpLoad  %10  %95
-%97 = OpLoad  %6  %71
-%98 = OpAccessChain  %94  %80 %3 %3 %97
-%99 = OpLoad  %10  %98
-%100 = OpLoad  %6  %71
-%101 = OpAccessChain  %94  %80 %3 %100 %30
-%102 = OpLoad  %10  %101
-%103 = OpLoad  %6  %71
-%104 = OpLoad  %6  %71
-%105 = OpAccessChain  %94  %80 %3 %103 %104
-%106 = OpLoad  %10  %105
-%107 = OpCompositeConstruct  %39  %9 %9
-%108 = OpCompositeConstruct  %39  %11 %11
-%109 = OpCompositeConstruct  %39  %12 %12
-%110 = OpCompositeConstruct  %45  %107 %108 %109
-%111 = OpCompositeConstruct  %46  %110
-OpStore %73 %111
-%112 = OpLoad  %6  %71
-%113 = OpIAdd  %6  %112 %8
-OpStore %71 %113
-%115 = OpCompositeConstruct  %39  %13 %13
-%116 = OpCompositeConstruct  %39  %14 %14
-%117 = OpCompositeConstruct  %39  %15 %15
-%118 = OpCompositeConstruct  %45  %115 %116 %117
-%119 = OpAccessChain  %114  %73 %3
-OpStore %119 %118
-%121 = OpCompositeConstruct  %39  %16 %16
-%122 = OpAccessChain  %120  %73 %3 %3
-OpStore %122 %121
-%123 = OpLoad  %6  %71
-%124 = OpCompositeConstruct  %39  %17 %17
-%125 = OpAccessChain  %120  %73 %3 %123
-OpStore %125 %124
-%127 = OpAccessChain  %126  %73 %3 %3 %30
-OpStore %127 %18
-%128 = OpLoad  %6  %71
-%129 = OpAccessChain  %126  %73 %3 %3 %128
-OpStore %129 %19
-%130 = OpLoad  %6  %71
-%131 = OpAccessChain  %126  %73 %3 %130 %30
-OpStore %131 %20
-%132 = OpLoad  %6  %71
-%133 = OpLoad  %6  %71
-%134 = OpAccessChain  %126  %73 %3 %132 %133
-OpStore %134 %21
+%22 = OpConstant  %10  0.0
+%23 = OpConstant  %10  8.0
+%24 = OpConstant  %10  7.0
+%25 = OpConstant  %6  10
+%26 = OpConstant  %6  5
+%27 = OpConstant  %6  4
+%28 = OpConstant  %6  9
+%29 = OpConstant  %4  3
+%30 = OpConstant  %4  2
+%31 = OpConstant  %6  3
+%32 = OpConstant  %4  1
+%33 = OpConstant  %6  42
+%34 = OpConstant  %4  42
+%35 = OpTypeVector %4 3
+%36 = OpTypeStruct %4 %35 %6
+%37 = OpTypeStruct %6
+%39 = OpTypeVector %10 3
+%38 = OpTypeMatrix %39 4
+%41 = OpTypeVector %10 2
+%40 = OpTypeMatrix %41 2
+%42 = OpTypeArray %40 %7
+%43 = OpTypeVector %4 2
+%44 = OpTypeArray %43 %7
+%45 = OpTypeRuntimeArray %37
+%46 = OpTypeStruct %38 %42 %6 %44 %45
+%47 = OpTypeMatrix %41 3
+%48 = OpTypeStruct %47
+%49 = OpTypeVector %6 2
+%50 = OpTypeMatrix %41 4
+%51 = OpTypeArray %50 %7
+%52 = OpTypeStruct %51
+%53 = OpTypePointer Function %10
+%54 = OpTypeArray %10 %25
+%55 = OpTypeArray %54 %26
+%56 = OpTypeVector %10 4
+%57 = OpTypePointer StorageBuffer %6
+%58 = OpTypeArray %6 %26
+%59 = OpTypePointer Workgroup %4
+%60 = OpConstantComposite  %35  %3 %3 %3
+%61 = OpConstantComposite  %36  %3 %60 %5
+%62 = OpConstantComposite  %41  %22 %22
+%63 = OpConstantComposite  %50  %62 %62 %62 %62
+%64 = OpConstantComposite  %51  %63 %63
+%65 = OpConstantComposite  %54  %22 %22 %22 %22 %22 %22 %22 %22 %22 %22
+%66 = OpConstantComposite  %55  %65 %65 %65 %65 %65
+%67 = OpConstantComposite  %49  %5 %5
+%69 = OpTypePointer Private %36
+%68 = OpVariable  %69  Private %61
+%71 = OpTypePointer StorageBuffer %46
+%70 = OpVariable  %71  StorageBuffer
+%73 = OpTypeStruct %48
+%74 = OpTypePointer Uniform %73
+%72 = OpVariable  %74  Uniform
+%76 = OpTypeStruct %49
+%77 = OpTypePointer StorageBuffer %76
+%75 = OpVariable  %77  StorageBuffer
+%79 = OpTypeStruct %52
+%80 = OpTypePointer Uniform %79
+%78 = OpVariable  %80  Uniform
+%81 = OpVariable  %59  Workgroup
+%83 = OpTypePointer Function %6
+%85 = OpTypePointer Function %48
+%86 = OpConstantNull  %48
+%89 = OpTypeFunction %2
+%90 = OpTypePointer Uniform %48
+%92 = OpTypePointer StorageBuffer %49
+%96 = OpTypePointer Uniform %47
+%99 = OpTypePointer Uniform %41
+%105 = OpTypePointer Uniform %10
+%125 = OpTypePointer Function %47
+%131 = OpTypePointer Function %41
+%137 = OpTypePointer Function %10
+%148 = OpTypePointer Function %52
+%149 = OpConstantNull  %52
+%152 = OpTypePointer Uniform %52
+%157 = OpTypePointer Uniform %51
+%160 = OpTypePointer Uniform %50
+%183 = OpTypePointer Function %51
+%185 = OpTypePointer Function %50
+%208 = OpTypeFunction %10 %53
+%214 = OpTypeFunction %10 %55
+%221 = OpTypeFunction %2 %59
+%225 = OpTypePointer Function %58
+%226 = OpConstantNull  %58
+%229 = OpTypePointer Input %4
+%228 = OpVariable  %229  Input
+%232 = OpTypePointer Output %56
+%231 = OpVariable  %232  Output
+%241 = OpTypePointer StorageBuffer %38
+%244 = OpTypePointer StorageBuffer %44
+%247 = OpTypePointer StorageBuffer %39
+%248 = OpTypePointer StorageBuffer %10
+%251 = OpTypePointer StorageBuffer %45
+%254 = OpTypePointer StorageBuffer %37
+%255 = OpConstant  %4  4
+%267 = OpTypeVector %6 4
+%273 = OpVariable  %232  Output
+%291 = OpConstantNull  %6
+%295 = OpTypePointer StorageBuffer %6
+%298 = OpConstant  %4  64
+%88 = OpFunction  %2  None %89
+%87 = OpLabel
+%82 = OpVariable  %83  Function %8
+%84 = OpVariable  %85  Function %86
+%91 = OpAccessChain  %90  %72 %3
+OpBranch %93
+%93 = OpLabel
+%94 = OpLoad  %6  %82
+%95 = OpISub  %6  %94 %8
+OpStore %82 %95
+%97 = OpAccessChain  %96  %91 %3
+%98 = OpLoad  %47  %97
+%100 = OpAccessChain  %99  %91 %3 %3
+%101 = OpLoad  %41  %100
+%102 = OpLoad  %6  %82
+%103 = OpAccessChain  %99  %91 %3 %102
+%104 = OpLoad  %41  %103
+%106 = OpAccessChain  %105  %91 %3 %3 %32
+%107 = OpLoad  %10  %106
+%108 = OpLoad  %6  %82
+%109 = OpAccessChain  %105  %91 %3 %3 %108
+%110 = OpLoad  %10  %109
+%111 = OpLoad  %6  %82
+%112 = OpAccessChain  %105  %91 %3 %111 %32
+%113 = OpLoad  %10  %112
+%114 = OpLoad  %6  %82
+%115 = OpLoad  %6  %82
+%116 = OpAccessChain  %105  %91 %3 %114 %115
+%117 = OpLoad  %10  %116
+%118 = OpCompositeConstruct  %41  %9 %9
+%119 = OpCompositeConstruct  %41  %11 %11
+%120 = OpCompositeConstruct  %41  %12 %12
+%121 = OpCompositeConstruct  %47  %118 %119 %120
+%122 = OpCompositeConstruct  %48  %121
+OpStore %84 %122
+%123 = OpLoad  %6  %82
+%124 = OpIAdd  %6  %123 %8
+OpStore %82 %124
+%126 = OpCompositeConstruct  %41  %13 %13
+%127 = OpCompositeConstruct  %41  %14 %14
+%128 = OpCompositeConstruct  %41  %15 %15
+%129 = OpCompositeConstruct  %47  %126 %127 %128
+%130 = OpAccessChain  %125  %84 %3
+OpStore %130 %129
+%132 = OpCompositeConstruct  %41  %16 %16
+%133 = OpAccessChain  %131  %84 %3 %3
+OpStore %133 %132
+%134 = OpLoad  %6  %82
+%135 = OpCompositeConstruct  %41  %17 %17
+%136 = OpAccessChain  %131  %84 %3 %134
+OpStore %136 %135
+%138 = OpAccessChain  %137  %84 %3 %3 %32
+OpStore %138 %18
+%139 = OpLoad  %6  %82
+%140 = OpAccessChain  %137  %84 %3 %3 %139
+OpStore %140 %19
+%141 = OpLoad  %6  %82
+%142 = OpAccessChain  %137  %84 %3 %141 %32
+OpStore %142 %20
+%143 = OpLoad  %6  %82
+%144 = OpLoad  %6  %82
+%145 = OpAccessChain  %137  %84 %3 %143 %144
+OpStore %145 %21
 OpReturn
 OpFunctionEnd
-%137 = OpFunction  %10  None %138
-%136 = OpFunctionParameter  %48
-%135 = OpLabel
-OpBranch %139
-%139 = OpLabel
-%140 = OpLoad  %10  %136
-OpReturnValue %140
+%151 = OpFunction  %2  None %89
+%150 = OpLabel
+%146 = OpVariable  %83  Function %8
+%147 = OpVariable  %148  Function %149
+%153 = OpAccessChain  %152  %78 %3
+OpBranch %154
+%154 = OpLabel
+%155 = OpLoad  %6  %146
+%156 = OpISub  %6  %155 %8
+OpStore %146 %156
+%158 = OpAccessChain  %157  %153 %3
+%159 = OpLoad  %51  %158
+%161 = OpAccessChain  %160  %153 %3 %3
+%162 = OpLoad  %50  %161
+%163 = OpAccessChain  %99  %153 %3 %3 %3
+%164 = OpLoad  %41  %163
+%165 = OpLoad  %6  %146
+%166 = OpAccessChain  %99  %153 %3 %3 %165
+%167 = OpLoad  %41  %166
+%168 = OpAccessChain  %105  %153 %3 %3 %3 %32
+%169 = OpLoad  %10  %168
+%170 = OpLoad  %6  %146
+%171 = OpAccessChain  %105  %153 %3 %3 %3 %170
+%172 = OpLoad  %10  %171
+%173 = OpLoad  %6  %146
+%174 = OpAccessChain  %105  %153 %3 %3 %173 %32
+%175 = OpLoad  %10  %174
+%176 = OpLoad  %6  %146
+%177 = OpLoad  %6  %146
+%178 = OpAccessChain  %105  %153 %3 %3 %176 %177
+%179 = OpLoad  %10  %178
+%180 = OpCompositeConstruct  %52  %64
+OpStore %147 %180
+%181 = OpLoad  %6  %146
+%182 = OpIAdd  %6  %181 %8
+OpStore %146 %182
+%184 = OpAccessChain  %183  %147 %3
+OpStore %184 %64
+%186 = OpCompositeConstruct  %41  %23 %23
+%187 = OpCompositeConstruct  %41  %24 %24
+%188 = OpCompositeConstruct  %41  %13 %13
+%189 = OpCompositeConstruct  %41  %14 %14
+%190 = OpCompositeConstruct  %50  %186 %187 %188 %189
+%191 = OpAccessChain  %185  %147 %3 %3
+OpStore %191 %190
+%192 = OpCompositeConstruct  %41  %16 %16
+%193 = OpAccessChain  %131  %147 %3 %3 %3
+OpStore %193 %192
+%194 = OpLoad  %6  %146
+%195 = OpCompositeConstruct  %41  %17 %17
+%196 = OpAccessChain  %131  %147 %3 %3 %194
+OpStore %196 %195
+%197 = OpAccessChain  %137  %147 %3 %3 %3 %32
+OpStore %197 %18
+%198 = OpLoad  %6  %146
+%199 = OpAccessChain  %137  %147 %3 %3 %3 %198
+OpStore %199 %19
+%200 = OpLoad  %6  %146
+%201 = OpAccessChain  %137  %147 %3 %3 %200 %32
+OpStore %201 %20
+%202 = OpLoad  %6  %146
+%203 = OpLoad  %6  %146
+%204 = OpAccessChain  %137  %147 %3 %3 %202 %203
+OpStore %204 %21
+OpReturn
+OpFunctionEnd
+%207 = OpFunction  %10  None %208
+%206 = OpFunctionParameter  %53
+%205 = OpLabel
+OpBranch %209
+%209 = OpLabel
+%210 = OpLoad  %10  %206
+OpReturnValue %210
 OpFunctionEnd
-%143 = OpFunction  %10  None %144
-%142 = OpFunctionParameter  %50
-%141 = OpLabel
-OpBranch %145
-%145 = OpLabel
-%146 = OpCompositeExtract  %49  %142 4
-%147 = OpCompositeExtract  %10  %146 9
-OpReturnValue %147
+%213 = OpFunction  %10  None %214
+%212 = OpFunctionParameter  %55
+%211 = OpLabel
+OpBranch %215
+%215 = OpLabel
+%216 = OpCompositeExtract  %54  %212 4
+%217 = OpCompositeExtract  %10  %216 9
+OpReturnValue %217
 OpFunctionEnd
-%150 = OpFunction  %2  None %151
-%149 = OpFunctionParameter  %54
-%148 = OpLabel
-OpBranch %152
-%152 = OpLabel
-OpStore %149 %32
+%220 = OpFunction  %2  None %221
+%219 = OpFunctionParameter  %59
+%218 = OpLabel
+OpBranch %222
+%222 = OpLabel
+OpStore %219 %34
 OpReturn
 OpFunctionEnd
-%163 = OpFunction  %2  None %78
-%157 = OpLabel
-%153 = OpVariable  %48  Function %26
-%154 = OpVariable  %155  Function %156
-%160 = OpLoad  %4  %158
-%164 = OpAccessChain  %79  %64 %3
-%165 = OpAccessChain  %81  %67 %3
-OpBranch %166
-%166 = OpLabel
-%167 = OpLoad  %10  %153
-OpStore %153 %9
-%168 = OpFunctionCall  %2  %77
-%170 = OpAccessChain  %169  %62 %3
-%171 = OpLoad  %36  %170
-%173 = OpAccessChain  %172  %62 %27
-%174 = OpLoad  %42  %173
-%177 = OpAccessChain  %176  %62 %3 %27 %3
-%178 = OpLoad  %10  %177
-%180 = OpArrayLength  %4  %62 4
-%181 = OpISub  %4  %180 %28
-%184 = OpAccessChain  %52  %62 %183 %181 %3
-%185 = OpLoad  %6  %184
-%186 = OpLoad  %47  %165
-%187 = OpFunctionCall  %10  %137 %153
-%188 = OpConvertFToS  %6  %178
-%189 = OpCompositeConstruct  %53  %185 %188 %29 %24 %23
-OpStore %154 %189
-%190 = OpIAdd  %4  %160 %30
-%191 = OpAccessChain  %72  %154 %190
-OpStore %191 %31
-%192 = OpAccessChain  %72  %154 %160
-%193 = OpLoad  %6  %192
-%194 = OpFunctionCall  %10  %143 %58
-%196 = OpCompositeConstruct  %195  %193 %193 %193 %193
-%197 = OpConvertSToF  %51  %196
-%198 = OpMatrixTimesVector  %37  %171 %197
-%199 = OpCompositeConstruct  %51  %198 %11
-OpStore %161 %199
+%233 = OpFunction  %2  None %89
+%227 = OpLabel
+%223 = OpVariable  %53  Function %22
+%224 = OpVariable  %225  Function %226
+%230 = OpLoad  %4  %228
+%234 = OpAccessChain  %90  %72 %3
+%235 = OpAccessChain  %92  %75 %3
+%236 = OpAccessChain  %152  %78 %3
+OpBranch %237
+%237 = OpLabel
+%238 = OpLoad  %10  %223
+OpStore %223 %9
+%239 = OpFunctionCall  %2  %88
+%240 = OpFunctionCall  %2  %151
+%242 = OpAccessChain  %241  %70 %3
+%243 = OpLoad  %38  %242
+%245 = OpAccessChain  %244  %70 %29
+%246 = OpLoad  %44  %245
+%249 = OpAccessChain  %248  %70 %3 %29 %3
+%250 = OpLoad  %10  %249
+%252 = OpArrayLength  %4  %70 4
+%253 = OpISub  %4  %252 %30
+%256 = OpAccessChain  %57  %70 %255 %253 %3
+%257 = OpLoad  %6  %256
+%258 = OpLoad  %49  %235
+%259 = OpFunctionCall  %10  %207 %223
+%260 = OpConvertFToS  %6  %250
+%261 = OpCompositeConstruct  %58  %257 %260 %31 %27 %26
+OpStore %224 %261
+%262 = OpIAdd  %4  %230 %32
+%263 = OpAccessChain  %83  %224 %262
+OpStore %263 %33
+%264 = OpAccessChain  %83  %224 %230
+%265 = OpLoad  %6  %264
+%266 = OpFunctionCall  %10  %213 %66
+%268 = OpCompositeConstruct  %267  %265 %265 %265 %265
+%269 = OpConvertSToF  %56  %268
+%270 = OpMatrixTimesVector  %39  %243 %269
+%271 = OpCompositeConstruct  %56  %270 %11
+OpStore %231 %271
 OpReturn
 OpFunctionEnd
-%202 = OpFunction  %2  None %78
-%200 = OpLabel
-%203 = OpAccessChain  %81  %67 %3
-OpBranch %204
-%204 = OpLabel
-%205 = OpAccessChain  %176  %62 %3 %30 %28
-OpStore %205 %9
-%206 = OpCompositeConstruct  %37  %26 %26 %26
-%207 = OpCompositeConstruct  %37  %9 %9 %9
-%208 = OpCompositeConstruct  %37  %11 %11 %11
-%209 = OpCompositeConstruct  %37  %12 %12 %12
-%210 = OpCompositeConstruct  %36  %206 %207 %208 %209
-%211 = OpAccessChain  %169  %62 %3
-OpStore %211 %210
-%212 = OpCompositeConstruct  %41  %3 %3
-%213 = OpCompositeConstruct  %41  %30 %30
-%214 = OpCompositeConstruct  %42  %212 %213
-%215 = OpAccessChain  %172  %62 %27
-OpStore %215 %214
-%216 = OpAccessChain  %52  %62 %183 %30 %3
-OpStore %216 %8
-OpStore %203 %59
-%217 = OpCompositeConstruct  %51  %26 %26 %26 %26
-OpStore %201 %217
+%274 = OpFunction  %2  None %89
+%272 = OpLabel
+%275 = OpAccessChain  %92  %75 %3
+OpBranch %276
+%276 = OpLabel
+%277 = OpAccessChain  %248  %70 %3 %32 %30
+OpStore %277 %9
+%278 = OpCompositeConstruct  %39  %22 %22 %22
+%279 = OpCompositeConstruct  %39  %9 %9 %9
+%280 = OpCompositeConstruct  %39  %11 %11 %11
+%281 = OpCompositeConstruct  %39  %12 %12 %12
+%282 = OpCompositeConstruct  %38  %278 %279 %280 %281
+%283 = OpAccessChain  %241  %70 %3
+OpStore %283 %282
+%284 = OpCompositeConstruct  %43  %3 %3
+%285 = OpCompositeConstruct  %43  %32 %32
+%286 = OpCompositeConstruct  %44  %284 %285
+%287 = OpAccessChain  %244  %70 %29
+OpStore %287 %286
+%288 = OpAccessChain  %57  %70 %255 %32 %3
+OpStore %288 %8
+OpStore %275 %67
+%289 = OpCompositeConstruct  %56  %22 %22 %22 %22
+OpStore %273 %289
 OpReturn
 OpFunctionEnd
-%221 = OpFunction  %2  None %78
-%220 = OpLabel
-%218 = OpVariable  %72  Function %219
-OpBranch %222
-%222 = OpLabel
-%224 = OpAccessChain  %223  %62 %28
-%225 = OpAtomicLoad  %6  %224 %8 %226
-%228 = OpAccessChain  %223  %62 %28
-%227 = OpAtomicIAdd  %6  %228 %8 %226 %23
-OpStore %218 %227
-%230 = OpAccessChain  %223  %62 %28
-%229 = OpAtomicISub  %6  %230 %8 %226 %23
-OpStore %218 %229
-%232 = OpAccessChain  %223  %62 %28
-%231 = OpAtomicAnd  %6  %232 %8 %226 %23
-OpStore %218 %231
-%234 = OpAccessChain  %223  %62 %28
-%233 = OpAtomicOr  %6  %234 %8 %226 %23
-OpStore %218 %233
-%236 = OpAccessChain  %223  %62 %28
-%235 = OpAtomicXor  %6  %236 %8 %226 %23
-OpStore %218 %235
-%238 = OpAccessChain  %223  %62 %28
-%237 = OpAtomicSMin  %6  %238 %8 %226 %23
-OpStore %218 %237
-%240 = OpAccessChain  %223  %62 %28
-%239 = OpAtomicSMax  %6  %240 %8 %226 %23
-OpStore %218 %239
-%242 = OpAccessChain  %223  %62 %28
-%241 = OpAtomicExchange  %6  %242 %8 %226 %23
-OpStore %218 %241
-%243 = OpAccessChain  %223  %62 %28
-OpAtomicStore %243 %8 %226 %225
+%293 = OpFunction  %2  None %89
+%292 = OpLabel
+%290 = OpVariable  %83  Function %291
+OpBranch %294
+%294 = OpLabel
+%296 = OpAccessChain  %295  %70 %30
+%297 = OpAtomicLoad  %6  %296 %8 %298
+%300 = OpAccessChain  %295  %70 %30
+%299 = OpAtomicIAdd  %6  %300 %8 %298 %26
+OpStore %290 %299
+%302 = OpAccessChain  %295  %70 %30
+%301 = OpAtomicISub  %6  %302 %8 %298 %26
+OpStore %290 %301
+%304 = OpAccessChain  %295  %70 %30
+%303 = OpAtomicAnd  %6  %304 %8 %298 %26
+OpStore %290 %303
+%306 = OpAccessChain  %295  %70 %30
+%305 = OpAtomicOr  %6  %306 %8 %298 %26
+OpStore %290 %305
+%308 = OpAccessChain  %295  %70 %30
+%307 = OpAtomicXor  %6  %308 %8 %298 %26
+OpStore %290 %307
+%310 = OpAccessChain  %295  %70 %30
+%309 = OpAtomicSMin  %6  %310 %8 %298 %26
+OpStore %290 %309
+%312 = OpAccessChain  %295  %70 %30
+%311 = OpAtomicSMax  %6  %312 %8 %298 %26
+OpStore %290 %311
+%314 = OpAccessChain  %295  %70 %30
+%313 = OpAtomicExchange  %6  %314 %8 %298 %26
+OpStore %290 %313
+%315 = OpAccessChain  %295  %70 %30
+OpAtomicStore %315 %8 %298 %297
 OpReturn
 OpFunctionEnd
-%245 = OpFunction  %2  None %78
-%244 = OpLabel
-OpBranch %246
-%246 = OpLabel
-%247 = OpFunctionCall  %2  %150 %70
+%317 = OpFunction  %2  None %89
+%316 = OpLabel
+OpBranch %318
+%318 = OpLabel
+%319 = OpFunctionCall  %2  %220 %81
 OpReturn
 OpFunctionEnd
\ No newline at end of file
diff --git a/tests/out/spv/globals.spvasm b/tests/out/spv/globals.spvasm
index 8109c3ebfd..2c1691c0cf 100644
--- a/tests/out/spv/globals.spvasm
+++ b/tests/out/spv/globals.spvasm
@@ -1,41 +1,53 @@
 ; SPIR-V
 ; Version: 1.1
 ; Generator: rspirv
-; Bound: 138
+; Bound: 169
 OpCapability Shader
 OpExtension "SPV_KHR_storage_buffer_storage_class"
 %1 = OpExtInstImport "GLSL.std.450"
 OpMemoryModel Logical GLSL450
-OpEntryPoint GLCompute %99 "main"
-OpExecutionMode %99 LocalSize 1 1 1
-OpDecorate %24 ArrayStride 4
-OpMemberDecorate %26 0 Offset 0
-OpMemberDecorate %26 1 Offset 12
-OpDecorate %28 ArrayStride 8
-OpDecorate %30 ArrayStride 16
-OpDecorate %39 DescriptorSet 0
-OpDecorate %39 Binding 1
-OpDecorate %40 Block
-OpMemberDecorate %40 0 Offset 0
-OpDecorate %42 NonWritable
-OpDecorate %42 DescriptorSet 0
-OpDecorate %42 Binding 2
-OpDecorate %43 Block
-OpMemberDecorate %43 0 Offset 0
-OpDecorate %45 DescriptorSet 0
-OpDecorate %45 Binding 3
-OpDecorate %46 Block
-OpMemberDecorate %46 0 Offset 0
-OpDecorate %48 DescriptorSet 0
-OpDecorate %48 Binding 4
-OpDecorate %49 Block
-OpMemberDecorate %49 0 Offset 0
-OpDecorate %51 DescriptorSet 0
-OpDecorate %51 Binding 5
-OpDecorate %52 Block
-OpMemberDecorate %52 0 Offset 0
-OpMemberDecorate %52 0 ColMajor
-OpMemberDecorate %52 0 MatrixStride 16
+OpEntryPoint GLCompute %114 "main"
+OpExecutionMode %114 LocalSize 1 1 1
+OpDecorate %25 ArrayStride 4
+OpMemberDecorate %27 0 Offset 0
+OpMemberDecorate %27 1 Offset 12
+OpDecorate %29 ArrayStride 8
+OpDecorate %31 ArrayStride 16
+OpDecorate %34 ArrayStride 32
+OpDecorate %35 ArrayStride 64
+OpDecorate %37 ArrayStride 32
+OpDecorate %38 ArrayStride 64
+OpDecorate %46 DescriptorSet 0
+OpDecorate %46 Binding 1
+OpDecorate %47 Block
+OpMemberDecorate %47 0 Offset 0
+OpDecorate %49 NonWritable
+OpDecorate %49 DescriptorSet 0
+OpDecorate %49 Binding 2
+OpDecorate %50 Block
+OpMemberDecorate %50 0 Offset 0
+OpDecorate %52 DescriptorSet 0
+OpDecorate %52 Binding 3
+OpDecorate %53 Block
+OpMemberDecorate %53 0 Offset 0
+OpDecorate %55 DescriptorSet 0
+OpDecorate %55 Binding 4
+OpDecorate %56 Block
+OpMemberDecorate %56 0 Offset 0
+OpDecorate %58 DescriptorSet 0
+OpDecorate %58 Binding 5
+OpDecorate %59 Block
+OpMemberDecorate %59 0 Offset 0
+OpMemberDecorate %59 0 ColMajor
+OpMemberDecorate %59 0 MatrixStride 8
+OpDecorate %61 DescriptorSet 0
+OpDecorate %61 Binding 6
+OpDecorate %62 Block
+OpMemberDecorate %62 0 Offset 0
+OpDecorate %64 DescriptorSet 0
+OpDecorate %64 Binding 7
+OpDecorate %65 Block
+OpMemberDecorate %65 0 Offset 0
 %2 = OpTypeVoid
 %4 = OpTypeBool
 %3 = OpConstantTrue  %4
@@ -43,153 +55,185 @@ OpMemberDecorate %52 0 MatrixStride 16
 %5 = OpConstant  %6  10
 %8 = OpTypeInt 32 1
 %7 = OpConstant  %8  20
-%10 = OpTypeFloat 32
-%9 = OpConstant  %10  1.0
-%11 = OpConstant  %8  1
-%12 = OpConstant  %8  0
-%13 = OpConstant  %10  2.0
-%14 = OpConstant  %10  3.0
-%15 = OpConstant  %10  0.0
-%16 = OpConstant  %8  6
-%17 = OpConstant  %8  5
-%18 = OpConstant  %8  4
-%19 = OpConstant  %8  3
-%20 = OpConstant  %8  2
-%21 = OpConstant  %10  4.0
-%22 = OpConstant  %6  2
-%23 = OpConstantTrue  %4
-%24 = OpTypeArray %10 %5
-%25 = OpTypeVector %10 3
-%26 = OpTypeStruct %25 %10
-%27 = OpTypeVector %10 2
-%28 = OpTypeRuntimeArray %27
-%29 = OpTypeVector %10 4
-%30 = OpTypeArray %29 %7
-%31 = OpTypeMatrix %29 4
-%32 = OpTypeMatrix %25 3
-%33 = OpConstantComposite  %25  %15 %15 %15
-%34 = OpConstantComposite  %32  %33 %33 %33
-%36 = OpTypePointer Workgroup %24
-%35 = OpVariable  %36  Workgroup
-%38 = OpTypePointer Workgroup %6
-%37 = OpVariable  %38  Workgroup
-%40 = OpTypeStruct %26
-%41 = OpTypePointer StorageBuffer %40
-%39 = OpVariable  %41  StorageBuffer
-%43 = OpTypeStruct %28
-%44 = OpTypePointer StorageBuffer %43
-%42 = OpVariable  %44  StorageBuffer
-%46 = OpTypeStruct %30
-%47 = OpTypePointer Uniform %46
-%45 = OpVariable  %47  Uniform
-%49 = OpTypeStruct %29
-%50 = OpTypePointer Uniform %49
-%48 = OpVariable  %50  Uniform
-%52 = OpTypeStruct %31
-%53 = OpTypePointer Uniform %52
-%51 = OpVariable  %53  Uniform
-%57 = OpTypeFunction %2 %25
-%58 = OpTypePointer StorageBuffer %28
-%59 = OpTypePointer Uniform %29
-%60 = OpTypePointer StorageBuffer %26
-%61 = OpTypePointer Uniform %30
-%62 = OpTypePointer Uniform %31
-%65 = OpTypePointer Function %8
-%68 = OpTypeFunction %2
-%69 = OpConstant  %6  0
-%72 = OpTypePointer StorageBuffer %25
-%75 = OpTypePointer StorageBuffer %10
-%95 = OpTypePointer Function %10
-%97 = OpTypePointer Function %4
-%107 = OpTypePointer Workgroup %10
-%112 = OpConstant  %6  6
-%114 = OpTypePointer StorageBuffer %27
-%115 = OpConstant  %6  1
-%118 = OpConstant  %6  5
-%120 = OpTypePointer Uniform %10
-%121 = OpConstant  %6  3
-%124 = OpConstant  %6  4
-%126 = OpTypePointer StorageBuffer %10
-%137 = OpConstant  %6  256
-%56 = OpFunction  %2  None %57
-%55 = OpFunctionParameter  %25
-%54 = OpLabel
-OpBranch %63
-%63 = OpLabel
+%9 = OpConstant  %8  2
+%11 = OpTypeFloat 32
+%10 = OpConstant  %11  1.0
+%12 = OpConstant  %8  1
+%13 = OpConstant  %8  0
+%14 = OpConstant  %11  2.0
+%15 = OpConstant  %11  3.0
+%16 = OpConstant  %11  0.0
+%17 = OpConstant  %8  7
+%18 = OpConstant  %8  6
+%19 = OpConstant  %8  5
+%20 = OpConstant  %8  4
+%21 = OpConstant  %8  3
+%22 = OpConstant  %11  4.0
+%23 = OpConstant  %6  2
+%24 = OpConstantTrue  %4
+%25 = OpTypeArray %11 %5
+%26 = OpTypeVector %11 3
+%27 = OpTypeStruct %26 %11
+%28 = OpTypeVector %11 2
+%29 = OpTypeRuntimeArray %28
+%30 = OpTypeVector %11 4
+%31 = OpTypeArray %30 %7
+%32 = OpTypeMatrix %28 3
+%33 = OpTypeMatrix %30 2
+%34 = OpTypeArray %33 %9
+%35 = OpTypeArray %34 %9
+%36 = OpTypeMatrix %28 4
+%37 = OpTypeArray %36 %9
+%38 = OpTypeArray %37 %9
+%39 = OpTypeMatrix %26 3
+%40 = OpConstantComposite  %26  %16 %16 %16
+%41 = OpConstantComposite  %39  %40 %40 %40
+%43 = OpTypePointer Workgroup %25
+%42 = OpVariable  %43  Workgroup
+%45 = OpTypePointer Workgroup %6
+%44 = OpVariable  %45  Workgroup
+%47 = OpTypeStruct %27
+%48 = OpTypePointer StorageBuffer %47
+%46 = OpVariable  %48  StorageBuffer
+%50 = OpTypeStruct %29
+%51 = OpTypePointer StorageBuffer %50
+%49 = OpVariable  %51  StorageBuffer
+%53 = OpTypeStruct %31
+%54 = OpTypePointer Uniform %53
+%52 = OpVariable  %54  Uniform
+%56 = OpTypeStruct %26
+%57 = OpTypePointer Uniform %56
+%55 = OpVariable  %57  Uniform
+%59 = OpTypeStruct %32
+%60 = OpTypePointer Uniform %59
+%58 = OpVariable  %60  Uniform
+%62 = OpTypeStruct %35
+%63 = OpTypePointer Uniform %62
+%61 = OpVariable  %63  Uniform
+%65 = OpTypeStruct %38
+%66 = OpTypePointer Uniform %65
+%64 = OpVariable  %66  Uniform
+%70 = OpTypeFunction %2 %26
+%71 = OpTypePointer StorageBuffer %29
+%72 = OpTypePointer Uniform %26
+%73 = OpTypePointer StorageBuffer %27
+%74 = OpTypePointer Uniform %35
+%75 = OpTypePointer Uniform %31
+%76 = OpTypePointer Uniform %38
+%77 = OpTypePointer Uniform %32
+%80 = OpTypePointer Function %8
+%83 = OpTypeFunction %2
+%84 = OpConstant  %6  0
+%87 = OpTypePointer StorageBuffer %26
+%90 = OpTypePointer StorageBuffer %11
+%110 = OpTypePointer Function %11
+%112 = OpTypePointer Function %4
+%124 = OpTypePointer Workgroup %11
+%125 = OpTypePointer Uniform %37
+%126 = OpTypePointer Uniform %36
+%129 = OpTypePointer Uniform %34
+%130 = OpTypePointer Uniform %33
+%131 = OpTypePointer Uniform %30
+%136 = OpConstant  %6  7
+%142 = OpConstant  %6  6
+%144 = OpTypePointer StorageBuffer %28
+%145 = OpConstant  %6  1
+%148 = OpConstant  %6  5
+%150 = OpTypePointer Uniform %30
+%151 = OpTypePointer Uniform %11
+%152 = OpConstant  %6  3
+%155 = OpConstant  %6  4
+%157 = OpTypePointer StorageBuffer %11
+%168 = OpConstant  %6  256
+%69 = OpFunction  %2  None %70
+%68 = OpFunctionParameter  %26
+%67 = OpLabel
+OpBranch %78
+%78 = OpLabel
 OpReturn
 OpFunctionEnd
-%67 = OpFunction  %2  None %68
-%66 = OpLabel
-%64 = OpVariable  %65  Function %11
-%70 = OpAccessChain  %60  %39 %69
-OpBranch %71
-%71 = OpLabel
-%73 = OpCompositeConstruct  %25  %9 %9 %9
-%74 = OpAccessChain  %72  %70 %69
-OpStore %74 %73
-%76 = OpAccessChain  %75  %70 %69 %69
-OpStore %76 %9
-%77 = OpAccessChain  %75  %70 %69 %69
-OpStore %77 %13
-%78 = OpLoad  %8  %64
-%79 = OpAccessChain  %75  %70 %69 %78
-OpStore %79 %14
-%80 = OpLoad  %26  %70
-%81 = OpCompositeExtract  %25  %80 0
-%82 = OpCompositeExtract  %25  %80 0
-%83 = OpVectorShuffle  %27  %82 %82 2 0
-%84 = OpCompositeExtract  %25  %80 0
-%85 = OpFunctionCall  %2  %56 %84
-%86 = OpCompositeExtract  %25  %80 0
-%87 = OpVectorTimesMatrix  %25  %86 %34
-%88 = OpCompositeExtract  %25  %80 0
-%89 = OpMatrixTimesVector  %25  %34 %88
-%90 = OpCompositeExtract  %25  %80 0
-%91 = OpVectorTimesScalar  %25  %90 %13
-%92 = OpCompositeExtract  %25  %80 0
-%93 = OpVectorTimesScalar  %25  %92 %13
+%82 = OpFunction  %2  None %83
+%81 = OpLabel
+%79 = OpVariable  %80  Function %12
+%85 = OpAccessChain  %73  %46 %84
+OpBranch %86
+%86 = OpLabel
+%88 = OpCompositeConstruct  %26  %10 %10 %10
+%89 = OpAccessChain  %87  %85 %84
+OpStore %89 %88
+%91 = OpAccessChain  %90  %85 %84 %84
+OpStore %91 %10
+%92 = OpAccessChain  %90  %85 %84 %84
+OpStore %92 %14
+%93 = OpLoad  %8  %79
+%94 = OpAccessChain  %90  %85 %84 %93
+OpStore %94 %15
+%95 = OpLoad  %27  %85
+%96 = OpCompositeExtract  %26  %95 0
+%97 = OpCompositeExtract  %26  %95 0
+%98 = OpVectorShuffle  %28  %97 %97 2 0
+%99 = OpCompositeExtract  %26  %95 0
+%100 = OpFunctionCall  %2  %69 %99
+%101 = OpCompositeExtract  %26  %95 0
+%102 = OpVectorTimesMatrix  %26  %101 %41
+%103 = OpCompositeExtract  %26  %95 0
+%104 = OpMatrixTimesVector  %26  %41 %103
+%105 = OpCompositeExtract  %26  %95 0
+%106 = OpVectorTimesScalar  %26  %105 %14
+%107 = OpCompositeExtract  %26  %95 0
+%108 = OpVectorTimesScalar  %26  %107 %14
 OpReturn
 OpFunctionEnd
-%99 = OpFunction  %2  None %68
-%98 = OpLabel
-%94 = OpVariable  %95  Function %9
-%96 = OpVariable  %97  Function %23
-%100 = OpAccessChain  %60  %39 %69
-%101 = OpAccessChain  %58  %42 %69
-%102 = OpAccessChain  %61  %45 %69
-%103 = OpAccessChain  %59  %48 %69
-%104 = OpAccessChain  %62  %51 %69
-OpBranch %105
-%105 = OpLabel
-%106 = OpFunctionCall  %2  %67
-%108 = OpLoad  %31  %104
-%109 = OpLoad  %29  %103
-%110 = OpMatrixTimesVector  %29  %108 %109
-%111 = OpCompositeExtract  %10  %110 0
-%113 = OpAccessChain  %107  %35 %112
-OpStore %113 %111
-%116 = OpAccessChain  %75  %101 %115 %115
-%117 = OpLoad  %10  %116
-%119 = OpAccessChain  %107  %35 %118
-OpStore %119 %117
-%122 = OpAccessChain  %120  %102 %69 %121
-%123 = OpLoad  %10  %122
-%125 = OpAccessChain  %107  %35 %124
-OpStore %125 %123
-%127 = OpAccessChain  %126  %100 %115
-%128 = OpLoad  %10  %127
-%129 = OpAccessChain  %107  %35 %121
-OpStore %129 %128
-%130 = OpAccessChain  %75  %100 %69 %69
-%131 = OpLoad  %10  %130
-%132 = OpAccessChain  %107  %35 %22
-OpStore %132 %131
-%133 = OpAccessChain  %126  %100 %115
-OpStore %133 %21
-%134 = OpArrayLength  %6  %42 0
-%135 = OpConvertUToF  %10  %134
-%136 = OpAccessChain  %107  %35 %115
-OpStore %136 %135
-OpAtomicStore %37 %20 %137 %22
+%114 = OpFunction  %2  None %83
+%113 = OpLabel
+%109 = OpVariable  %110  Function %10
+%111 = OpVariable  %112  Function %24
+%115 = OpAccessChain  %73  %46 %84
+%116 = OpAccessChain  %71  %49 %84
+%117 = OpAccessChain  %75  %52 %84
+%118 = OpAccessChain  %72  %55 %84
+%119 = OpAccessChain  %77  %58 %84
+%120 = OpAccessChain  %74  %61 %84
+%121 = OpAccessChain  %76  %64 %84
+OpBranch %122
+%122 = OpLabel
+%123 = OpFunctionCall  %2  %82
+%127 = OpAccessChain  %126  %121 %84 %84
+%128 = OpLoad  %36  %127
+%132 = OpAccessChain  %131  %120 %84 %84 %84
+%133 = OpLoad  %30  %132
+%134 = OpMatrixTimesVector  %28  %128 %133
+%135 = OpCompositeExtract  %11  %134 0
+%137 = OpAccessChain  %124  %42 %136
+OpStore %137 %135
+%138 = OpLoad  %32  %119
+%139 = OpLoad  %26  %118
+%140 = OpMatrixTimesVector  %28  %138 %139
+%141 = OpCompositeExtract  %11  %140 0
+%143 = OpAccessChain  %124  %42 %142
+OpStore %143 %141
+%146 = OpAccessChain  %90  %116 %145 %145
+%147 = OpLoad  %11  %146
+%149 = OpAccessChain  %124  %42 %148
+OpStore %149 %147
+%153 = OpAccessChain  %151  %117 %84 %152
+%154 = OpLoad  %11  %153
+%156 = OpAccessChain  %124  %42 %155
+OpStore %156 %154
+%158 = OpAccessChain  %157  %115 %145
+%159 = OpLoad  %11  %158
+%160 = OpAccessChain  %124  %42 %152
+OpStore %160 %159
+%161 = OpAccessChain  %90  %115 %84 %84
+%162 = OpLoad  %11  %161
+%163 = OpAccessChain  %124  %42 %23
+OpStore %163 %162
+%164 = OpAccessChain  %157  %115 %145
+OpStore %164 %22
+%165 = OpArrayLength  %6  %49 0
+%166 = OpConvertUToF  %11  %165
+%167 = OpAccessChain  %124  %42 %145
+OpStore %167 %166
+OpAtomicStore %44 %9 %168 %23
 OpReturn
 OpFunctionEnd
\ No newline at end of file
diff --git a/tests/out/wgsl/access.wgsl b/tests/out/wgsl/access.wgsl
index 74caf8cd87..2a25704545 100644
--- a/tests/out/wgsl/access.wgsl
+++ b/tests/out/wgsl/access.wgsl
@@ -20,6 +20,10 @@ struct Baz {
     m: mat3x2<f32>,
 }
 
+struct MatCx2InArray {
+    am: array<mat4x2<f32>,2>,
+}
+
 var<private> global_const: GlobalConst = GlobalConst(0u, vec3<u32>(0u, 0u, 0u), 0);
 @group(0) @binding(0) 
 var<storage, read_write> bar: Bar;
@@ -27,6 +31,8 @@ var<storage, read_write> bar: Bar;
 var<uniform> baz: Baz;
 @group(0) @binding(2) 
 var<storage, read_write> qux: vec2<i32>;
+@group(0) @binding(3) 
+var<uniform> nested_mat_cx2_: MatCx2InArray;
 var<workgroup> val: u32;
 
 fn test_matrix_within_struct_accesses() {
@@ -65,9 +71,47 @@ fn test_matrix_within_struct_accesses() {
     return;
 }
 
+fn test_matrix_within_array_within_struct_accesses() {
+    var idx_1: i32 = 1;
+    var t_1: MatCx2InArray;
+
+    let _e7 = idx_1;
+    idx_1 = (_e7 - 1);
+    _ = nested_mat_cx2_.am;
+    _ = nested_mat_cx2_.am[0];
+    _ = nested_mat_cx2_.am[0][0];
+    let _e25 = idx_1;
+    _ = nested_mat_cx2_.am[0][_e25];
+    _ = nested_mat_cx2_.am[0][0][1];
+    let _e41 = idx_1;
+    _ = nested_mat_cx2_.am[0][0][_e41];
+    let _e47 = idx_1;
+    _ = nested_mat_cx2_.am[0][_e47][1];
+    let _e55 = idx_1;
+    let _e57 = idx_1;
+    _ = nested_mat_cx2_.am[0][_e55][_e57];
+    t_1 = MatCx2InArray(array<mat4x2<f32>,2>(mat4x2<f32>(vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0)), mat4x2<f32>(vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0))));
+    let _e63 = idx_1;
+    idx_1 = (_e63 + 1);
+    t_1.am = array<mat4x2<f32>,2>(mat4x2<f32>(vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0)), mat4x2<f32>(vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0), vec2<f32>(0.0, 0.0)));
+    t_1.am[0] = mat4x2<f32>(vec2<f32>(8.0), vec2<f32>(7.0), vec2<f32>(6.0), vec2<f32>(5.0));
+    t_1.am[0][0] = vec2<f32>(9.0);
+    let _e90 = idx_1;
+    t_1.am[0][_e90] = vec2<f32>(90.0);
+    t_1.am[0][0][1] = 10.0;
+    let _e107 = idx_1;
+    t_1.am[0][0][_e107] = 20.0;
+    let _e113 = idx_1;
+    t_1.am[0][_e113][1] = 30.0;
+    let _e121 = idx_1;
+    let _e123 = idx_1;
+    t_1.am[0][_e121][_e123] = 40.0;
+    return;
+}
+
 fn read_from_private(foo_1: ptr<function, f32>) -> f32 {
-    let _e5 = (*foo_1);
-    return _e5;
+    let _e6 = (*foo_1);
+    return _e6;
 }
 
 fn test_arr_as_arg(a: array<array<f32,10>,5>) -> f32 {
@@ -87,17 +131,18 @@ fn foo_vert(@builtin(vertex_index) vi: u32) -> @builtin(position) vec4<f32> {
     let baz_1 = foo;
     foo = 1.0;
     test_matrix_within_struct_accesses();
+    test_matrix_within_array_within_struct_accesses();
     let _matrix = bar._matrix;
     let arr = bar.arr;
     let b = bar._matrix[3][0];
     let a_1 = bar.data[(arrayLength((&bar.data)) - 2u)].value;
     let c_1 = qux;
     let data_pointer = (&bar.data[0].value);
-    let _e31 = read_from_private((&foo));
+    let _e32 = read_from_private((&foo));
     c = array<i32,5>(a_1, i32(b), 3, 4, 5);
     c[(vi + 1u)] = 42;
     let value = c[vi];
-    let _e45 = test_arr_as_arg(array<array<f32,10>,5>(array<f32,10>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), array<f32,10>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), array<f32,10>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), array<f32,10>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), array<f32,10>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)));
+    let _e46 = test_arr_as_arg(array<array<f32,10>,5>(array<f32,10>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), array<f32,10>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), array<f32,10>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), array<f32,10>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), array<f32,10>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)));
     return vec4<f32>((_matrix * vec4<f32>(vec4<i32>(value))), 2.0);
 }
 
@@ -116,22 +161,22 @@ fn atomics() {
     var tmp: i32;
 
     let value_1 = atomicLoad((&bar.atom));
-    let _e9 = atomicAdd((&bar.atom), 5);
-    tmp = _e9;
-    let _e12 = atomicSub((&bar.atom), 5);
-    tmp = _e12;
-    let _e15 = atomicAnd((&bar.atom), 5);
-    tmp = _e15;
-    let _e18 = atomicOr((&bar.atom), 5);
-    tmp = _e18;
-    let _e21 = atomicXor((&bar.atom), 5);
-    tmp = _e21;
-    let _e24 = atomicMin((&bar.atom), 5);
-    tmp = _e24;
-    let _e27 = atomicMax((&bar.atom), 5);
-    tmp = _e27;
-    let _e30 = atomicExchange((&bar.atom), 5);
-    tmp = _e30;
+    let _e10 = atomicAdd((&bar.atom), 5);
+    tmp = _e10;
+    let _e13 = atomicSub((&bar.atom), 5);
+    tmp = _e13;
+    let _e16 = atomicAnd((&bar.atom), 5);
+    tmp = _e16;
+    let _e19 = atomicOr((&bar.atom), 5);
+    tmp = _e19;
+    let _e22 = atomicXor((&bar.atom), 5);
+    tmp = _e22;
+    let _e25 = atomicMin((&bar.atom), 5);
+    tmp = _e25;
+    let _e28 = atomicMax((&bar.atom), 5);
+    tmp = _e28;
+    let _e31 = atomicExchange((&bar.atom), 5);
+    tmp = _e31;
     atomicStore((&bar.atom), value_1);
     return;
 }
diff --git a/tests/out/wgsl/globals.wgsl b/tests/out/wgsl/globals.wgsl
index 77435cabce..147f6ec322 100644
--- a/tests/out/wgsl/globals.wgsl
+++ b/tests/out/wgsl/globals.wgsl
@@ -14,9 +14,13 @@ var<storage> dummy: array<vec2<f32>>;
 @group(0) @binding(3) 
 var<uniform> float_vecs: array<vec4<f32>,20>;
 @group(0) @binding(4) 
-var<uniform> global_vec: vec4<f32>;
+var<uniform> global_vec: vec3<f32>;
 @group(0) @binding(5) 
-var<uniform> global_mat: mat4x4<f32>;
+var<uniform> global_mat: mat3x2<f32>;
+@group(0) @binding(6) 
+var<uniform> global_nested_arrays_of_matrices_2x4_: array<array<mat2x4<f32>,2>,2>;
+@group(0) @binding(7) 
+var<uniform> global_nested_arrays_of_matrices_4x2_: array<array<mat4x2<f32>,2>,2>;
 
 fn test_msl_packed_vec3_as_arg(arg: vec3<f32>) {
     return;
@@ -28,8 +32,8 @@ fn test_msl_packed_vec3_() {
     alignment.v3_ = vec3<f32>(1.0);
     alignment.v3_.x = 1.0;
     alignment.v3_.x = 2.0;
-    let _e21 = idx;
-    alignment.v3_[_e21] = 3.0;
+    let _e23 = idx;
+    alignment.v3_[_e23] = 3.0;
     let data = alignment;
     _ = data.v3_;
     _ = data.v3_.zx;
@@ -46,17 +50,20 @@ fn main() {
     var at: bool = true;
 
     test_msl_packed_vec3_();
-    let _e10 = global_mat;
-    let _e11 = global_vec;
-    wg[6] = (_e10 * _e11).x;
-    let _e19 = dummy[1].y;
-    wg[5] = _e19;
-    let _e25 = float_vecs[0].w;
-    wg[4] = _e25;
-    let _e29 = alignment.v1_;
-    wg[3] = _e29;
-    let _e34 = alignment.v3_.x;
-    wg[2] = _e34;
+    let _e16 = global_nested_arrays_of_matrices_4x2_[0][0];
+    let _e23 = global_nested_arrays_of_matrices_2x4_[0][0][0];
+    wg[7] = (_e16 * _e23).x;
+    let _e28 = global_mat;
+    let _e29 = global_vec;
+    wg[6] = (_e28 * _e29).x;
+    let _e37 = dummy[1].y;
+    wg[5] = _e37;
+    let _e43 = float_vecs[0].w;
+    wg[4] = _e43;
+    let _e47 = alignment.v1_;
+    wg[3] = _e47;
+    let _e52 = alignment.v3_.x;
+    wg[2] = _e52;
     alignment.v1_ = 4.0;
     wg[1] = f32(arrayLength((&dummy)));
     atomicStore((&at_1), 2u);