diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 6fe6746ec0..dc507f9162 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -356,6 +356,14 @@ vkapi::VulkanBuffer& vTensor::buffer(
   return storage_.buffer_;
 }
 
+utils::uvec3 vTensor::mapped_extents() const {
+  utils::uvec3 m_extents;
+  m_extents[0] = storage_.image_extents_[axis_mapping_.at(0)];
+  m_extents[1] = storage_.image_extents_[axis_mapping_.at(1)];
+  m_extents[2] = storage_.image_extents_[axis_mapping_.at(2)];
+  return m_extents;
+}
+
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
   if (!sizes_uniform_.buffer()) {
     sizes_uniform_ =
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 70f363796f..31052b351d 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -347,10 +347,25 @@ class vTensor final {
     return storage_.storage_type_ == utils::kBuffer;
   }
 
+  /*
+   * Returns the raw image extents of the underlying image texture used to store
+   * the tensor's data. Note that due to axis mapping, the X, Y, and Z extents
+   * may not correspond to the width, height, or channels dimension of the
+   * tensor.
+   */
   inline const utils::uvec3& image_extents() const {
     return storage_.image_extents_;
   }
 
+  /*
+   * Returns the image extents of the underlying image texture, but re-ordered
+   * such that the first element is the extent of the axis used to represent the
+   * tensor's width dimension, the second element is the extent of the axis used
+   * to represent the tensor's height dimension, and the third element is the
+   * extent of the axis used to represent the tensor's channels dimension.
+   */
+  utils::uvec3 mapped_extents() const;
+
   /*
    * Extract an `vkapi::ScalarType` from the TensorOptions member
    */
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index afdc8290cd..4678795533 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -288,6 +288,10 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().image_extents();
   }
 
+  inline utils::uvec3 mapped_extents_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().mapped_extents();
+  }
+
   inline int32_t numel_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().numel();
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
index 1698efb0b1..6e964c745e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
@@ -16,90 +16,219 @@ $if MAT2_IS_TRANSPOSED:
 $if BATCH_MODE:
   #define BATCH_MODE
 
-$if TILE_ROW == "tile_row_2":
-  #define TILE_ROW_2
+$if HAS_BIAS:
+  #define HAS_BIAS
 
 #include "indexing_utils.h"
-#include "matmul.h"
 
-// addmm will have additional arguments compared to regular mm
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self;
+${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")}
+$if HAS_BIAS:
+  ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_axis_mapping")}
+${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat1_axis_mapping")}
+${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat2_axis_mapping")}
+$if HAS_BIAS:
+  ${layout_declare_ubo(B, "ivec4", "bias_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "bias_axis_mapping")}
+  ${layout_declare_ubo(B, "float", "alpha", "float", "beta")}
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(set = 0, binding = 5) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
-};
+layout(constant_id = 3) const int out_packed_dim = C_DIM;
 
-layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes {
-  ivec4 self_sizes;
-};
+// To convince the SPIR-V compiler to unroll the loops optimally, need this
+// macro
+#define FOUR 4
 
-layout(set = 0, binding = 7) uniform PRECISION restrict InLimits {
-  ivec3 in_limits;
+#define TILE_ROWS ${TILE_ROWS}
+
+// we avoid mat4 and vec4 usage here as they compile to much less efficient
+// SPIR-V
+struct FloatMatrix_2d {
+  float data[TILE_ROWS][FOUR];
 };
 
-layout(set = 0, binding = 8) uniform PRECISION restrict Params {
-  float alpha;
-  float beta;
+struct FloatMatrix_3d {
+  float data[TILE_ROWS][FOUR][FOUR];
 };
 
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+#ifdef BATCH_MODE
+  #define FloatMatrix FloatMatrix_3d
+#else
+  #define FloatMatrix FloatMatrix_2d
+#endif // BATCH_MODE
+
+#ifdef HAS_BIAS
+// get texel from self tensor (channel_packed) in addmm
+vec4 get_texel_C_packed(const ivec2 idx) {
+  ivec3 bias_pos = ivec3(0);
+  if (bias_sizes.x > 1) {
+    bias_pos[bias_axis_mapping.x] = idx.x;
+  }
+  if (bias_sizes.y > 1) {
+    bias_pos[bias_axis_mapping.y] = idx.y;
+  }
 
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  return texelFetch(bias_tensor, bias_pos, 0);
+}
+#endif // HAS_BIAS
+
+FloatMatrix matmul_partial(const ivec4 out_idx_tl) {
+  FloatMatrix results;
+  for (int i = 0; i < TILE_ROWS; i++) {
+    for (int j = 0; j < FOUR; j++) {
+#ifdef BATCH_MODE
+      for (int k = 0; k < FOUR; k++) {
+        results.data[i][j][k] = 0.0f;
+      }
+#else
+      results.data[i][j] = 0.0f;
+#endif // BATCH_MODE
+    }
+  }
+  vec4 mat1_tensor_partial_load[TILE_ROWS];
+  vec4 mat2_tensor_partial_load[FOUR];
+
+#ifdef MAT2_IS_TRANSPOSED
+  const int mat2_k_axis = mat2_axis_mapping.x;
+  const int mat2_row_axis = mat2_axis_mapping.y;
+#else
+  const int mat2_k_axis = mat2_axis_mapping.y;
+  const int mat2_row_axis = mat2_axis_mapping.x;
+#endif // MAT2_IS_TRANSPOSED
+
+#ifdef BATCH_MODE
+  for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) {
+    if (out_idx_tl.z + batch_idx >= out_sizes.z) {
+      break;
+    }
+#endif // BATCH_MODE
+  for (int k = 0; k < mat1_sizes.x; k+=4) {
+    const int k_div4 = k >> 2;
+    // read and cache (4 x TILE_ROWS) tile of mat1
+    for (int r = 0; r < TILE_ROWS; r++) {
+      ivec3 mat1_pos = ivec3(0);
+      mat1_pos[mat1_axis_mapping.x] = k_div4;
+      mat1_pos[mat1_axis_mapping.y] = out_idx_tl.y + r;
+#ifdef BATCH_MODE
+      mat1_pos[mat1_axis_mapping.z] = out_idx_tl.z + batch_idx;
+#endif // BATCH_MODE
+
+      mat1_tensor_partial_load[r] = texelFetch(mat1_tensor, mat1_pos, 0);
+    }
 
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
+    // read and cache (4 x 4) tile of mat2
+    for (int r = 0; r < FOUR; ++r) {
+      ivec3 mat2_pos = ivec3(0);
+      mat2_pos[mat2_k_axis] = k_div4;
+      mat2_pos[mat2_row_axis] = out_idx_tl.x + r;
+#if defined(BATCH_MODE) && !defined(MAT2_IS_TRANSPOSED)
+      mat2_pos[mat2_axis_mapping.z] = out_idx_tl.z + batch_idx;
+#endif // BATCH_MODE
+
+      mat2_tensor_partial_load[r] = texelFetch(mat2_tensor, mat2_pos, 0);
+    }
+
+    // perform partial dot products and add partial result to results
+    for (int out_row = 0; out_row < TILE_ROWS; out_row++) {
+      for (int out_col = 0; out_col < FOUR; out_col++) {
+#ifdef BATCH_MODE
+        results.data[out_row][out_col][batch_idx] +=
+#else
+        results.data[out_row][out_col] +=
+#endif // BATCH_MODE
+            dot(mat1_tensor_partial_load[out_row], mat2_tensor_partial_load[out_col]);
+      }
+    }
   }
+#ifdef BATCH_MODE
+  }
+#endif // BATCH_MODE
+
+  return results;
+}
 
-  $if BATCH_MODE:
-    FloatMatrix_3d results = matmul_partial_3d(
-      im_mat1,
-      im_mat2,
-      pos,
-      out_sizes[2],
-      in_limits[0]);
-  $else:
-    FloatMatrix_2d results = matmul_partial_2d(
-        im_mat1,
-        im_mat2,
-        pos,
-        out_sizes[2],
-        in_limits[0]);
-
-  for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) {
-    for (int idx_r = 0; idx_r < FOUR; idx_r++) {
-      const ivec3 out_pos =
-          ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z);
-
-      vec4 self_texel = get_texel_C_packed(
-          im_self,
-          out_pos,
-          self_sizes.x == 1,
-          self_sizes.y == 1);
-
-      // results is in transposed order w.r.t. the desired output
-      $if BATCH_MODE:
-        imageStore(
-          im_out,
-          out_pos,
-          vec4(
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][0],
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][1],
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][2],
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][3]));
-      $else:
-        imageStore(
-            im_out,
-            out_pos,
-            vec4(
-                beta * self_texel.x + alpha * results.data[idx_c][idx_r], 0.0, 0.0, 0.0));
+//
+// Write result matrix to output (3D matmul)
+//
+
+void write_results_C_packed(const ivec4 out_idx_tl, FloatMatrix results) {
+  ivec3 out_pos = to_texture_pos(
+      out_idx_tl, out_sizes, out_axis_mapping, out_packed_dim);
+
+  for (int tile_c = 0;
+       tile_c < TILE_ROWS;
+       tile_c++, out_pos[out_axis_mapping.y]++) {
+    out_pos[out_axis_mapping.x] = out_idx_tl.x;
+
+    for (int tile_r = 0;
+         tile_r < FOUR;
+         tile_r++, out_pos[out_axis_mapping.x]++) {
+
+#ifdef HAS_BIAS
+      ivec2 bias_idx;
+      bias_idx[bias_axis_mapping.x] = out_pos[out_axis_mapping.x];
+      bias_idx[bias_axis_mapping.y] = out_pos[out_axis_mapping.y];
+      float bias_val = get_texel_C_packed(bias_idx).x;
+#ifdef BATCH_MODE
+      vec4 bias_texel = vec4(bias_val);
+#else
+      vec4 bias_texel = vec4(bias_val, 0, 0, 0);
+#endif // BATCH_MODE
+#endif // HAS_BIAS
+
+#ifdef BATCH_MODE
+      vec4 out_texel = vec4(
+            results.data[tile_c][tile_r][0],
+            results.data[tile_c][tile_r][1],
+            results.data[tile_c][tile_r][2],
+            results.data[tile_c][tile_r][3]);
+#else
+      vec4 out_texel = vec4(
+            results.data[tile_c][tile_r],
+            0.0,
+            0.0,
+            0.0);
+#endif // BATCH_MODE
+
+#ifdef HAS_BIAS
+      imageStore(out_tensor, out_pos, beta * bias_texel + alpha * out_texel);
+#else
+      imageStore(out_tensor, out_pos, out_texel);
+#endif // HAS_BIAS
     }
   }
 }
+
+void main() {
+  // Each thread is responsible for calculating a (4 x TILE_ROWS x 1) tile of
+  // output elements. If the input matrices are 3D, then a (4 x TILE_ROWS x 4)
+  // tile of output elements will be computed. Note the sizes are written in
+  // (W x H x C) format.
+  const ivec3 tile_idx = ivec3(gl_GlobalInvocationID);
+
+  // Calculate the tensor index of the top left element in the output tile
+  const ivec4 out_idx_topleft = ivec4(
+      tile_idx.x * 4,
+      tile_idx.y * TILE_ROWS,
+#ifdef BATCH_MODE
+      tile_idx.z * 4,
+#else
+      tile_idx.z,
+#endif // BATCH_MODE
+      0);
+
+  // If the top left element is already out of range, then skip
+  if (any(greaterThanEqual(out_idx_topleft, out_sizes))) {
+    return;
+  }
+
+  FloatMatrix results = matmul_partial(out_idx_topleft);
+
+  write_results_C_packed(out_idx_topleft, results);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
index b958d3b954..c82c2003d2 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
@@ -7,24 +7,37 @@
 addmm_optimized:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
-    PACKING: C_packed
     MAT2_IS_TRANSPOSED: false
     BATCH_MODE: false
-    TILE_ROW: tile_row_4
+    TILE_ROWS: 4
+    HAS_BIAS: true
   generate_variant_forall:
-    TILE_ROW:
-      - VALUE: tile_row_4
-      - VALUE: tile_row_2
+    TILE_ROWS:
+      - VALUE: 4
+        SUFFIX: tile_row_4
+      - VALUE: 2
+        SUFFIX: tile_row_2
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
     - NAME: addmm_optimized
+    - NAME: matmul_optimized
+      HAS_BIAS: false
     - NAME: linear_optimized
       MAT2_IS_TRANSPOSED: true
+    - NAME: matmul_transposed_optimized
+      MAT2_IS_TRANSPOSED: true
+      HAS_BIAS: false
     - NAME: batch_addmm_optimized
       BATCH_MODE: true
+    - NAME: batch_matmul_optimized
+      BATCH_MODE: true
+      HAS_BIAS: false
     - NAME: batch_linear_optimized
       MAT2_IS_TRANSPOSED: true
       BATCH_MODE: true
+    - NAME: batch_matmul_transposed_optimized
+      MAT2_IS_TRANSPOSED: true
+      BATCH_MODE: true
+      HAS_BIAS: false
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
deleted file mode 100644
index 8634371a7b..0000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-$if MAT2_IS_TRANSPOSED:
-  #define MAT2_IS_TRANSPOSED
-
-$if BATCH_MODE:
-  #define BATCH_MODE
-
-$if TILE_ROW == "tile_row_2":
-  #define TILE_ROW_2
-
-#include "indexing_utils.h"
-#include "matmul.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-
-layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict InLimits {
-  ivec3 in_limits;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  $if BATCH_MODE:
-    FloatMatrix_3d results = matmul_partial_3d(
-        im_mat1,
-        im_mat2,
-        pos,
-        out_sizes[2],
-        in_limits[0]);
-  $else:
-    FloatMatrix_2d results = matmul_partial_2d(
-        im_mat1,
-        im_mat2,
-        pos,
-        out_sizes[2],
-        in_limits[0]);
-
-  for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) {
-    for (int idx_r = 0; idx_r < FOUR; idx_r++) {
-      const ivec3 out_pos =
-          ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z);
-
-      // results is in transposed order w.r.t. the desired output
-      $if BATCH_MODE:
-        imageStore(
-          im_out,
-          out_pos,
-          vec4(
-              results.data[idx_c][idx_r][0],
-              results.data[idx_c][idx_r][1],
-              results.data[idx_c][idx_r][2],
-              results.data[idx_c][idx_r][3]));
-      $else:
-        imageStore(
-            im_out,
-            out_pos,
-            vec4(results.data[idx_c][idx_r], 0.0, 0.0, 0.0));
-    }
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
deleted file mode 100644
index 9268d5a25a..0000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-matmul_optimized:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    PACKING: C_packed
-    MAT2_IS_TRANSPOSED: false
-    BATCH_MODE: false
-    TILE_ROW: tile_row_4
-  generate_variant_forall:
-    TILE_ROW:
-      - VALUE: tile_row_4
-      - VALUE: tile_row_2
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: matmul_optimized
-    - NAME: matmul_transposed_optimized
-      MAT2_IS_TRANSPOSED: true
-    - NAME: batch_matmul_optimized
-      BATCH_MODE: true
-    - NAME: batch_matmul_transposed_optimized
-      MAT2_IS_TRANSPOSED: true
-      BATCH_MODE: true
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
index 63b60bf52f..14c814b084 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -174,10 +174,19 @@ void add_addmm_optimized_node(
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   utils::uvec3 global_size;
+
+  // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the
+  // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is
+  // channels packed, C does not need to be divided by 4. The "identity" of each
+  // thread is the (x, y, z) coordinate of the output tile it is computing, and
+  // this identity can be used to compute the tensor index of the top left
+  // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0]
   if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1});
+    // Use `mapped_extents` instead of `image_extents` because the workgroup
+    // axes need to correspond to tensor dimensions.
+    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 2, 1});
   } else {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1});
+    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 4, 1});
   }
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
@@ -191,14 +200,18 @@ void add_addmm_optimized_node(
        {{mat1_W_packed, mat2_packed, self}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          graph.texture_limits_ubo(out),
           graph.sizes_ubo(out),
+          graph.axis_mapping_ubo(out),
+          graph.sizes_ubo(mat1_W_packed),
+          graph.axis_mapping_ubo(mat1_W_packed),
+          graph.sizes_ubo(mat2_packed),
+          graph.axis_mapping_ubo(mat2_packed),
           graph.sizes_ubo(self),
-          graph.texture_limits_ubo(mat1_W_packed),
+          graph.axis_mapping_ubo(self),
           graph.create_params_buffer(params),
       },
       // Specialization Constants
-      {},
+      {graph.packed_dim_whcn_idx_of(out)},
       // Resizing Logic
       resize_addmm_node,
       {mat2_is_transposed}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
index a25a602e38..07618239a6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -181,12 +181,21 @@ void add_matmul_optimized_node(
 
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
+  // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the
+  // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is
+  // channels packed, C does not need to be divided by 4. The "identity" of each
+  // thread is the (x, y, z) coordinate of the output tile it is computing, and
+  // this identity can be used to compute the tensor index of the top left
+  // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0]
   utils::uvec3 global_size;
   if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1});
+    // Use `mapped_extents` instead of `image_extents` because the workgroup
+    // axes need to correspond to tensor dimensions.
+    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 2, 1});
   } else {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1});
+    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 4, 1});
   }
+
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -199,12 +208,15 @@ void add_matmul_optimized_node(
        {{mat1_W_packed, mat2_packed}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          graph.texture_limits_ubo(out),
           graph.sizes_ubo(out),
-          graph.texture_limits_ubo(mat1_W_packed),
+          graph.axis_mapping_ubo(out),
+          graph.sizes_ubo(mat1_W_packed),
+          graph.axis_mapping_ubo(mat1_W_packed),
+          graph.sizes_ubo(mat2_packed),
+          graph.axis_mapping_ubo(mat2_packed),
       },
       // Specialization Constants
-      {},
+      {graph.packed_dim_whcn_idx_of(out)},
       // Resizing Logic
       resize_matmul_node,
       {mat2_is_transposed}));