diff --git a/csrc/cutlass_extensions/torch_utils.hpp b/csrc/cutlass_extensions/torch_utils.hpp
index 1618a340ce10..2c78572521ee 100644
--- a/csrc/cutlass_extensions/torch_utils.hpp
+++ b/csrc/cutlass_extensions/torch_utils.hpp
@@ -68,7 +68,13 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,
                         name, ".stride(", idx, ") to be ", StrideEle::value);
             return StrideEle{};
           } else {
-            return tensor.stride(idx);
+            if (tensor.size(idx) == 1) {
+              // use 0 stride for dim with size 1, this is easier for
+              // cute/cutlass to optimize (helps the TMA code flatten dims)
+              return StrideEle{0};
+            } else {
+              return tensor.stride(idx);
+            }
           }
         } else {
           // Extra strides are assumed to be 0 or 1
diff --git a/csrc/quantization/machete/machete_mm_launcher.cuh b/csrc/quantization/machete/machete_mm_launcher.cuh
index e2604d4bed3e..60a4ed60535b 100644
--- a/csrc/quantization/machete/machete_mm_launcher.cuh
+++ b/csrc/quantization/machete/machete_mm_launcher.cuh
@@ -71,7 +71,7 @@ torch::Tensor run_impl(PyTorchArguments args) {
   auto arguments = MacheteKernel::create_arguments(
       stream, A_ptr, layout_A, B_ptr, D_ptr, layout_D, C_ptr, layout_C, S_ptr,
       layout_S, Z_ptr, layout_Z, args.alpha.value_or(1), args.beta.value_or(0),
-      args.group_size.value_or(K));
+      args.group_size);
   TORCH_CHECK(MacheteKernel::can_implement(arguments),
               "Machete kernel cannot be run with these arguments");
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 8ef3c4914027..53d0b86c4f7b 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -389,7 +389,8 @@ def machete_gemm_fake(
     @torch.library.register_fake("_C::machete_prepack_B")
     def machete_prepack_B_fake(b_q_weight: torch.Tensor,
                                b_type: ScalarType) -> torch.Tensor:
-        return torch.empty_like(b_q_weight)
+        return torch.empty_like(b_q_weight,
+                                memory_format=torch.contiguous_format)
 
     @torch.library.register_fake("_C::causal_conv1d_fwd")
     def causal_conv1d_fwd_fake(x: torch.Tensor, weight: torch.Tensor,