pytorch
diff --git a/‎.github/scripts/validate_binaries.sh‎
100644100755 b/‎.github/scripts/validate_binaries.sh‎
100644100755
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/bench_all_to_all_v.py‎
Lines changed: 52 additions & 24 deletions b/‎benchmarks/prototype/moe_training/mxfp8/bench_all_to_all_v.py‎
Lines changed: 52 additions & 24 deletions
diff --git a/‎scripts/test_torch_version_torchao_version_compatibility.sh‎
Lines changed: 82 additions & 0 deletions b/‎scripts/test_torch_version_torchao_version_compatibility.sh‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎test/prototype/mx_formats/test_inference_workflow.py‎
Lines changed: 13 additions & 0 deletions b/‎test/prototype/mx_formats/test_inference_workflow.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎test/prototype/mx_formats/test_nvfp4_tensor.py‎
Lines changed: 44 additions & 16 deletions b/‎test/prototype/mx_formats/test_nvfp4_tensor.py‎
Lines changed: 44 additions & 16 deletions
diff --git a/‎test/prototype/test_awq.py‎
Lines changed: 2 additions & 2 deletions b/‎test/prototype/test_awq.py‎
Lines changed: 2 additions & 2 deletions
@@ -7,7 +7,7 @@
 #
 # To run these benchmarks, use the following command:
 #
-# torchrun --nproc-per-node=8 --local-ranks-filter=0 benchmarks/prototype/moe_training/mxfp8/bench_all_to_all_v.py
+# torchrun --nproc-per-node=4 --local-ranks-filter=0 benchmarks/prototype/moe_training/mxfp8/bench_all_to_all_v.py
 #
 #######################################################################
 import argparse
@@ -24,6 +24,7 @@
     all_to_all_single,
     all_to_all_single_autograd,
 )
+from torch.nn import functional as F
 from tqdm import tqdm
 
 from benchmarks.utils import profile_fn
@@ -66,33 +67,53 @@ def get_configs() -> List[ExperimentConfig]:
     return configs
 
 
-# Copy/paste a2a impls added in https://github.com/pytorch/torchtitan/pull/1765
-def default_a2a_dispatch(
+def default_a2a_fwd_bwd(
     routed_input: torch.Tensor,
+    labels: torch.Tensor,
     output_splits_list: list[int],
     input_splits_list: list[int],
     device_mesh: DeviceMesh,
 ):
-    """
-    Default implementation of all-to-all dispatch. Incurs device-to-host sync.
-
-    Returns:
-        routed_input: the local tokens after all-to-all dispatch
-        input_splits: the input splits for all-to-all dispatch
-        output_splits: the output splits for all-to-all dispatch
-        num_tokens_per_expert_group: the number of tokens per EP rank after all-to-all dispatch
-    """
-    # perform all-to-all
     routed_input = all_to_all_single_autograd(
         routed_input,
         output_splits_list,
         input_splits_list,
         device_mesh.get_group(),
     )
     routed_input = torch.ops._c10d_functional.wait_tensor(routed_input)
+
+    loss = F.mse_loss(routed_input, labels)
+    loss.backward()
+
+    torch.cuda.synchronize()
     return routed_input
 
 
+def mxfp8_a2a_fwd_bwd(
+    routed_input: torch.Tensor,
+    labels: torch.Tensor,
+    output_splits_list: list[int],
+    input_splits_list: list[int],
+    device_mesh: DeviceMesh,
+):
+    routed_input = to_mxfp8_a2a_dequant(
+        routed_input,
+        output_splits_list,
+        input_splits_list,
+        device_mesh.get_group(),
+    )
+
+    loss = F.mse_loss(routed_input, labels)
+    loss.backward()
+    torch.cuda.synchronize()
+    return routed_input
+
+
+# Compile target funcs
+default_a2a_sync_compiled = torch.compile(default_a2a_fwd_bwd)
+mxfp8_a2a_sync_compiled = torch.compile(mxfp8_a2a_fwd_bwd)
+
+
 def run_experiment(
     config: ExperimentConfig, args: argparse.Namespace
 ) -> ExperimentResult:
@@ -101,8 +122,9 @@ def run_experiment(
         (batch_size * seq_len, dim),
         dtype=torch.bfloat16,
         device=device,
+        requires_grad=True,
     )
-    ref_x = x.detach().clone()
+    ref_x = x.detach().clone().requires_grad_(True)
 
     # Set up device mesh
     mesh = init_device_mesh("cuda", (dist.get_world_size(),))
@@ -121,24 +143,27 @@ def warmup(func_no_args):
     )
     input_splits_list, output_splits_list = get_split_lists(input_splits, mesh)
 
-    # Compile target funcs
-    default_a2a_dispatch_c = torch.compile(default_a2a_dispatch)
-    to_mxfp8_a2a_dequant_c = torch.compile(to_mxfp8_a2a_dequant)
+    # Generate labels
+    labels_shape = (sum(output_splits_list), dim)
+    labels = x.new_ones(*labels_shape)
 
     # Bench default a2a (exclude d2h sync from preparing input splits_list and output_splits_list)
     warmup(
-        lambda: default_a2a_dispatch_c(
-            ref_x, output_splits_list, input_splits_list, mesh
+        lambda: default_a2a_sync_compiled(
+            ref_x, labels, output_splits_list, input_splits_list, mesh
         )
     )
     start_sec = time.perf_counter()
-    default_a2a_dispatch_c(ref_x, output_splits_list, input_splits_list, mesh)
+    default_a2a_sync_compiled(
+        ref_x, labels, output_splits_list, input_splits_list, mesh
+    )
     end_sec = time.perf_counter()
     bf16_ms = (end_sec - start_sec) * 1e3
     if args.profile:
         profile_fn(
-            default_a2a_dispatch_c,
+            default_a2a_sync_compiled,
             ref_x,
+            labels,
             output_splits_list,
             input_splits_list,
             mesh,
@@ -148,16 +173,19 @@ def warmup(func_no_args):
 
     # Bench mxfp8 sync a2a (exclude d2h sync from preparing input splits_list and output_splits_list)
     warmup(
-        lambda: to_mxfp8_a2a_dequant_c(x, output_splits_list, input_splits_list, mesh)
+        lambda: mxfp8_a2a_sync_compiled(
+            x, labels, output_splits_list, input_splits_list, mesh
+        )
     )
     start_sec = time.perf_counter()
-    to_mxfp8_a2a_dequant_c(x, output_splits_list, input_splits_list, mesh)
+    mxfp8_a2a_sync_compiled(x, labels, output_splits_list, input_splits_list, mesh)
     end_sec = time.perf_counter()
     mxfp8_ms = (end_sec - start_sec) * 1e3
     if args.profile:
         profile_fn(
-            to_mxfp8_a2a_dequant_c,
+            mxfp8_a2a_sync_compiled,
             x,
+            labels,
             output_splits_list,
             input_splits_list,
             mesh,
 
@@ -0,0 +1,82 @@
+# Testing compatibility
+# We know that torchao .so files built using PyTorch 2.8.0 are not ABI compatible with PyTorch 2.9+. (see #2919)
+# If the version of torch is not compatible with the version of torchao,
+# we expect to skip loading the .so files and a warning should be logged but no error
+
+PREV_TORCH_VERSION = 2.8.0
+PREV_TORCHAO_VERSION = 0.13.0
+
+# Function to check torchao import with configurable expectations
+check_torchao_import() {
+    local expect_warning="$1"
+    local warning_text="$2"
+    local torch_incompatible="${3:-}"
+
+    if [ -n "$torch_incompatible" ]; then
+        output=$(TORCH_INCOMPATIBLE=1 python -c "import torchao" 2>&1)
+    else
+        output=$(python -c "import torchao" 2>&1)
+    fi
+    exit_code=$?
+
+    if [ $exit_code -ne 0 ]; then
+        echo "ERROR: Failed to import torchao"
+        echo "Output: $output"
+        exit 1
+    fi
+
+    warning_found=false
+    if [ -n "$warning_text" ] && echo "$output" | grep -i "$warning_text" > /dev/null; then
+        echo "Output: $output"
+        warning_found=true
+    fi
+
+    if [ "$expect_warning" != "$warning_found" ]; then
+        echo echo "FAILURE: expect_warning is $expect_warning but warning_found is $warning_found with message $output"
+        exit 1
+    fi
+}
+
+## prev torch version, prev torchao version
+# Uninstall torch
+pip uninstall torch
+# Uninstall torchao
+pip uninstall torchao
+# Install prev compatible version of torch
+pip install PREV_TORCH_VERSION
+# Installs prev compatible version of torchao
+pip install PREV_TORCHAO_VERSION
+# hould import successfully without warning
+check_torchao_import "false" ""
+
+## current torch, current torchao
+# Uninstall torch
+pip uninstall torch
+# Uninstall torchao
+pip uninstall torchao
+# Install specific compatible version of torch (nightly 2.9.0dev)
+pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu129
+# Build torchao from source
+python setup.py develop
+# Should import successfully without warning
+check_torchao_import "false" ""
+## prev torch, torchao from source (do not rebuild), env var = True
+# Uninstall torch
+pip uninstall torch
+# Install incompatible version of torch
+pip install torch==PREV_TORCH_VERSION
+# Should import with warning because optional env var is set to true
+check_torchao_import "true" "Skipping import of cpp extensions due to incompatible torch version" "TORCHAO_SKIP_LOADING_SO_FILES=1"
+
+
+# current torch, prev torchao
+# Uninstall torch
+pip uninstall torch
+# Uninstall torchao
+pip uninstall torchao
+# Install non-ABI stable torch version
+pip install torch==2.9.0
+# Installs incompatible torchao
+pip install torchao==PREV_TORCHAO_VERSION
+# Should import with specific warning
+check_torchao_import "true" "Skipping import of cpp extensions due to incompatible torch version"
@@ -218,3 +218,16 @@ def test_narrow_similar_to_vllm(self):
             gemm_kernel_choice=MXGemmKernelChoice.EMULATED,
         )
         self._test_narrow_similar_to_vllm(config)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.skipif(
+        not torch_version_at_least("2.8.0"),
+        reason="torch.compile requires PyTorch 2.8+",
+    )
+    def test_nvfp4_quantize_3d_param_similar_to_vllm(self):
+        config = NVFP4InferenceConfig(
+            mm_config=NVFP4MMConfig.WEIGHT_ONLY,
+            use_triton_kernel=False,
+            use_dynamic_per_tensor_scale=False,
+        )
+        self._test_quantize_3d_param_similar_to_vllm(config)
@@ -42,6 +42,7 @@
         (torch.float32, (64, 128), False),
         (torch.bfloat16, (128, 256), False),
         (torch.bfloat16, (64, 128), True),
+        (torch.bfloat16, (1, 32, 64), False),
     ],
 )
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -83,14 +84,20 @@ def assert_sqnr_gt_threshold(orig, new, threshold):
         f"Dtype mismatch: {x.dtype} vs {x_reconstructed.dtype}"
     )
 
-    x_nvfp4_t = x_nvfp4.t()
+    if len(x.shape) == 2:
+        x_nvfp4_t = x_nvfp4.t()
+        x_t = x.t()
+    else:
+        x_nvfp4_t = x_nvfp4.transpose(-2, -1)
+        x_t = x.transpose(-2, -1)
+
     x_reconstructed_t = x_nvfp4_t.to_dtype(dtype)
-    assert_sqnr_gt_threshold(x.t(), x_reconstructed_t, 8.0)
+    assert_sqnr_gt_threshold(x_t, x_reconstructed_t, 8.0)
 
-    assert x.t().shape == x_reconstructed_t.shape, (
+    assert x_t.shape == x_reconstructed_t.shape, (
         f"Transpose shape mismatch: {x.t().shape} vs {x_reconstructed_t.shape}"
     )
-    assert x.t().dtype == x_reconstructed_t.dtype, (
+    assert x_t.dtype == x_reconstructed_t.dtype, (
         f"Transpose dtype mismatch: {x.t().dtype} vs {x_reconstructed_t.dtype}"
     )
 
@@ -103,6 +110,7 @@ def assert_sqnr_gt_threshold(orig, new, threshold):
         (16, 32),
         (64, 128),
         (384, 128),
+        (1, 32, 64),
     ],
 )
 @pytest.mark.skipif(
@@ -115,8 +123,7 @@ def test_nvfp4_swizzled_scales_construction(is_swizzled_scales, shape):
     that the _is_swizzled_scales flag is set correctly.
     """
 
-    M, K = shape
-    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    data = torch.randn(*shape, device="cuda", dtype=torch.bfloat16)
 
     tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=is_swizzled_scales)
     assert tensor._is_swizzled_scales == is_swizzled_scales
@@ -536,36 +543,43 @@ def test_nvfp4_to_copy():
 @pytest.mark.parametrize("use_triton_kernel", [False, True])
 @pytest.mark.parametrize("is_swizzled_scales", [False, True])
 @pytest.mark.parametrize(
-    "mk",
+    "shape",
     (
         (128, 64),
         (128 + 16, 64),
         (128, 64 + 16),
         (128 + 16, 64 + 16),
+        (1, 128, 64),
     ),
 )
 def test_scale_shape_matches_qdata(
-    transpose, use_triton_kernel, is_swizzled_scales, mk
+    transpose, use_triton_kernel, is_swizzled_scales, shape
 ):
     if use_triton_kernel and not is_sm_at_least_100():
         pytest.skip("CUDA capability >= 10.0 required for nvfp4 triton kernel")
     if use_triton_kernel and not is_swizzled_scales:
         pytest.skip("triton kernel requires swizzled scales")
 
-    M, K = mk
-
     block_size = 16
 
-    x_hp = torch.randn(M, K, device="cuda")
+    x_hp = torch.randn(*shape, device="cuda")
     x = NVFP4Tensor.to_nvfp4(
         x_hp, is_swizzled_scales=is_swizzled_scales, use_triton_kernel=use_triton_kernel
     )
 
-    m_dim, k_dim = 0, 1
-    if transpose:
-        x_hp = x_hp.t()
-        x = x.t()
-        m_dim, k_dim = 1, 0
+    if len(shape) == 2:
+        m_dim, k_dim = 0, 1
+        if transpose:
+            x_hp = x_hp.t()
+            x = x.t()
+            m_dim, k_dim = 1, 0
+    else:
+        assert len(shape) == 3, "unsupported"
+        m_dim, k_dim = 1, 2
+        if transpose:
+            x_hp = x_hp.transpose(-2, -1)
+            x = x.transpose(-2, -1)
+            m_dim, k_dim = 2, 1
 
     orig_m = x_hp.shape[m_dim]
     expected_padded_m = orig_m
@@ -587,3 +601,17 @@ def test_scale_shape_matches_qdata(
     assert expected_padded_k == actual_padded_k, (
         f"incompatible padded shape for dim {k_dim}: {expected_padded_k}, {actual_padded_k=}, {x.shape}, {x._scale_e4m3.shape}"
     )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+"
+)
+@pytest.mark.parametrize("dims", ((1, 2), (2, 1), (-1, -2), (-2, -1)))
+@pytest.mark.parametrize("is_swizzled_scales", [True, False])
+def test_3d_transpose(dims, is_swizzled_scales):
+    x_hp = torch.randn(2, 128, 256, device="cuda")
+    x_nvfp4 = NVFP4Tensor.to_nvfp4(x_hp, is_swizzled_scales=is_swizzled_scales)
+    x_hp_t = x_hp.transpose(dims[0], dims[1])
+    x_nvfp4_t = x_nvfp4.transpose(dims[0], dims[1])
+    assert x_hp_t.shape == x_nvfp4_t.shape
@@ -16,7 +16,7 @@
 
 from torchao.prototype.awq import AWQConfig, AWQStep
 from torchao.quantization import Int4WeightOnlyConfig, quantize_
-from torchao.utils import _is_fbgemm_genai_gpu_available, torch_version_at_least
+from torchao.utils import _is_fbgemm_gpu_genai_available, torch_version_at_least
 
 
 class ToyLinearModel(torch.nn.Module):
@@ -46,7 +46,7 @@ def forward(self, x):
 devices = ["cpu"]
 if (
     torch.cuda.is_available()
-    and _is_fbgemm_genai_gpu_available()
+    and _is_fbgemm_gpu_genai_available()
     and torch_version_at_least("2.6.0")
 ):
     devices.append("cuda")