diff --git a/.github/workflows/ci-sharktank-nightly.yaml b/.github/workflows/ci-sharktank-nightly.yaml
new file mode 100644
index 000000000..4bd1e7ee7
--- /dev/null
+++ b/.github/workflows/ci-sharktank-nightly.yaml
@@ -0,0 +1,87 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+name: Sharktank Nightly Tests
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekdays at 10:00 AM UTC = 02:00 AM PST / 03:00 AM PDT
+    - cron: "0 10 * * 1-5"
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  nightly-mi300x:
+    if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
+    name: "Nightly tests and benchmarks"
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: llama-mi300x-3
+    defaults:
+      run:
+        shell: bash
+    env:
+      VENV_DIR: ${{ github.workspace }}/.venv
+      HF_HOME: "/data/huggingface"
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{matrix.version}}
+      - name: Create Python venv
+        run: python -m venv ${VENV_DIR}
+
+      - name: Install pip deps
+        run: |
+          source ${VENV_DIR}/bin/activate
+          python -m pip install --no-compile --upgrade pip
+
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install -r requirements-iree-unpinned.txt
+          pip install --no-compile \
+            -r sharktank/requirements-tests.txt \
+            -e sharktank/
+
+          pip freeze
+
+      - name: Run benchmarks
+        run: |
+          source ${VENV_DIR}/bin/activate
+          pytest \
+            --verbose \
+            --capture=no \
+            --iree-hip-target=gfx942 \
+            --iree-device=hip://0 \
+            --with-flux-data \
+            -m="benchmark" \
+            --html=out/sharktank_nightly/benchmark/index.html \
+            sharktank/tests
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out/sharktank_nightly
+          destination_dir: ./sharktank_nightly
+          keep_files: true
diff --git a/.github/workflows/ci-sharktank.yml b/.github/workflows/ci-sharktank.yml
index 51a34db4f..309849853 100644
--- a/.github/workflows/ci-sharktank.yml
+++ b/.github/workflows/ci-sharktank.yml
@@ -85,7 +85,10 @@ jobs:
       - name: Run sharktank tests
         if: ${{ !cancelled() }}
         run: |
-          pytest -n 4 sharktank/ --durations=10
+          pytest \
+            -n 4 \
+            --durations=10 \
+            sharktank/
 
 
   test_with_data:
@@ -191,7 +194,9 @@ jobs:
 
       - name: Run punet tests
         run: |
-          pytest -v sharktank/ -m punet_quick \
+          pytest -v sharktank/ \
+            -m "punet_quick or model_punet" \
+            --with-expensive \
             --durations=0 \
             --timeout=600
 
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..3628e583a
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third_party/benchmark"]
+	path = third_party/benchmark
+	url = https://github.com/google/benchmark
diff --git a/sharktank/pyproject.toml b/sharktank/pyproject.toml
index 09aca178b..cdb72a5f0 100644
--- a/sharktank/pyproject.toml
+++ b/sharktank/pyproject.toml
@@ -41,9 +41,10 @@ testing = {file = ["requirements-tests.txt"]}
 addopts = [
     "-ra",
     "--import-mode=importlib",
-    "-m=unit",
+    "-m=unit and not expensive",
 ]
 markers = [
+    "benchmark: model benchmarks",
     "expensive: tests that are very expensive",
     "export: tests that require export from torch",
     "golden: tests that compare to some golden values",
diff --git a/sharktank/sharktank/models/flux/benchmark.py b/sharktank/sharktank/models/flux/benchmark.py
new file mode 100644
index 000000000..58af63c0c
--- /dev/null
+++ b/sharktank/sharktank/models/flux/benchmark.py
@@ -0,0 +1,92 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from pathlib import Path
+import iree.compiler
+import iree.runtime
+import os
+from iree.turbine.support.tools import iree_tool_prepare_input_args
+
+from .export import (
+    export_flux_transformer_from_hugging_face,
+    flux_transformer_default_batch_sizes,
+    iree_compile_flags,
+)
+from ...types import Dataset
+from .flux import FluxModelV1, FluxParams
+from ...utils.export_artifacts import ExportArtifacts
+from ...utils.iree import flatten_for_iree_signature
+from ...utils.benchmark import iree_benchmark_module
+
+
+def iree_benchmark_flux_dev_transformer(
+    artifacts_dir: Path,
+    iree_device: str,
+    json_result_output_path: Path,
+    caching: bool = False,
+) -> str:
+    mlir_path = artifacts_dir / "model.mlir"
+    parameters_path = artifacts_dir / "parameters.irpa"
+    if (
+        not caching
+        or not os.path.exists(mlir_path)
+        or not os.path.exists(parameters_path)
+    ):
+        export_flux_transformer_from_hugging_face(
+            "black-forest-labs/FLUX.1-dev/black-forest-labs-transformer",
+            mlir_output_path=mlir_path,
+            parameters_output_path=parameters_path,
+        )
+    return iree_benchmark_flux_transformer(
+        mlir_path=mlir_path,
+        parameters_path=parameters_path,
+        artifacts_dir=artifacts_dir,
+        iree_device=iree_device,
+        json_result_output_path=json_result_output_path,
+        caching=caching,
+    )
+
+
+def iree_benchmark_flux_transformer(
+    artifacts_dir: Path,
+    mlir_path: Path,
+    parameters_path: Path,
+    iree_device: str,
+    json_result_output_path: Path,
+    caching: bool = False,
+) -> str:
+    dataset = Dataset.load(parameters_path)
+    model = FluxModelV1(
+        theta=dataset.root_theta,
+        params=FluxParams.from_hugging_face_properties(dataset.properties),
+    )
+    input_args = flatten_for_iree_signature(
+        model.sample_inputs(batch_size=flux_transformer_default_batch_sizes[0])
+    )
+    cli_input_args = iree_tool_prepare_input_args(
+        input_args, file_path_prefix=f"{artifacts_dir / 'arg'}"
+    )
+    cli_input_args = [f"--input={v}" for v in cli_input_args]
+
+    iree_module_path = artifacts_dir / "model.vmfb"
+    if not caching or not os.path.exists(iree_module_path):
+        iree.compiler.compile_file(
+            mlir_path,
+            output_file=iree_module_path,
+            extra_args=iree_compile_flags,
+        )
+
+    iree_benchmark_args = [
+        f"--device={iree_device}",
+        f"--module={iree_module_path}",
+        f"--parameters=model={parameters_path}",
+        f"--function=forward_bs{flux_transformer_default_batch_sizes[0]}",
+        "--benchmark_repetitions=30",
+        "--benchmark_min_warmup_time=1.0",
+        "--benchmark_out_format=json",
+        f"--benchmark_out={json_result_output_path}",
+    ] + cli_input_args
+    return iree_benchmark_module(iree_benchmark_args)
diff --git a/sharktank/sharktank/models/flux/export.py b/sharktank/sharktank/models/flux/export.py
index a63e75af9..dc5d8ba93 100644
--- a/sharktank/sharktank/models/flux/export.py
+++ b/sharktank/sharktank/models/flux/export.py
@@ -17,6 +17,26 @@
 
 flux_transformer_default_batch_sizes = [1]
 
+iree_compile_flags = [
+    "--iree-hal-target-device=hip",
+    "--iree-hip-target=gfx942",
+    "--iree-opt-const-eval=false",
+    "--iree-opt-strip-assertions=true",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-dispatch-creation-enable-fuse-horizontal-contractions=true",
+    "--iree-dispatch-creation-enable-aggressive-fusion=true",
+    "--iree-opt-aggressively-propagate-transposes=true",
+    "--iree-opt-outer-dim-concat=true",
+    "--iree-vm-target-truncate-unsupported-floats",
+    "--iree-llvmgpu-enable-prefetch=true",
+    "--iree-opt-data-tiling=false",
+    "--iree-codegen-gpu-native-math-precision=true",
+    "--iree-codegen-llvmgpu-use-vector-distribution",
+    "--iree-hip-waves-per-eu=2",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline,iree-preprocessing-pad-to-intrinsics)",
+]
+
 
 def export_flux_transformer_model_mlir(
     model: FluxModelV1,
diff --git a/sharktank/sharktank/models/flux/testing.py b/sharktank/sharktank/models/flux/testing.py
index da9561b33..9097928f8 100644
--- a/sharktank/sharktank/models/flux/testing.py
+++ b/sharktank/sharktank/models/flux/testing.py
@@ -7,6 +7,7 @@
 import torch
 from os import PathLike
 from collections import OrderedDict
+import pytest
 
 from .flux import FluxParams, FluxModelV1
 from .export import export_flux_transformer, flux_transformer_default_batch_sizes
@@ -17,6 +18,8 @@
     make_mmdit_single_block_random_theta,
 )
 
+with_flux_data = pytest.mark.skipif("not config.getoption('with_flux_data')")
+
 
 def convert_flux_transformer_input_for_hugging_face_model(
     img: torch.Tensor,
diff --git a/sharktank/sharktank/utils/benchmark.py b/sharktank/sharktank/utils/benchmark.py
new file mode 100644
index 000000000..ecc5d506c
--- /dev/null
+++ b/sharktank/sharktank/utils/benchmark.py
@@ -0,0 +1,118 @@
+# Copyright 2025 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from typing import Any
+import iree.runtime
+import subprocess
+import json
+import sys
+import pandas
+from pathlib import Path
+import os
+from os import PathLike
+
+
+def _run_program(
+    args: tuple[str],
+):
+    process_result = subprocess.run(
+        args=args,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+
+    out = process_result.stdout.decode()
+    err = process_result.stderr.decode()
+
+    if process_result.returncode != 0:
+        raise RuntimeError(f"stderr:\n{err}\nstdout:\n{out}")
+
+    if err != "":
+        print(err, file=sys.stderr)
+
+    return out
+
+
+def iree_benchmark_module(
+    cli_args: tuple[str],
+):
+    args = [iree.runtime.benchmark_exe()] + cli_args
+    return _run_program(args=args)
+
+
+def google_benchmark_compare_path() -> str:
+    return os.path.abspath(
+        Path(__file__).parent.parent.parent.parent
+        / "third_party"
+        / "benchmark"
+        / "tools"
+        / "compare.py"
+    )
+
+
+def iree_benchmark_compare(cli_args: tuple[str]):
+    args = [google_benchmark_compare_path()] + cli_args
+    return _run_program(args=args)
+
+
+def _get_benchmark_comparison_aggregate_real_time(
+    benchmark_comparison_result_json: dict[str, Any], aggregate: str
+) -> tuple[float, float, str]:
+    real_time = [
+        (
+            benchmark["measurements"][0]["real_time"],
+            benchmark["measurements"][0]["real_time_other"],
+            benchmark["time_unit"],
+        )
+        for benchmark in benchmark_comparison_result_json
+        if "aggregate_name" in benchmark and benchmark["aggregate_name"] == aggregate
+    ]
+    assert len(real_time) == 1
+    return real_time[0]
+
+
+def _assert_contender_aggregate_real_time_is_not_worse(
+    benchmark_comparison_result_json: dict[str, Any], aggregate: str
+):
+    real_time = _get_benchmark_comparison_aggregate_real_time(
+        benchmark_comparison_result_json, aggregate
+    )
+    baseline_real_time, contender_real_time, time_unit = real_time
+    if baseline_real_time < contender_real_time:
+        raise AssertionError(
+            f"Benchmark contender {aggregate} "
+            f"real time {contender_real_time} {time_unit} "
+            f"is worse than baseline {baseline_real_time} {time_unit}."
+        )
+
+
+def iree_benchmark_assert_contender_is_not_worse(
+    benchmark_comparison_result_json: dict[str, Any], alpha: float = 0.05
+):
+    """If contender is not from the same distribution as baseline, assert that and
+    that its median and mean is not worse.
+
+    Arguments
+    ---------
+    alpha: acceptance/significance threshold probability that the two benchmark sample
+    sets are from the same distribution. Meaning they are not different."""
+    time_pvalue = [
+        b["utest"]["time_pvalue"]
+        for b in benchmark_comparison_result_json
+        if "utest" in b and "time_pvalue" in b["utest"]
+    ]
+    assert len(time_pvalue) == 1
+    time_pvalue = time_pvalue[0]
+    if alpha <= time_pvalue:
+        # The benchmarks are from the same distribution.
+        return
+
+    _assert_contender_aggregate_real_time_is_not_worse(
+        benchmark_comparison_result_json, "mean"
+    )
+    _assert_contender_aggregate_real_time_is_not_worse(
+        benchmark_comparison_result_json, "median"
+    )
diff --git a/sharktank/sharktank/utils/testing.py b/sharktank/sharktank/utils/testing.py
index 0ad524f37..9e70eff11 100644
--- a/sharktank/sharktank/utils/testing.py
+++ b/sharktank/sharktank/utils/testing.py
@@ -7,6 +7,7 @@
 from typing import Optional
 import contextlib
 from pathlib import Path
+import pytest
 from os import PathLike
 import os
 import shutil
@@ -21,6 +22,8 @@
 from ..types import *
 from .math import cosine_similarity
 
+is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
+
 # Range of torch.rand() is [0,1)
 # Range of torch.rand() * 2 - 1 is [-1, 1), includes negative values
 def make_rand_torch(shape: list[int], dtype: Optional[torch.dtype] = torch.float32):
diff --git a/sharktank/tests/evaluate/perplexity_iree_test.py b/sharktank/tests/evaluate/perplexity_iree_test.py
index f529ac0fa..1745a3fc6 100644
--- a/sharktank/tests/evaluate/perplexity_iree_test.py
+++ b/sharktank/tests/evaluate/perplexity_iree_test.py
@@ -10,8 +10,9 @@
 import numpy as np
 
 from sharktank.evaluate import perplexity_iree
+from sharktank.utils.testing import is_mi300x
+
 
-is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
 skipif_run_quick_llama_test = pytest.mark.skipif(
     'not config.getoption("run-nightly-llama-tests")',
     reason="Run large tests if --run-nightly-llama-tests is passed",
diff --git a/sharktank/tests/models/flux/flux_benchmark_test.py b/sharktank/tests/models/flux/flux_benchmark_test.py
new file mode 100644
index 000000000..90e612403
--- /dev/null
+++ b/sharktank/tests/models/flux/flux_benchmark_test.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from pathlib import Path
+import json
+import pandas
+import pytest
+import sys
+import torch
+
+from sharktank.models.flux.benchmark import iree_benchmark_flux_dev_transformer
+from sharktank.models.flux.testing import with_flux_data
+from sharktank.utils.benchmark import (
+    iree_benchmark_compare,
+    iree_benchmark_assert_contender_is_not_worse,
+)
+from sharktank.utils.testing import is_mi300x
+from sharktank.utils.testing import TempDirTestBase
+
+
+@pytest.mark.usefixtures("get_iree_flags", "caching", "path_prefix")
+@pytest.mark.benchmark
+@pytest.mark.expensive
+class FluxBenchmark(TempDirTestBase):
+    def setUp(self):
+        super().setUp()
+        torch.manual_seed(12345)
+        if self.path_prefix is None:
+            self.path_prefix = self._temp_dir
+
+    @is_mi300x
+    @with_flux_data
+    def testBenchmarkFluxDevTransformerMi300x(self):
+        benchmark_result_file_path = Path(self.path_prefix) / "benchmark_result.json"
+        benchmark_result = iree_benchmark_flux_dev_transformer(
+            artifacts_dir=Path(self.path_prefix),
+            iree_device=self.iree_device,
+            json_result_output_path=benchmark_result_file_path,
+            caching=self.caching,
+        )
+        print(benchmark_result)
+
+        baseline_benchmark_result_file_path = (
+            Path(__file__).parent
+            / "flux_transformer_baseline_benchmark_result_mi300x.json"
+        )
+        benchmark_compare_result_file_path = (
+            Path(self.path_prefix) / "benchmark_compare_result.json"
+        )
+
+        benchmark_compare_result = iree_benchmark_compare(
+            [
+                f"--dump_to_json={benchmark_compare_result_file_path}",
+                "benchmarks",
+                str(baseline_benchmark_result_file_path),
+                benchmark_result_file_path,
+            ]
+        )
+        print(benchmark_compare_result)
+
+        with open(benchmark_compare_result_file_path, "r") as f:
+            benchmark_compare_result = json.load(f)
+        iree_benchmark_assert_contender_is_not_worse(benchmark_compare_result)
diff --git a/sharktank/tests/models/flux/flux_test.py b/sharktank/tests/models/flux/flux_test.py
index bdf5c2744..6bed7b65b 100644
--- a/sharktank/tests/models/flux/flux_test.py
+++ b/sharktank/tests/models/flux/flux_test.py
@@ -17,12 +17,14 @@
     export_flux_transformer_from_hugging_face,
     export_flux_transformer,
     import_flux_transformer_dataset_from_hugging_face,
+    iree_compile_flags,
 )
 from sharktank.models.flux.testing import (
     convert_flux_transformer_input_for_hugging_face_model,
     export_dev_random_single_layer,
     make_dev_single_layer_config,
     make_random_theta,
+    with_flux_data,
 )
 from sharktank.models.flux.flux import FluxModelV1, FluxParams
 from sharktank.utils.testing import TempDirTestBase
@@ -41,27 +43,6 @@
 
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
-with_flux_data = pytest.mark.skipif("not config.getoption('with_flux_data')")
-
-iree_compile_flags = [
-    "--iree-hal-target-device=hip",
-    "--iree-hip-target=gfx942",
-    "--iree-opt-const-eval=false",
-    "--iree-opt-strip-assertions=true",
-    "--iree-global-opt-propagate-transposes=true",
-    "--iree-dispatch-creation-enable-fuse-horizontal-contractions=true",
-    "--iree-dispatch-creation-enable-aggressive-fusion=true",
-    "--iree-opt-aggressively-propagate-transposes=true",
-    "--iree-opt-outer-dim-concat=true",
-    "--iree-vm-target-truncate-unsupported-floats",
-    "--iree-llvmgpu-enable-prefetch=true",
-    "--iree-opt-data-tiling=false",
-    "--iree-codegen-gpu-native-math-precision=true",
-    "--iree-codegen-llvmgpu-use-vector-distribution",
-    "--iree-hip-waves-per-eu=2",
-    "--iree-execution-model=async-external",
-    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline,iree-preprocessing-pad-to-intrinsics)",
-]
 
 
 def convert_dtype_if_dtype(
diff --git a/sharktank/tests/models/flux/flux_transformer_baseline_benchmark_result_mi300x.json b/sharktank/tests/models/flux/flux_transformer_baseline_benchmark_result_mi300x.json
new file mode 100644
index 000000000..b1d973ad4
--- /dev/null
+++ b/sharktank/tests/models/flux/flux_transformer_baseline_benchmark_result_mi300x.json
@@ -0,0 +1,558 @@
+{
+  "context": {
+    "date": "2025-01-26T15:32:22+00:00",
+    "host_name": "sharkmi300x-3",
+    "executable": "/home/bpetkant/ws/iree/build/RelWithDebInfo/runtime/bindings/python/iree/runtime/../_runtime_libs/iree-benchmark-module",
+    "num_cpus": 128,
+    "mhz_per_cpu": 3763,
+    "cpu_scaling_enabled": true,
+    "caches": [
+      {
+        "type": "Data",
+        "level": 1,
+        "size": 32768,
+        "num_sharing": 1
+      },
+      {
+        "type": "Instruction",
+        "level": 1,
+        "size": 32768,
+        "num_sharing": 1
+      },
+      {
+        "type": "Unified",
+        "level": 2,
+        "size": 1048576,
+        "num_sharing": 1
+      },
+      {
+        "type": "Unified",
+        "level": 3,
+        "size": 33554432,
+        "num_sharing": 8
+      }
+    ],
+    "load_avg": [
+      0.875977,
+      0.46582,
+      0.229004
+    ],
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.4941223305650055e+02,
+      "cpu_time": 5.6384478100000024e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8201269280750831e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 1,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5047306598862633e+02,
+      "cpu_time": 5.6478033500000004e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8166193076205142e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 2,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5021137697622180e+02,
+      "cpu_time": 5.6454373199999975e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8174833197664257e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 3,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5035454698372632e+02,
+      "cpu_time": 5.6479656399999988e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8170105170941186e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 4,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5070075998082757e+02,
+      "cpu_time": 5.6513533200000052e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8158682040584337e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 5,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5090650497004390e+02,
+      "cpu_time": 5.6527730799999972e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8151900385608553e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 6,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5177536100381985e+02,
+      "cpu_time": 5.6610141200000010e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8123317398238759e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 7,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5097636696882546e+02,
+      "cpu_time": 5.6543593099999794e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8149598784090508e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 8,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5182288005016744e+02,
+      "cpu_time": 5.6630932099999859e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8121756747547109e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 9,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5110136204166338e+02,
+      "cpu_time": 5.6523207299999888e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8145482281069010e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 10,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5142490100115538e+02,
+      "cpu_time": 5.6575547299999937e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8134835735281833e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 11,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5079155095154420e+02,
+      "cpu_time": 5.6511900799999989e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8155688813170898e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 12,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5180292006116360e+02,
+      "cpu_time": 5.6630688300000111e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8122412253439268e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 13,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5177955300314352e+02,
+      "cpu_time": 5.6627096100000074e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8123179711124651e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 14,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5185354396235198e+02,
+      "cpu_time": 5.6629863300000238e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8120749806550505e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 15,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5273327603936195e+02,
+      "cpu_time": 5.6710612400000036e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8091908762315709e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 16,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5235066899331287e+02,
+      "cpu_time": 5.6679550600000005e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8104440822395504e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 17,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5234270502114668e+02,
+      "cpu_time": 5.6673579399999949e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8104701861894139e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 18,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5279905395582318e+02,
+      "cpu_time": 5.6723945699999945e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8089755994407233e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 19,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5281000799732283e+02,
+      "cpu_time": 5.6721902399999988e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8089397542253665e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 20,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5279980297200382e+02,
+      "cpu_time": 5.6724346300000047e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8089731483689482e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 21,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5278226599330083e+02,
+      "cpu_time": 5.6732682099999818e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8090305379154819e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 22,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5371954495785758e+02,
+      "cpu_time": 5.6823183500000243e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8059683988147970e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 23,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5347738502314314e+02,
+      "cpu_time": 5.6792782300000158e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8067585542961000e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 24,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5333389097359031e+02,
+      "cpu_time": 5.6774159800000132e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8072270943688289e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 25,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5361895298119634e+02,
+      "cpu_time": 5.6813219200000026e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8062965413576890e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 26,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5385963898152113e+02,
+      "cpu_time": 5.6819793300000038e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8055115946684170e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 27,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5390325299231336e+02,
+      "cpu_time": 5.6860353600000121e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8053694297655212e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 28,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5448368302313611e+02,
+      "cpu_time": 5.6868854500000054e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8034795804050279e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "iteration",
+      "repetitions": 30,
+      "repetition_index": 29,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 5.5424572899937630e+02,
+      "cpu_time": 5.6859769099999721e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8042538673331396e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time_mean",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "aggregate",
+      "repetitions": 30,
+      "threads": 1,
+      "aggregate_name": "mean",
+      "aggregate_unit": "time",
+      "iterations": 30,
+      "real_time": 5.5215489286347292e+02,
+      "cpu_time": 5.6656650296666680e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8110963237949083e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time_median",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "aggregate",
+      "repetitions": 30,
+      "threads": 1,
+      "aggregate_name": "median",
+      "aggregate_unit": "time",
+      "iterations": 30,
+      "real_time": 5.5209812449174933e+02,
+      "cpu_time": 5.6652255749999904e+02,
+      "time_unit": "ms",
+      "items_per_second": 1.8112725834222321e+00
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time_stddev",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "aggregate",
+      "repetitions": 30,
+      "threads": 1,
+      "aggregate_name": "stddev",
+      "aggregate_unit": "time",
+      "iterations": 30,
+      "real_time": 1.3402606227623226e+00,
+      "cpu_time": 1.3650754512544929e+00,
+      "time_unit": "ms",
+      "items_per_second": 4.3969683769790810e-03
+    },
+    {
+      "name": "BM_forward_bs1/process_time/real_time_cv",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_forward_bs1/process_time/real_time",
+      "run_type": "aggregate",
+      "repetitions": 30,
+      "threads": 1,
+      "aggregate_name": "cv",
+      "aggregate_unit": "percentage",
+      "iterations": 30,
+      "real_time": 2.4273272592256435e-03,
+      "cpu_time": 2.4093825598701258e-03,
+      "time_unit": "ms",
+      "items_per_second": 2.4277937728711340e-03
+    }
+  ]
+}
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index 6056f9c6c..1c4371cf8 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -19,8 +19,8 @@
     IreeBenchmarkException,
     IreeCompileException,
 )
+from sharktank.utils.testing import is_mi300x
 
-is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
 skipif_run_quick_llama_test = pytest.mark.skipif(
     'config.getoption("run-quick-llama-test") and not config.getoption("run-nightly-llama-tests")',
     reason="Skipping largs tests when --run-quick-llama-test is set.",
diff --git a/third_party/benchmark b/third_party/benchmark
new file mode 160000
index 000000000..c58e6d071
--- /dev/null
+++ b/third_party/benchmark
@@ -0,0 +1 @@
+Subproject commit c58e6d0710581e3a08d65c349664128a8d9a2461