diff --git a/.github/workflows/pkgci_test_amd_mi250.yml b/.github/workflows/pkgci_test_amd_mi250.yml
index 6253a59301a5..0ec19c966ac3 100644
--- a/.github/workflows/pkgci_test_amd_mi250.yml
+++ b/.github/workflows/pkgci_test_amd_mi250.yml
@@ -56,9 +56,9 @@ jobs:
           echo "CC=clang" >> $GITHUB_ENV
           echo "CXX=clang++" >> $GITHUB_ENV
 
-      - name: Build tests
+      - name: Build in-tree tests
         run: ./build_tools/pkgci/build_tests_using_package.sh ${VENV_DIR}/bin
-      - name: Run GPU tests
+      - name: Run in-tree GPU tests
         env:
           CTEST_PARALLEL_LEVEL: 2
           IREE_CTEST_LABEL_REGEX: ^requires-gpu|^driver=hip$
@@ -67,3 +67,36 @@ jobs:
           IREE_NVIDIA_SM80_TESTS_DISABLE: 1
           IREE_MULTI_DEVICE_TESTS_DISABLE: 0
         run: ./build_tools/cmake/ctest_all.sh ${BUILD_DIR}
+
+      - name: Checkout test suites repository
+        uses: actions/checkout@v4.1.7
+        with:
+          repository: ScottTodd/iree-test-suites
+          ref: dc027ded6175dbd753ac08d6d76ba7be36494730
+          path: iree-test-suites
+      - name: Configure out-of-tree test suite CMake project
+        run: |
+          source ${VENV_DIR}/bin/activate
+          cmake -G Ninja -S iree-test-suites/matmul -B build/ \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DIREE_USE_LOCAL_REPO=ON \
+            -DIREE_LOCAL_REPO_PATH=${GITHUB_WORKSPACE} \
+            -DIREE_HOST_BIN_DIR=${VENV_DIR}/bin \
+            -DIREE_HAL_DRIVER_LOCAL_SYNC=OFF \
+            -DIREE_HAL_DRIVER_LOCAL_TASK=OFF \
+            -DIREE_HAL_DRIVER_HIP=ON \
+            -DIREE_HAL_DRIVER_VULKAN=OFF \
+            -DIREE_HAL_DRIVER_METAL=OFF \
+            -DIREE_HAL_DRIVER_CUDA=OFF \
+            -DIREE_HIP_TEST_TARGET_CHIP=gfx90a
+      - name: Build out-of-tree test suite CMake project
+        run: cmake --build build/ --target iree-test-suites-matmul-deps
+      - name: CTest out-of-tree test suite CMake project
+        run: |
+          ctest \
+            --test-dir build/ -R iree-test-suites \
+            --timeout 900 \
+            --output-on-failure \
+            --no-tests=error \
+            --label-regex "^requires-gpu|^driver=hip$" \
+            --label-exclude "(^nodocker$|^driver=vulkan$|^driver=metal$|^driver=cuda$|^vulkan_uses_vk_khr_shader_float16_int8$|^requires-gpu-sm80$|^requires-gpu-rdna3$)"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c42e006168a2..547c7515ec68 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -556,7 +556,6 @@ include(iree_lit_test)
 include(iree_llvm)
 include(iree_add_all_subdirs)
 include(iree_check_test)
-include(iree_e2e_generated_runner_test)
 include(iree_native_test)
 include(iree_cc_binary_benchmark)
 include(iree_hal_cts_test_suite)
diff --git a/build_tools/bazel/iree_e2e_generated_runner_test.bzl b/build_tools/bazel/iree_e2e_generated_runner_test.bzl
deleted file mode 100644
index 9d17d50808d0..000000000000
--- a/build_tools/bazel/iree_e2e_generated_runner_test.bzl
+++ /dev/null
@@ -1,252 +0,0 @@
-# Copyright 2021 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-"""Macros for defining tests that use the iree-e2e-${test_type}-test runner."""
-
-load("//build_tools/bazel:iree_bytecode_module.bzl", "iree_bytecode_module")
-load("//build_tools/bazel:native_binary.bzl", "native_test")
-
-def iree_e2e_runner_test(
-        name,
-        test_type,
-        tests_src,
-        tests_vmfb,
-        calls_src,
-        calls_vmfb,
-        target_backend,
-        driver,
-        test_runner,
-        compiler_flags = [],
-        runner_args = [],
-        tags = [],
-        target_cpu_features = None,
-        timeout = None,
-        **kwargs):
-    """Creates a test using a specified test runner program.
-
-    Args:
-        name: Name of the target
-        test_type: Name of the test (e.g., matmuls, conv2ds).
-        tests_src: mlir source file with tests to be compiled.
-        tests_vmfb: specifies the path to use for the generated IREE module.
-        calls_src: mlir source file with calls to be compiled.
-        calls_vmfb: specifies the path to use for the generated IREE module.
-        target_backend: target backend to compile for.
-        driver: driver to run the module with.
-        compiler_flags: additional flags to pass to the compiler. Bytecode
-            output format and backend flags are passed automatically.
-        runner_args: additional args to pass to the test runner program. The
-            driver and input file flags are passed automatically.
-        tags: Additional labels to apply to the test. "driver=${DRIVER}" is
-            added automatically.
-        test_runner: test runner program to run.
-        timeout: timeout for the generated tests.
-        target_cpu_features: target CPU features. Only for llvm-cpu backend.
-        **kwargs: any additional attributes to pass to the underlying tests and
-            test suite.
-    """
-
-    if target_cpu_features:
-        fail("target_cpu_features must currently be empty")
-
-    iree_bytecode_module(
-        name = name + "_%s_module" % test_type,
-        module_name = tests_vmfb,
-        src = tests_src,
-        flags = [
-            "--iree-hal-target-backends=%s" % target_backend,
-        ] + ([
-            "--iree-llvmcpu-target-cpu-features=%s" % target_cpu_features,
-        ] if target_cpu_features else []) + compiler_flags,
-        visibility = ["//visibility:private"],
-        testonly = True,
-        **kwargs
-    )
-
-    iree_bytecode_module(
-        name = name + "_calls_module",
-        module_name = calls_vmfb,
-        src = calls_src,
-        flags = [
-            "--iree-hal-target-backends=%s" % target_backend,
-        ] + compiler_flags,
-        visibility = ["//visibility:private"],
-        testonly = True,
-        **kwargs
-    )
-
-    native_test(
-        name = name,
-        args = [
-            "--device=%s" % driver,
-            "--module=$(location :%s)" % tests_vmfb,
-            "--module=$(location :%s)" % calls_vmfb,
-        ] + runner_args,
-        data = [
-            ":%s" % tests_vmfb,
-            ":%s" % calls_vmfb,
-        ],
-        src = test_runner,
-        tags = tags + ["driver=%s" % driver],
-        timeout = timeout,
-        **kwargs
-    )
-
-def iree_single_backend_e2e_runner_test(
-        name,
-        test_type,
-        generator,
-        test_runner,
-        target_backend,
-        driver,
-        generator_args = [],
-        compiler_flags = [],
-        runner_args = [],
-        tags = [],
-        target_cpu_features = None,
-        timeout = None,
-        **kwargs):
-    """Generates an iree_e2e_runner_test using a custom python generator script.
-
-    The generator script produces .mlir sources which are compiled and passed to
-    iree_e2e_runner_test.
-
-    Args:
-        name: Name of the target
-        test_type: Name of the test (e.g., matmul, conv2d).
-        generator: Target to run to generate the source MLIR files.
-            It will be invoked with the following standard flags, in addition
-            to generator_args:
-            --output_${test_type}_mlir=(current binary dir)/name_${test_type}.mlir
-            --output_calls_mlir=(current binary dir)/name_calls.mlir
-        generator_args: additional args to pass to the generator program.
-        target_backend: target backend to compile for.
-        driver: driver to run the module with.
-        compiler_flags: additional flags to pass to the compiler. Bytecode
-            output format and backend flags are passed automatically.
-        runner_args: additional args to pass to the test runner program. The
-            driver and input file flags are passed automatically.
-        tags: Additional labels to apply to the test. "driver=${DRIVER}" is
-            added automatically.
-        test_runner: test runner program to run.
-        timeout: timeout for the generated tests.
-        target_cpu_features: target CPU features. Only for llvm-cpu backend.
-        **kwargs: any additional attributes to pass to the underlying tests and
-            test suite.
-    """
-
-    tests_src = "%s.mlir" % (name)
-    tests_vmfb = "%s.vmfb" % (name)
-    calls_src = "%s_calls.mlir" % (name)
-    calls_vmfb = "%s_calls.vmfb" % (name)
-    native.genrule(
-        name = "%s_generate" % (name),
-        outs = [tests_src, calls_src],
-        cmd = " ".join([
-            "$(location %s)" % (generator),
-            " ".join([('"%s"' % arg) for arg in generator_args]),
-            "--output_%s_mlir=$(location %s)" % (test_type, tests_src),
-            "--output_calls_mlir=$(location %s)" % (calls_src),
-        ] + [('"%s"' % arg) for arg in generator_args]),
-        tools = [generator],
-        message = "Generating code and calls for test %s..." % (name),
-        output_to_bindir = 1,
-        testonly = True,
-        **kwargs
-    )
-    iree_e2e_runner_test(
-        name = name,
-        test_type = test_type,
-        tests_src = tests_src,
-        tests_vmfb = tests_vmfb,
-        calls_src = calls_src,
-        calls_vmfb = calls_vmfb,
-        target_backend = target_backend,
-        driver = driver,
-        test_runner = test_runner,
-        compiler_flags = compiler_flags,
-        runner_args = runner_args,
-        tags = tags,
-        timeout = timeout,
-        target_cpu_features = target_cpu_features,
-        **kwargs
-    )
-
-def iree_generated_e2e_runner_test(
-        name,
-        test_type,
-        generator,
-        test_runner,
-        target_backends_and_drivers,
-        generator_args = [],
-        compiler_flags = [],
-        runner_args = [],
-        tags = [],
-        timeout = None,
-        target_cpu_features_variants = [],
-        **kwargs):
-    """Generates a suite of iree_e2e_runner_test on multiple backends/drivers.
-
-    Args:
-        name: Name of the target
-        test_type: Name of the test (e.g., matmul, conv2d).
-        generator: Target to run to generate the source MLIR files.
-            It will be invoked with the following standard flags, in addition
-            to generator_args:
-            --output_${test_type}_mlir=(current binary dir)/name_${test_type}.mlir
-            --output_calls_mlir=(current binary dir)/name_calls.mlir
-        generator_args: additional args to pass to the generator program.
-        target_backends_and_drivers: backend/driver pairs to compile and run
-            the module.
-        compiler_flags: additional flags to pass to the compiler. Bytecode
-            output format and backend flags are passed automatically.
-        runner_args: additional args to pass to the test runner program. The
-            driver and input file flags are passed automatically.
-        tags: Additional labels to apply to the test. "driver=${DRIVER}" is
-            added automatically.
-        test_runner: test runner program to run.
-        timeout: timeout for the generated tests.
-        target_cpu_features_variants: list of target cpu features variants.
-            Currently unimplemented in Bazel due to difficulty of specializing
-            to target architecture in Bazel. The following describes the
-            semantics that this should have if implemented. Each
-            entry is either "default" for the architecture defaults, or a colon-
-            separated triple "arch:name:cpu_features" where "arch" filters
-            for a target CPU architecture (in IREE_ARCH format), "name" is a
-            short name for the CPU features set (used to generate target names)
-            and cpu_features is a comma-separated list of LLVM target attributes
-            to enable. Example:
-              x86_64:avx2_fma:+avx,+avx2,+fma
-        **kwargs: any additional attributes to pass to the underlying tests and test suite.
-    """
-
-    tests = []
-    for backend, driver in target_backends_and_drivers:
-        # CUDA/ROCm backend/driver not supported by Bazel build.
-        if backend == "cuda" or driver == "cuda" or backend == "rocm" or driver == "hip":
-            continue
-        suite_entry_name = "_".join([name, backend, driver])
-        iree_single_backend_e2e_runner_test(
-            name = suite_entry_name,
-            test_type = test_type,
-            generator = generator,
-            test_runner = test_runner,
-            driver = driver,
-            target_backend = backend,
-            generator_args = generator_args,
-            compiler_flags = compiler_flags,
-            runner_args = runner_args,
-            tags = tags,
-            timeout = timeout,
-            **kwargs
-        )
-        tests.append(suite_entry_name)
-    native.test_suite(
-        name = name,
-        tests = tests,
-        tags = tags,
-        **kwargs
-    )
diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
index 9a1796b443fd..8a73090d5b69 100644
--- a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
+++ b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
@@ -858,71 +858,6 @@ def iree_check_test_suite(
             f")\n\n"
         )
 
-    def iree_generated_e2e_runner_test(
-        self,
-        name,
-        test_type,
-        generator,
-        generator_args=None,
-        test_runner=None,
-        target_backends_and_drivers=None,
-        compiler_flags=None,
-        runner_args=None,
-        tags=None,
-        target_cpu_features_variants=None,
-        **kwargs,
-    ):
-        if self._should_skip_target(tags=tags, **kwargs):
-            return
-        target_backends = None
-        drivers = None
-        if target_backends_and_drivers is not None:
-            target_backends = [it[0] for it in target_backends_and_drivers]
-            drivers = [it[1] for it in target_backends_and_drivers]
-
-        name_block = self._convert_string_arg_block("NAME", name, quote=False)
-        test_type_block = self._convert_string_arg_block(
-            "TEST_TYPE", test_type, quote=False
-        )
-        # For now we assume that the generator target is a py_binary with a single
-        # source .py file named like it.
-        generator_py = f"{generator.split(':')[-1]}.py"
-        generator_block = self._convert_string_arg_block(
-            "GENERATOR", generator_py, quote=True
-        )
-        generator_args_block = self._convert_string_list_block(
-            "GENERATOR_ARGS", generator_args
-        )
-        test_runner_block = self._convert_target_block("TEST_RUNNER", test_runner)
-        target_backends_block = self._convert_string_list_block(
-            "TARGET_BACKENDS", target_backends
-        )
-        drivers_block = self._convert_string_list_block("DRIVERS", drivers)
-        compiler_flags_block = self._convert_string_list_block(
-            "COMPILER_FLAGS", compiler_flags
-        )
-        runner_args_block = self._convert_string_list_block("RUNNER_ARGS", runner_args)
-        labels_block = self._convert_string_list_block("LABELS", tags)
-        target_cpu_features_variants_block = self._convert_string_list_block(
-            "TARGET_CPU_FEATURES_VARIANTS", target_cpu_features_variants
-        )
-
-        self._converter.body += (
-            f"iree_generated_e2e_runner_test(\n"
-            f"{name_block}"
-            f"{test_type_block}"
-            f"{generator_block}"
-            f"{generator_args_block}"
-            f"{test_runner_block}"
-            f"{target_backends_block}"
-            f"{drivers_block}"
-            f"{compiler_flags_block}"
-            f"{runner_args_block}"
-            f"{labels_block}"
-            f"{target_cpu_features_variants_block}"
-            f")\n\n"
-        )
-
     def native_test(self, name, src, args=None, data=None, tags=None, timeout=None):
         if self._should_skip_target(tags=tags):
             return
diff --git a/build_tools/cmake/iree_e2e_generated_runner_test.cmake b/build_tools/cmake/iree_e2e_generated_runner_test.cmake
deleted file mode 100644
index 585d9906f112..000000000000
--- a/build_tools/cmake/iree_e2e_generated_runner_test.cmake
+++ /dev/null
@@ -1,502 +0,0 @@
-# Copyright 2021 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-include(CMakeParseArguments)
-
-# iree_e2e_runner_test()
-#
-# Creates a test using a specified test runner program for the specified
-# test files.
-#
-# Parameters:
-#   NAME: Name of the target
-#   TEST_TYPE: Type of test (Currently, matmul and conv2d are supported).
-#   VARIANT_NAME: Variant name to suffix NAME with.
-#       Will reuse the same TEST_TYPE/calls vmfb files.
-#   TESTS_SRC: mlir source file with TEST_TYPE to be compiled to an IREE module.
-#   TESTS_VMFB: specifies the path to use for the generated IREE module.
-#   CALLS_SRC: mlir source file with calls to be compiled to an IREE module.
-#   CALLS_VMFB: specifies the path to use for the generated IREE module.
-#   TARGET_BACKEND: target backend to compile for.
-#   DRIVER: driver to run the module with.
-#   COMPILER_FLAGS: additional flags to pass to the compiler. Bytecode output
-#       format and backend flags are passed automatically.
-#   RUNNER_ARGS: additional args to pass to the trace-runner program. The driver
-#       and input file flags are passed automatically.
-#   LABELS: Additional labels to apply to the test. The package path and
-#       "driver=${DRIVER}" are added automatically.
-#   TEST_RUNNER: trace-runner program to run.
-#   TARGET_CPU_FEATURES: If specified, a string passed as argument to
-#       --iree-llvmcpu-target-cpu-features.
-#   TEST_DEFINED: Whether to define a test target.
-#   TEST_DISABLED: The test target will be skipped and its status will be
-#       'Not Run'.
-function(iree_e2e_runner_test)
-  if(NOT IREE_BUILD_TESTS)
-    return()
-  endif()
-
-  # See comment in iree_check_test about this condition.
-  if(NOT IREE_BUILD_COMPILER AND NOT IREE_HOST_BIN_DIR)
-    return()
-  endif()
-
-  cmake_parse_arguments(
-    _RULE
-    ""
-    "NAME;TEST_TYPE;VARIANT_NAME;TESTS_SRC;TESTS_VMFB;CALLS_SRC;CALLS_VMFB;TRACE;TARGET_BACKEND;DRIVER;TEST_RUNNER;TEST_DEFINED;TEST_DISABLED"
-    "COMPILER_FLAGS;RUNNER_ARGS;LABELS;TARGET_CPU_FEATURES"
-    ${ARGN}
-  )
-
-  iree_is_bytecode_module_test_excluded_by_labels(_EXCLUDED_BY_LABELS "${_RULE_LABELS}")
-  if(_EXCLUDED_BY_LABELS)
-    return()
-  endif()
-
-  iree_package_name(_PACKAGE_NAME)
-  set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
-
-  set(_BASE_COMPILER_FLAGS
-    "--iree-hal-target-backends=${_RULE_TARGET_BACKEND}"
-  )
-  if (_RULE_TARGET_CPU_FEATURES)
-    list(APPEND _BASE_COMPILER_FLAGS "--iree-llvmcpu-target-cpu-features=${_RULE_TARGET_CPU_FEATURES}")
-  endif()
-
-  if(NOT TARGET "${_NAME}_${_RULE_TEST_TYPE}_module")
-    iree_bytecode_module(
-      NAME
-        "${_RULE_NAME}_${_RULE_TEST_TYPE}_module"
-      MODULE_FILE_NAME
-        "${_RULE_TESTS_VMFB}"
-      SRC
-        "${_RULE_TESTS_SRC}"
-      FLAGS
-        "${_BASE_COMPILER_FLAGS}"
-        "${_RULE_COMPILER_FLAGS}"
-    )
-  endif()
-
-  if(NOT TARGET "${_NAME}_calls_module")
-    iree_bytecode_module(
-      NAME
-        "${_RULE_NAME}_calls_module"
-      MODULE_FILE_NAME
-        "${_RULE_CALLS_VMFB}"
-      SRC
-        "${_RULE_CALLS_SRC}"
-      FLAGS
-        "${_BASE_COMPILER_FLAGS}"
-        "${_RULE_COMPILER_FLAGS}"
-    )
-  endif()
-
-  # A target specifically for the test. We could combine this with the above,
-  # but we want that one to get pulled into iree_bytecode_module.
-  add_custom_target("${_NAME}${_RULE_VARIANT_NAME}" ALL)
-  add_dependencies(
-    "${_NAME}${_RULE_VARIANT_NAME}"
-    "${_NAME}_${_RULE_TEST_TYPE}_module"
-    "${_NAME}_calls_module"
-    "${_RULE_TEST_RUNNER}"
-  )
-
-  add_dependencies(iree-test-deps "${_NAME}${_RULE_VARIANT_NAME}")
-
-  if(_RULE_TEST_DEFINED)
-    iree_native_test(
-      NAME
-        "${_RULE_NAME}${_RULE_VARIANT_NAME}"
-      DRIVER
-        "${_RULE_DRIVER}"
-      SRC
-        "${_RULE_TEST_RUNNER}"
-      DATA
-        ${_TESTS_VMFB}
-        ${_CALLS_VMFB}
-      ARGS
-        "--module={{${_TESTS_VMFB}}}"
-        "--module={{${_CALLS_VMFB}}}"
-        ${_RULE_RUNNER_ARGS}
-      LABELS
-        ${_RULE_LABELS}
-      DISABLED
-        ${_RULE_TEST_DISABLED}
-    )
-  endif()
-endfunction()
-
-# iree_single_backend_e2e_runner_test()
-#
-# Parameters:
-#   NAME: Name of the target
-#   TEST_TYPE: Type of test (Currently, matmul and conv are supported).
-#   GENERATOR: Program (at the moment, must be Python3) to run to generate the
-#       source file (and possibly a trace file and module path). It will be
-#       invoked with the following standard flags, in addition to GENERATOR_ARGS:
-#         --output_${TEST_TYPE}_mlir=${CMAKE_CURRENT_BINARY_DIR}/name_${TEST_TYPE}.mlir
-#         --output_calls_mlir=${CMAKE_CURRENT_BINARY_DIR}/name_calls.mlir
-#       and if TARGET_CPU_FEATURES is not empty:
-#         --requirements=${TARGET_CPU_FEATURES}
-#   GENERATOR_ARGS: additional args to pass to the generator program.
-#   TARGET_BACKEND: target backend to compile for.
-#   DRIVER: driver to run the module with.
-#   COMPILER_FLAGS: additional flags to pass to the compiler. Bytecode output
-#       format and backend flags are passed automatically.
-#   RUNNER_ARGS: additional args to pass to the trace-runner program. The driver
-#       and input file flags are passed automatically.
-#   LABELS: Additional labels to apply to the test. The package path and
-#       "driver=${DRIVER}" are added automatically.
-#   TEST_RUNNER: trace-runner program to run.
-#   TARGET_CPU_FEATURES: If specified, a string passed as argument to
-#       --iree-llvmcpu-target-cpu-features.
-function(iree_single_backend_e2e_runner_test)
-  if(NOT IREE_BUILD_TESTS)
-    return()
-  endif()
-
-  # Copied from iree_check_test. Refer to the comment there.
-  if(NOT IREE_BUILD_COMPILER AND NOT IREE_HOST_BIN_DIR)
-    return()
-  endif()
-
-  cmake_parse_arguments(
-    _RULE
-    ""
-    "NAME;TEST_TYPE;GENERATOR;TARGET_BACKEND;DRIVER;TEST_RUNNER"
-    "GENERATOR_ARGS;COMPILER_FLAGS;RUNNER_ARGS;LABELS;TARGET_CPU_FEATURES"
-    ${ARGN}
-  )
-
-  # ---------------------------------------------------------------------------
-  # Bytecode module builds require
-  #   1. the compiler, either in the same build or provided in IREE_HOST_BIN_DIR
-  #   2. compiler support for _RULE_INPUT_TYPE
-  #   3. compiler support for _RULE_TARGET_BACKEND
-  set(_BYTECODE_MODULE_BUILD_ENABLED TRUE)
-
-  # 1. Check for the compiler.
-  if(NOT IREE_BUILD_COMPILER AND NOT IREE_HOST_BIN_DIR)
-    set(_BYTECODE_MODULE_BUILD_ENABLED FALSE)
-  endif()
-
-  # 2. Check target backend availability.
-  # Note: we can only reliably check for this when building the compiler host
-  # tools from source. If the tools are already built, we assume that all target
-  # backends are enabled. We could query the tools in the binary directory for
-  # support dynamically if optionality would be useful.
-  if(NOT IREE_HOST_BIN_DIR)
-    string(TOUPPER ${_RULE_TARGET_BACKEND} _UPPERCASE_TARGET_BACKEND)
-    string(REPLACE "-" "_" _NORMALIZED_TARGET_BACKEND ${_UPPERCASE_TARGET_BACKEND})
-    # TODO(scotttodd): allow plugins to provide external backends here
-    if(NOT DEFINED IREE_TARGET_BACKEND_${_NORMALIZED_TARGET_BACKEND})
-      message(SEND_ERROR "Unknown backend '${_RULE_TARGET_BACKEND}'. Check IREE_TARGET_BACKEND_* options.")
-    endif()
-    if(NOT IREE_TARGET_BACKEND_${_NORMALIZED_TARGET_BACKEND})
-      set(_BYTECODE_MODULE_BUILD_ENABLED FALSE)
-    endif()
-  endif()
-  # ---------------------------------------------------------------------------
-
-  # ---------------------------------------------------------------------------
-  # Tests are defined if _RULE_DRIVER is defined.
-  set(_TEST_DEFINED TRUE)
-  if(NOT DEFINED _RULE_DRIVER)
-    set(_TEST_DEFINED FALSE)
-  endif()
-
-  # Test execution requires
-  #   1. the bytecode module build to be enabled
-  #   2. _RULE_DRIVER is defined and runtime support is enabled
-  #   3. no other label exclusions (e.g. 'optonly' test with 'debug' config)
-  set(_TEST_DISABLED FALSE)
-
-  # 1. Check bytecode module build.
-  if(NOT _BYTECODE_MODULE_BUILD_ENABLED)
-    set(_TEST_DISABLED TRUE)
-  endif()
-
-  # 2. Check driver availability.
-  if(DEFINED _RULE_DRIVER)
-    string(TOUPPER ${_RULE_DRIVER} _UPPERCASE_DRIVER)
-    string(REPLACE "-" "_" _NORMALIZED_DRIVER ${_UPPERCASE_DRIVER})
-    if((NOT IREE_HAL_DRIVER_${_NORMALIZED_DRIVER}) AND
-       (NOT IREE_EXTERNAL_${_NORMALIZED_DRIVER}_HAL_DRIVER_FOUND))
-      set(_TEST_DISABLED TRUE)
-    endif()
-  endif()
-
-  # 3. Check label exclusions.
-  iree_is_bytecode_module_test_excluded_by_labels(_EXCLUDED_BY_LABELS "${_RULE_LABELS}")
-  if(_EXCLUDED_BY_LABELS)
-    set(_TEST_DISABLED TRUE)
-  endif()
-
-  if((_TEST_DISABLED OR NOT _TEST_DEFINED) AND NOT IREE_BUILD_ALL_CHECK_TEST_MODULES)
-    set(_BYTECODE_MODULE_BUILD_ENABLED FALSE)
-  endif()
-  # ---------------------------------------------------------------------------
-
-  iree_package_name(_PACKAGE_NAME)
-  set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
-
-  set(_TESTS_SRC "${CMAKE_CURRENT_BINARY_DIR}/${_RULE_NAME}_${_RULE_TEST_TYPE}.mlir")
-  set(_CALLS_SRC "${CMAKE_CURRENT_BINARY_DIR}/${_RULE_NAME}_calls.mlir")
-  set(_TESTS_VMFB "${CMAKE_CURRENT_BINARY_DIR}/${_RULE_NAME}_${_RULE_TEST_TYPE}.vmfb")
-  set(_CALLS_VMFB "${CMAKE_CURRENT_BINARY_DIR}/${_RULE_NAME}_calls.vmfb")
-
-  list(APPEND _GENERATOR_STANDARD_FLAGS "--output_${_RULE_TEST_TYPE}_mlir=${_TESTS_SRC}")
-  list(APPEND _GENERATOR_STANDARD_FLAGS "--output_calls_mlir=${_CALLS_SRC}")
-  if(_RULE_TARGET_CPU_FEATURES)
-    list(APPEND _GENERATOR_STANDARD_FLAGS "--requirements=${_RULE_TARGET_CPU_FEATURES}")
-  endif()
-
-  if(NOT _BYTECODE_MODULE_BUILD_ENABLED)
-    return()
-  endif()
-
-  add_custom_command(
-    COMMAND
-      "${Python3_EXECUTABLE}"
-      "${CMAKE_CURRENT_SOURCE_DIR}/${_RULE_GENERATOR}"
-      ${_GENERATOR_STANDARD_FLAGS}
-      ${_RULE_GENERATOR_ARGS}
-    OUTPUT
-      ${_TESTS_SRC}
-      ${_CALLS_SRC}
-    DEPENDS
-      ${_RULE_GENERATOR}
-  )
-
-  add_custom_target(
-    "${_NAME}_generated_files"
-    DEPENDS
-      ${_TESTS_SRC}
-      ${_CALLS_SRC}
-  )
-
-  # When using the llvm-cpu backend, the runtime build config may need to
-  # match the compiled executable config using (`--iree-llvmcpu-sanitize=`):
-  #
-  # | Runtime type         | Compatible with these executable types |
-  # | -------------------- | -------------------------------------- |
-  # | Base (no sanitizers) | Base, ASan                             |
-  # | ASan                 | Base, ASan                             |
-  # | TSan                 | TSan (ABI break)                       |
-
-  # Define the regular test suite, unless the config is llvm-cpu + TSan.
-  if(NOT _RULE_TARGET_BACKEND STREQUAL "llvm-cpu" OR NOT IREE_ENABLE_TSAN)
-    iree_e2e_runner_test(
-      NAME ${_RULE_NAME}
-      TEST_TYPE ${_RULE_TEST_TYPE}
-      VARIANT_NAME ""
-      TESTS_SRC ${_TESTS_SRC}
-      TESTS_VMFB ${_TESTS_VMFB}
-      CALLS_SRC ${_CALLS_SRC}
-      CALLS_VMFB ${_CALLS_VMFB}
-      TEST_RUNNER ${_RULE_TEST_RUNNER}
-      TARGET_BACKEND ${_RULE_TARGET_BACKEND}
-      DRIVER ${_RULE_DRIVER}
-      COMPILER_FLAGS ${_RULE_COMPILER_FLAGS}
-      RUNNER_ARGS ${_RULE_RUNNER_ARGS}
-      LABELS ${_RULE_LABELS}
-      TARGET_CPU_FEATURES ${_RULE_TARGET_CPU_FEATURES}
-      TEST_DEFINED ${_TEST_DEFINED}
-      TEST_DISABLED ${_TEST_DISABLED}
-    )
-    # Note we are relying on the fact that the target created by
-    # iree_e2e_runner_test is _NAME, even though we passed _RULE_NAME to it,
-    # i.e. we are relying on the prefixing to be identical.
-    add_dependencies("${_NAME}" "${_NAME}_generated_files")
-  endif()
-
-  # Define tests for AddressSanitizer (ASan) and ThreadSanitizer (TSan).
-  # Normally test suites should do this sort of branching at the leaves rather
-  # than modify the base CMake function directly, but sanitizers are applied
-  # at the build system uniformly, so until we decouple the test suites from
-  # source builds further this felt like a reasonable compromise.
-  if(_RULE_TARGET_BACKEND STREQUAL "llvm-cpu")
-    if(IREE_ENABLE_ASAN)
-      set(_ASAN_COMPILER_FLAGS ${_RULE_COMPILER_FLAGS})
-      list(APPEND _ASAN_COMPILER_FLAGS "--iree-llvmcpu-link-embedded=false")
-      list(APPEND _ASAN_COMPILER_FLAGS "--iree-llvmcpu-sanitize=address")
-      iree_e2e_runner_test(
-        NAME ${_RULE_NAME}
-        TEST_TYPE ${_RULE_TEST_TYPE}
-        VARIANT_NAME "_asan"
-        TESTS_SRC ${_TESTS_SRC}
-        TESTS_VMFB ${_TESTS_VMFB}
-        CALLS_SRC ${_CALLS_SRC}
-        CALLS_VMFB ${_CALLS_VMFB}
-        TEST_RUNNER ${_RULE_TEST_RUNNER}
-        TARGET_BACKEND ${_RULE_TARGET_BACKEND}
-        DRIVER ${_RULE_DRIVER}
-        COMPILER_FLAGS ${_ASAN_COMPILER_FLAGS}
-        RUNNER_ARGS ${_RULE_RUNNER_ARGS}
-        LABELS ${_RULE_LABELS}
-        TARGET_CPU_FEATURES ${_RULE_TARGET_CPU_FEATURES}
-        TEST_DEFINED ${_TEST_DEFINED}
-        TEST_DISABLED ${_TEST_DISABLED}
-      )
-      # Note we are relying on the fact that the target created by
-      # iree_e2e_runner_test is _NAME, even though we passed _RULE_NAME to it,
-      # i.e. we are relying on the prefixing to be identical.
-      add_dependencies("${_NAME}_asan" "${_NAME}_generated_files")
-    endif()
-
-    if(IREE_ENABLE_TSAN)
-      set(_TSAN_COMPILER_FLAGS ${_RULE_COMPILER_FLAGS})
-      list(APPEND _TSAN_COMPILER_FLAGS "--iree-llvmcpu-link-embedded=false")
-      list(APPEND _TSAN_COMPILER_FLAGS "--iree-llvmcpu-sanitize=thread")
-      iree_e2e_runner_test(
-        NAME ${_RULE_NAME}
-        VARIANT_NAME "_tsan"
-        TESTS_SRC ${_TESTS_SRC}
-        TESTS_VMFB ${_TESTS_VMFB}
-        CALLS_SRC ${_CALLS_SRC}
-        CALLS_VMFB ${_CALLS_VMFB}
-        TEST_RUNNER ${_RULE_TEST_RUNNER}
-        TARGET_BACKEND ${_RULE_TARGET_BACKEND}
-        DRIVER ${_RULE_DRIVER}
-        COMPILER_FLAGS ${_TSAN_COMPILER_FLAGS}
-        RUNNER_ARGS ${_RULE_RUNNER_ARGS}
-        LABELS ${_RULE_LABELS}
-        TARGET_CPU_FEATURES ${_RULE_TARGET_CPU_FEATURES}
-        TEST_DEFINED ${_TEST_DEFINED}
-        TEST_DISABLED ${_TEST_DISABLED}
-      )
-      # Note we are relying on the fact that the target created by
-      # iree_e2e_runner_test is _NAME, even though we passed _RULE_NAME to it,
-      # i.e. we are relying on the prefixing to be identical.
-      add_dependencies("${_NAME}_tsan" "${_NAME}_generated_files")
-    endif()
-  endif()
-endfunction()
-
-
-# iree_generated_e2e_runner_test()
-#
-# Creates a set of iree_single_backend_e2e_runner_test's differing
-# by target backend and driver.
-#
-# Mirrors the bzl rule of the same name.
-#
-# One test is generated per source and backend/driver pair.
-# Parameters:
-#   NAME: Name of the target
-#   TEST_TYPE: Type of test (Currently, matmul and conv are supported).
-#   GENERATOR: Program (at the moment, must be Python3) to run to generate the
-#       source file (and possibly a trace file and module path). It will be
-#       invoked with the following standard flags, in addition to GENERATOR_ARGS:
-#         --output_${TEST_TYPE}_mlir=${CMAKE_CURRENT_BINARY_DIR}/name_${TEST_TYPE}.mlir
-#         --output_calls_mlir=${CMAKE_CURRENT_BINARY_DIR}/name_calls.mlir
-#   GENERATOR_ARGS: additional args to pass to the generator program.
-#   TARGET_BACKENDS: backends to compile the module for. These form pairs with
-#       the DRIVERS argument (due to cmake limitations they are separate list
-#       arguments). The lengths must exactly match. If no backends or drivers are
-#       specified, a test will be generated for every supported pair.
-#   DRIVERS: drivers to run the module with. These form pairs with the
-#       TARGET_BACKENDS argument (due to cmake limitations they are separate list
-#       arguments). The lengths must exactly match. If no backends or drivers are
-#       specified, a test will be generated for every supported pair.
-#   COMPILER_FLAGS: additional flags to pass to the compiler. Bytecode output
-#       format and backend flags are passed automatically.
-#   RUNNER_ARGS: additional args to pass to the trace-runner program. The driver
-#       and input file flags are passed automatically.
-#   LABELS: Additional labels to apply to the test. The package path and
-#       "driver=${DRIVER}" are added automatically.
-#   TEST_RUNNER: trace-runner program to run.
-#   TARGET_CPU_FEATURES_VARIANTS:list of target cpu features variants. Each
-#       entry is either "default" for the architecture defaults, or a colon-
-#       separated triple "arch:name:cpu_features" where "arch" filters
-#       for a target CPU architecture (in IREE_ARCH format), "name" is a
-#       short name for the CPU features set (used to generate target names)
-#       and cpu_features is a comma-separated list of LLVM target attributes
-#       to enable. Example:
-#         x86_64:avx2_fma:+avx,+avx2,+fma
-function(iree_generated_e2e_runner_test)
-  if(NOT IREE_BUILD_TESTS)
-    return()
-  endif()
-
-  cmake_parse_arguments(
-    _RULE
-    ""
-    "NAME;TEST_TYPE;GENERATOR;TEST_RUNNER"
-    "TARGET_BACKENDS;DRIVERS;GENERATOR_ARGS;COMPILER_FLAGS;RUNNER_ARGS;LABELS;TARGET_CPU_FEATURES_VARIANTS"
-    ${ARGN}
-  )
-
-  iree_is_bytecode_module_test_excluded_by_labels(_EXCLUDED_BY_LABELS "${_RULE_LABELS}")
-  if(_EXCLUDED_BY_LABELS)
-    return()
-  endif()
-
-  if(_RULE_TARGET_CPU_FEATURES_VARIANTS)
-    set(_TARGET_CPU_FEATURES_VARIANTS "${_RULE_TARGET_CPU_FEATURES_VARIANTS}")
-  else()
-    set(_TARGET_CPU_FEATURES_VARIANTS "default")
-  endif()
-
-
-  if(NOT DEFINED _RULE_TARGET_BACKENDS AND NOT DEFINED _RULE_DRIVERS)
-    set(_RULE_TARGET_BACKENDS "vmvx" "vulkan-spirv" "llvm-cpu")
-    set(_RULE_DRIVERS "local-task" "vulkan" "local-task")
-  endif()
-
-  list(LENGTH _RULE_TARGET_BACKENDS _TARGET_BACKEND_COUNT)
-  list(LENGTH _RULE_DRIVERS _DRIVER_COUNT)
-
-  if(NOT _TARGET_BACKEND_COUNT EQUAL _DRIVER_COUNT)
-    message(SEND_ERROR
-        "TARGET_BACKENDS count ${_TARGET_BACKEND_COUNT} does not match DRIVERS count ${_DRIVER_COUNT}")
-  endif()
-
-  math(EXPR _MAX_INDEX "${_TARGET_BACKEND_COUNT} - 1")
-  foreach(_INDEX RANGE "${_MAX_INDEX}")
-    list(GET _RULE_TARGET_BACKENDS ${_INDEX} _TARGET_BACKEND)
-    list(GET _RULE_DRIVERS ${_INDEX} _DRIVER)
-    foreach(_VARIANT_STRING IN LISTS _TARGET_CPU_FEATURES_VARIANTS)
-      parse_target_cpu_features_variant("${_VARIANT_STRING}"
-        _ENABLED _TARGET_CPU_FEATURES_NAME _TARGET_CPU_FEATURES)
-      if(NOT _ENABLED)
-        # The current entry is disabled on the target CPU architecture.
-        continue()
-      endif()
-      set(_TARGET_CPU_FEATURES_SUFFIX "")
-      set(_LABELS "${_RULE_LABELS}")
-      if (_TARGET_CPU_FEATURES_NAME)
-        set(_TARGET_CPU_FEATURES_SUFFIX "_${_TARGET_CPU_FEATURES_NAME}")
-        list(APPEND _LABELS "cpu_features=${_TARGET_CPU_FEATURES_NAME}")
-      endif()
-      iree_single_backend_e2e_runner_test(
-        NAME
-          "${_RULE_NAME}_${_TARGET_BACKEND}_${_DRIVER}${_TARGET_CPU_FEATURES_SUFFIX}"
-        TEST_TYPE
-          ${_RULE_TEST_TYPE}
-        GENERATOR
-          ${_RULE_GENERATOR}
-        GENERATOR_ARGS
-          ${_RULE_GENERATOR_ARGS}
-        TEST_RUNNER
-          ${_RULE_TEST_RUNNER}
-        TARGET_BACKEND
-          ${_TARGET_BACKEND}
-        DRIVER
-          ${_DRIVER}
-        COMPILER_FLAGS
-          ${_RULE_COMPILER_FLAGS}
-        RUNNER_ARGS
-          ${_RULE_RUNNER_ARGS}
-        LABELS
-          ${_LABELS}
-        TARGET_CPU_FEATURES
-          ${_TARGET_CPU_FEATURES}
-      )
-    endforeach()
-  endforeach()
-endfunction()
diff --git a/tests/e2e/attention/BUILD.bazel b/tests/e2e/attention/BUILD.bazel
deleted file mode 100644
index 3e9e41d5b9ee..000000000000
--- a/tests/e2e/attention/BUILD.bazel
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2024 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# End-to-end attention tests.
-
-load("//build_tools/bazel:iree_e2e_generated_runner_test.bzl", "iree_generated_e2e_runner_test")
-
-package(
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-py_binary(
-    name = "generate_e2e_attention_tests",
-    srcs = ["generate_e2e_attention_tests.py"],
-)
-
-###########################################################################
-##
-## LLVMCPU backend
-##
-###########################################################################
-
-# Default CPU backend.
-[iree_generated_e2e_runner_test(
-    name = "e2e_attention_cpu_%s_%s_%s_%s" % (dtype, dtype, dtype, size),
-    generator = ":generate_e2e_attention_tests",
-    generator_args = [
-        "--query_type=%s" % dtype,
-        "--key_type=%s" % dtype,
-        "--value_type=%s" % dtype,
-        "--shapes=%s" % size,
-    ],
-    tags = [
-        "hostonly",
-        "local",
-    ],
-    target_backends_and_drivers = [
-        ("llvm-cpu", "local-task"),
-    ],
-    target_cpu_features_variants = ["default"],
-    test_runner = "//tools/testing/e2e:iree-e2e-attention-test",
-    test_type = "attention",
-) for dtype in [
-    "f16",
-] for size in [
-    "small",
-    "medium",
-    "large",
-]]
diff --git a/tests/e2e/attention/CMakeLists.txt b/tests/e2e/attention/CMakeLists.txt
deleted file mode 100644
index f7937845756d..000000000000
--- a/tests/e2e/attention/CMakeLists.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# tests/e2e/attention/BUILD.bazel                                              #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_attention_cpu_f16_f16_f16_small
-  TEST_TYPE
-    attention
-  GENERATOR
-    "generate_e2e_attention_tests.py"
-  GENERATOR_ARGS
-    "--query_type=f16"
-    "--key_type=f16"
-    "--value_type=f16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-attention-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_attention_cpu_f16_f16_f16_medium
-  TEST_TYPE
-    attention
-  GENERATOR
-    "generate_e2e_attention_tests.py"
-  GENERATOR_ARGS
-    "--query_type=f16"
-    "--key_type=f16"
-    "--value_type=f16"
-    "--shapes=medium"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-attention-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_attention_cpu_f16_f16_f16_large
-  TEST_TYPE
-    attention
-  GENERATOR
-    "generate_e2e_attention_tests.py"
-  GENERATOR_ARGS
-    "--query_type=f16"
-    "--key_type=f16"
-    "--value_type=f16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-attention-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/tests/e2e/attention/generate_e2e_attention_tests.py b/tests/e2e/attention/generate_e2e_attention_tests.py
deleted file mode 100644
index f567a16c5557..000000000000
--- a/tests/e2e/attention/generate_e2e_attention_tests.py
+++ /dev/null
@@ -1,499 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2024 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Generator for e2e attention tests.
-"""
-
-import argparse
-import enum
-import dataclasses
-import typing
-import math
-
-
-# Data type of kernel entries. The string values must match MLIR data types.
-@enum.unique
-class QueryElemTypeId(enum.Enum):
-    NONE = ""
-    F16 = "f16"
-
-
-# Data type of input entries. The string values must match MLIR data types.
-@enum.unique
-class KeyElemTypeId(enum.Enum):
-    NONE = ""
-    F16 = "f16"
-
-
-# Data type of input entries. The string values must match MLIR data types.
-@enum.unique
-class ValueElemTypeId(enum.Enum):
-    NONE = ""
-    F16 = "f16"
-
-
-# Data type of input entries. The string values must match MLIR data types.
-@enum.unique
-class ResultElemTypeId(enum.Enum):
-    NONE = ""
-    F16 = "f16"
-
-
-# Enumerates of the collections of shapes that we can generate tests for.
-# The values are the accepted values for the --shapes= flag.
-@enum.unique
-class ShapesId(enum.Enum):
-    SMALL = "small"
-    MEDIUM = "medium"
-    LARGE = "large"
-
-
-# batch: Batch dimension
-# m: M dimension of first and second matmul
-# n: N dimension of second matmul
-# k1: K dimension of first matmul
-# k2: K dimension of second matmul
-@dataclasses.dataclass
-class TestShapeAndScale:
-    batch: int
-    m: int
-    k1: int
-    k2: int
-    n: int
-    scale: float
-
-
-# Returns the list of TestShape's to use for the collection of shapes
-# identified by shapes_id.
-def get_test_shapes(shapes_id: ShapesId):
-    if shapes_id == ShapesId.SMALL:
-        return [
-            TestShapeAndScale(batch=2, m=512, k1=64, k2=128, n=32, scale=1.0),
-        ]
-    if shapes_id == ShapesId.MEDIUM:
-        return [
-            TestShapeAndScale(batch=2, m=1024, k1=128, k2=256, n=64, scale=1.0),
-        ]
-    if shapes_id == ShapesId.LARGE:
-        return [
-            TestShapeAndScale(batch=2, m=2048, k1=256, k2=512, n=128, scale=1.0),
-        ]
-
-    raise ValueError(shapes_id)
-
-
-# Determines the shape of input and kernel tensors.
-@dataclasses.dataclass
-class TestInputTensorShapes:
-    batch: int
-    m: int
-    k1: int
-    k2: int
-    n: int
-    scale: float
-
-
-# Helper for generate_function. Generates TestInputTensorShapes, i.e.
-# converts from the runtime shape dimensions in TestShape and given dynamicity to
-# the set of shapes to be used in a test function's input tensors.
-def generate_shapes_and_scale(shape: TestShapeAndScale):
-    batch = shape.batch
-    m = shape.m
-    k1 = shape.k1
-    k2 = shape.k2
-    n = shape.n
-    scale = shape.scale
-
-    shapes_scale = TestInputTensorShapes(
-        batch=batch,
-        m=m,
-        k1=k1,
-        k2=k2,
-        n=n,
-        scale=scale,
-    )
-    return shapes_scale
-
-
-# Helper to return input, kernel and output shapes based on the layout and the Attention Params.
-def get_tensor_shapes(
-    shapes_scale: TestShapeAndScale,
-):
-    batch = shapes_scale.batch
-    m = shapes_scale.m
-    k1 = shapes_scale.k1
-    k2 = shapes_scale.k2
-    n = shapes_scale.n
-    scale = shapes_scale.scale
-
-    query_tensor_shape = [batch, m, k1]
-    key_tensor_shape = [batch, k2, k1]
-    value_tensor_shape = [batch, k2, n]
-    result_tensor_shape = [batch, m, n]
-
-    return query_tensor_shape, key_tensor_shape, value_tensor_shape, result_tensor_shape
-
-
-# Helper for generate_function.
-# Generates a name for a test function in the generated MLIR code.
-def generate_function_name(
-    query_type: QueryElemTypeId,
-    key_type: KeyElemTypeId,
-    value_type: ValueElemTypeId,
-    shapes_scale: TestInputTensorShapes,
-):
-    query_t = query_type.value
-    key_t = key_type.value
-    value_t = value_type.value
-    result_t = value_type.value
-
-    batch = shapes_scale.batch
-    m = shapes_scale.m
-    k1 = shapes_scale.k1
-    k2 = shapes_scale.k2
-    n = shapes_scale.n
-
-    attention = "attention"
-    return (
-        f"{attention}_{batch}_{m}_{k1}_{k2}_{n}"
-        + f"_dtype_{query_t}_{key_t}_{value_t}_{result_t}"
-    )
-
-
-# Represents a generated test function.
-@dataclasses.dataclass
-class MLIRFunction:
-    name: str
-    signature: str
-    import_declaration: str
-    definition: str
-
-
-# Generates a test function in the generated MLIR code.
-# The generated function will take the same arguments as iree_linalg_ext.attention variants
-# and will just call iree_linalg_ext.attention variants with them, returning its result.
-def generate_function(
-    query_type: QueryElemTypeId,
-    key_type: KeyElemTypeId,
-    value_type: ValueElemTypeId,
-    shape_scale: TestShapeAndScale,
-):
-    shapes_scale = generate_shapes_and_scale(shape_scale)
-    func_name = generate_function_name(
-        query_type,
-        key_type,
-        value_type,
-        shapes_scale,
-    )
-
-    query_shape, key_shape, value_shape, result_shape = get_tensor_shapes(shapes_scale)
-    query_tensor_type = (
-        f"tensor<{query_shape[0]}x{query_shape[1]}x{query_shape[2]}x{query_type.value}>"
-    )
-    key_tensor_type = (
-        f"tensor<{key_shape[0]}x{key_shape[1]}x{key_shape[2]}x{key_type.value}>"
-    )
-    value_tensor_type = (
-        f"tensor<{value_shape[0]}x{value_shape[1]}x{value_shape[2]}x{value_type.value}>"
-    )
-    result_tensor_type = f"tensor<{result_shape[0]}x{result_shape[1]}x{result_shape[2]}x{value_type.value}>"
-    F32 = "f32"
-    F16 = "f16"
-    op_name = "iree_linalg_ext.attention"
-
-    # Compilation info is optional; prints empty string by default.
-    func_definition = ""
-
-    signature = f"({query_tensor_type}, {key_tensor_type}, {value_tensor_type}, {result_tensor_type}) -> {result_tensor_type}"
-    import_declaration = f"func.func private @module.{func_name}(%query: !hal.buffer_view, %key: !hal.buffer_view, %value: !hal.buffer_view, %scale: {F32}) -> !hal.buffer_view"
-    func_definition = func_definition + (
-        f"func.func @{func_name}(%query: {query_tensor_type}, %key: {key_tensor_type}, %value: {value_tensor_type}, %scale: {F32}) -> {result_tensor_type} {{\n"
-        f"  %result0 = tensor.empty(): {result_tensor_type}\n"
-        f"  %scale_f16 = arith.truncf %scale : {F32} to {F16} \n"
-        f"  %result1 = {op_name} {{\n"
-        f"      indexing_maps = [affine_map<(batch, m, n, k1, k2) -> (batch, m, k1)>,\n"
-        f"                       affine_map<(batch, m, n, k1, k2) -> (batch, k2, k1)>,\n"
-        f"                       affine_map<(batch, m, n, k1, k2) -> (batch, k2, n)>,\n"
-        f"                       affine_map<(batch, m, n, k1, k2) -> (batch, m, n)>]\n}}"
-        f"      ins(%query, %key, %value, %scale_f16: {query_tensor_type}, {key_tensor_type}, {value_tensor_type}, {F16})\n"
-        f"      outs(%result0: {result_tensor_type}) -> {result_tensor_type}\n"
-        f" return %result1: {result_tensor_type}\n"
-        f"}}\n"
-    )
-    return MLIRFunction(
-        name=func_name,
-        signature=signature,
-        import_declaration=import_declaration,
-        definition=func_definition,
-    )
-
-
-# Represents a call to a generated test function.
-@dataclasses.dataclass
-class TestCall:
-    function: MLIRFunction
-    op: str
-
-
-# Enumerates ways to initialize tensor buffer contents.
-@enum.unique
-class TensorGenerator(enum.Enum):
-    ZERO = "zero"  # Fill with zeros
-    RANDOM = "random"  # Fill with (deterministic) pseudorandom values.
-
-
-# Intentionally fixed seed! We want full reproducibility here, both across runs
-# and across machines.
-# Intentionally not shared with local_pseudorandom_state to limit the ways
-# in which shuffling testcases changes which random values are generated.
-pseudorandom_generator_seed = 1
-
-
-def contents_generator_tag(generator: TensorGenerator):
-    if generator == TensorGenerator.ZERO:
-        return ""
-    elif generator == TensorGenerator.RANDOM:
-        global pseudorandom_generator_seed
-        pseudorandom_generator_seed = pseudorandom_generator_seed + 1
-        return f"!tag:iree:fully_specified_pseudorandom {pseudorandom_generator_seed}"
-    else:
-        raise ValueError(generator)
-
-
-# Generate a 3d tensor function argument of the given size as `%name`.
-def generate_random_3d_tensor(
-    name: str,
-    tensor_shape: list,
-    element_type: typing.Union[QueryElemTypeId, ResultElemTypeId],
-):
-    global pseudorandom_generator_seed
-    pseudorandom_generator_seed = pseudorandom_generator_seed + 1
-    return (
-        f"  %{name}_dim0 = arith.constant {tensor_shape[0]} : i64\n"
-        f"  %{name}_dim1 = arith.constant {tensor_shape[1]} : i64\n"
-        f"  %{name}_dim2 = arith.constant {tensor_shape[2]} : i64\n"
-        f"  %{name}_element_type = hal.element_type<{element_type.value}> : i32\n"
-        f"  %{name}_seed = arith.constant {pseudorandom_generator_seed} : i32\n"
-        f"  %{name} = call @attention_test.generate_random_tensor(%device, %{name}_dim0, %{name}_dim1, %{name}_dim2, %{name}_element_type, %{name}_seed) : (!hal.device, i64, i64, i64, i32, i32) -> !hal.buffer_view\n"
-    )
-
-
-call_id = 0
-
-
-def generate_call(
-    function: MLIRFunction,
-    query_type: QueryElemTypeId,
-    key_type: KeyElemTypeId,
-    value_type: ValueElemTypeId,
-    shapes_scale: TestShapeAndScale,
-):
-    global call_id
-    func_name = f"{function.name}_{shapes_scale.batch}_{shapes_scale.m}_{shapes_scale.k1}_{shapes_scale.k2}_{shapes_scale.n}_{shapes_scale.k1}_{shapes_scale.scale}"
-    func_name = f"{func_name}_{call_id}"
-    call_id = call_id + 1
-
-    description = f"Attention shape (BATCHxMxK1xK2xN): {shapes_scale.batch}x{shapes_scale.m}x{shapes_scale.k1}x{shapes_scale.k2}x{shapes_scale.k1}x{shapes_scale.n}"
-    op = (
-        f"func.func @{func_name}() attributes {{\n"
-        f'  iree.reflection = {{description = "{description}"}}\n'
-        "} {\n"
-        "  %device_index = arith.constant 0 : index\n"
-        "  %device = hal.devices.get %device_index : !hal.device\n"
-    )
-
-    query_shape, key_shape, value_shape, result_shape = get_tensor_shapes(
-        shapes_scale,
-    )
-
-    op = op + generate_random_3d_tensor("query", query_shape, query_type)
-    op = op + generate_random_3d_tensor("key", key_shape, key_type)
-    op = op + generate_random_3d_tensor("value", value_shape, value_type)
-
-    global pseudorandom_generator_seed
-    pseudorandom_generator_seed = pseudorandom_generator_seed - 1
-    op = op + (
-        f"  %scale = arith.constant {shapes_scale.scale} : f32\n"
-        f"  %result = call @module.{function.name}(%query, %key, %value, %scale) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view, f32) -> !hal.buffer_view\n"
-    )
-
-    op = op + (
-        f"  %batch = arith.constant {shapes_scale.batch} : i64 \n"
-        f"  %m = arith.constant {shapes_scale.m} : i64 \n"
-        f"  %k1 = arith.constant {shapes_scale.k1} : i64 \n"
-        f"  %k2 = arith.constant {shapes_scale.k2} : i64 \n"
-        f"  %n = arith.constant {shapes_scale.n} : i64 \n"
-        f"  %queryTensor = hal.tensor.import %query : !hal.buffer_view -> tensor<{shapes_scale.batch}x{shapes_scale.m}x{shapes_scale.k1}xf16> \n"
-        f"  %keyTensor = hal.tensor.import %key : !hal.buffer_view -> tensor<{shapes_scale.batch}x{shapes_scale.k2}x{shapes_scale.k1}xf16> \n"
-        f"  %valueTensor = hal.tensor.import %value : !hal.buffer_view -> tensor<{shapes_scale.batch}x{shapes_scale.k2}x{shapes_scale.n}xf16> \n"
-        f"  %resultTensor = hal.tensor.import %result : !hal.buffer_view -> tensor<{shapes_scale.batch}x{shapes_scale.m}x{shapes_scale.n}xf16> \n"
-        f"  %queryExt = arith.extf %queryTensor : tensor<{shapes_scale.batch}x{shapes_scale.m}x{shapes_scale.k1}xf16> to tensor<{shapes_scale.batch}x{shapes_scale.m}x{shapes_scale.k1}xf32> \n"
-        f"  %keyExt = arith.extf %keyTensor : tensor<{shapes_scale.batch}x{shapes_scale.k2}x{shapes_scale.k1}xf16> to tensor<{shapes_scale.batch}x{shapes_scale.k2}x{shapes_scale.k1}xf32> \n"
-        f"  %valueExt = arith.extf %valueTensor : tensor<{shapes_scale.batch}x{shapes_scale.k2}x{shapes_scale.n}xf16> to tensor<{shapes_scale.batch}x{shapes_scale.k2}x{shapes_scale.n}xf32> \n"
-        f"  %resultExt = arith.extf %resultTensor : tensor<{shapes_scale.batch}x{shapes_scale.m}x{shapes_scale.n}xf16> to tensor<{shapes_scale.batch}x{shapes_scale.m}x{shapes_scale.n}xf32> \n"
-        f"  %queryExtBufferView = hal.tensor.export %queryExt : tensor<{shapes_scale.batch}x{shapes_scale.m}x{shapes_scale.k1}xf32> -> !hal.buffer_view \n"
-        f"  %keyExtBufferView = hal.tensor.export %keyExt : tensor<{shapes_scale.batch}x{shapes_scale.k2}x{shapes_scale.k1}xf32> -> !hal.buffer_view \n"
-        f"  %valueExtBufferView = hal.tensor.export %valueExt : tensor<{shapes_scale.batch}x{shapes_scale.k2}x{shapes_scale.n}xf32> -> !hal.buffer_view \n"
-        f"  %resultExtBufferView = hal.tensor.export %resultExt : tensor<{shapes_scale.batch}x{shapes_scale.m}x{shapes_scale.n}xf32> -> !hal.buffer_view \n"
-        f"  call @attention_test.check_attention_results(%device, %batch, %m, %k1, %k2, %n, %queryExtBufferView, %keyExtBufferView, %valueExtBufferView, %resultExtBufferView) : (!hal.device, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()\n"
-    )
-
-    op = op + "  return\n"
-    op = op + "}\n"
-
-    return TestCall(function=function, op=op)
-
-
-# Generates all output files' contents as strings.
-def generate(
-    query_type: QueryElemTypeId,
-    key_type: KeyElemTypeId,
-    value_type: ValueElemTypeId,
-    shapes_id: ShapesId,
-):
-    functions = {}
-    calls = []
-
-    for shape in get_test_shapes(shapes_id):
-        function = generate_function(
-            query_type,
-            key_type,
-            value_type,
-            shape,
-        )
-        if function.name not in functions:
-            functions[function.name] = function
-        calls.append(
-            generate_call(
-                function,
-                query_type,
-                key_type,
-                value_type,
-                shape,
-            )
-        )
-
-    return (functions, calls)
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(description="Generator of e2e Attention tests")
-    parser.add_argument(
-        "--output_attention_mlir",
-        type=str,
-        help="Path of output .mlir file containing the generated Attention functions",
-        required=True,
-    )
-    parser.add_argument(
-        "--output_calls_mlir",
-        type=str,
-        help="Path of output .mlir file containing the calls",
-        required=True,
-    )
-    parser.add_argument(
-        "--query_type",
-        type=str,
-        choices=["f16"],
-        help="Numeric type of query tensors ",
-        required=True,
-    )
-    parser.add_argument(
-        "--key_type",
-        type=str,
-        choices=["f16"],
-        help="Numeric type of key tensors ",
-        required=True,
-    )
-    parser.add_argument(
-        "--value_type",
-        type=str,
-        choices=["f16"],
-        help="Numeric type of value tensors ",
-        required=True,
-    )
-    parser.add_argument(
-        "--shapes_scale",
-        type=str,
-        choices=[s.value for s in ShapesId],
-        help="Collection of tensor shapes to test",
-        required=True,
-    )
-    parser.add_argument(
-        "--requirements",
-        type=str,
-        help="Target requirements for this module. Comma-separated. As in -iree-llvmcpu-target-cpu-features. If the target device does not meet all of the requirements, the test will be skipped.",
-        required=False,
-    )
-    return parser.parse_args()
-
-
-def write_code_file(functions, filename):
-    with open(filename, "w") as file:
-        for function in functions.values():
-            file.write(function.definition + "\n")
-
-
-def write_calls_file(functions, calls, filename, requirements):
-    # Module-level reflection information used to control the test tool.
-    reflection = ""
-    if requirements:
-        reflection = (
-            "iree.reflection = {"
-            'target_features = "'
-            + ",".join([req.lstrip("+") for req in requirements.split(",")])
-            + '"'
-            "}"
-        )
-    module_definition = (
-        f"builtin.module @calls attributes {{\n" f"  {reflection}\n" f"}} {{\n\n"
-    )
-
-    # Declare the custom module that generates arguments.
-    module_definition = module_definition + (
-        "func.func private @attention_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view\n"
-        "func.func private @attention_test.check_attention_results(%device: !hal.device, %batch: i64, %m: i64, %k1: i64, %k2: i64, %n: i64, %query: !hal.buffer_view, %key: !hal.buffer_view, %value: !hal.buffer_view, %result: !hal.buffer_view)\n"
-        "\n"
-    )
-
-    # Declare the functions that will be called.
-    for function in functions.values():
-        module_definition = module_definition + function.import_declaration + "\n"
-    module_definition = module_definition + "\n"
-
-    # Emit the test cases for each call.
-    for call in calls:
-        module_definition = module_definition + call.op + "\n"
-
-    module_definition = module_definition + "\n}\n"
-
-    with open(filename, "w") as file:
-        file.write(module_definition)
-
-
-def main(args):
-    query_type = QueryElemTypeId(args.query_type)
-    key_type = KeyElemTypeId(args.key_type)
-    value_type = ValueElemTypeId(args.value_type)
-    shapes_id = ShapesId(args.shapes_scale)
-
-    (functions, calls) = generate(
-        query_type,
-        key_type,
-        value_type,
-        shapes_id,
-    )
-
-    write_code_file(functions, args.output_attention_mlir)
-    write_calls_file(
-        functions,
-        calls,
-        args.output_calls_mlir,
-        args.requirements,
-    )
-
-
-if __name__ == "__main__":
-    main(parse_arguments())
diff --git a/tests/e2e/convolution/BUILD.bazel b/tests/e2e/convolution/BUILD.bazel
deleted file mode 100644
index 9847e210158f..000000000000
--- a/tests/e2e/convolution/BUILD.bazel
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2024 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# End-to-end convolution 2d tests.
-
-load("//build_tools/bazel:iree_e2e_generated_runner_test.bzl", "iree_generated_e2e_runner_test")
-
-package(
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-py_binary(
-    name = "generate_e2e_conv2d_tests",
-    srcs = ["generate_e2e_conv2d_tests.py"],
-)
-
-###########################################################################
-##
-## LLVMCPU backend
-##
-###########################################################################
-
-# Default CPU backend.
-[iree_generated_e2e_runner_test(
-    name = "e2e_conv2d_cpu_%s_%s_%s_%s" % (dtype, dtype, dtype, size),
-    generator = ":generate_e2e_conv2d_tests",
-    generator_args = [
-        "--input_type=%s" % dtype,
-        "--kernel_type=%s" % dtype,
-        "--acc_type=%s" % dtype,
-        "--shapes=%s" % size,
-    ],
-    tags = [
-        "hostonly",
-        "local",
-    ],
-    target_backends_and_drivers = [
-        ("llvm-cpu", "local-task"),
-    ],
-    target_cpu_features_variants = ["default"],
-    test_runner = "//tools/testing/e2e:iree-e2e-conv2d-test",
-    test_type = "conv2d",
-) for dtype in [
-    "f32",
-    "f16",
-] for size in [
-    "small",
-    "medium",
-    "large",
-]]
-
-# Default CPU backend + winograd.
-[iree_generated_e2e_runner_test(
-    name = "e2e_winograd_conv2d_cpu_%s_%s_%s_%s" % (dtype, dtype, dtype, size),
-    compiler_flags = [
-        "--iree-preprocessing-pass-pipeline=builtin.module\\(func.func\\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\\)\\)",
-    ],
-    generator = ":generate_e2e_conv2d_tests",
-    generator_args = [
-        "--input_type=%s" % dtype,
-        "--kernel_type=%s" % dtype,
-        "--acc_type=%s" % dtype,
-        "--shapes=%s" % size,
-    ],
-    tags = [
-        "hostonly",
-        "local",
-    ],
-    target_backends_and_drivers = [
-        ("llvm-cpu", "local-task"),
-    ],
-    target_cpu_features_variants = ["default"],
-    test_runner = "//tools/testing/e2e:iree-e2e-conv2d-test",
-    test_type = "conv2d",
-) for dtype in [
-    "f32",
-    "f16",
-] for size in [
-    "small",
-    "medium",
-    "large",
-]]
diff --git a/tests/e2e/convolution/CMakeLists.txt b/tests/e2e/convolution/CMakeLists.txt
deleted file mode 100644
index 8ddad849b082..000000000000
--- a/tests/e2e/convolution/CMakeLists.txt
+++ /dev/null
@@ -1,325 +0,0 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# tests/e2e/convolution/BUILD.bazel                                            #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_conv2d_cpu_f32_f32_f32_small
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f32"
-    "--kernel_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_conv2d_cpu_f32_f32_f32_medium
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f32"
-    "--kernel_type=f32"
-    "--acc_type=f32"
-    "--shapes=medium"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_conv2d_cpu_f32_f32_f32_large
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f32"
-    "--kernel_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_conv2d_cpu_f16_f16_f16_small
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f16"
-    "--kernel_type=f16"
-    "--acc_type=f16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_conv2d_cpu_f16_f16_f16_medium
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f16"
-    "--kernel_type=f16"
-    "--acc_type=f16"
-    "--shapes=medium"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_conv2d_cpu_f16_f16_f16_large
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f16"
-    "--kernel_type=f16"
-    "--acc_type=f16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_winograd_conv2d_cpu_f32_f32_f32_small
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f32"
-    "--kernel_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_winograd_conv2d_cpu_f32_f32_f32_medium
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f32"
-    "--kernel_type=f32"
-    "--acc_type=f32"
-    "--shapes=medium"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_winograd_conv2d_cpu_f32_f32_f32_large
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f32"
-    "--kernel_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_winograd_conv2d_cpu_f16_f16_f16_small
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f16"
-    "--kernel_type=f16"
-    "--acc_type=f16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_winograd_conv2d_cpu_f16_f16_f16_medium
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f16"
-    "--kernel_type=f16"
-    "--acc_type=f16"
-    "--shapes=medium"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_winograd_conv2d_cpu_f16_f16_f16_large
-  TEST_TYPE
-    conv2d
-  GENERATOR
-    "generate_e2e_conv2d_tests.py"
-  GENERATOR_ARGS
-    "--input_type=f16"
-    "--kernel_type=f16"
-    "--acc_type=f16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-conv2d-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
-  LABELS
-    "hostonly"
-    "local"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-)
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/tests/e2e/convolution/generate_e2e_conv2d_tests.py b/tests/e2e/convolution/generate_e2e_conv2d_tests.py
deleted file mode 100644
index 0982e1801679..000000000000
--- a/tests/e2e/convolution/generate_e2e_conv2d_tests.py
+++ /dev/null
@@ -1,707 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2024 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Generator for e2e conv2d tests.
-"""
-
-import argparse
-import enum
-import dataclasses
-import typing
-import math
-
-
-# Data type of kernel entries. The string values must match MLIR data types.
-@enum.unique
-class KernelElemTypeId(enum.Enum):
-    NONE = ""
-    F32 = "f32"
-    F16 = "f16"
-
-
-# Data type of input entries. The string values must match MLIR data types.
-@enum.unique
-class InputElemTypeId(enum.Enum):
-    NONE = ""
-    F32 = "f32"
-    F16 = "f16"
-
-
-# Enumerates of the collections of shapes that we can generate tests for.
-# The values are the accepted values for the --shapes= flag.
-@enum.unique
-class ShapesId(enum.Enum):
-    SMALL = "small"
-    MEDIUM = "medium"
-    LARGE = "large"
-
-
-# Enumerates ways to construct MLIR tensor types.
-# TODO: Enable dynamic dimensions once the tests start passing.
-@enum.unique
-class Dynamicity(enum.Enum):
-    DYNAMIC = "dynamic"  # Use '?' everywhere. Example: tensor<?x?xf32>.
-    STATIC = "static"  # Use fixed values everywhere. Example: tensor<4x6xf32>.
-    MIXED = "mixed"  # Randomly mix '?' and values. Example: tensor<?x4xf32>.
-
-
-# Enumerates ways to initialize input buffer contents.
-@enum.unique
-class InputGenerator(enum.Enum):
-    ZERO = "zero"  # Fill with zeros
-    RANDOM = "random"  # Fill with (deterministic) pseudorandom values.
-
-
-# Enumerates ways to initialize kernel buffer contents.
-@enum.unique
-class KernelGenerator(enum.Enum):
-    ZERO = "zero"  # Fill with zeros
-    RANDOM = "random"  # Fill with (deterministic) pseudorandom values.
-
-
-# TODO: Add more input layouts as needed. The layout determines the dim of input and kernel.
-@enum.unique
-class InputLayout(enum.Enum):
-    NCHW = "nchw"
-    NHWC = "nhwc"
-
-
-# TODO: Add more kernel layouts as needed.
-@enum.unique
-class KernelLayout(enum.Enum):
-    FCHW = "fchw"
-    HWCF = "hwcf"
-
-
-# Describes the shape of a tensor conv2d in the usual convention:
-# the input is {n}x{c}x{h}x{w}, the kernel is {f}x{c}x{kh}x{kw}, the accumulator/result is
-# {n}x{f}x{oh}x{ow}.
-# The extra `accumulate` boolean tells whether the conv2d is accumulating into
-# an existing accumulator (C += A * B) or just overwriting the result
-# (C = A * B).
-@dataclasses.dataclass
-class TestShape:
-    n: int
-    c: int
-    h: int
-    w: int
-    kh: int
-    kw: int
-    f: int
-    accumulate: bool
-
-
-# Attributes for the linalg.conv2d operation.
-@dataclasses.dataclass
-class ConvAttrs:
-    STRIDE: typing.Tuple[int, int] = (1, 1)
-    DILATION: typing.Tuple[int, int] = (1, 1)
-
-
-# Returns the list of TestShape's to use for the collection of shapes
-# identified by shapes_id.
-def get_test_shapes(shapes_id: ShapesId):
-    # Notes:
-    # 1. Be conservative in adding more shapes, as that can increase both the
-    #    build and execution latency of tests. The build latency is nearly the
-    #    same for all shapes, while execution latency grows linearly with
-    #    n*f*ow*oh*kh*kw.
-
-    if shapes_id == ShapesId.SMALL:
-        return [
-            TestShape(n=1, c=1, h=1, w=1, kh=1, kw=1, f=1, accumulate=True),
-            TestShape(n=1, c=1, h=16, w=16, kh=2, kw=2, f=1, accumulate=True),
-            TestShape(n=2, c=2, h=32, w=32, kh=3, kw=3, f=2, accumulate=True),
-        ]
-    if shapes_id == ShapesId.MEDIUM:
-        return [
-            TestShape(n=2, h=32, w=32, c=32, kh=3, kw=3, f=64, accumulate=True),
-        ]
-    if shapes_id == ShapesId.LARGE:
-        return [
-            TestShape(n=2, c=4, h=128, w=128, kh=3, kw=3, f=8, accumulate=True),
-            TestShape(n=2, c=3, h=128, w=128, kh=3, kw=3, f=12, accumulate=True),
-        ]
-
-    raise ValueError(shapes_id)
-
-
-# Returns the list of Dynamicity's to use for the collection of shapes
-# identified by shapes_id.
-def get_dynamicities(shapes_id: ShapesId):
-    if shapes_id == ShapesId.LARGE:
-        return [
-            Dynamicity.STATIC,
-        ]
-    # TODO: Enable dynamic dimensions once the tests start passing.
-    else:
-        return [
-            Dynamicity.STATIC,
-        ]
-    raise ValueError(shapes_id)
-
-
-# Intentionally fixed seed! We want full reproducibility here, both across runs
-# and across machines.
-# Intentionally not shared with pseudorandom_generator_seed to limit the ways
-# in which shuffling testcases changes which random values are generated.
-local_pseudorandom_state = 1
-
-
-# A shape dimension value, i.e. a size value that could appear in a MLIR type
-# such as 'tensor<?x4xf32>'. None means a dynamic size, similar to '?' in MLIR.
-@dataclasses.dataclass
-class DimSize:
-    value: typing.Optional[int]
-
-
-# Generates a compile-time MLIR size value, i.e. either a fixed positive integer
-# or None (which maps to MLIR '?') depending on dynamicity.
-def shape_dim(x: int, dynamicity: Dynamicity):
-    if dynamicity == Dynamicity.DYNAMIC:
-        return DimSize(None)
-    elif dynamicity == Dynamicity.STATIC:
-        return DimSize(x)
-    else:
-        raise ValueError(dynamicity)
-
-
-# Stringification used for generating MLIR types, e.g. tensor<?x?xf32>.
-def int_or_question_mark(s: DimSize):
-    return s.value or "?"
-
-
-# Stringification used for generating alphanumeric identifiers, e.g.
-# func.func @somefunction_DYNxDYNxf32, where we can't use "?" characters.
-def int_or_DYN(s: DimSize):
-    return s.value or "DYN"
-
-
-# Determines the shape of input and kernel tensors.
-@dataclasses.dataclass
-class TestInputTensorShapes:
-    n: DimSize
-    c: DimSize
-    h: DimSize
-    w: DimSize
-    kh: DimSize
-    kw: DimSize
-    f: DimSize
-
-
-# Helper for generate_function. Generates TestInputTensorShapes, i.e.
-# converts from the runtime shape dimensions in TestShape and given dynamicity to
-# the set of shapes to be used in a test function's input tensors.
-def generate_shapes(shape: TestShape, dynamicity: Dynamicity):
-    n = shape_dim(shape.n, dynamicity)
-    c = shape_dim(shape.c, dynamicity)
-    h = shape_dim(shape.h, dynamicity)
-    w = shape_dim(shape.w, dynamicity)
-    kh = shape_dim(shape.kh, dynamicity)
-    kw = shape_dim(shape.kw, dynamicity)
-    f = shape_dim(shape.f, dynamicity)
-    shapes = TestInputTensorShapes(
-        n=n,
-        c=c,
-        h=h,
-        w=w,
-        kh=kh,
-        kw=kw,
-        f=f,
-    )
-    return shapes
-
-
-# Helper to calculate the output shape based on the input shape, kernel shape,
-# dilation and stride.
-def calc_out_shape(i_shape: int, k_shape: int, dilation_val: int, stride_val: int):
-    x = (k_shape - 1) * (dilation_val - 1)
-    x = i_shape - k_shape - x
-    return math.floor(x / stride_val) + 1
-
-
-# Helper to return input, kernel and output shapes based on the layout and Conv2dParams.
-def get_tensor_shape(
-    shapes: TestShape,
-    kernel_layout: KernelLayout,
-    input_layout: InputLayout,
-    conv_attr: ConvAttrs,
-):
-    n = shapes.n
-    c = shapes.c
-    h = shapes.h
-    w = shapes.w
-    kh = shapes.kh
-    kw = shapes.kw
-    f = shapes.f
-
-    # Extract input dimensions
-    input_height, input_width = h, w
-
-    # Extract kernel dimensions
-    kernel_height, kernel_width = kh, kw
-
-    # Get the dilation and stride
-    dilation = conv_attr.DILATION
-    stride = conv_attr.STRIDE
-
-    # Calculate output height.
-    oh = calc_out_shape(input_height, kernel_height, dilation[0], stride[0])
-    # Calculate output width.
-    ow = calc_out_shape(input_width, kernel_width, dilation[1], stride[1])
-
-    input_tensor_shape, kernel_tensor_shape, output_tensor_shape = [], [], []
-
-    if input_layout == InputLayout.NCHW:
-        input_tensor_shape = [n, c, h, w]
-        output_tensor_shape = [n, f, oh, ow]
-    elif input_layout == InputLayout.NHWC:
-        input_tensor_shape = [n, h, w, c]
-        output_tensor_shape = [n, oh, ow, f]
-    else:
-        raise ValueError(input_layout)
-
-    if kernel_layout == KernelLayout.FCHW:
-        kernel_tensor_shape = [f, c, kh, kw]
-    elif kernel_layout == KernelLayout.HWCF:
-        kernel_tensor_shape = [f, c, kh, kw]
-    else:
-        raise ValueError(kernel_layout)
-
-    return input_tensor_shape, kernel_tensor_shape, output_tensor_shape
-
-
-# Helper for generate_function.
-# Generates a name for a test function in the generated MLIR code.
-def generate_function_name(
-    input_type: InputElemTypeId,
-    kernel_type: KernelElemTypeId,
-    output_type: InputElemTypeId,
-    shapes: TestInputTensorShapes,
-    accumulate: bool,
-):
-    input_t = input_type.value
-    kernel_t = kernel_type.value
-    acc_t = output_type.value
-    n = int_or_DYN(shapes.n)
-    c = int_or_DYN(shapes.c)
-    h = int_or_DYN(shapes.h)
-    w = int_or_DYN(shapes.w)
-    kh = int_or_DYN(shapes.kh)
-    kw = int_or_DYN(shapes.kw)
-    f = int_or_DYN(shapes.f)
-
-    conv2d_kind = "conv2d_accumulate" if accumulate else "conv2d"
-    return (
-        f"{conv2d_kind}_{n}_{c}_{h}_{w}_times_"
-        + f"{kh}_{kw}_{f}_dtype_{input_t}_{kernel_t}_{acc_t}"
-    )
-
-
-# Represents a generated test function.
-@dataclasses.dataclass
-class MLIRFunction:
-    name: str
-    signature: str
-    import_declaration: str
-    definition: str
-
-
-# Generates a test function in the generated MLIR code.
-# The generated function will take the same arguments as linalg.conv2d variants
-# and will just call linalg.conv2d variants with them, returning its result.
-def generate_function(
-    input_type: InputElemTypeId,
-    input_layout: InputLayout,
-    kernel_type: KernelElemTypeId,
-    kernel_layout: KernelLayout,
-    acc_type: InputElemTypeId,
-    conv2d_attr: ConvAttrs,
-    shape: TestShape,
-    dynamicity: Dynamicity,
-):
-    shapes = generate_shapes(shape, dynamicity)
-    func_name = generate_function_name(
-        input_type,
-        kernel_type,
-        acc_type,
-        shapes,
-        shape.accumulate,
-    )
-
-    input_shape, kernel_shape, output_shape = get_tensor_shape(
-        shape, kernel_layout, input_layout, conv2d_attr
-    )
-    input_tensor_type = f"tensor<{input_shape[0]}x{input_shape[1]}x{input_shape[2]}x{input_shape[3]}x{input_type.value}>"
-    kernel_tensor_type = f"tensor<{kernel_shape[0]}x{kernel_shape[1]}x{kernel_shape[2]}x{kernel_shape[3]}x{kernel_type.value}>"
-
-    acc_tensor_type = f"tensor<{output_shape[0]}x{output_shape[1]}x{output_shape[2]}x{output_shape[3]}x{input_type.value}>"
-
-    op_name = None
-    if input_layout == InputLayout.NCHW:
-        if kernel_layout == KernelLayout.FCHW:
-            op_name = "linalg.conv_2d_nchw_fchw"
-        if kernel_layout == KernelLayout.HWCF:
-            op_name = "linalg.conv_2d_nchw_hwcf"
-    elif input_layout == InputLayout.NHWC:
-        if kernel_layout == KernelLayout.HWCF:
-            op_name = "linalg.conv_2d_nhwc_hwcf"
-
-    conv_attr = f"{{dilations = dense<{list(conv2d_attr.DILATION)}> : tensor<2xi64>, strides = dense<{list(conv2d_attr.STRIDE)}> : tensor<2xi64>}}"
-
-    # Compilation info is optional; prints empty string by default.
-    func_definition = ""
-
-    signature = f"({input_tensor_type}, {kernel_tensor_type}, {acc_tensor_type}) -> {acc_tensor_type}"
-    import_declaration = f"func.func private @module.{func_name}(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view"
-    func_definition = func_definition + (
-        f"func.func @{func_name}(%lhs: {input_tensor_type}, %rhs: {kernel_tensor_type}, %acc: {acc_tensor_type}) -> {acc_tensor_type} {{\n"
-        f"  %result = {op_name} {conv_attr} ins(%lhs, %rhs: {input_tensor_type}, {kernel_tensor_type}) outs(%acc: {acc_tensor_type}) -> {acc_tensor_type}\n"
-        f"  return %result: {acc_tensor_type}\n"
-        f"}}\n"
-    )
-
-    return MLIRFunction(
-        name=func_name,
-        signature=signature,
-        import_declaration=import_declaration,
-        definition=func_definition,
-    )
-
-
-# Represents a call to a generated test function.
-@dataclasses.dataclass
-class TestCall:
-    function: MLIRFunction
-    op: str
-
-
-# Enumerates ways to initialize tensor buffer contents.
-@enum.unique
-class TensorGenerator(enum.Enum):
-    ZERO = "zero"  # Fill with zeros
-    RANDOM = "random"  # Fill with (deterministic) pseudorandom values.
-
-
-# Intentionally fixed seed! We want full reproducibility here, both across runs
-# and across machines.
-# Intentionally not shared with local_pseudorandom_state to limit the ways
-# in which shuffling testcases changes which random values are generated.
-pseudorandom_generator_seed = 1
-
-
-def contents_generator_tag(generator: TensorGenerator):
-    if generator == TensorGenerator.ZERO:
-        return ""
-    elif generator == TensorGenerator.RANDOM:
-        global pseudorandom_generator_seed
-        pseudorandom_generator_seed = pseudorandom_generator_seed + 1
-        return f"!tag:iree:fully_specified_pseudorandom {pseudorandom_generator_seed}"
-    else:
-        raise ValueError(generator)
-
-
-# Generate a 4d tensor function argument of the given size as `%name`.
-def generate_random_4d_tensor(
-    name: str,
-    tensor_shape: list,
-    element_type: typing.Union[InputElemTypeId, KernelElemTypeId],
-):
-    global pseudorandom_generator_seed
-    pseudorandom_generator_seed = pseudorandom_generator_seed + 1
-    return (
-        f"  %{name}_dim0 = arith.constant {tensor_shape[0]} : i64\n"
-        f"  %{name}_dim1 = arith.constant {tensor_shape[1]} : i64\n"
-        f"  %{name}_dim2 = arith.constant {tensor_shape[2]} : i64\n"
-        f"  %{name}_dim3 = arith.constant {tensor_shape[3]} : i64\n"
-        f"  %{name}_element_type = hal.element_type<{element_type.value}> : i32\n"
-        f"  %{name}_seed = arith.constant {pseudorandom_generator_seed} : i32\n"
-        f"  %{name} = call @conv2d_test.generate_random_tensor(%device, %{name}_dim0, %{name}_dim1, %{name}_dim2, %{name}_dim3, %{name}_element_type, %{name}_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view\n"
-    )
-
-
-call_id = 0
-
-
-def generate_call(
-    function: MLIRFunction,
-    input_type: InputElemTypeId,
-    input_layout: InputLayout,
-    kernel_type: KernelElemTypeId,
-    kernel_layout: KernelLayout,
-    conv2d_attr: ConvAttrs,
-    acc_type: InputElemTypeId,
-    shape: TestShape,
-):
-    global call_id
-    func_name = f"{function.name}_{shape.n}_{shape.c}_{shape.h}_{shape.w}_{shape.f}_{shape.kh}_{shape.kw}"
-    if shape.accumulate:
-        func_name = f"{func_name}_acc"
-    func_name = f"{func_name}_{call_id}"
-    call_id = call_id + 1
-
-    description = f"Conv2d shape (NxCxHxWxFxKHxKW): {shape.n}x{shape.c}x{shape.h}x{shape.w}x{shape.f}x{shape.kh}x{shape.kw}"
-    op = (
-        f"func.func @{func_name}() attributes {{\n"
-        f'  iree.reflection = {{description = "{description}"}}\n'
-        "} {\n"
-        "  %device_index = arith.constant 0 : index\n"
-        "  %device = hal.devices.get %device_index : !hal.device\n"
-    )
-
-    inp_shape, kernel_shape, out_shape = get_tensor_shape(
-        shape,
-        kernel_layout,
-        input_layout,
-        conv2d_attr,
-    )
-
-    op = op + generate_random_4d_tensor("input", inp_shape, input_type)
-    op = op + generate_random_4d_tensor("kernel", kernel_shape, kernel_type)
-    if shape.accumulate:
-        op = op + generate_random_4d_tensor("acc", out_shape, acc_type)
-        # TODO(#16168): there's a bug with in-place input->output aliasing and
-        # we work around it here by passing in a unique copy.
-        global pseudorandom_generator_seed
-        pseudorandom_generator_seed = pseudorandom_generator_seed - 1
-        op = op + generate_random_4d_tensor("acc_copy", out_shape, acc_type)
-        op = op + (
-            f"  %result = call @module.{function.name}(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view\n"
-        )
-    else:
-        op = op + (
-            f"  %acc = util.null : !hal.buffer_view\n"
-            f"  %result = call @module.{function.name}(%input, %kernel) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view\n"
-        )
-
-    op = op + (
-        f"  %n = arith.constant {shape.n} : i64\n"
-        f"  %c = arith.constant {shape.c} : i64\n"
-        f"  %h = arith.constant {shape.h} : i64\n"
-        f"  %w = arith.constant {shape.w} : i64\n"
-        f"  %f = arith.constant {shape.f} : i64\n"
-        f"  %kh = arith.constant {shape.kh} : i64\n"
-        f"  %kw = arith.constant {shape.kw} : i64\n"
-        f"  %sh = arith.constant {conv2d_attr.STRIDE[0]} : i64\n"
-        f"  %sw = arith.constant {conv2d_attr.STRIDE[1]} : i64\n"
-        f"  %dh = arith.constant {conv2d_attr.DILATION[0]} : i64\n"
-        f"  %dw = arith.constant {conv2d_attr.DILATION[1]} : i64\n"
-        f"  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()\n"
-    )
-
-    op = op + "  return\n"
-    op = op + "}\n"
-
-    return TestCall(function=function, op=op)
-
-
-# Generates all output files' contents as strings.
-def generate(
-    input_elem_type: InputElemTypeId,
-    input_layout: InputLayout,
-    kernel_elem_type: KernelElemTypeId,
-    kernel_layout: KernelLayout,
-    conv2d_attr: ConvAttrs,
-    acc_type: InputElemTypeId,
-    shapes_id: ShapesId,
-):
-    functions = {}
-    calls = []
-
-    for shape in get_test_shapes(shapes_id):
-        for dynamicity in get_dynamicities(shapes_id):
-            function = generate_function(
-                input_elem_type,
-                input_layout,
-                kernel_elem_type,
-                kernel_layout,
-                acc_type,
-                conv2d_attr,
-                shape,
-                dynamicity,
-            )
-            # Different testcases may differ only by runtime parameters but
-            # share the same code. For example, dynamic-shapes testcases
-            # share the same code involing tensor<?x?xf32> even though the runtime
-            # value in the trace are different. That's why we append conditionally
-            # to calls, but unconditionally to function_definitions.
-            if function.name not in functions:
-                functions[function.name] = function
-            calls.append(
-                generate_call(
-                    function,
-                    input_elem_type,
-                    input_layout,
-                    kernel_elem_type,
-                    kernel_layout,
-                    conv2d_attr,
-                    acc_type,
-                    shape,
-                )
-            )
-
-    return (functions, calls)
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(description="Generator of e2e conv2d tests")
-    parser.add_argument(
-        "--output_conv2d_mlir",
-        type=str,
-        help="Path of output .mlir file containing the generated conv2d functions",
-        required=True,
-    )
-    parser.add_argument(
-        "--output_calls_mlir",
-        type=str,
-        help="Path of output .mlir file containing the calls",
-        required=True,
-    )
-    parser.add_argument(
-        "--input_type",
-        type=str,
-        choices=["f32", "f16"],
-        help="Numeric type of input tensors",
-        required=True,
-    )
-    parser.add_argument(
-        "--input_layout",
-        type=str,
-        default="nchw",
-        choices=["nchw", "nhwc"],
-        help="Layout of the input tensor. Currently, only nchw is supported.",
-        required=False,
-    )
-    parser.add_argument(
-        "--kernel_type",
-        type=str,
-        choices=["f32", "f16"],
-        help="Numeric type of input tensors",
-        required=True,
-    )
-    parser.add_argument(
-        "--kernel_layout",
-        type=str,
-        default="fchw",
-        choices=["fchw", "hwcf"],
-        help="Layout of kernel tensor. Currently, only fchw is supported.",
-        required=False,
-    )
-    parser.add_argument(
-        "--acc_type",
-        type=str,
-        choices=["f32", "f16"],
-        help="Numeric type of input tensors",
-        default="",
-        required=False,
-    )
-    parser.add_argument(
-        "--shapes",
-        type=str,
-        choices=[s.value for s in ShapesId],
-        help="Collection of tensor shapes to test",
-        required=True,
-    )
-    parser.add_argument(
-        "--dilation",
-        type=str,
-        default="1,1",
-        help="The dilation factor for the convolution operation. Comma-separated. As in 1,1",
-        required=False,
-    )
-    parser.add_argument(
-        "--stride",
-        type=str,
-        default="1,1",
-        help="The stride factor for the convolution operation. Comma-separated. As in 1,1",
-        required=False,
-    )
-    parser.add_argument(
-        "--requirements",
-        type=str,
-        help="Target requirements for this module. Comma-separated. As in -iree-llvmcpu-target-cpu-features. If the target device does not meet all of the requirements, the test will be skipped.",
-        required=False,
-    )
-    return parser.parse_args()
-
-
-def write_code_file(functions, filename):
-    with open(filename, "w") as file:
-        for function in functions.values():
-            file.write(function.definition + "\n")
-
-
-def write_calls_file(functions, calls, filename, requirements):
-    # Module-level reflection information used to control the test tool.
-    reflection = ""
-    if requirements:
-        reflection = (
-            "iree.reflection = {"
-            'target_features = "'
-            + ",".join([req.lstrip("+") for req in requirements.split(",")])
-            + '"'
-            "}"
-        )
-    module_definition = (
-        f"builtin.module @calls attributes {{\n" f"  {reflection}\n" f"}} {{\n\n"
-    )
-
-    # Declare the custom module that generates arguments.
-    module_definition = module_definition + (
-        "func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view\n"
-        "func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)\n"
-        "\n"
-    )
-
-    # Declare the functions that will be called.
-    for function in functions.values():
-        module_definition = module_definition + function.import_declaration + "\n"
-    module_definition = module_definition + "\n"
-
-    # Emit the test cases for each call.
-    for call in calls:
-        module_definition = module_definition + call.op + "\n"
-
-    module_definition = module_definition + "\n}\n"
-
-    with open(filename, "w") as file:
-        file.write(module_definition)
-
-
-def main(args):
-    input_type = InputElemTypeId(args.input_type)
-    input_layout = InputLayout(args.input_layout)
-    kernel_type = KernelElemTypeId(args.kernel_type)
-    kernel_layout = KernelLayout(args.kernel_layout)
-    # TODO: The output type is same as the input type for now.
-    acc_type = input_type
-    shapes_id = ShapesId(args.shapes)
-    conv2d_attr = ConvAttrs(
-        tuple(map(int, args.stride.split(","))),
-        tuple(map(int, args.dilation.split(","))),
-    )
-
-    (functions, calls) = generate(
-        input_type,
-        input_layout,
-        kernel_type,
-        kernel_layout,
-        conv2d_attr,
-        acc_type,
-        shapes_id,
-    )
-
-    write_code_file(functions, args.output_conv2d_mlir)
-    write_calls_file(
-        functions,
-        calls,
-        args.output_calls_mlir,
-        args.requirements,
-    )
-
-
-if __name__ == "__main__":
-    main(parse_arguments())
diff --git a/tests/e2e/matmul/BUILD.bazel b/tests/e2e/matmul/BUILD.bazel
deleted file mode 100644
index b4c2b51e429b..000000000000
--- a/tests/e2e/matmul/BUILD.bazel
+++ /dev/null
@@ -1,704 +0,0 @@
-# Copyright 2022 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# End-to-end matrix multiplication tests.
-
-load("//build_tools/bazel:iree_e2e_generated_runner_test.bzl", "iree_generated_e2e_runner_test")
-
-package(
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-py_binary(
-    name = "generate_e2e_matmul_tests",
-    srcs = ["generate_e2e_matmul_tests.py"],
-)
-
-###########################################################################
-##
-## LLVMCPU backend
-##
-###########################################################################
-
-# LLVMCPU, non-data-tiling, no microkernels
-[iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cpu_nondt_%s_%s_%s" % (lhs_rhs_type, acc_type, size),
-    compiler_flags = [
-        "--iree-opt-data-tiling=false",
-        "--iree-llvmcpu-enable-ukernels=none",
-        "--iree-llvmcpu-enable-scalable-vectorization",
-        "--iree-llvmcpu-target-triple=aarch64-unknown-unknown",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=%s" % lhs_rhs_type,
-        "--acc_type=%s" % acc_type,
-        "--shapes=%s" % size,
-    ],
-    tags = [
-        # f16/bf16 trigger internal LLVM assertion errors on riscv and wasm.
-        "noriscv",
-        "nowasm",
-    ] if (lhs_rhs_type == "f16" or lhs_rhs_type == "bf16") else [],
-    target_backends_and_drivers = [
-        ("llvm-cpu", "local-task"),
-    ],
-    target_cpu_features_variants = ["default"] +
-                                   # Widening matmuls fail to lower for SVE.
-                                   (["arm_64:sve:+sve"] if lhs_rhs_type == acc_type else []),
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-) for (lhs_rhs_type, acc_type) in [
-    # ("i8", "i32"),  # TODO(#15800): enable once compile time is reasonable
-    # ("f32", "f32"),  # TODO(#15800): enable once compile time is reasonable
-    # ("f16", "f16"),  # TODO(#15800): enable once compile time is reasonable
-    # ("f16", "f32"),  # TODO(#15800): enable once compile time is reasonable
-    # TODO(#15258): enable bf16 tests when that bug is fixed.
-    # ("bf16", "bf16"),
-    # ("bf16", "f32"),
-] for size in [
-    "small",
-    "large",
-]]
-
-PREPROCESSING_TRANSPOSE_LHS = "--iree-preprocessing-pass-pipeline=builtin.module\\(util.func\\(iree-preprocessing-transpose-matmul-pass{input=lhs}\\)\\)"
-
-PREPROCESSING_PEEL = "--iree-llvmcpu-vector-pproc-strategy=peel"
-
-# LLVMCPU, non-data-tiling, no microkernels, ArmSME
-[iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cpu_arm_sme_nondt_%s_%s%s%s" % (
-        dtype,
-        size,
-        "_transpose_lhs" if transpose_lhs else "",
-        "_peel" if peel else "",
-    ),
-    compiler_flags = [
-                         "--iree-opt-data-tiling=false",
-                         "--iree-llvmcpu-enable-scalable-vectorization",
-                         "--iree-llvmcpu-target-triple=aarch64-unknown-unknown",
-                     ] + ([PREPROCESSING_TRANSPOSE_LHS] if transpose_lhs else []) +
-                     ([PREPROCESSING_PEEL] if peel else []),
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=%s" % dtype,
-        "--acc_type=%s" % dtype,
-        "--shapes=%s" % size,
-    ],
-    tags = [
-        "requires-arm-sme",
-    ],
-    target_backends_and_drivers = [
-        ("llvm-cpu", "local-task"),
-    ],
-    target_cpu_features_variants = ["arm_64:sme:+sve,+sme"],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-) for dtype in [
-    "f32",
-    # "f64" (also supported for ArmSME, but not by the test generator)
-] for size in [
-    "small",
-    "large",
-] for transpose_lhs in [
-    True,
-    False,
-] for peel in [
-    True,
-    False,
-]]
-
-X86_64_AVX2 = [
-    "+avx",
-    "+avx2",
-    "+fma",
-    "+f16c",
-]
-
-X86_64_AVX512 = X86_64_AVX2 + [
-    "+avx512f",
-    "+avx512vl",
-    "+avx512cd",
-    "+avx512bw",
-    "+avx512dq",
-]
-
-X86_64_AVX512_VNNI = X86_64_AVX512 + [
-    "+avx512vnni",
-]
-
-X86_64_AVX512_BF16 = X86_64_AVX512 + [
-    "+avx512bf16",
-]
-
-# LLVMCPU, data-tiling, data-tiling + ukernels.
-[iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cpu_dt%s_%s_%s_%s" % (
-        ("_uk" if use_uk else ""),
-        lhs_rhs_type,
-        acc_type,
-        size,
-    ),
-    compiler_flags = [
-        "--iree-opt-data-tiling",
-    ] + ["--iree-llvmcpu-enable-ukernels=%s" % ("all" if use_uk else "none")],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=%s" % lhs_rhs_type,
-        "--acc_type=%s" % acc_type,
-        "--shapes=%s" % size,
-    ],
-    tags = ([
-        # "--shapes=large" can cause timeouts on sanitizers.
-        "noasan",
-        "notsan",
-    ] if size == "large" else []) + ([
-        # "--shapes=large" can cause timeouts on RISC-V emulator.
-        # f16/bf16 trigger internal LLVM assertion errors on riscv and wasm.
-        "noriscv",
-        "nowasm",
-    ] if (lhs_rhs_type == "f16" or lhs_rhs_type == "bf16") else []),
-    target_backends_and_drivers = [
-        ("llvm-cpu", "local-task"),
-    ],
-    target_cpu_features_variants = ["default"] +
-                                   ([
-                                       "arm_64:dotprod:+dotprod",
-                                       "arm_64:i8mm:+i8mm",
-                                       "x86_64:avx512vnni:" + ",".join(X86_64_AVX512_VNNI),
-                                   ] if lhs_rhs_type == "i8" and acc_type == "i32" else [
-                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
-                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
-                                   ] if lhs_rhs_type == "f32" and acc_type == "f32" else [
-                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
-                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
-                                       "arm_64:fullfp16:+fullfp16",
-                                   ] if lhs_rhs_type == "f16" and acc_type == "f16" else [
-                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
-                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
-                                       "arm_64:fp16fml:+fp16fml",
-                                   ] if lhs_rhs_type == "f16" and acc_type == "f32" else [
-                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
-                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
-                                       "x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
-                                       "arm_64:bf16:+bf16",
-                                   ] if lhs_rhs_type == "bf16" and acc_type == "bf16" else [
-                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
-                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
-                                       "x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
-                                       "arm_64:bf16:+bf16",
-                                   ] if lhs_rhs_type == "bf16" and acc_type == "f32" else []),
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-) for use_uk in [
-    False,
-    True,
-] for (lhs_rhs_type, acc_type) in (
-    [
-        ("i8", "i32"),
-        ("f32", "f32"),
-        ("f16", "f16"),
-        ("f16", "f32"),
-        ("bf16", "bf16"),
-        ("bf16", "f32"),
-    ]
-) for size in [
-    "small",
-    "large",
-]]
-
-# LLVMCPU, data-tiling, data-tiling + ukernels + late materialization.
-[iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cpu_experimental_dt%s_%s_%s_%s" % (
-        ("_uk" if use_uk else ""),
-        lhs_rhs_type,
-        acc_type,
-        size,
-    ),
-    compiler_flags = [
-        "--iree-opt-data-tiling",
-        "--iree-global-opt-enable-early-materialization=false",
-    ] + ["--iree-llvmcpu-enable-ukernels=%s" % ("all" if use_uk else "none")],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=%s" % lhs_rhs_type,
-        "--acc_type=%s" % acc_type,
-        "--shapes=%s" % size,
-    ],
-    tags = ([
-        # "--shapes=large" can cause timeouts on sanitizers.
-        "noasan",
-        "notsan",
-    ] if size == "large" else []) + ([
-        # "--shapes=large" can cause timeouts on RISC-V emulator.
-        # f16/bf16 trigger internal LLVM assertion errors on riscv and wasm.
-        "noriscv",
-        "nowasm",
-    ] if (lhs_rhs_type == "f16" or lhs_rhs_type == "bf16") else []),
-    target_backends_and_drivers = [
-        ("llvm-cpu", "local-task"),
-    ],
-    target_cpu_features_variants = ["default"] +
-                                   ([
-                                       "arm_64:dotprod:+dotprod",
-                                       "arm_64:i8mm:+i8mm",
-                                       "x86_64:avx512vnni:" + ",".join(X86_64_AVX512_VNNI),
-                                   ] if lhs_rhs_type == "i8" and acc_type == "i32" else [
-                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
-                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
-                                   ] if lhs_rhs_type == "f32" and acc_type == "f32" else [
-                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
-                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
-                                       "arm_64:fullfp16:+fullfp16",
-                                   ] if lhs_rhs_type == "f16" and acc_type == "f16" else [
-                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
-                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
-                                       "arm_64:fp16fml:+fp16fml",
-                                   ] if lhs_rhs_type == "f16" and acc_type == "f32" else [
-                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
-                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
-                                       "x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
-                                       "arm_64:bf16:+bf16",
-                                   ] if lhs_rhs_type == "bf16" and acc_type == "bf16" else [
-                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
-                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
-                                       "x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
-                                       "arm_64:bf16:+bf16",
-                                   ] if lhs_rhs_type == "bf16" and acc_type == "f32" else []),
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-) for use_uk in [
-    False,
-    True,
-] for (lhs_rhs_type, acc_type) in (
-    [
-        ("i8", "i32"),
-        ("f32", "f32"),
-        ("f16", "f16"),
-        ("f16", "f32"),
-        ("bf16", "bf16"),
-        ("bf16", "f32"),
-    ]
-) for size in [
-    "small",
-    "large",
-]]
-
-[iree_generated_e2e_runner_test(
-    name = "e2e_matmul_vmvx_experimental_dt%s_%s_%s" % (
-        ("_uk" if use_uk else ""),
-        lhs_rhs_type,
-        acc_type,
-    ),
-    compiler_flags = [
-        "--iree-opt-data-tiling",
-        "--iree-global-opt-enable-early-materialization=false",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=%s" % lhs_rhs_type,
-        "--acc_type=%s" % acc_type,
-        "--shapes=small",
-    ],
-    tags = [],
-    target_backends_and_drivers = [
-        ("vmvx", "local-task"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-) for use_uk in [
-    False,
-    True,
-] for (lhs_rhs_type, acc_type) in (
-    [
-        ("f32", "f32"),
-    ]
-)]
-
-[iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cuda_experimental_dt_%s_%s" % (
-        lhs_rhs_type,
-        acc_type,
-    ),
-    compiler_flags = [
-        "--iree-opt-data-tiling",
-        "--iree-global-opt-enable-early-materialization=false",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=%s" % lhs_rhs_type,
-        "--acc_type=%s" % acc_type,
-        "--shapes=small",
-    ],
-    tags = [],
-    target_backends_and_drivers = [
-        ("cuda", "cuda"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-) for (lhs_rhs_type, acc_type) in (
-    [
-        ("f32", "f32"),
-    ]
-)]
-
-[iree_generated_e2e_runner_test(
-    name = "e2e_matmul_spirv_experimental_dt_%s_%s" % (
-        lhs_rhs_type,
-        acc_type,
-    ),
-    compiler_flags = [
-        "--iree-opt-data-tiling",
-        "--iree-global-opt-enable-early-materialization=false",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=%s" % lhs_rhs_type,
-        "--acc_type=%s" % acc_type,
-        "--shapes=small",
-    ],
-    tags = [],
-    target_backends_and_drivers = [
-        ("vulkan-spirv", "vulkan"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-) for (lhs_rhs_type, acc_type) in (
-    [
-        ("f32", "f32"),
-    ]
-)]
-
-###########################################################################
-##
-## VMVX backend
-##
-###########################################################################
-
-# VMVX, data-tiling + microkernels.
-[iree_generated_e2e_runner_test(
-    name = "e2e_matmul_vmvx_dt_uk_%s_small" % lhs_rhs_type,
-    compiler_flags = [
-        "--iree-vmvx-enable-microkernels",
-        "--iree-opt-data-tiling",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=%s" % lhs_rhs_type,
-        "--shapes=small",
-    ],
-    target_backends_and_drivers = [
-        ("vmvx", "local-task"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-) for lhs_rhs_type in [
-    "i8",
-    "f32",
-]]
-
-###########################################################################
-##
-## CUDA backend
-##
-###########################################################################
-
-iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cuda_f32_large_simt",
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=f32",
-        "--shapes=gpu_large_aligned",
-        "--compilation_info=LLVMGPUMatmulSimt",
-    ],
-    tags = [
-        # CUDA cuInit fails with sanitizer on.
-        "noasan",
-        "nomsan",
-        "notsan",
-        "noubsan",
-        "requires-gpu-nvidia",
-    ],
-    target_backends_and_drivers = [
-        ("cuda", "cuda"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-)
-
-# Testing Ampere + TensorCore path.
-# WMMA TensorCore(F32): wmma.161616.f32.tf32
-iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cuda_f32_large_tensorcore",
-    compiler_flags = [
-        "--iree-hal-cuda-llvm-target-arch=sm_80",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=f32",
-        "--shapes=gpu_large_aligned",
-        "--compilation_info=LLVMGPUMatmulTensorCore",
-    ],
-    tags = [
-        # CUDA cuInit fails with sanitizer on.
-        "noasan",
-        "nomsan",
-        "notsan",
-        "noubsan",
-        "requires-gpu-sm80",
-    ],
-    target_backends_and_drivers = [
-        ("cuda", "cuda"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-)
-
-iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cuda_f32_large_unaligned",
-    compiler_flags = [
-        "--iree-hal-cuda-llvm-target-arch=sm_80",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=f32",
-        "--shapes=gpu_large",
-    ],
-    tags = [
-        # CUDA cuInit fails with sanitizer on.
-        "noasan",
-        "nomsan",
-        "notsan",
-        "noubsan",
-        "requires-gpu-sm80",
-    ],
-    target_backends_and_drivers = [
-        ("cuda", "cuda"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-)
-
-iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cuda_f16_large_unaligned",
-    compiler_flags = [
-        "--iree-hal-cuda-llvm-target-arch=sm_80",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=f16",
-        "--shapes=gpu_large",
-    ],
-    tags = [
-        # CUDA cuInit fails with sanitizer on.
-        "noasan",
-        "nomsan",
-        "notsan",
-        "noubsan",
-        "requires-gpu-sm80",
-    ],
-    target_backends_and_drivers = [
-        ("cuda", "cuda"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-)
-
-# MMA.SYNC TensorCore(F32): mma.sync.1688.f32.t32
-iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cuda_f32_large_mma_sync",
-    compiler_flags = [
-        "--iree-hal-cuda-llvm-target-arch=sm_80",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=f32",
-        "--shapes=gpu_large_aligned",
-        "--compilation_info=LLVMGPUMatmulTensorCoreMmaSync",
-    ],
-    tags = [
-        # CUDA cuInit fails with sanitizer on.
-        "noasan",
-        "nomsan",
-        "notsan",
-        "noubsan",
-        "requires-gpu-sm80",
-    ],
-    target_backends_and_drivers = [
-        ("cuda", "cuda"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-)
-
-# WMMA TensorCore(F16): wmma.161616.f16.f16
-iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cuda_f16_large_tensorcore",
-    compiler_flags = [
-        "--iree-hal-cuda-llvm-target-arch=sm_80",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=f16",
-        "--shapes=gpu_large_aligned",
-        "--compilation_info=LLVMGPUMatmulTensorCore",
-    ],
-    tags = [
-        # CUDA cuInit fails with sanitizer on.
-        "noasan",
-        "nomsan",
-        "notsan",
-        "noubsan",
-        "requires-gpu-sm80",
-    ],
-    target_backends_and_drivers = [
-        ("cuda", "cuda"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-)
-
-# MMA.SYNC TensorCore(F16): mma.sync.161616.f16.f16
-iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cuda_f16_large_mma_sync",
-    compiler_flags = [
-        "--iree-hal-cuda-llvm-target-arch=sm_80",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=f16",
-        "--shapes=gpu_large_aligned",
-        "--compilation_info=LLVMGPUMatmulTensorCoreMmaSync",
-    ],
-    tags = [
-        # CUDA cuInit fails with sanitizer on.
-        "noasan",
-        "nomsan",
-        "notsan",
-        "noubsan",
-        "requires-gpu-sm80",
-    ],
-    target_backends_and_drivers = [
-        ("cuda", "cuda"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-)
-
-[iree_generated_e2e_runner_test(
-    name = "e2e_matmul_cuda_%s_large_splitk" % lhs_rhs_type,
-    compiler_flags = [
-        "--iree-flow-split-matmul-reduction=4",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=%s" % lhs_rhs_type,
-        "--shapes=large",
-    ],
-    tags = [
-        # CUDA cuInit fails with sanitizer on.
-        "noasan",
-        "nomsan",
-        "notsan",
-        "noubsan",
-        "requires-gpu-nvidia",
-        # "--shapes=large" can cause timeouts on riscv emulator.
-        "noriscv",
-    ],
-    target_backends_and_drivers = [
-        ("cuda", "cuda"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-) for lhs_rhs_type in [
-    "f32",
-]]
-
-###########################################################################
-##
-## Vulkan backend
-##
-###########################################################################
-
-[iree_generated_e2e_runner_test(
-    name = "e2e_matmul_vulkan_{0}_large_valhall".format(lhs_rhs_type),
-    compiler_flags = [
-        "--iree-vulkan-target-triple=valhall-unknown-android31",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=%s" % lhs_rhs_type,
-        "--shapes=gpu_large_aligned",
-        "--compilation_info=SPIRVVectorizeMali",
-    ],
-    tags = [
-        # Nvidia GPUs support a superset of Valhall features
-        "requires-gpu-nvidia",
-        "vulkan_uses_vk_khr_shader_float16_int8",
-    ],
-    target_backends_and_drivers = [
-        ("vulkan-spirv", "vulkan"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-) for lhs_rhs_type in [
-    "i8",
-    "f16",
-    "f32",
-]]
-
-[iree_generated_e2e_runner_test(
-    name = "e2e_matmul_vulkan_{0}_large_ampere".format(lhs_rhs_type),
-    compiler_flags = [
-        "--iree-vulkan-target-triple=ampere-unknown-linux",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=%s" % lhs_rhs_type,
-        "--shapes=gpu_large_aligned",
-        "--compilation_info=SPIRVVectorizeNVIDIA",
-    ],
-    tags = [
-        "requires-gpu-sm80",
-        "vulkan_uses_vk_khr_shader_float16_int8",
-    ],
-    target_backends_and_drivers = [
-        ("vulkan-spirv", "vulkan"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-) for lhs_rhs_type in [
-    "i8",
-    "f16",
-    "f32",
-]]
-
-iree_generated_e2e_runner_test(
-    name = "e2e_matmul_vulkan_f16_large_rdna3",
-    compiler_flags = [
-        "--iree-vulkan-target-triple=rdna3-unknown-linux",
-    ],
-    generator = ":generate_e2e_matmul_tests",
-    generator_args = [
-        "--lhs_rhs_type=f16",
-        "--shapes=gpu_large_aligned",
-        "--compilation_info=SPIRVCooperativeMatrixVectorize",
-    ],
-    runner_args = [
-        "--require_exact_results=false",
-    ],
-    tags = [
-        "requires-gpu",
-        "requires-gpu-rdna3",
-        "vulkan_uses_vk_khr_shader_float16_int8",
-    ],
-    target_backends_and_drivers = [
-        ("vulkan-spirv", "vulkan"),
-    ],
-    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
-    test_type = "matmul",
-)
diff --git a/tests/e2e/matmul/CMakeLists.txt b/tests/e2e/matmul/CMakeLists.txt
deleted file mode 100644
index 0556e756bef6..000000000000
--- a/tests/e2e/matmul/CMakeLists.txt
+++ /dev/null
@@ -1,2540 +0,0 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# tests/e2e/matmul/BUILD.bazel                                                 #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_arm_sme_nondt_f32_small_transpose_lhs_peel
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-preprocessing-pass-pipeline=builtin.module\(util.func\(iree-preprocessing-transpose-matmul-pass{input=lhs}\)\)"
-    "--iree-llvmcpu-vector-pproc-strategy=peel"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_arm_sme_nondt_f32_small_transpose_lhs
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-preprocessing-pass-pipeline=builtin.module\(util.func\(iree-preprocessing-transpose-matmul-pass{input=lhs}\)\)"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_arm_sme_nondt_f32_small_peel
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-llvmcpu-vector-pproc-strategy=peel"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_arm_sme_nondt_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_arm_sme_nondt_f32_large_transpose_lhs_peel
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-preprocessing-pass-pipeline=builtin.module\(util.func\(iree-preprocessing-transpose-matmul-pass{input=lhs}\)\)"
-    "--iree-llvmcpu-vector-pproc-strategy=peel"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_arm_sme_nondt_f32_large_transpose_lhs
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-preprocessing-pass-pipeline=builtin.module\(util.func\(iree-preprocessing-transpose-matmul-pass{input=lhs}\)\)"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_arm_sme_nondt_f32_large_peel
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-llvmcpu-vector-pproc-strategy=peel"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_arm_sme_nondt_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_i8_i32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_i8_i32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_f32_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_f32_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_f16_f16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_f16_f16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_f16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_f16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_bf16_bf16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_bf16_bf16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_bf16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_bf16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_i8_i32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_i8_i32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_f32_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_f32_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_f16_f16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_f16_f16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_f16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_f16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_bf16_bf16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_bf16_bf16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_bf16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_dt_uk_bf16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_i8_i32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_i8_i32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_f32_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_f32_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_f16_f16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_f16_f16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_f16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_f16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_bf16_bf16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_bf16_bf16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_bf16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_bf16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_i8_i32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_i8_i32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_f32_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_f32_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_f16_f16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_f16_f16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_f16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_f16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_bf16_bf16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_bf16_bf16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_bf16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cpu_experimental_dt_uk_bf16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_vmvx_experimental_dt_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vmvx"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_vmvx_experimental_dt_uk_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vmvx"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cuda_experimental_dt_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_spirv_experimental_dt_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vulkan-spirv"
-  DRIVERS
-    "vulkan"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_vmvx_dt_uk_i8_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vmvx"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-vmvx-enable-microkernels"
-    "--iree-opt-data-tiling"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_vmvx_dt_uk_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vmvx"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-vmvx-enable-microkernels"
-    "--iree-opt-data-tiling"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cuda_f32_large_simt
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUMatmulSimt"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-nvidia"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cuda_f32_large_tensorcore
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUMatmulTensorCore"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-sm80"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cuda_f32_large_unaligned
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--shapes=gpu_large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-sm80"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cuda_f16_large_unaligned
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--shapes=gpu_large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-sm80"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cuda_f32_large_mma_sync
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUMatmulTensorCoreMmaSync"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-sm80"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cuda_f16_large_tensorcore
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUMatmulTensorCore"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-sm80"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cuda_f16_large_mma_sync
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUMatmulTensorCoreMmaSync"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-sm80"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cuda_f32_large_splitk
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-flow-split-matmul-reduction=4"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-nvidia"
-    "noriscv"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_vulkan_i8_large_valhall
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=SPIRVVectorizeMali"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vulkan-spirv"
-  DRIVERS
-    "vulkan"
-  COMPILER_FLAGS
-    "--iree-vulkan-target-triple=valhall-unknown-android31"
-  LABELS
-    "requires-gpu-nvidia"
-    "vulkan_uses_vk_khr_shader_float16_int8"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_vulkan_f16_large_valhall
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=SPIRVVectorizeMali"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vulkan-spirv"
-  DRIVERS
-    "vulkan"
-  COMPILER_FLAGS
-    "--iree-vulkan-target-triple=valhall-unknown-android31"
-  LABELS
-    "requires-gpu-nvidia"
-    "vulkan_uses_vk_khr_shader_float16_int8"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_vulkan_f32_large_valhall
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=SPIRVVectorizeMali"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vulkan-spirv"
-  DRIVERS
-    "vulkan"
-  COMPILER_FLAGS
-    "--iree-vulkan-target-triple=valhall-unknown-android31"
-  LABELS
-    "requires-gpu-nvidia"
-    "vulkan_uses_vk_khr_shader_float16_int8"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_vulkan_i8_large_ampere
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=SPIRVVectorizeNVIDIA"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vulkan-spirv"
-  DRIVERS
-    "vulkan"
-  COMPILER_FLAGS
-    "--iree-vulkan-target-triple=ampere-unknown-linux"
-  LABELS
-    "requires-gpu-sm80"
-    "vulkan_uses_vk_khr_shader_float16_int8"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_vulkan_f16_large_ampere
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=SPIRVVectorizeNVIDIA"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vulkan-spirv"
-  DRIVERS
-    "vulkan"
-  COMPILER_FLAGS
-    "--iree-vulkan-target-triple=ampere-unknown-linux"
-  LABELS
-    "requires-gpu-sm80"
-    "vulkan_uses_vk_khr_shader_float16_int8"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_vulkan_f32_large_ampere
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=SPIRVVectorizeNVIDIA"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vulkan-spirv"
-  DRIVERS
-    "vulkan"
-  COMPILER_FLAGS
-    "--iree-vulkan-target-triple=ampere-unknown-linux"
-  LABELS
-    "requires-gpu-sm80"
-    "vulkan_uses_vk_khr_shader_float16_int8"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_vulkan_f16_large_rdna3
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=SPIRVCooperativeMatrixVectorize"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vulkan-spirv"
-  DRIVERS
-    "vulkan"
-  COMPILER_FLAGS
-    "--iree-vulkan-target-triple=rdna3-unknown-linux"
-  RUNNER_ARGS
-    "--require_exact_results=false"
-  LABELS
-    "requires-gpu"
-    "requires-gpu-rdna3"
-    "vulkan_uses_vk_khr_shader_float16_int8"
-)
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
-
-# To distinguish between CDNA(gfx9) and RDNA3(gfx11)
-if(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx9")
-
-unset(IREE_HIP_TEST_COMPILER_FLAGS)
-list(APPEND IREE_HIP_TEST_COMPILER_FLAGS
-  "--iree-rocm-target-chip=${IREE_HIP_TEST_TARGET_CHIP}"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_rocm_f16_large_cdna3_mfma
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUVectorDistributeMFMA"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "rocm"
-  DRIVERS
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-cdna3"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_rocm_f32_large_cdna3_mfma
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUVectorDistributeMFMA"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "rocm"
-  DRIVERS
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-cdna3"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_rocm_f16_large_cdna3_mfma_tb
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--transpose_rhs"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUVectorDistributeMFMA"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "rocm"
-  DRIVERS
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-cdna3"
-)
-
-if(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx94")
-
-# I8 Intrinsics has different layout on CDNA3/gfx94x,
-# and only CDNA3/gfx94x has F8 intrinsics.
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_rocm_f8_large_cdna3_mfma
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f8E4M3FNUZ"
-    "--acc_type=f32"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUVectorDistributeMFMA"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "rocm"
-  DRIVERS
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-cdna3"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_rocm_i8_large_cdna3_mfma_tb
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--transpose_rhs"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUVectorDistributeMFMA"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "rocm"
-  DRIVERS
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-cdna3"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_cdna_experimental_dt_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "rocm"
-  DRIVERS
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-cdna3"
-)
-endif()
-
-elseif(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx11")
-
-unset(IREE_HIP_TEST_COMPILER_FLAGS)
-list(APPEND IREE_HIP_TEST_COMPILER_FLAGS
-  "--iree-rocm-target-chip=${IREE_HIP_TEST_TARGET_CHIP}"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_rocm_f16_large_rdna3_wmma
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUVectorDistributeWMMA"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "rocm"
-  DRIVERS
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-  RUNNER_ARGS
-    "--require_exact_results=false"
-    "--acceptable_fp_delta=1e-04"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-rdna3"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_rocm_f16_large_rdna3_wmma_tb
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--transpose_rhs"
-    "--shapes=gpu_large_aligned"
-    "--compilation_info=LLVMGPUVectorDistributeWMMA"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "rocm"
-  DRIVERS
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-  RUNNER_ARGS
-    "--require_exact_results=false"
-    "--acceptable_fp_delta=1e-04"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-rdna3"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    e2e_matmul_rdna3_experimental_dt_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree_tools_testing_e2e_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "rocm"
-  DRIVERS
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-rdna3"
-)
-
-endif()
diff --git a/tests/e2e/matmul/generate_e2e_matmul_tests.py b/tests/e2e/matmul/generate_e2e_matmul_tests.py
deleted file mode 100644
index 6cb270233386..000000000000
--- a/tests/e2e/matmul/generate_e2e_matmul_tests.py
+++ /dev/null
@@ -1,992 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2021 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""iree_generated_e2e_matmul_test generator for e2e matmul tests.
-"""
-
-from typing import Optional
-import argparse
-import os
-import re
-import enum
-import dataclasses
-import typing
-import itertools
-
-
-# Data type of matrix entries. The string values must match MLIR data types.
-# This is a superset of the values accepted for the --lhs_rhs_types= flag,
-# as this also includes accumulator-specific types like i32.
-@enum.unique
-class MatrixElemTypeId(enum.Enum):
-    NONE = ""
-    I8 = "i8"
-    I32 = "i32"
-    F32 = "f32"
-    F16 = "f16"
-    F8E4M3FNUZ = "f8E4M3FNUZ"
-    BF16 = "bf16"
-
-
-# Enumerates of the collections of shapes that we can generate tests for.
-# The values are the accepted values for the --shapes= flag.
-@enum.unique
-class ShapesId(enum.Enum):
-    SMALL = "small"
-    LARGE = "large"
-    GPU_LARGE = "gpu_large"
-    GPU_LARGE_ALIGNED = "gpu_large_aligned"
-
-
-# Enumerates of the collections of compilation info that we can generate tests
-# for. The values are the accepted values for the --compilation_info= flag.
-@enum.unique
-class CompilationInfoId(enum.Enum):
-    NONE = ""
-    LLVMGPUMatmulSimt = "LLVMGPUMatmulSimt"
-    LLVMGPUMatmulTensorCore = "LLVMGPUMatmulTensorCore"
-    LLVMGPUMatmulTensorCoreMmaSync = "LLVMGPUMatmulTensorCoreMmaSync"
-    LLVMGPUVectorDistributeMFMA = "LLVMGPUVectorDistributeMFMA"
-    LLVMGPUVectorDistributeWMMA = "LLVMGPUVectorDistributeWMMA"
-    SPIRVCooperativeMatrixVectorize = "SPIRVCooperativeMatrixVectorize"
-    SPIRVVectorizeMali = "SPIRVVectorizeMali"
-    SPIRVVectorizeNVIDIA = "SPIRVVectorizeNVIDIA"
-
-
-# Enumerates ways to construct MLIR tensor types.
-@enum.unique
-class Dynamicity(enum.Enum):
-    DYNAMIC = "dynamic"  # Use '?' everywhere. Example: tensor<?x?xf32>.
-    STATIC = "static"  # Use fixed values everywhere. Example: tensor<4x6xf32>.
-    MIXED = "mixed"  # Randomly mix '?' and values. Example: tensor<?x4xf32>.
-
-
-# Enumerates ways to initialize matrix buffer contents.
-@enum.unique
-class MatrixGenerator(enum.Enum):
-    ZERO = "zero"  # Fill with zeros
-    RANDOM = "random"  # Fill with (deterministic) pseudorandom values.
-
-
-# Describes the shape of a matrix multiplication in the usual convention:
-# the LHS is {m}x{k}, the RHS is {k}x{n}, the accumulator/result is {m}x{n}.
-# The extra `accumulate` boolean tells whether the matmul is accumulating into
-# an existing accumulator (C += A * B) or just overwriting the result
-# (C = A * B).
-@dataclasses.dataclass
-class TestShape:
-    m: int
-    k: int
-    n: int
-    accumulate: bool
-
-
-# Describes a workgroup and tiling schedule to target a specific MMA intrinsic.
-@dataclasses.dataclass
-class MMASchedule:
-    intrinsic: str
-    m_count: int  # Number of subgroups per workgroup along M
-    n_count: int  # Number of subgroups per workgroup along N
-    m_tile_count: int
-    n_tile_count: int
-    k_tile_count: int
-
-    def __str__(self):
-        return (
-            "mma_schedule = #iree_gpu.mma_schedule<"
-            + f"intrinsic = #iree_gpu.mma_layout<{self.intrinsic}>, "
-            + f"subgroup_m_count = {self.m_count}, "
-            + f"subgroup_n_count = {self.n_count}>"
-        )
-
-
-# Describes how to construct compilation info for the testcase.
-@dataclasses.dataclass
-class CompilationInfo:
-    # Lowering Config
-    tile_sizes: typing.List[typing.List[int]]
-    # Translation Info
-    dispatch_lowering_pass_pipeline: str
-    software_pipeline_depth: int
-    mma_schedule: typing.Optional[MMASchedule]
-    # Compilation info
-    workgroup_size: typing.List[int]
-    subgroup_size: Optional[int] = None
-
-    # Prints the workgroup size
-    def workgroup_size_str(self):
-        return "workgroup_size = [" + ", ".join(map(str, self.workgroup_size)) + "]"
-
-
-# Returns the list of TestShape's to use for the collection of shapes
-# identified by shapes_id.
-def get_test_shapes(shapes_id: ShapesId):
-    # Notes:
-    # 1. Be conservative in adding more shapes, as that can increase both the
-    #    build and execution latency of tests. The build latency is nearly the
-    #    same for all shapes, while execution latency grows cubicly i.e.
-    #    linearly with m*k*n.
-    # 2. Some shapes are commented out: they used to be tested but have been
-    #    disabled to improve the trade-off between test coverage and build
-    #    latency.
-    if shapes_id == ShapesId.SMALL:
-        return [
-            # square matrices. Start by the simplest case of 1x1x1.
-            TestShape(m=1, k=1, n=1, accumulate=True),
-            TestShape(m=1, k=1, n=1, accumulate=False),
-            # Test some small powers of two, that exercise in particular the
-            # adjustment of data-tiling tile sizes to narrow cases.
-            TestShape(m=2, k=2, n=2, accumulate=True),
-            TestShape(m=4, k=4, n=4, accumulate=True),
-            TestShape(m=8, k=8, n=8, accumulate=True),
-            # test 9x9x9 because as many kernel M0/K0/N0 dims are equal to 8,
-            # this will often be the smallest value that exercises something above
-            # the kernel's size.
-            TestShape(m=9, k=9, n=9, accumulate=True),
-            # rectangular matrices.
-            # >= 2x differences between M/N/K dims may exercise tiling corner cases
-            # not exercised by nearly-square matrices.
-            TestShape(m=6, k=13, n=3, accumulate=True),
-            TestShape(m=15, k=37, n=7, accumulate=False),
-            TestShape(m=81, k=19, n=41, accumulate=True),
-            # shapes involving vectors (i.e. most rectangular cases)
-            # This is particularly relevant because we have dedicated kernels for
-            # the matrix*vector / vector*matrix case.
-            TestShape(m=1, k=10, n=10, accumulate=True),  # vector*matrix
-            TestShape(m=1, k=10, n=10, accumulate=False),  # vector*matrix
-            TestShape(m=10, k=1, n=10, accumulate=True),  # outer-product
-            TestShape(m=10, k=10, n=1, accumulate=True),  # matrix*vector
-            TestShape(m=10, k=10, n=1, accumulate=False),  # matrix*vector
-        ]
-    if shapes_id == ShapesId.LARGE:
-        return [
-            # some random large sizes
-            TestShape(m=123, k=456, n=789, accumulate=True),
-            TestShape(m=654, k=321, n=234, accumulate=False),
-            # shapes involving vectors (i.e. most rectangular cases)
-            TestShape(m=1, k=1000, n=1000, accumulate=True),  # large vector*matrix
-            TestShape(m=1000, k=1000, n=1, accumulate=True),  # large matrix*vector
-            TestShape(m=1000, k=1000, n=1, accumulate=False),  # large matrix*vector
-            # Be conservative in adding larger shapes. They can result in
-            # high latency tests. If you have to, consider splitting them
-            # out in a way that constrains the latency impact, e.g. by
-            # running on fewer backends/drivers or with fewer generators
-            # (see get_test_generators).
-        ]
-    if shapes_id == ShapesId.GPU_LARGE_ALIGNED:
-        return [
-            TestShape(m=512, k=128, n=512, accumulate=True),
-            TestShape(m=512, k=128, n=512, accumulate=False),
-        ]
-    if shapes_id == ShapesId.GPU_LARGE:
-        return [
-            # unaligned cases.
-            TestShape(m=457, k=330, n=512, accumulate=False),
-            TestShape(m=457, k=330, n=514, accumulate=False),
-            TestShape(m=438, k=330, n=514, accumulate=False),
-            TestShape(m=540, k=332, n=516, accumulate=False),
-            TestShape(m=1000, k=4, n=512, accumulate=False),
-            TestShape(m=4, k=1000, n=512, accumulate=False),
-            TestShape(m=512, k=1000, n=4, accumulate=False),
-            TestShape(m=512, k=128, n=500, accumulate=False),
-            TestShape(m=457, k=160, n=512, accumulate=False),
-            TestShape(m=512, k=330, n=512, accumulate=False),
-        ]
-
-    raise ValueError(shapes_id)
-
-
-# Returns the list of Dynamicity's to use for the collection of shapes
-# identified by shapes_id.
-def get_dynamicities(shapes_id: ShapesId):
-    if shapes_id == ShapesId.GPU_LARGE or shapes_id == ShapesId.GPU_LARGE_ALIGNED:
-        return [
-            Dynamicity.STATIC,
-        ]
-    else:
-        return [
-            Dynamicity.DYNAMIC,
-            Dynamicity.STATIC,
-        ]
-    raise ValueError(shapes_id)
-
-
-@dataclasses.dataclass
-class TileWorkgroupSizePair:
-    tile_size: typing.List[typing.List[int]]
-    workgroup_size: typing.List[int]
-
-
-# Constructs a TileWorkgroupSizePair for SPIR-V targets enforcing the
-# constraints between the workgroup_size and tile size
-def get_spirv_tile_workgroup_size_pair(
-    workgroup_size, t_tile_k, t_tile_m=4, t_tile_n=4
-):
-    x, y, z = workgroup_size
-    wg_tile_m = y * t_tile_m
-    wg_tile_n = x * t_tile_n
-    return TileWorkgroupSizePair(
-        [[wg_tile_m, wg_tile_n], [t_tile_m, t_tile_n], [0, 0, t_tile_k]], workgroup_size
-    )
-
-
-# Returns all the TileWorkgroupSizePairs for a given SPIRV Target
-def get_all_spirv_tile_workgroup_size_pairs(t_tile_k):
-    tile_workgroup_size_pairs = [
-        get_spirv_tile_workgroup_size_pair([32, 8, 1], t_tile_k),
-        get_spirv_tile_workgroup_size_pair([16, 8, 1], t_tile_k),
-        get_spirv_tile_workgroup_size_pair([64, 2, 1], t_tile_k),
-        get_spirv_tile_workgroup_size_pair([8, 8, 1], t_tile_k),
-        get_spirv_tile_workgroup_size_pair([32, 1, 1], t_tile_k),
-        get_spirv_tile_workgroup_size_pair([16, 2, 1], t_tile_k),
-        get_spirv_tile_workgroup_size_pair([32, 1, 1], t_tile_k),
-    ]
-    return tile_workgroup_size_pairs
-
-
-def get_rocm_test_compilation_infos(
-    compilation_info_id: CompilationInfoId, lhs_rhs_type: MatrixElemTypeId
-):
-    intrinsic = ""
-    if compilation_info_id == CompilationInfoId.LLVMGPUVectorDistributeMFMA:
-        intrinsic = "MFMA"
-    elif compilation_info_id == CompilationInfoId.LLVMGPUVectorDistributeWMMA:
-        intrinsic = "WMMA"
-    else:
-        raise ValueError("Unknown pipeline for rocm")
-
-    schedules = []
-    if intrinsic == "MFMA":
-        schedules = [
-            MMASchedule("MFMA_F32_16x16x4_F32", 1, 1, 1, 1, 1),
-            MMASchedule("MFMA_F32_16x16x4_F32", 1, 1, 1, 1, 2),
-            MMASchedule("MFMA_F32_16x16x4_F32", 1, 1, 1, 2, 1),
-            MMASchedule("MFMA_F32_16x16x4_F32", 1, 1, 2, 1, 1),
-            MMASchedule("MFMA_F32_16x16x4_F32", 2, 2, 1, 1, 2),
-            MMASchedule("MFMA_F32_16x16x16_F16", 1, 1, 1, 1, 1),
-            MMASchedule("MFMA_F32_16x16x16_F16", 1, 1, 1, 1, 2),
-            MMASchedule("MFMA_F32_16x16x16_F16", 1, 1, 1, 2, 1),
-            MMASchedule("MFMA_F32_16x16x16_F16", 1, 1, 2, 1, 1),
-            MMASchedule("MFMA_F32_16x16x16_F16", 2, 2, 1, 1, 1),
-            MMASchedule("MFMA_F32_16x16x16_F16", 2, 4, 2, 1, 2),
-            MMASchedule("MFMA_F32_16x16x16_F16", 4, 2, 4, 2, 2),
-            MMASchedule("MFMA_F32_32x32x8_F16", 1, 1, 1, 2, 2),
-            MMASchedule("MFMA_F32_32x32x8_F16", 2, 2, 1, 1, 1),
-            MMASchedule("MFMA_F32_32x32x8_F16", 1, 4, 2, 1, 2),
-            MMASchedule("MFMA_F32_32x32x8_F16", 4, 2, 1, 2, 4),
-            MMASchedule("MFMA_F32_16x16x32_F8E4M3FNUZ", 1, 1, 1, 1, 1),
-            MMASchedule("MFMA_F32_16x16x32_F8E4M3FNUZ", 2, 2, 1, 1, 2),
-            MMASchedule("MFMA_F32_16x16x32_F8E4M3FNUZ", 4, 1, 4, 1, 1),
-            MMASchedule("MFMA_F32_16x16x32_F8E4M3FNUZ", 4, 2, 4, 2, 1),
-            MMASchedule("MFMA_I32_16x16x32_I8", 1, 1, 1, 1, 1),
-            MMASchedule("MFMA_I32_16x16x32_I8", 2, 2, 1, 1, 2),
-            MMASchedule("MFMA_I32_16x16x32_I8", 4, 1, 4, 1, 1),
-            MMASchedule("MFMA_I32_16x16x32_I8", 4, 2, 4, 2, 1),
-            MMASchedule("MFMA_I32_32x32x16_I8", 1, 1, 1, 1, 1),
-            MMASchedule("MFMA_I32_32x32x16_I8", 2, 2, 1, 1, 2),
-            MMASchedule("MFMA_I32_32x32x16_I8", 4, 1, 1, 2, 2),
-            MMASchedule("MFMA_I32_32x32x16_I8", 4, 2, 2, 2, 2),
-        ]
-    elif intrinsic == "WMMA":
-        schedules = [
-            MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 1, 1),
-            MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 1, 2),
-            MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 2, 1),
-            MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 2, 1, 1),
-            MMASchedule("WMMA_F32_16x16x16_F16", 2, 2, 1, 1, 1),
-            MMASchedule("WMMA_F32_16x16x16_F16", 2, 4, 2, 1, 2),
-            MMASchedule("WMMA_F32_16x16x16_F16", 4, 2, 4, 2, 2),
-        ]
-    else:
-        raise NotImplementedError("unhandled intrinsic case")
-
-    subgroup_size = 64 if intrinsic == "MFMA" else 32
-
-    infos = []
-    for schedule in schedules:
-        # Skip schedules with an intrinsic which element type does not
-        # match the requested one.
-        # Extracts the input type from strings. The naming convention is
-        # [output_type]_MxNxK_[input_type].
-        input_type = schedule.intrinsic.split("_")[-1]
-        if lhs_rhs_type.value.upper() != input_type:
-            continue
-
-        if schedule.intrinsic == "MFMA_F32_16x16x4_F32":
-            wg_tile_m = schedule.m_count * schedule.m_tile_count * 16
-            wg_tile_n = schedule.n_count * schedule.n_tile_count * 16
-            wg_tile_k = schedule.k_tile_count * 4
-        elif schedule.intrinsic == "MFMA_F32_16x16x16_F16":
-            wg_tile_m = schedule.m_count * schedule.m_tile_count * 16
-            wg_tile_n = schedule.n_count * schedule.n_tile_count * 16
-            wg_tile_k = schedule.k_tile_count * 16
-        elif schedule.intrinsic == "MFMA_F32_32x32x8_F16":
-            wg_tile_m = schedule.m_count * schedule.m_tile_count * 32
-            wg_tile_n = schedule.n_count * schedule.n_tile_count * 32
-            wg_tile_k = schedule.k_tile_count * 8
-        elif (
-            schedule.intrinsic == "MFMA_I32_16x16x32_I8"
-            or schedule.intrinsic == "MFMA_F32_16x16x32_F8E4M3FNUZ"
-        ):
-            wg_tile_m = schedule.m_count * schedule.m_tile_count * 16
-            wg_tile_n = schedule.n_count * schedule.n_tile_count * 16
-            wg_tile_k = schedule.k_tile_count * 32
-        elif schedule.intrinsic == "MFMA_I32_32x32x16_I8":
-            wg_tile_m = schedule.m_count * schedule.m_tile_count * 32
-            wg_tile_n = schedule.n_count * schedule.n_tile_count * 32
-            wg_tile_k = schedule.k_tile_count * 16
-        elif schedule.intrinsic == "WMMA_F32_16x16x16_F16":
-            wg_tile_m = schedule.m_count * schedule.m_tile_count * 16
-            wg_tile_n = schedule.n_count * schedule.n_tile_count * 16
-            wg_tile_k = schedule.k_tile_count * 16
-        else:
-            raise NotImplementedError("unhandled intrinsic case")
-
-        workgroup_tile = [[wg_tile_m, wg_tile_n, wg_tile_k]]
-        workgroup_size = [schedule.n_count * subgroup_size, schedule.m_count, 1]
-        infos.append(
-            CompilationInfo(
-                tile_sizes=workgroup_tile,
-                dispatch_lowering_pass_pipeline="LLVMGPUVectorDistribute",
-                workgroup_size=workgroup_size,
-                software_pipeline_depth=0,
-                mma_schedule=schedule,
-                subgroup_size=subgroup_size,
-            )
-        )
-    return infos
-
-
-# Returns the list of CompilationInfo's to use for the CompilationInfoId.
-def get_test_compilation_infos(
-    compilation_info_id: CompilationInfoId, lhs_rhs_type: MatrixElemTypeId
-) -> typing.List[typing.Optional[CompilationInfo]]:
-    if compilation_info_id == CompilationInfoId.NONE:
-        return [None]
-
-    if compilation_info_id in [
-        CompilationInfoId.LLVMGPUVectorDistributeMFMA,
-        CompilationInfoId.LLVMGPUVectorDistributeWMMA,
-    ]:
-        return get_rocm_test_compilation_infos(compilation_info_id, lhs_rhs_type)
-
-    software_pipeline_depth = 0
-    if compilation_info_id == CompilationInfoId.LLVMGPUMatmulSimt:
-        tile_workgroup_size_pairs = [
-            TileWorkgroupSizePair([[32, 128, 32]], [32, 8, 1]),
-            TileWorkgroupSizePair([[128, 64, 8]], [16, 8, 1]),
-            TileWorkgroupSizePair([[16, 256, 32]], [64, 2, 1]),
-            TileWorkgroupSizePair([[8, 32, 32]], [8, 8, 1]),
-            TileWorkgroupSizePair([[8, 128, 4]], [32, 1, 1]),
-            TileWorkgroupSizePair([[16, 64, 4]], [16, 2, 1]),
-            TileWorkgroupSizePair([[1, 128, 8]], [32, 1, 1]),
-        ]
-        software_pipeline_depth = 3
-    elif compilation_info_id == CompilationInfoId.SPIRVCooperativeMatrixVectorize:
-        tile_workgroup_size_pairs = [
-            TileWorkgroupSizePair(
-                [[64, 128], [32, 64], [0, 0, 32], [16, 16, 16]], [64, 2, 1]
-            )
-        ]
-    elif compilation_info_id == CompilationInfoId.SPIRVVectorizeNVIDIA:
-        tile_workgroup_size_pairs = get_all_spirv_tile_workgroup_size_pairs(32)
-    elif compilation_info_id == CompilationInfoId.SPIRVVectorizeMali:
-        tile_workgroup_size_pairs = get_all_spirv_tile_workgroup_size_pairs(4)
-    elif (
-        compilation_info_id == CompilationInfoId.LLVMGPUMatmulTensorCore
-        or compilation_info_id == CompilationInfoId.LLVMGPUMatmulTensorCoreMmaSync
-    ):
-        tile_workgroup_size_pairs = []
-        ## WarpShape = 2x2
-        tile_workgroup_size_pairs.append(
-            TileWorkgroupSizePair([[32, 32, 16]], [64, 2, 1])
-        )
-        tile_workgroup_size_pairs.append(
-            TileWorkgroupSizePair([[64, 64, 64]], [64, 2, 1])
-        )
-
-        ## WarpShape = 4x1
-        tile_workgroup_size_pairs.append(
-            TileWorkgroupSizePair([[32, 32, 32]], [64, 1, 1])
-        )
-
-        ## WarpShape = 2x2 with large tiles using larger Shared Memory capacity.
-        if lhs_rhs_type == MatrixElemTypeId.F16:
-            tile_workgroup_size_pairs.append(
-                TileWorkgroupSizePair([[128, 128, 64]], [64, 2, 1])
-            )
-        elif lhs_rhs_type == MatrixElemTypeId.F32:
-            tile_workgroup_size_pairs.append(
-                TileWorkgroupSizePair([[128, 128, 16]], [64, 2, 1])
-            )
-        software_pipeline_depth = 3
-
-    compilation_infos = []
-    for tile_workgroup_size_pair in tile_workgroup_size_pairs:
-        compilation_infos.append(
-            CompilationInfo(
-                tile_sizes=tile_workgroup_size_pair.tile_size,
-                dispatch_lowering_pass_pipeline=compilation_info_id.value,
-                workgroup_size=tile_workgroup_size_pair.workgroup_size,
-                software_pipeline_depth=software_pipeline_depth,
-                mma_schedule=None,
-            )
-        )
-    return compilation_infos
-
-
-# Intentionally fixed seed! We want full reproducibility here, both across runs
-# and across machines.
-# Intentionally not shared with pseudorandom_generator_seed to limit the ways
-# in which shuffling testcases changes which random values are generated.
-local_pseudorandom_state = 1
-
-
-# A shape dimension value, i.e. a size value that could appear in a MLIR type
-# such as 'tensor<?x4xf32>'. None means a dynamic size, similar to '?' in MLIR.
-@dataclasses.dataclass
-class DimSize:
-    value: typing.Optional[int]
-
-
-# Generates a compile-time MLIR size value, i.e. either a fixed positive integer
-# or None (which maps to MLIR '?') depending on dynamicity.
-def shape_dim(x: int, dynamicity: Dynamicity):
-    if dynamicity == Dynamicity.DYNAMIC:
-        return DimSize(None)
-    elif dynamicity == Dynamicity.STATIC:
-        return DimSize(x)
-    else:
-        raise ValueError(dynamicity)
-
-
-# Stringification used for generating MLIR types, e.g. tensor<?x?xf32>.
-def int_or_question_mark(s: DimSize):
-    return s.value or "?"
-
-
-# Stringification used for generating alphanumeric identifiers, e.g.
-# func.func @somefunction_DYNxDYNxf32, where we can't use "?" characters.
-def int_or_DYN(s: DimSize):
-    return s.value or "DYN"
-
-
-# Gets friendlier form/type that we can use as arg types which we can cast into the target_type.
-def cast_argtype_if_required(target_type: MatrixElemTypeId):
-    if target_type == MatrixElemTypeId.F8E4M3FNUZ:
-        return MatrixElemTypeId.F32
-    return target_type
-
-
-# Gets the op needed to cast/convert from the friendly form/type into the target_type.
-def get_castback_from_arg_op(target_type: MatrixElemTypeId):
-    if target_type == MatrixElemTypeId.F8E4M3FNUZ:
-        return "arith.truncf"
-    return ValueError(f"Unhandled castback type of {t}")
-
-
-# Describes the fully resolved shape dimensions of all 3 input matrices,
-# LHS, RHS, and Accumulator, in a testcase.
-# Each value is a string, which may either represent a positive integer such as "123",
-# or a "?" string, meaning a dynamic dimension as in MLIR.
-# These string values are used to generate MLIR function names and tensor shapes.
-@dataclasses.dataclass
-class TestInputMatricesShapes:
-    lhs_rows: DimSize
-    lhs_cols: DimSize
-    rhs_rows: DimSize
-    rhs_cols: DimSize
-    acc_rows: DimSize
-    acc_cols: DimSize
-
-
-# Helper for generate_function. Generates TestInputMatricesShapes, i.e.
-# converts from the runtime shape dimensions in TestShape and given dynamicity to
-# the set of shapes to be used in a test function's input tensors.
-def generate_shapes(shape: TestShape, transpose_rhs: bool, dynamicity: Dynamicity):
-    lhs_rows = shape_dim(shape.m, dynamicity)
-    lhs_cols = shape_dim(shape.k, dynamicity)
-    acc_rows = shape_dim(shape.m, dynamicity)
-    acc_cols = shape_dim(shape.n, dynamicity)
-    if transpose_rhs:
-        rhs_rows = shape_dim(shape.n, dynamicity)
-        rhs_cols = shape_dim(shape.k, dynamicity)
-    else:
-        rhs_rows = shape_dim(shape.k, dynamicity)
-        rhs_cols = shape_dim(shape.n, dynamicity)
-    shapes = TestInputMatricesShapes(
-        lhs_rows=lhs_rows,
-        lhs_cols=lhs_cols,
-        rhs_rows=rhs_rows,
-        rhs_cols=rhs_cols,
-        acc_rows=acc_rows,
-        acc_cols=acc_cols,
-    )
-    return shapes
-
-
-# Helper for generate_function.
-# Generates a name for a test function in the generated MLIR code.
-def generate_function_name(
-    lhs_rhs_type: MatrixElemTypeId,
-    acc_type: MatrixElemTypeId,
-    shapes: TestInputMatricesShapes,
-    accumulate: bool,
-    compilation_info: typing.Optional[CompilationInfo] = None,
-):
-    input_t = lhs_rhs_type.value
-    acc_t = acc_type.value
-    lhs_r = int_or_DYN(shapes.lhs_rows)
-    lhs_c = int_or_DYN(shapes.lhs_cols)
-    rhs_r = int_or_DYN(shapes.rhs_rows)
-    rhs_c = int_or_DYN(shapes.rhs_cols)
-    acc_r = int_or_DYN(shapes.acc_rows)
-    acc_c = int_or_DYN(shapes.acc_cols)
-
-    info = ""
-    if compilation_info:
-        tile_sizes = list(itertools.chain(*compilation_info.tile_sizes))
-        tile_workgroup_key = (
-            "_".join([str(a) for a in tile_sizes])
-            + "_"
-            + "_".join([str(a) for a in compilation_info.workgroup_size])
-        )
-        info = f"_for_{compilation_info.dispatch_lowering_pass_pipeline}_{tile_workgroup_key}"
-
-    matmul_kind = "matmul_accumulate" if accumulate else "matmul"
-    return (
-        f"{matmul_kind}_{lhs_r}x{lhs_c}x{input_t}_times_"
-        + f"{rhs_r}x{rhs_c}x{input_t}_into_{acc_r}x{acc_c}x{acc_t}{info}"
-    )
-
-
-# Represents a generated test function.
-@dataclasses.dataclass
-class MLIRFunction:
-    name: str
-    signature: str
-    import_declaration: str
-    definition: str
-
-
-# Generates a test function in the generated MLIR code.
-# The generated function will take the same arguments as linalg.matmul variants
-# and will just call linalg.matmul variants with them, returning its result.
-def generate_function(
-    lhs_rhs_type: MatrixElemTypeId,
-    acc_type: MatrixElemTypeId,
-    shape: TestShape,
-    transpose_rhs: bool,
-    dynamicity: Dynamicity,
-    compilation_info: typing.Optional[CompilationInfo] = None,
-):
-    shapes = generate_shapes(shape, transpose_rhs, dynamicity)
-    func_name = generate_function_name(
-        lhs_rhs_type, acc_type, shapes, shape.accumulate, compilation_info
-    )
-    lhs_r = int_or_question_mark(shapes.lhs_rows)
-    lhs_c = int_or_question_mark(shapes.lhs_cols)
-    rhs_r = int_or_question_mark(shapes.rhs_rows)
-    rhs_c = int_or_question_mark(shapes.rhs_cols)
-    acc_r = int_or_question_mark(shapes.acc_rows)
-    acc_c = int_or_question_mark(shapes.acc_cols)
-
-    casted_lhs_rhs_type = cast_argtype_if_required(lhs_rhs_type)
-    lhs_tensor_type = f"tensor<{lhs_r}x{lhs_c}x{casted_lhs_rhs_type.value}>"
-    rhs_tensor_type = f"tensor<{rhs_r}x{rhs_c}x{casted_lhs_rhs_type.value}>"
-    acc_tensor_type = f"tensor<{acc_r}x{acc_c}x{acc_type.value}>"
-
-    if transpose_rhs:
-        op_name = "linalg.matmul_transpose_b"
-    else:
-        op_name = "linalg.matmul"
-
-    # Compilation info is optional; prints empty string by default.
-    func_definition = ""
-    compilation_info_attr = ""
-    if compilation_info:
-        requested_pipeline = compilation_info.dispatch_lowering_pass_pipeline
-        compiler_pipeline = requested_pipeline
-        if requested_pipeline == "SPIRVVectorizeMali":
-            compiler_pipeline = "SPIRVBaseVectorize"
-        elif requested_pipeline == "SPIRVCooperativeMatrixVectorize":
-            compiler_pipeline = "SPIRVCooperativeMatrixVectorize"
-        elif requested_pipeline == "SPIRVVectorizeNVIDIA":
-            # TODO: change to test SPIRVMatmulPromoteVectorize too
-            compiler_pipeline = "SPIRVBaseVectorize"
-
-        mma_schedule = ""
-        if compilation_info.mma_schedule is not None:
-            mma_schedule = ", {}".format(compilation_info.mma_schedule)
-        subgroup_size_str = ""
-        if compilation_info.subgroup_size is not None:
-            subgroup_size_str = f"subgroup_size = {compilation_info.subgroup_size}"
-
-        compilation_info_string = (
-            f"#compilation{generate_function.compilation_index} = "
-            "#iree_codegen.compilation_info<\n"
-            f"  lowering_config = #iree_codegen.lowering_config<tile_sizes = {compilation_info.tile_sizes}>,\n"
-            f"  translation_info = <{compiler_pipeline} {compilation_info.workgroup_size_str()}\n"
-            f"  {subgroup_size_str},\n"
-            f"  {{ pipeline_depth = {compilation_info.software_pipeline_depth}, "
-            f"  store_stage = 1{mma_schedule} }}>>\n"
-        )
-        compilation_info_attr = (
-            f"{{compilation_info = #compilation{generate_function.compilation_index}}} "
-        )
-        func_definition = func_definition + compilation_info_string
-        generate_function.compilation_index += 1
-    compute = f"  %result = {op_name} {compilation_info_attr}ins(%lhs, %rhs: {lhs_tensor_type}, {rhs_tensor_type}) outs(%acc: {acc_tensor_type}) -> {acc_tensor_type}\n"
-    if casted_lhs_rhs_type != lhs_rhs_type:
-        castback_op = get_castback_from_arg_op(lhs_rhs_type)
-        compute_lhs_tensor_type = f"tensor<{lhs_r}x{lhs_c}x{lhs_rhs_type.value}>"
-        compute_rhs_tensor_type = f"tensor<{rhs_r}x{rhs_c}x{lhs_rhs_type.value}>"
-        compute = (
-            f"  %lhs_casted = {castback_op} %lhs: {lhs_tensor_type} to {compute_lhs_tensor_type}\n"
-            f"  %rhs_casted = {castback_op} %rhs: {rhs_tensor_type} to {compute_rhs_tensor_type}\n"
-            f"  %result = {op_name} {compilation_info_attr}ins(%lhs_casted, %rhs_casted: {compute_lhs_tensor_type}, {compute_rhs_tensor_type}) outs(%acc: {acc_tensor_type}) -> {acc_tensor_type}"
-        )
-    if shape.accumulate:
-        signature = f"({lhs_tensor_type}, {rhs_tensor_type}, {acc_tensor_type}) -> {acc_tensor_type}"
-        import_declaration = f"func.func private @module.{func_name}(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view"
-        func_definition = func_definition + (
-            f"func.func @{func_name}(%lhs: {lhs_tensor_type}, %rhs: {rhs_tensor_type}, %acc: {acc_tensor_type}) -> {acc_tensor_type} {{\n"
-            f"{compute}\n"
-            f"  return %result: {acc_tensor_type}\n"
-            f"}}\n"
-        )
-    else:
-        literal_zero_for_acc_type = "0.0" if "f" in acc_type.value else "0"
-        if acc_r == "?":
-            signature = f"({lhs_tensor_type}, {rhs_tensor_type}) -> {acc_tensor_type}"
-            import_declaration = f"func.func private @module.{func_name}(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view"
-            func_definition = func_definition + (
-                f"func.func @{func_name}(%lhs: {lhs_tensor_type}, %rhs: {rhs_tensor_type}) -> {acc_tensor_type} {{\n"
-                f"  %c0 = arith.constant 0 : index\n"
-                f"  %c1 = arith.constant 1 : index\n"
-                f"  %acc_dim0 = tensor.dim %lhs, %c0 : {lhs_tensor_type}\n"
-                f"  %acc_dim1 = tensor.dim %rhs, %c1 : {rhs_tensor_type}\n"
-                f"  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : {acc_tensor_type}\n"
-                f"  %c0_acc_type = arith.constant {literal_zero_for_acc_type}: {acc_type.value}\n"
-                f"  %acc = linalg.fill ins(%c0_acc_type : {acc_type.value}) outs(%init_acc : {acc_tensor_type}) -> {acc_tensor_type}\n"
-                f"{compute}"
-                f"  return %result: {acc_tensor_type}\n"
-                f"}}\n"
-            )
-        else:
-            signature = f"({lhs_tensor_type}, {rhs_tensor_type}) -> {acc_tensor_type}"
-            import_declaration = f"func.func private @module.{func_name}(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view"
-            func_definition = func_definition + (
-                f"func.func @{func_name}(%lhs: {lhs_tensor_type}, %rhs: {rhs_tensor_type}) -> {acc_tensor_type} {{\n"
-                f"  %init_acc = tensor.empty() : {acc_tensor_type}\n"
-                f"  %c0_acc_type = arith.constant {literal_zero_for_acc_type}: {acc_type.value}\n"
-                f"  %acc = linalg.fill ins(%c0_acc_type : {acc_type.value}) outs(%init_acc : {acc_tensor_type}) -> {acc_tensor_type}\n"
-                f"{compute}"
-                f"  return %result: {acc_tensor_type}\n"
-                f"}}\n"
-            )
-    return MLIRFunction(
-        name=func_name,
-        signature=signature,
-        import_declaration=import_declaration,
-        definition=func_definition,
-    )
-
-
-# Counter for producing unique compilation info attrs
-generate_function.compilation_index = 0
-
-
-# Represents a call to a generated test function.
-@dataclasses.dataclass
-class TestCall:
-    function: MLIRFunction
-    op: str
-
-
-# Intentionally fixed seed! We want full reproducibility here, both across runs
-# and across machines.
-# Intentionally not shared with local_pseudorandom_state to limit the ways
-# in which shuffling testcases changes which random values are generated.
-pseudorandom_generator_seed = 1
-
-
-def contents_generator_tag(generator: MatrixGenerator):
-    if generator == MatrixGenerator.ZERO:
-        return ""
-    elif generator == MatrixGenerator.RANDOM:
-        global pseudorandom_generator_seed
-        pseudorandom_generator_seed = pseudorandom_generator_seed + 1
-        return f"!tag:iree:fully_specified_pseudorandom {pseudorandom_generator_seed}"
-    else:
-        raise ValueError(generator)
-
-
-# Generate a matrix function argument of the given size as `%name`.
-def generate_random_matrix(
-    name: str,
-    matrix_shape: list,
-    element_type: MatrixElemTypeId,
-):
-    global pseudorandom_generator_seed
-    pseudorandom_generator_seed = pseudorandom_generator_seed + 1
-    return (
-        f"  %{name}_dim0 = arith.constant {matrix_shape[0]} : i64\n"
-        f"  %{name}_dim1 = arith.constant {matrix_shape[1]} : i64\n"
-        f"  %{name}_element_type = hal.element_type<{element_type.value}> : i32\n"
-        f"  %{name}_seed = arith.constant {pseudorandom_generator_seed} : i32\n"
-        f"  %{name} = call @matmul_test.generate_random_matrix(%device, %{name}_dim0, %{name}_dim1, %{name}_element_type, %{name}_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view\n"
-    )
-
-
-call_id = 0
-
-
-# Generates the output trace for a testcase i.e. a single test function call,
-# as a dictionary to be passed to yaml.dump.
-def generate_call(
-    function: MLIRFunction,
-    lhs_rhs_type: MatrixElemTypeId,
-    acc_type: MatrixElemTypeId,
-    shape: TestShape,
-    transpose_rhs: bool,
-):
-    global call_id
-    func_name = f"{function.name}_{shape.m}_{shape.k}_{shape.n}"
-    if shape.accumulate:
-        func_name = f"{func_name}_acc"
-    func_name = f"{func_name}_{call_id}"
-    call_id = call_id + 1
-
-    description = f"Matmul shape (MxKxN): {shape.m}x{shape.k}x{shape.n}"
-    op = (
-        f"func.func @{func_name}() attributes {{\n"
-        f'  iree.reflection = {{description = "{description}"}}\n'
-        "} {\n"
-        "  %device_index = arith.constant 0 : index\n"
-        "  %device = hal.devices.get %device_index : !hal.device\n"
-    )
-
-    lhs_shape = [shape.m, shape.k]
-    if transpose_rhs:
-        rhs_shape = [shape.n, shape.k]
-        transpose_rhs = 1
-    else:
-        rhs_shape = [shape.k, shape.n]
-        transpose_rhs = 0
-
-    casted_lhs_rhs_type = cast_argtype_if_required(lhs_rhs_type)
-    op = op + generate_random_matrix("lhs", lhs_shape, casted_lhs_rhs_type)
-    op = op + generate_random_matrix("rhs", rhs_shape, casted_lhs_rhs_type)
-    if shape.accumulate:
-        op = op + generate_random_matrix("acc", [shape.m, shape.n], acc_type)
-        # TODO(#16168): there's a bug with in-place input->output aliasing and
-        # we work around it here by passing in a unique copy.
-        global pseudorandom_generator_seed
-        pseudorandom_generator_seed = pseudorandom_generator_seed - 1
-        op = op + generate_random_matrix("acc_copy", [shape.m, shape.n], acc_type)
-        op = op + (
-            f"  %result = call @module.{function.name}(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view\n"
-        )
-    else:
-        op = op + (
-            f"  %acc = util.null : !hal.buffer_view\n"
-            f"  %result = call @module.{function.name}(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view\n"
-        )
-
-    op = op + (
-        f"  %m = arith.constant {shape.m} : i64\n"
-        f"  %k = arith.constant {shape.k} : i64\n"
-        f"  %n = arith.constant {shape.n} : i64\n"
-        f"  %transpose_rhs = arith.constant {transpose_rhs} : i32\n"
-        f"  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()\n"
-    )
-
-    op = op + "  return\n"
-    op = op + "}\n"
-
-    return TestCall(function=function, op=op)
-
-
-# Generates all output files' contents as strings.
-def generate(
-    lhs_rhs_type: MatrixElemTypeId,
-    acc_type: MatrixElemTypeId,
-    shapes_id: ShapesId,
-    transpose_rhs: bool,
-    compilation_info_id: CompilationInfoId,
-):
-    functions = {}
-    calls = []
-
-    for compilation_info in get_test_compilation_infos(
-        compilation_info_id, lhs_rhs_type
-    ):
-        for shape in get_test_shapes(shapes_id):
-            for dynamicity in get_dynamicities(shapes_id):
-                function = generate_function(
-                    lhs_rhs_type,
-                    acc_type,
-                    shape,
-                    transpose_rhs,
-                    dynamicity,
-                    compilation_info,
-                )
-                # Different testcases may differ only by runtime parameters but
-                # share the same code. For example, dynamic-shapes testcases
-                # share the same code involing tensor<?x?xf32> even though the runtime
-                # value in the trace are different. That's why we append conditionally
-                # to calls, but unconditionally to function_definitions.
-                if function.name not in functions:
-                    functions[function.name] = function
-                calls.append(
-                    generate_call(
-                        function, lhs_rhs_type, acc_type, shape, transpose_rhs
-                    )
-                )
-
-    return (functions, calls)
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(description="Generator of e2e matmul tests")
-    parser.add_argument(
-        "--output_matmul_mlir",
-        type=str,
-        help="Path of output .mlir file containing the generated matmuls",
-        required=True,
-    )
-    parser.add_argument(
-        "--output_calls_mlir",
-        type=str,
-        help="Path of output .mlir file containing the calls",
-        required=True,
-    )
-    parser.add_argument(
-        "--lhs_rhs_type",
-        type=str,
-        choices=["i32", "i8", "f32", "f16", "f8E4M3FNUZ", "bf16"],
-        help="Numeric type of input matrices",
-        required=True,
-    )
-    parser.add_argument(
-        "--acc_type",
-        type=str,
-        choices=["i32", "f32", "f16", "bf16"],
-        help="Numeric type of input matrices",
-        default="",
-        required=False,
-    )
-    parser.add_argument(
-        "--shapes",
-        type=str,
-        choices=[s.value for s in ShapesId],
-        help="Collection of matrix shapes to test",
-        required=True,
-    )
-    parser.add_argument(
-        "--transpose_rhs",
-        action="store_true",
-        help="Whether to transpose RHS",
-        default=False,
-        required=False,
-    )
-    parser.add_argument(
-        "--compilation_info",
-        type=str,
-        choices=[i.value for i in CompilationInfoId],
-        help="Collection of compilation info setups to test",
-        default="",
-        required=False,
-    )
-    parser.add_argument(
-        "--requirements",
-        type=str,
-        help="Target requirements for this module. Comma-separated. As in -iree-llvmcpu-target-cpu-features. If the target device does not meet all of the requirements, the test will be skipped.",
-        required=False,
-    )
-    return parser.parse_args()
-
-
-def write_code_file(functions, filename):
-    with open(filename, "w") as file:
-        for function in functions.values():
-            file.write(function.definition + "\n")
-
-
-def write_calls_file(functions, calls, filename, requirements):
-    # Module-level reflection information used to control the test tool.
-    reflection = ""
-    if requirements:
-        reflection = (
-            "iree.reflection = {"
-            'target_features = "'
-            + ",".join([req.lstrip("+") for req in requirements.split(",")])
-            + '"'
-            "}"
-        )
-    module_definition = (
-        f"builtin.module @calls attributes {{\n" f"  {reflection}\n" f"}} {{\n\n"
-    )
-
-    # Declare the custom module that generates arguments.
-    module_definition = module_definition + (
-        "func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view\n"
-        "func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)\n"
-        "\n"
-    )
-
-    # Declare the functions that will be called.
-    for function in functions.values():
-        module_definition = module_definition + function.import_declaration + "\n"
-    module_definition = module_definition + "\n"
-
-    # Emit the test cases for each call.
-    for call in calls:
-        module_definition = module_definition + call.op + "\n"
-
-    module_definition = module_definition + "\n}\n"
-
-    with open(filename, "w") as file:
-        file.write(module_definition)
-
-
-# For now, the accumulator type can always be inferred from the input LHS/RHS
-# type, so we do that. That is temporary: eventually there will be cases
-# where the same input types are used with different accumulator types, e.g.
-# f16 inputs with both f16 and f32 accumulator.
-def infer_acc_type(lhs_rhs_type: MatrixElemTypeId, acc_type: MatrixElemTypeId):
-    if acc_type != MatrixElemTypeId.NONE:
-        return acc_type
-    if lhs_rhs_type == MatrixElemTypeId.F8E4M3FNUZ:
-        return MatrixElemTypeId.F32
-    if lhs_rhs_type == MatrixElemTypeId.I8:
-        return MatrixElemTypeId.I32
-    return lhs_rhs_type
-
-
-def main(args):
-    lhs_rhs_type = MatrixElemTypeId(args.lhs_rhs_type)
-    acc_type = MatrixElemTypeId(args.acc_type)
-    acc_type = infer_acc_type(lhs_rhs_type, acc_type)
-    shapes_id = ShapesId(args.shapes)
-    compilation_info_id = CompilationInfoId(args.compilation_info)
-
-    (functions, calls) = generate(
-        lhs_rhs_type, acc_type, shapes_id, args.transpose_rhs, compilation_info_id
-    )
-
-    write_code_file(functions, args.output_matmul_mlir)
-    write_calls_file(
-        functions,
-        calls,
-        args.output_calls_mlir,
-        args.requirements,
-    )
-
-
-if __name__ == "__main__":
-    main(parse_arguments())
diff --git a/tools/testing/BUILD.bazel b/tools/testing/BUILD.bazel
deleted file mode 100644
index 2a6834ce38b3..000000000000
--- a/tools/testing/BUILD.bazel
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright 2024 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-package(
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
diff --git a/tools/testing/CMakeLists.txt b/tools/testing/CMakeLists.txt
deleted file mode 100644
index ae2678c84ef5..000000000000
--- a/tools/testing/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright 2024 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-iree_add_all_subdirs()
diff --git a/tools/testing/e2e/BUILD.bazel b/tools/testing/e2e/BUILD.bazel
deleted file mode 100644
index 397627961d20..000000000000
--- a/tools/testing/e2e/BUILD.bazel
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright 2024 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_runtime_cc_binary", "iree_runtime_cc_library")
-
-package(
-    default_visibility = ["//visibility:public"],
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-iree_runtime_cc_library(
-    name = "e2e_test_util",
-    srcs = ["test_utils.c"],
-    hdrs = ["test_utils.h"],
-    deps = [
-        "//runtime/src/iree/base",
-        "//runtime/src/iree/base/internal",
-        "//runtime/src/iree/base/internal:cpu",
-        "//runtime/src/iree/base/internal:flags",
-        "//runtime/src/iree/base/internal:path",
-        "//runtime/src/iree/hal",
-        "//runtime/src/iree/modules/hal",
-        "//runtime/src/iree/tooling:context_util",
-        "//runtime/src/iree/tooling:device_util",
-        "//runtime/src/iree/vm",
-        "//runtime/src/iree/vm:cc",
-    ],
-)
-
-iree_runtime_cc_binary(
-    name = "iree-e2e-matmul-test",
-    srcs = ["iree-e2e-matmul-test.cc"],
-    deps = [
-        ":e2e_test_util",
-        "//runtime/src/iree/base",
-        "//runtime/src/iree/base/internal",
-        "//runtime/src/iree/base/internal:cpu",
-        "//runtime/src/iree/base/internal:flags",
-        "//runtime/src/iree/base/internal:path",
-        "//runtime/src/iree/hal",
-        "//runtime/src/iree/modules/hal",
-        "//runtime/src/iree/tooling:context_util",
-        "//runtime/src/iree/tooling:device_util",
-        "//runtime/src/iree/vm",
-        "//runtime/src/iree/vm:cc",
-    ],
-)
-
-iree_runtime_cc_binary(
-    name = "iree-e2e-conv2d-test",
-    srcs = ["iree-e2e-conv2d-test.cc"],
-    deps = [
-        ":e2e_test_util",
-        "//runtime/src/iree/base",
-        "//runtime/src/iree/base/internal",
-        "//runtime/src/iree/base/internal:cpu",
-        "//runtime/src/iree/base/internal:flags",
-        "//runtime/src/iree/base/internal:path",
-        "//runtime/src/iree/hal",
-        "//runtime/src/iree/modules/hal",
-        "//runtime/src/iree/tooling:context_util",
-        "//runtime/src/iree/tooling:device_util",
-        "//runtime/src/iree/vm",
-        "//runtime/src/iree/vm:cc",
-    ],
-)
-
-iree_runtime_cc_binary(
-    name = "iree-e2e-attention-test",
-    srcs = ["iree-e2e-attention-test.cc"],
-    deps = [
-        ":e2e_test_util",
-        "//runtime/src/iree/base",
-        "//runtime/src/iree/base/internal",
-        "//runtime/src/iree/base/internal:cpu",
-        "//runtime/src/iree/base/internal:flags",
-        "//runtime/src/iree/base/internal:path",
-        "//runtime/src/iree/hal",
-        "//runtime/src/iree/modules/hal",
-        "//runtime/src/iree/tooling:context_util",
-        "//runtime/src/iree/tooling:device_util",
-        "//runtime/src/iree/vm",
-        "//runtime/src/iree/vm:cc",
-    ],
-)
diff --git a/tools/testing/e2e/CMakeLists.txt b/tools/testing/e2e/CMakeLists.txt
deleted file mode 100644
index ece0c59d00b0..000000000000
--- a/tools/testing/e2e/CMakeLists.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-if(NOT IREE_ENABLE_THREADING)
-  return()
-endif()
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_ABOVE_THIS_LINE ###
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# tools/testing/e2e/BUILD.bazel                                                #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-iree_cc_library(
-  NAME
-    e2e_test_util
-  HDRS
-    "test_utils.h"
-  SRCS
-    "test_utils.c"
-  DEPS
-    iree::base
-    iree::base::internal
-    iree::base::internal::cpu
-    iree::base::internal::flags
-    iree::base::internal::path
-    iree::hal
-    iree::modules::hal
-    iree::tooling::context_util
-    iree::tooling::device_util
-    iree::vm
-    iree::vm::cc
-  PUBLIC
-)
-
-iree_cc_binary(
-  NAME
-    iree-e2e-matmul-test
-  SRCS
-    "iree-e2e-matmul-test.cc"
-  DEPS
-    ::e2e_test_util
-    iree::base
-    iree::base::internal
-    iree::base::internal::cpu
-    iree::base::internal::flags
-    iree::base::internal::path
-    iree::hal
-    iree::modules::hal
-    iree::tooling::context_util
-    iree::tooling::device_util
-    iree::vm
-    iree::vm::cc
-)
-
-iree_cc_binary(
-  NAME
-    iree-e2e-conv2d-test
-  SRCS
-    "iree-e2e-conv2d-test.cc"
-  DEPS
-    ::e2e_test_util
-    iree::base
-    iree::base::internal
-    iree::base::internal::cpu
-    iree::base::internal::flags
-    iree::base::internal::path
-    iree::hal
-    iree::modules::hal
-    iree::tooling::context_util
-    iree::tooling::device_util
-    iree::vm
-    iree::vm::cc
-)
-
-iree_cc_binary(
-  NAME
-    iree-e2e-attention-test
-  SRCS
-    "iree-e2e-attention-test.cc"
-  DEPS
-    ::e2e_test_util
-    iree::base
-    iree::base::internal
-    iree::base::internal::cpu
-    iree::base::internal::flags
-    iree::base::internal::path
-    iree::hal
-    iree::modules::hal
-    iree::tooling::context_util
-    iree::tooling::device_util
-    iree::vm
-    iree::vm::cc
-)
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/tools/testing/e2e/iree-e2e-attention-test.cc b/tools/testing/e2e/iree-e2e-attention-test.cc
deleted file mode 100644
index 4b0464b13dfb..000000000000
--- a/tools/testing/e2e/iree-e2e-attention-test.cc
+++ /dev/null
@@ -1,486 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "iree/base/api.h"
-#include "iree/base/internal/cpu.h"
-#include "iree/base/internal/flags.h"
-#include "iree/base/internal/math.h"
-#include "iree/base/internal/path.h"
-#include "iree/hal/api.h"
-#include "iree/modules/hal/module.h"
-#include "iree/tooling/context_util.h"
-#include "iree/tooling/device_util.h"
-#include "iree/vm/api.h"
-#include "iree/vm/native_module_cc.h"
-#include "tools/testing/e2e/test_utils.h"
-
-//===----------------------------------------------------------------------===//
-// Reference Attention
-//===----------------------------------------------------------------------===//
-
-// Helper for reference_attention.
-// Function to allocate and initialize tensors
-float* allocate_tensor(int dim1, int dim2, int dim3) {
-  const int size = dim1 * dim2 * dim3;
-  float* tensor = (float*)malloc(size * sizeof(float));
-  for (int i = 0; i < size; ++i) {
-    tensor[i] = 0.0f;
-  }
-  return tensor;
-}
-
-// Function to free allocated tensors
-void free_tensor(float* tensor) {
-  if (tensor != nullptr) free(tensor);
-}
-
-// Function to calculate 1D index for a 3D array
-int index_3d(int i, int j, int k, int dim2, int dim3) {
-  return i * dim2 * dim3 + j * dim3 + k;
-}
-
-static void reference_attention_f32_f32_f32_f32(
-    iree_hal_dim_t M, iree_hal_dim_t K1, iree_hal_dim_t K2, iree_hal_dim_t N,
-    iree_hal_dim_t B, const float* query_data, const float* key_data,
-    const float* value_data, float* result_data, iree_hal_dim_t b,
-    float* Attention) {
-  // Compute Q * K^T
-  for (int m = 0; m < M; ++m) {
-    for (int k2 = 0; k2 < K2; ++k2) {
-      float sum = 0.0;
-      for (int k1 = 0; k1 < K1; ++k1) {
-        int q_idx = index_3d(b, m, k1, M, K1);
-        int k_idx = index_3d(b, k2, k1, K2, K1);
-
-        sum += query_data[q_idx] * key_data[k_idx];
-      }
-      int att_idx = index_3d(0, m, k2, M, K2);
-      Attention[att_idx] = sum / sqrt(K1);  // Scale by sqrt(K1)
-    }
-  }
-
-  // Compute softmax on Attention
-  for (int m = 0; m < M; ++m) {
-    // Find the maximum value for the current sequence
-    float max_val = -FLT_MAX;
-    for (int k2 = 0; k2 < K2; ++k2) {
-      int att_idx = index_3d(0, m, k2, M, K2);
-      max_val = iree_max(max_val, Attention[att_idx]);
-    }
-
-    // Calculate the softmax denominator
-    float sum = 0.0f;
-    for (int k2 = 0; k2 < K2; ++k2) {
-      int att_idx = index_3d(0, m, k2, M, K2);
-      sum += exp(Attention[att_idx] - max_val);
-    }
-
-    // Apply softmax
-    for (int k2 = 0; k2 < K2; ++k2) {
-      int att_idx = index_3d(0, m, k2, M, K2);
-      Attention[att_idx] = exp(Attention[att_idx]) / sum;
-    }
-  }
-
-  // Compute Attention * V
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      float sum = 0.0;
-      for (int k2 = 0; k2 < K2; ++k2) {
-        int att_idx = index_3d(0, m, k2, M, K2);
-        int v_idx = index_3d(b, k2, n, K2, N);
-        sum += Attention[att_idx] * value_data[v_idx];
-      }
-      int o_idx = index_3d(b, m, n, M, N);
-      result_data[o_idx] = sum;
-    }
-  }
-}
-
-static iree_status_t reference_attention_element(
-    iree_hal_dim_t M, iree_hal_dim_t K1, iree_hal_dim_t K2, iree_hal_dim_t N,
-    iree_hal_dim_t B, iree_hal_element_type_t query_elem_type,
-    iree_hal_element_type_t key_elem_type,
-    iree_hal_element_type_t value_elem_type, void* query_data, void* key_data,
-    void* value_data, void* actual_data, void* result_data, iree_hal_dim_t b,
-    float* Attention) {
-  if (query_elem_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
-      key_elem_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
-      value_elem_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
-    reference_attention_f32_f32_f32_f32(
-        M, K1, K2, N, B, (const float*)query_data, (const float*)key_data,
-        (const float*)value_data, (float*)result_data, b, Attention);
-
-  } else {
-    return iree_make_status(
-        IREE_STATUS_INVALID_ARGUMENT,
-        "unhandled combination of element types in attention");
-  }
-  return iree_ok_status();
-}
-
-// Reference attention implementation, used to compare attention results
-// against.
-static iree_status_t reference_attention(
-    iree_hal_dim_t B, iree_hal_dim_t M, iree_hal_dim_t K1, iree_hal_dim_t K2,
-    iree_hal_dim_t N, iree_hal_element_type_t query_elem_type,
-    iree_hal_element_type_t key_elem_type,
-    iree_hal_element_type_t value_elem_type, iree_byte_span_t query_contents,
-    iree_byte_span_t key_contents, iree_byte_span_t value_contents,
-    iree_byte_span_t actual_contents, iree_byte_span_t result_contents,
-    int compute_every) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, B);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, M);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, K1);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, K2);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, N);
-
-  iree_host_size_t count = 0;
-  float* Attention = allocate_tensor(1, M, K2);
-  for (iree_hal_dim_t b = 0; b < B; ++b) {
-    if (++count < compute_every) continue;
-    count = 0;
-    IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0,
-        reference_attention_element(
-            M, K1, K2, N, B, query_elem_type, key_elem_type, value_elem_type,
-            query_contents.data, key_contents.data, value_contents.data,
-            actual_contents.data, result_contents.data, b, Attention));
-  }
-  free_tensor(Attention);
-
-  IREE_TRACE_ZONE_END(z0);
-  return iree_ok_status();
-}
-//===----------------------------------------------------------------------===//
-// Attention comparison/logging
-//===----------------------------------------------------------------------===//
-
-typedef struct {
-  iree_allocator_t host_allocator;
-  iree_hal_dim_t b;
-  iree_hal_dim_t m;
-  iree_hal_dim_t k1;
-  iree_hal_dim_t k2;
-  iree_hal_dim_t n;
-  iree_hal_element_type_t query_elem_type;
-  iree_hal_element_type_t key_elem_type;
-  iree_hal_element_type_t value_elem_type;
-  iree_hal_element_type_t result_elem_type;
-  iree_byte_span_t query_contents;
-  iree_byte_span_t key_contents;
-  iree_byte_span_t value_contents;
-  iree_byte_span_t actual_contents;
-  iree_byte_span_t expected_contents;
-} attention_results_t;
-
-static void attention_results_deinitialize(attention_results_t* results);
-
-static iree_status_t attention_results_initialize(
-    iree_hal_device_t* device, iree_hal_dim_t b_size, iree_hal_dim_t m_size,
-    iree_hal_dim_t k1_size, iree_hal_dim_t k2_size, iree_hal_dim_t n_size,
-    iree_hal_buffer_view_t* query, iree_hal_buffer_view_t* key,
-    iree_hal_buffer_view_t* value, iree_hal_buffer_view_t* result,
-    iree_allocator_t host_allocator, attention_results_t* out_results) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  memset(out_results, 0, sizeof(*out_results));
-  out_results->host_allocator = host_allocator;
-
-  out_results->b = b_size;
-  out_results->m = m_size;
-  out_results->k1 = k1_size;
-  out_results->k2 = k2_size;
-  out_results->n = n_size;
-
-  out_results->query_elem_type = iree_hal_buffer_view_element_type(query);
-  out_results->key_elem_type = iree_hal_buffer_view_element_type(key);
-  out_results->value_elem_type = iree_hal_buffer_view_element_type(value);
-  out_results->result_elem_type = iree_hal_buffer_view_element_type(result);
-
-  iree_hal_buffer_t* query_buffer = iree_hal_buffer_view_buffer(query);
-  iree_hal_buffer_t* key_buffer = iree_hal_buffer_view_buffer(key);
-  iree_hal_buffer_t* value_buffer = iree_hal_buffer_view_buffer(value);
-  iree_hal_buffer_t* result_buffer = iree_hal_buffer_view_buffer(result);
-
-  iree_status_t status = iree_ok_status();
-
-  if (iree_status_is_ok(status)) {
-    out_results->query_contents.data_length =
-        iree_hal_buffer_byte_length(query_buffer);
-    status = iree_allocator_malloc(host_allocator,
-                                   out_results->query_contents.data_length,
-                                   (void**)&out_results->query_contents.data);
-  }
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_device_transfer_d2h(
-        device, query_buffer, 0, out_results->query_contents.data,
-        out_results->query_contents.data_length,
-        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-  }
-  if (iree_status_is_ok(status)) {
-    out_results->key_contents.data_length =
-        iree_hal_buffer_byte_length(key_buffer);
-    status = iree_allocator_malloc(host_allocator,
-                                   out_results->key_contents.data_length,
-                                   (void**)&out_results->key_contents.data);
-  }
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_device_transfer_d2h(
-        device, key_buffer, 0, out_results->key_contents.data,
-        out_results->key_contents.data_length,
-        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-  }
-  if (iree_status_is_ok(status)) {
-    out_results->value_contents.data_length =
-        iree_hal_buffer_byte_length(value_buffer);
-    status = iree_allocator_malloc(host_allocator,
-                                   out_results->value_contents.data_length,
-                                   (void**)&out_results->value_contents.data);
-  }
-
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_device_transfer_d2h(
-        device, value_buffer, 0, out_results->value_contents.data,
-        out_results->value_contents.data_length,
-        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-  }
-  if (iree_status_is_ok(status)) {
-    out_results->actual_contents.data_length =
-        iree_hal_buffer_byte_length(result_buffer);
-    status = iree_allocator_malloc(host_allocator,
-                                   out_results->actual_contents.data_length,
-                                   (void**)&out_results->actual_contents.data);
-  }
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_device_transfer_d2h(
-        device, result_buffer, 0, out_results->actual_contents.data,
-        out_results->actual_contents.data_length,
-        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-  }
-  if (iree_status_is_ok(status)) {
-    out_results->expected_contents.data_length =
-        iree_hal_buffer_byte_length(result_buffer);
-    status = iree_allocator_malloc(
-        host_allocator, out_results->expected_contents.data_length,
-        (void**)&out_results->expected_contents.data);
-  }
-  if (!iree_status_is_ok(status)) {
-    attention_results_deinitialize(out_results);
-  }
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
-static void attention_results_deinitialize(attention_results_t* results) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  iree_allocator_free(results->host_allocator, results->query_contents.data);
-  iree_allocator_free(results->host_allocator, results->key_contents.data);
-  iree_allocator_free(results->host_allocator, results->value_contents.data);
-  iree_allocator_free(results->host_allocator, results->actual_contents.data);
-  iree_allocator_free(results->host_allocator, results->expected_contents.data);
-
-  IREE_TRACE_ZONE_END(z0);
-}
-
-// Helper for check_attention_results: the actual interesting part once we've
-// obtained and validated the {b,m,k1,k2,n}_size values. On error, detailed
-// logging is written to |file| if it is not NULL.
-static iree_status_t check_attention_results_impl(
-    FILE* file, const attention_results_t* results, int check_every) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, reference_attention(results->b, results->m, results->k1, results->k2,
-                              results->n, results->query_elem_type,
-                              results->key_elem_type, results->value_elem_type,
-                              results->query_contents, results->key_contents,
-                              results->value_contents, results->actual_contents,
-                              results->expected_contents, check_every));
-
-  IREE_TRACE_ZONE_END(z0);
-  return iree_ok_status();
-}
-
-// Given an actual attention's inputs and output (all host-local), uses a
-// reference attention implementation on the same inputs to check if the output
-// is correct. On error, detailed logging is written to |file| if it is not
-// NULL.
-static iree_status_t check_attention_results(
-    FILE* file, const attention_results_t* results) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  // TODO: Increase the check every param to reduce the number of comparisons.
-  int check_every = 1;
-  iree_status_t status =
-      check_attention_results_impl(file, results, check_every);
-  if (!iree_status_is_ok(status) && check_every > 1) {
-    // If we got a failure with check_every>1, that didn't log a useful
-    // numerical summary, as most of the reference matrix entries hadn't been
-    // computed. Rerun now with check_every=1 to get that numerical logging.
-    iree_status_ignore(status);
-    status = check_attention_results_impl(file, results, 1);
-  }
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
-//===----------------------------------------------------------------------===//
-// `attention_test` custom module
-//===----------------------------------------------------------------------===//
-// This uses the C++ wrapper to keep things simple. Though easier to use it's
-// got additional overhead/code-size bloat that doesn't matter in a test like
-// this. Making a C module builder API that removes the boilerplate there is TBD
-// so this file is written in C besides this module so that we can swap it back
-// to being pure C in the future.
-
-namespace iree {
-
-class AttentionTestModuleState final {
- public:
-  explicit AttentionTestModuleState(iree_allocator_t host_allocator)
-      : host_allocator_(host_allocator) {}
-  ~AttentionTestModuleState() = default;
-
-  // Fills the destination span with pseudorandom values of the given
-  // |element_type|. The given |seed| is passed to the pseudorandom generator.
-  // The pseudorandom values are reproducible both across runs and across
-  // machines.
-  StatusOr<vm::ref<iree_hal_buffer_view_t>> GenerateRandom3dTensor(
-      const vm::ref<iree_hal_device_t> device, int64_t dim0, int64_t dim1,
-      int64_t dim2, iree_hal_element_type_t element_type, int32_t seed) {
-    iree_hal_dim_t dims[3] = {
-        (iree_hal_dim_t)dim0,
-        (iree_hal_dim_t)dim1,
-        (iree_hal_dim_t)dim2,
-    };
-    iree_hal_buffer_params_t buffer_params = {0};
-    buffer_params.usage = IREE_HAL_BUFFER_USAGE_DEFAULT;
-    buffer_params.access = IREE_HAL_MEMORY_ACCESS_ALL;
-    buffer_params.type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE;
-    vm::ref<iree_hal_buffer_view_t> result_view;
-    struct callback_state_t {
-      iree_hal_element_type_t element_type;
-      int32_t seed;
-    } callback_state = {
-        element_type,
-        seed,
-    };
-    IREE_RETURN_IF_ERROR(iree_hal_buffer_view_generate_buffer(
-        device.get(), iree_hal_device_allocator(device.get()),
-        IREE_ARRAYSIZE(dims), dims, element_type,
-        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
-        +[](iree_hal_buffer_mapping_t* mapping, void* user_data) {
-          callback_state_t callback_state = *(callback_state_t*)user_data;
-          iree_byte_span_t span = mapping->contents;
-          // Generate "uniform" integer-valued numbers in the range [min, max].
-          int32_t min = 0;
-          int32_t max = 0;
-          iree_test_utils_get_min_max_for_element_type(
-              callback_state.element_type, &min, &max);
-          uint32_t range = (max - min + 1);
-          iree_host_size_t element_byte_count =
-              iree_hal_element_dense_byte_count(callback_state.element_type);
-          uint8_t* data_end = span.data + span.data_length;
-          uint32_t state = callback_state.seed;
-          for (uint8_t* data = span.data; data < data_end;
-               data += element_byte_count) {
-            int32_t value =
-                (int32_t)iree_test_utils_pseudorandom_range(&state, range) +
-                min;
-            iree_test_utils_write_element(callback_state.element_type, value,
-                                          data);
-          }
-          return iree_ok_status();
-        },
-        &callback_state, &result_view));
-    return std::move(result_view);
-  }
-
-  Status CheckAttentionResults(
-      const vm::ref<iree_hal_device_t> device, int64_t b, int64_t m, int64_t k1,
-      int64_t k2, int64_t n, const vm::ref<iree_hal_buffer_view_t> query,
-      const vm::ref<iree_hal_buffer_view_t> key,
-      const vm::ref<iree_hal_buffer_view_t> value,
-      const vm::ref<iree_hal_buffer_view_t> actual_result) {
-    attention_results_t results = {};
-    IREE_RETURN_IF_ERROR(attention_results_initialize(
-        device.get(), (iree_hal_dim_t)b, (iree_hal_dim_t)m, (iree_hal_dim_t)k1,
-        (iree_hal_dim_t)k2, (iree_hal_dim_t)n, query.get(), key.get(),
-        value.get(), actual_result.get(), host_allocator_, &results));
-    iree_status_t status = check_attention_results(stderr, &results);
-    attention_results_deinitialize(&results);
-    return status;
-  }
-
- private:
-  iree_allocator_t host_allocator_;
-};
-
-static const vm::NativeFunction<AttentionTestModuleState>
-    kAttentionTestModuleFunctions[] = {
-        vm::MakeNativeFunction(
-            "generate_random_tensor",
-            &AttentionTestModuleState::GenerateRandom3dTensor),
-        vm::MakeNativeFunction(
-            "check_attention_results",
-            &AttentionTestModuleState::CheckAttentionResults),
-};
-
-struct AttentionTestModule final
-    : public vm::NativeModule<AttentionTestModuleState> {
-  using vm::NativeModule<AttentionTestModuleState>::NativeModule;
-  StatusOr<std::unique_ptr<AttentionTestModuleState>> CreateState(
-      iree_allocator_t host_allocator) override {
-    return std::make_unique<AttentionTestModuleState>(host_allocator);
-  }
-};
-
-}  // namespace iree
-
-static iree_status_t attention_test_module_create(
-    iree_vm_instance_t* instance, iree_allocator_t host_allocator,
-    iree_vm_module_t** out_module) {
-  IREE_ASSERT_ARGUMENT(out_module);
-  *out_module = NULL;
-  auto module = std::make_unique<iree::AttentionTestModule>(
-      "attention_test", /*version=*/0, instance, host_allocator,
-      iree::span<
-          const iree::vm::NativeFunction<iree::AttentionTestModuleState>>(
-          iree::kAttentionTestModuleFunctions));
-  *out_module = module.release()->interface();
-  return iree_ok_status();
-}
-
-int main(int argc, char** argv) {
-  IREE_TRACE_APP_ENTER();
-
-  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
-  if (argc != 1) {
-    fprintf(stderr, "use --module= flags to specify the modules to run\n");
-    IREE_TRACE_APP_EXIT(EXIT_FAILURE);
-    return EXIT_FAILURE;
-  }
-
-  iree_status_t status = iree_test_utils_load_and_run_e2e_tests(
-      iree_allocator_system(), attention_test_module_create);
-  int exit_code = EXIT_SUCCESS;
-  if (!iree_status_is_ok(status)) {
-    iree_status_fprint(stderr, status);
-    bool is_unavailable = iree_status_is_unavailable(status);
-    iree_status_free(status);
-    exit_code = is_unavailable ? EXIT_SUCCESS : EXIT_FAILURE;
-  }
-
-  IREE_TRACE_APP_EXIT(exit_code);
-  return exit_code;
-}
diff --git a/tools/testing/e2e/iree-e2e-conv2d-test.cc b/tools/testing/e2e/iree-e2e-conv2d-test.cc
deleted file mode 100644
index c4158fdc73c9..000000000000
--- a/tools/testing/e2e/iree-e2e-conv2d-test.cc
+++ /dev/null
@@ -1,567 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "iree/base/api.h"
-#include "iree/base/internal/flags.h"
-#include "iree/base/internal/math.h"
-#include "iree/hal/api.h"
-#include "iree/modules/hal/module.h"
-#include "iree/tooling/context_util.h"
-#include "iree/tooling/device_util.h"
-#include "iree/vm/api.h"
-#include "iree/vm/native_module_cc.h"
-#include "tools/testing/e2e/test_utils.h"
-
-//===----------------------------------------------------------------------===//
-// Reference conv2d (NCHW-FCHW)
-//===----------------------------------------------------------------------===//
-
-// Conversion from 4D indices in row major order to 1D index.
-static int convert_to_1d_index(iree_hal_dim_t channels, iree_hal_dim_t height,
-                               iree_hal_dim_t width, iree_hal_dim_t n,
-                               iree_hal_dim_t c, iree_hal_dim_t h,
-                               iree_hal_dim_t w) {
-  return n * (channels * height * width) + c * (height * width) + h * width + w;
-}
-
-// [f16 <= f16 * f16 + f16]
-static void reference_conv2d_f16_f16_f16_f16(
-    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
-    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
-    iree_hal_dim_t kw_size, iree_hal_dim_t sh_size, iree_hal_dim_t sw_size,
-    iree_hal_dim_t dh_size, iree_hal_dim_t dw_size, iree_hal_dim_t oh_size,
-    iree_hal_dim_t ow_size, const uint16_t* input_data,
-    const uint16_t* kernel_data, const uint16_t* acc_data,
-    uint16_t* result_data, iree_hal_dim_t n, iree_hal_dim_t oc,
-    iree_hal_dim_t oh, iree_hal_dim_t ow) {
-  iree_hal_dim_t out_idx =
-      convert_to_1d_index(f_size, oh_size, ow_size, n, oc, oh, ow);
-
-  float acc = acc_data ? iree_math_f16_to_f32(acc_data[out_idx]) : 0.f;
-
-  for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
-    for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
-      for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
-        iree_hal_dim_t inp_idx = convert_to_1d_index(
-            c_size, h_size, w_size, n, ic, (oh * sh_size + kh * dh_size),
-            (ow * sw_size + kw * dw_size));
-        iree_hal_dim_t krnl_idx =
-            convert_to_1d_index(c_size, kh_size, kw_size, oc, ic, kh, kw);
-
-        acc += iree_math_f16_to_f32(input_data[inp_idx]) *
-               iree_math_f16_to_f32(kernel_data[krnl_idx]);
-      }
-    }
-    result_data[out_idx] = iree_math_f32_to_f16(acc);
-  }
-}
-
-static void reference_conv2d_f32_f32_f32_f32(
-    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
-    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
-    iree_hal_dim_t kw_size, iree_hal_dim_t sh_size, iree_hal_dim_t sw_size,
-    iree_hal_dim_t dh_size, iree_hal_dim_t dw_size, iree_hal_dim_t oh_size,
-    iree_hal_dim_t ow_size, const float* input_data, const float* kernel_data,
-    const float* acc_data, float* result_data, iree_hal_dim_t n,
-    iree_hal_dim_t oc, iree_hal_dim_t oh, iree_hal_dim_t ow) {
-  iree_hal_dim_t out_idx =
-      convert_to_1d_index(f_size, oh_size, ow_size, n, oc, oh, ow);
-
-  float acc = acc_data ? acc_data[out_idx] : 0;
-
-  for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
-    for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
-      for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
-        iree_hal_dim_t inp_idx = convert_to_1d_index(
-            c_size, h_size, w_size, n, ic, (oh * sh_size + kh * dh_size),
-            (ow * sw_size + kw * dw_size));
-        iree_hal_dim_t krnl_idx =
-            convert_to_1d_index(c_size, kh_size, kw_size, oc, ic, kh, kw);
-
-        acc += input_data[inp_idx] * kernel_data[krnl_idx];
-      }
-    }
-    result_data[out_idx] = acc;
-  }
-}
-
-// Helper for reference_conv2d.
-static iree_status_t reference_conv2d_element(
-    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
-    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
-    iree_hal_dim_t kw_size, iree_hal_dim_t sh_size, iree_hal_dim_t sw_size,
-    iree_hal_dim_t dh_size, iree_hal_dim_t dw_size, iree_hal_dim_t oh_size,
-    iree_hal_dim_t ow_size, iree_hal_element_type_t input_type,
-    iree_hal_element_type_t kernel_type, iree_hal_element_type_t acc_type,
-    void* input_data, void* kernel_data, void* acc_data, void* result_data,
-    iree_hal_dim_t n, iree_hal_dim_t oc, iree_hal_dim_t oh, iree_hal_dim_t ow) {
-  if (input_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
-      kernel_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
-      acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
-    reference_conv2d_f32_f32_f32_f32(
-        n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, sh_size,
-        sw_size, dh_size, dw_size, oh_size, ow_size, (const float*)input_data,
-        (const float*)kernel_data, (const float*)acc_data, (float*)result_data,
-        n, oc, oh, ow);
-  } else if (input_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
-             kernel_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
-             acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16) {
-    reference_conv2d_f16_f16_f16_f16(
-        n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, sh_size,
-        sw_size, dh_size, dw_size, oh_size, ow_size,
-        (const uint16_t*)input_data, (const uint16_t*)kernel_data,
-        (const uint16_t*)acc_data, (uint16_t*)result_data, n, oc, oh, ow);
-  } else {
-    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                            "unhandled combination of element types in conv2d");
-  }
-  return iree_ok_status();
-}
-
-// Calculate the output shape given the dilation and strides.
-static iree_hal_dim_t out_shape_calc(iree_hal_dim_t i_shape,
-                                     iree_hal_dim_t k_shape,
-                                     iree_hal_dim_t stride,
-                                     iree_hal_dim_t dilation) {
-  iree_hal_dim_t x = (k_shape - 1) * (dilation - 1);
-  x = i_shape - k_shape - x;
-  return floor(x / stride) + 1;
-}
-
-// Reference conv2d-NCHW-FCHW implementation, used to compare conv2d results
-// against.
-static iree_status_t reference_conv2d(
-    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
-    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
-    iree_hal_dim_t kw_size, iree_hal_dim_t sh_size, iree_hal_dim_t sw_size,
-    iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
-    iree_hal_element_type_t input_type, iree_hal_element_type_t kernel_type,
-    iree_hal_element_type_t acc_type, iree_byte_span_t input_contents,
-    iree_byte_span_t kernel_contents, iree_byte_span_t acc_contents,
-    iree_byte_span_t result_contents, int compute_every) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, n_size);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, c_size);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, h_size);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, w_size);
-
-  iree_hal_dim_t oh_size = out_shape_calc(h_size, kh_size, sh_size, dh_size);
-  iree_hal_dim_t ow_size = out_shape_calc(w_size, kw_size, sw_size, dw_size);
-
-  for (iree_hal_dim_t n = 0; n < n_size; ++n) {
-    for (iree_hal_dim_t oc = 0; oc < f_size; ++oc) {
-      for (iree_hal_dim_t oh = 0; oh < oh_size; ++oh) {
-        for (iree_hal_dim_t ow = 0; ow < ow_size; ++ow) {
-          IREE_RETURN_AND_END_ZONE_IF_ERROR(
-              z0, reference_conv2d_element(
-                      n_size, c_size, h_size, w_size, f_size, kh_size, kw_size,
-                      sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
-                      input_type, kernel_type, acc_type, input_contents.data,
-                      kernel_contents.data, acc_contents.data,
-                      result_contents.data, n, oc, oh, ow));
-        }
-      }
-    }
-  }
-
-  IREE_TRACE_ZONE_END(z0);
-  return iree_ok_status();
-}
-
-//===----------------------------------------------------------------------===//
-// Conv2d comparison/logging
-//===----------------------------------------------------------------------===//
-
-typedef struct {
-  iree_allocator_t host_allocator;
-  iree_hal_dim_t n;   // batch dim
-  iree_hal_dim_t c;   // input channels
-  iree_hal_dim_t h;   // input height
-  iree_hal_dim_t w;   // input width
-  iree_hal_dim_t f;   // output channels
-  iree_hal_dim_t kh;  // kernel height
-  iree_hal_dim_t kw;  // kernel width
-  iree_hal_dim_t sh;  // stride along height dim
-  iree_hal_dim_t sw;  // stride along width dim
-  iree_hal_dim_t dh;  // dilation along height dim
-  iree_hal_dim_t dw;  // dilation along width dim
-  iree_hal_element_type_t input_type;
-  iree_hal_element_type_t kernel_type;
-  iree_hal_element_type_t acc_type;
-  iree_hal_element_type_t result_type;
-  iree_byte_span_t input_contents;
-  iree_byte_span_t kernel_contents;
-  iree_byte_span_t acc_contents;
-  iree_byte_span_t actual_contents;
-  iree_byte_span_t expected_contents;
-} conv2d_results_t;
-
-static void conv2d_results_deinitialize(conv2d_results_t* results);
-
-static iree_status_t conv2d_results_initialize(
-    iree_hal_device_t* device, iree_hal_dim_t n_size, iree_hal_dim_t c_size,
-    iree_hal_dim_t h_size, iree_hal_dim_t w_size, iree_hal_dim_t f_size,
-    iree_hal_dim_t kh_size, iree_hal_dim_t kw_size, iree_hal_dim_t sh_size,
-    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
-    iree_hal_buffer_view_t* input, iree_hal_buffer_view_t* kernel,
-    iree_hal_buffer_view_t* acc, iree_hal_buffer_view_t* result,
-    iree_allocator_t host_allocator, conv2d_results_t* out_results) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  memset(out_results, 0, sizeof(*out_results));
-  out_results->host_allocator = host_allocator;
-
-  out_results->n = n_size;
-  out_results->c = c_size;
-  out_results->h = h_size;
-  out_results->w = w_size;
-  out_results->f = f_size;
-  out_results->kh = kh_size;
-  out_results->kw = kw_size;
-  out_results->sh = sh_size;
-  out_results->sw = sw_size;
-  out_results->dh = dh_size;
-  out_results->dw = dw_size;
-
-  out_results->input_type = iree_hal_buffer_view_element_type(input);
-  out_results->kernel_type = iree_hal_buffer_view_element_type(kernel);
-  out_results->acc_type = iree_hal_buffer_view_element_type(acc);
-  out_results->result_type = iree_hal_buffer_view_element_type(result);
-
-  iree_hal_buffer_t* input_buffer = iree_hal_buffer_view_buffer(input);
-  iree_hal_buffer_t* kernel_buffer = iree_hal_buffer_view_buffer(kernel);
-  iree_hal_buffer_t* acc_buffer = acc ? iree_hal_buffer_view_buffer(acc) : NULL;
-  iree_hal_buffer_t* result_buffer = iree_hal_buffer_view_buffer(result);
-
-  iree_status_t status = iree_ok_status();
-
-  if (iree_status_is_ok(status)) {
-    out_results->input_contents.data_length =
-        iree_hal_buffer_byte_length(input_buffer);
-    status = iree_allocator_malloc(host_allocator,
-                                   out_results->input_contents.data_length,
-                                   (void**)&out_results->input_contents.data);
-  }
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_device_transfer_d2h(
-        device, input_buffer, 0, out_results->input_contents.data,
-        out_results->input_contents.data_length,
-        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-  }
-
-  if (iree_status_is_ok(status)) {
-    out_results->kernel_contents.data_length =
-        iree_hal_buffer_byte_length(kernel_buffer);
-    status = iree_allocator_malloc(host_allocator,
-                                   out_results->kernel_contents.data_length,
-                                   (void**)&out_results->kernel_contents.data);
-  }
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_device_transfer_d2h(
-        device, kernel_buffer, 0, out_results->kernel_contents.data,
-        out_results->kernel_contents.data_length,
-        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-  }
-
-  if (acc_buffer) {
-    if (iree_status_is_ok(status)) {
-      out_results->acc_contents.data_length =
-          iree_hal_buffer_byte_length(acc_buffer);
-      status = iree_allocator_malloc(host_allocator,
-                                     out_results->acc_contents.data_length,
-                                     (void**)&out_results->acc_contents.data);
-    }
-    if (iree_status_is_ok(status)) {
-      status = iree_hal_device_transfer_d2h(
-          device, acc_buffer, 0, out_results->acc_contents.data,
-          out_results->acc_contents.data_length,
-          IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-    }
-  }
-
-  if (iree_status_is_ok(status)) {
-    out_results->actual_contents.data_length =
-        iree_hal_buffer_byte_length(result_buffer);
-    status = iree_allocator_malloc(host_allocator,
-                                   out_results->actual_contents.data_length,
-                                   (void**)&out_results->actual_contents.data);
-  }
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_device_transfer_d2h(
-        device, result_buffer, 0, out_results->actual_contents.data,
-        out_results->actual_contents.data_length,
-        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-  }
-
-  if (iree_status_is_ok(status)) {
-    out_results->expected_contents.data_length =
-        iree_hal_buffer_byte_length(result_buffer);
-    status = iree_allocator_malloc(
-        host_allocator, out_results->expected_contents.data_length,
-        (void**)&out_results->expected_contents.data);
-  }
-
-  if (!iree_status_is_ok(status)) {
-    conv2d_results_deinitialize(out_results);
-  }
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
-static void conv2d_results_deinitialize(conv2d_results_t* results) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  iree_allocator_free(results->host_allocator, results->input_contents.data);
-  iree_allocator_free(results->host_allocator, results->kernel_contents.data);
-  if (!iree_byte_span_is_empty(results->acc_contents)) {
-    iree_allocator_free(results->host_allocator, results->acc_contents.data);
-  }
-  iree_allocator_free(results->host_allocator, results->actual_contents.data);
-  iree_allocator_free(results->host_allocator, results->expected_contents.data);
-
-  IREE_TRACE_ZONE_END(z0);
-}
-
-// Helper for check_conv2d: the actual interesting part once we've
-// obtained and validated the {n, f, oh, ow}_size values. On error, the first
-// index is returned where the actual and expected value doesn't match. TODO:
-// Add detailed logging to |file|.
-static iree_status_t check_conv2d_results_impl(FILE* file,
-                                               const conv2d_results_t* results,
-                                               int check_every) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, reference_conv2d(results->n, results->c, results->h, results->w,
-                           results->f, results->kh, results->kw, results->sh,
-                           results->sw, results->dh, results->dw,
-                           results->input_type, results->acc_type,
-                           results->kernel_type, results->input_contents,
-                           results->kernel_contents, results->acc_contents,
-                           results->expected_contents, check_every));
-
-  int count = 0;
-
-  iree_hal_dim_t oh_size =
-      out_shape_calc(results->h, results->kh, results->sh, results->dh);
-  iree_hal_dim_t ow_size =
-      out_shape_calc(results->w, results->kw, results->sw, results->dw);
-
-  for (iree_hal_dim_t n = 0; n < results->n; ++n) {
-    for (iree_hal_dim_t oc = 0; oc < results->f; ++oc) {
-      for (iree_hal_dim_t oh = 0; oh < oh_size; ++oh) {
-        for (iree_hal_dim_t ow = 0; ow < ow_size; ++ow) {
-          if (++count < check_every) continue;
-          count = 0;
-          iree_hal_dim_t idx =
-              convert_to_1d_index(results->f, oh_size, ow_size, n, oc, oh, ow);
-          iree_test_utils_e2e_value_t actual_value =
-              iree_test_utils_read_buffer_element(
-                  idx, results->result_type, results->actual_contents.data);
-          iree_test_utils_e2e_value_t expected_value =
-              iree_test_utils_read_buffer_element(
-                  idx, results->result_type, results->expected_contents.data);
-          if (!iree_test_utils_result_elements_agree(actual_value,
-                                                     expected_value)) {
-            fprintf(
-                file,
-                "\n\nerror: the actual and expected result tensors disagree "
-                "at n %" PRIdim ", oc %" PRIdim ", oh %" PRIdim ", ow %" PRIdim
-                ".\n\n",
-                n, oc, oh, ow);
-            IREE_TRACE_ZONE_END(z0);
-            return iree_make_status(IREE_STATUS_ABORTED);
-          }
-        }
-      }
-    }
-  }
-
-  IREE_TRACE_ZONE_END(z0);
-  return iree_ok_status();
-}
-
-// Given an actual conv2d's inputs and output (all host-local), uses a
-// reference conv2d implementation on the same inputs to check if the output
-// is correct. On error, the first index is returned where the actual and
-// expected value doesn't match. TODO: Add detailed logging to |file|.
-static iree_status_t check_conv2d_results(FILE* file,
-                                          const conv2d_results_t* results) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  // TODO: Increase the check every param to reduce the number of comparisons.
-  int check_every = 1;
-  iree_status_t status = check_conv2d_results_impl(file, results, check_every);
-  if (!iree_status_is_ok(status) && check_every > 1) {
-    // If we got a failure with check_every>1, that didn't log a useful
-    // numerical summary, as most of the reference tensor entries hadn't been
-    // computed. Rerun now with check_every=1 to get that numerical logging.
-    iree_status_ignore(status);
-    status = check_conv2d_results_impl(file, results, 1);
-  }
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
-//===----------------------------------------------------------------------===//
-// `conv2d_test` custom module
-//===----------------------------------------------------------------------===//
-// This uses the C++ wrapper to keep things simple. Though easier to use it's
-// got additional overhead/code-size bloat that doesn't matter in a test like
-// this. Making a C module builder API that removes the boilerplate there is
-// TBD so this file is written in C besides this module so that we can swap it
-// back to being pure C in the future.
-
-namespace iree {
-
-class Conv2dTestModuleState final {
- public:
-  explicit Conv2dTestModuleState(iree_allocator_t host_allocator)
-      : host_allocator_(host_allocator) {}
-  ~Conv2dTestModuleState() = default;
-
-  // Fills the destination span with pseudorandom values of the given
-  // |element_type|. The given |seed| is passed to the pseudorandom generator.
-  // The pseudorandom values are reproducible both across runs and across
-  // machines.
-  StatusOr<vm::ref<iree_hal_buffer_view_t>> GenerateRandom4dTensor(
-      const vm::ref<iree_hal_device_t> device, int64_t dim0, int64_t dim1,
-      int64_t dim2, int64_t dim3, iree_hal_element_type_t element_type,
-      int32_t seed) {
-    iree_hal_dim_t dims[4] = {
-        (iree_hal_dim_t)dim0,
-        (iree_hal_dim_t)dim1,
-        (iree_hal_dim_t)dim2,
-        (iree_hal_dim_t)dim3,
-    };
-    iree_hal_buffer_params_t buffer_params = {0};
-    buffer_params.usage = IREE_HAL_BUFFER_USAGE_DEFAULT;
-    buffer_params.access = IREE_HAL_MEMORY_ACCESS_ALL;
-    buffer_params.type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE;
-    vm::ref<iree_hal_buffer_view_t> result_view;
-    struct callback_state_t {
-      iree_hal_element_type_t element_type;
-      int32_t seed;
-    } callback_state = {
-        element_type,
-        seed,
-    };
-    IREE_RETURN_IF_ERROR(iree_hal_buffer_view_generate_buffer(
-        device.get(), iree_hal_device_allocator(device.get()),
-        IREE_ARRAYSIZE(dims), dims, element_type,
-        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
-        +[](iree_hal_buffer_mapping_t* mapping, void* user_data) {
-          callback_state_t callback_state = *(callback_state_t*)user_data;
-          iree_byte_span_t span = mapping->contents;
-          // Generate "uniform" integer-valued numbers in the range [min,
-          // max].
-          int32_t min = 0;
-          int32_t max = 0;
-          iree_test_utils_get_min_max_for_element_type(
-              callback_state.element_type, &min, &max);
-          uint32_t range = (max - min + 1);
-          iree_host_size_t element_byte_count =
-              iree_hal_element_dense_byte_count(callback_state.element_type);
-          uint8_t* data_end = span.data + span.data_length;
-          uint32_t state = callback_state.seed;
-          for (uint8_t* data = span.data; data < data_end;
-               data += element_byte_count) {
-            int32_t value =
-                (int32_t)iree_test_utils_pseudorandom_range(&state, range) +
-                min;
-            iree_test_utils_write_element(callback_state.element_type, value,
-                                          data);
-          }
-          return iree_ok_status();
-        },
-        &callback_state, &result_view));
-    return std::move(result_view);
-  }
-
-  Status CheckConv2dResults(
-      const vm::ref<iree_hal_device_t> device, int64_t n, int64_t c, int64_t h,
-      int64_t w, int64_t f, int64_t kh, int64_t kw, int64_t sh, int64_t sw,
-      int64_t dh, int64_t dw, const vm::ref<iree_hal_buffer_view_t> input,
-      const vm::ref<iree_hal_buffer_view_t> kernel,
-      const vm::ref<iree_hal_buffer_view_t> acc,
-      const vm::ref<iree_hal_buffer_view_t> actual_result) {
-    conv2d_results_t results = {};
-    IREE_RETURN_IF_ERROR(conv2d_results_initialize(
-        device.get(), (iree_hal_dim_t)n, (iree_hal_dim_t)c, (iree_hal_dim_t)h,
-        (iree_hal_dim_t)w, (iree_hal_dim_t)f, (iree_hal_dim_t)kh,
-        (iree_hal_dim_t)kw, (iree_hal_dim_t)sh, (iree_hal_dim_t)sw,
-        (iree_hal_dim_t)dh, (iree_hal_dim_t)dw, input.get(), kernel.get(),
-        acc.get(), actual_result.get(), host_allocator_, &results));
-    iree_status_t status = check_conv2d_results(stderr, &results);
-    conv2d_results_deinitialize(&results);
-    return status;
-  }
-
- private:
-  iree_allocator_t host_allocator_;
-};
-
-static const vm::NativeFunction<Conv2dTestModuleState>
-    kConv2dTestModuleFunctions[] = {
-        vm::MakeNativeFunction("generate_random_tensor",
-                               &Conv2dTestModuleState::GenerateRandom4dTensor),
-        vm::MakeNativeFunction("check_conv2d_results",
-                               &Conv2dTestModuleState::CheckConv2dResults),
-};
-
-struct Conv2dTestModule final : public vm::NativeModule<Conv2dTestModuleState> {
-  using vm::NativeModule<Conv2dTestModuleState>::NativeModule;
-  StatusOr<std::unique_ptr<Conv2dTestModuleState>> CreateState(
-      iree_allocator_t host_allocator) override {
-    return std::make_unique<Conv2dTestModuleState>(host_allocator);
-  }
-};
-
-}  // namespace iree
-
-static iree_status_t conv2d_test_module_create(iree_vm_instance_t* instance,
-                                               iree_allocator_t host_allocator,
-                                               iree_vm_module_t** out_module) {
-  IREE_ASSERT_ARGUMENT(out_module);
-  *out_module = NULL;
-  auto module = std::make_unique<iree::Conv2dTestModule>(
-      "conv2d_test", /*version=*/0, instance, host_allocator,
-      iree::span<const iree::vm::NativeFunction<iree::Conv2dTestModuleState>>(
-          iree::kConv2dTestModuleFunctions));
-  *out_module = module.release()->interface();
-  return iree_ok_status();
-}
-
-int main(int argc, char** argv) {
-  IREE_TRACE_APP_ENTER();
-
-  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
-  if (argc != 1) {
-    fprintf(stderr, "use --module= flags to specify the modules to run\n");
-    IREE_TRACE_APP_EXIT(EXIT_FAILURE);
-    return EXIT_FAILURE;
-  }
-
-  // Run the tests. Note that some modules may be compiled for other platforms
-  // and not have the required architectures for execution within them - to keep
-  // the test runner dumber we gracefully fail those cases by returning success.
-  iree_status_t status = iree_test_utils_load_and_run_e2e_tests(
-      iree_allocator_system(), conv2d_test_module_create);
-  int exit_code = EXIT_SUCCESS;
-  if (!iree_status_is_ok(status)) {
-    iree_status_fprint(stderr, status);
-    bool is_device_unavailable = iree_status_is_not_found(status);
-    iree_status_free(status);
-    exit_code = is_device_unavailable ? EXIT_SUCCESS : EXIT_FAILURE;
-  }
-
-  IREE_TRACE_APP_EXIT(exit_code);
-  return exit_code;
-}
diff --git a/tools/testing/e2e/iree-e2e-matmul-test.cc b/tools/testing/e2e/iree-e2e-matmul-test.cc
deleted file mode 100644
index f2773f048e79..000000000000
--- a/tools/testing/e2e/iree-e2e-matmul-test.cc
+++ /dev/null
@@ -1,743 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "iree/base/api.h"
-#include "iree/base/internal/cpu.h"
-#include "iree/base/internal/flags.h"
-#include "iree/base/internal/math.h"
-#include "iree/base/internal/path.h"
-#include "iree/hal/api.h"
-#include "iree/modules/hal/module.h"
-#include "iree/tooling/context_util.h"
-#include "iree/tooling/device_util.h"
-#include "iree/vm/api.h"
-#include "iree/vm/native_module_cc.h"
-#include "tools/testing/e2e/test_utils.h"
-
-//===----------------------------------------------------------------------===//
-// Reference matmul
-//===----------------------------------------------------------------------===//
-
-#define REFERENCE_MATMUL(LHSTYPE, RHSTYPE, RESTYPE, ACCTYPE)                   \
-  static void reference_matmul_##LHSTYPE##_##RHSTYPE##_##RESTYPE##_##ACCTYPE(  \
-      iree_hal_dim_t m_size, iree_hal_dim_t k_size, iree_hal_dim_t n_size,     \
-      iree_hal_element_type_t lhs_type, iree_hal_element_type_t rhs_type,      \
-      iree_hal_element_type_t acc_type, bool transpose_rhs,                    \
-      const LHSTYPE* lhs_data, const RHSTYPE* rhs_data,                        \
-      const ACCTYPE* acc_data, RESTYPE* result_data, iree_hal_dim_t m,         \
-      iree_hal_dim_t n) {                                                      \
-    ACCTYPE acc = acc_data ? acc_data[n + m * n_size] : 0;                     \
-    for (iree_hal_dim_t k = 0; k < k_size; ++k) {                              \
-      LHSTYPE lhs_value = lhs_data[k + m * k_size];                            \
-      RHSTYPE rhs_value =                                                      \
-          transpose_rhs ? rhs_data[k + n * k_size] : rhs_data[n + k * n_size]; \
-      acc += (ACCTYPE)lhs_value * (ACCTYPE)rhs_value;                          \
-    }                                                                          \
-    result_data[n + m * n_size] = acc;                                         \
-  }
-
-// Reference mamtul instantiations from macro REFERENCE_MATMUL
-// for the f32 input, f32 accumlation, and f32 result.
-// [float <= float * float + float]
-REFERENCE_MATMUL(float, float, float, float)
-
-// Reference mamtul instantiations from macro REFERENCE_MATMUL
-// for the int8_t input, int32_t accumlation, and int32_t result.
-// [i32 <= i8 * i8 + i32]
-REFERENCE_MATMUL(int8_t, int8_t, int32_t, int32_t)
-
-// Reference mamtul instantiations from macro REFERENCE_MATMUL
-// for the int32_t input, int32_t accumlation, and int32_t result.
-// [i32 <= i32 * i32 + i32]
-REFERENCE_MATMUL(int32_t, int32_t, int32_t, int32_t)
-
-// Reference mamtul for the f16 input, f16 accumlation, and f16 result.
-// [f16 <= f16 * f16 + f16]
-static void reference_matmul_f16_f16_f16_f16(
-    iree_hal_dim_t m_size, iree_hal_dim_t k_size, iree_hal_dim_t n_size,
-    iree_hal_element_type_t lhs_type, iree_hal_element_type_t rhs_type,
-    iree_hal_element_type_t acc_type, bool transpose_rhs,
-    const uint16_t* lhs_data, const uint16_t* rhs_data,
-    const uint16_t* acc_data, uint16_t* result_data, iree_hal_dim_t m,
-    iree_hal_dim_t n) {
-  float acc = acc_data ? iree_math_f16_to_f32(acc_data[n + m * n_size]) : 0.f;
-  for (iree_hal_dim_t k = 0; k < k_size; ++k) {
-    int64_t rhs_index = transpose_rhs ? k + n * k_size : n + k * n_size;
-    acc += iree_math_f16_to_f32(lhs_data[k + m * k_size]) *
-           iree_math_f16_to_f32(rhs_data[rhs_index]);
-  }
-  result_data[n + m * n_size] = iree_math_f32_to_f16(acc);
-}
-
-// Reference mamtul for the f16 input, f32 accumlation, and f32 result.
-// [f32 <= f16 * f16 + f32]
-static void reference_matmul_f16_f16_f32_f32(
-    iree_hal_dim_t m_size, iree_hal_dim_t k_size, iree_hal_dim_t n_size,
-    iree_hal_element_type_t lhs_type, iree_hal_element_type_t rhs_type,
-    iree_hal_element_type_t acc_type, bool transpose_rhs,
-    const uint16_t* lhs_data, const uint16_t* rhs_data, const float* acc_data,
-    float* result_data, iree_hal_dim_t m, iree_hal_dim_t n) {
-  float acc = acc_data ? acc_data[n + m * n_size] : 0.f;
-  for (iree_hal_dim_t k = 0; k < k_size; ++k) {
-    int64_t rhs_index = transpose_rhs ? k + n * k_size : n + k * n_size;
-    acc += iree_math_f16_to_f32(lhs_data[k + m * k_size]) *
-           iree_math_f16_to_f32(rhs_data[rhs_index]);
-  }
-  result_data[n + m * n_size] = acc;
-}
-
-// Reference mamtul for the bf16 input, bf16 accumlation, and bf16 result.
-// [bf16 <= bf16 * bf16 + bf16]
-static void reference_matmul_bf16_bf16_bf16_bf16(
-    iree_hal_dim_t m_size, iree_hal_dim_t k_size, iree_hal_dim_t n_size,
-    iree_hal_element_type_t lhs_type, iree_hal_element_type_t rhs_type,
-    iree_hal_element_type_t acc_type, bool transpose_rhs,
-    const uint16_t* lhs_data, const uint16_t* rhs_data,
-    const uint16_t* acc_data, uint16_t* result_data, iree_hal_dim_t m,
-    iree_hal_dim_t n) {
-  float acc = acc_data ? iree_math_bf16_to_f32(acc_data[n + m * n_size]) : 0.f;
-  for (iree_hal_dim_t k = 0; k < k_size; ++k) {
-    int64_t rhs_index = transpose_rhs ? k + n * k_size : n + k * n_size;
-    acc += iree_math_bf16_to_f32(lhs_data[k + m * k_size]) *
-           iree_math_bf16_to_f32(rhs_data[rhs_index]);
-  }
-  result_data[n + m * n_size] = iree_math_f32_to_bf16(acc);
-}
-
-// Reference mamtul for the bf16 input, f32 accumlation, and f32 result.
-// [f32 <= bf16 * bf16 + f32]
-static void reference_matmul_bf16_bf16_f32_f32(
-    iree_hal_dim_t m_size, iree_hal_dim_t k_size, iree_hal_dim_t n_size,
-    iree_hal_element_type_t lhs_type, iree_hal_element_type_t rhs_type,
-    iree_hal_element_type_t acc_type, bool transpose_rhs,
-    const uint16_t* lhs_data, const uint16_t* rhs_data, const float* acc_data,
-    float* result_data, iree_hal_dim_t m, iree_hal_dim_t n) {
-  float acc = acc_data ? acc_data[n + m * n_size] : 0.f;
-  for (iree_hal_dim_t k = 0; k < k_size; ++k) {
-    int64_t rhs_index = transpose_rhs ? k + n * k_size : n + k * n_size;
-    acc += iree_math_bf16_to_f32(lhs_data[k + m * k_size]) *
-           iree_math_bf16_to_f32(rhs_data[rhs_index]);
-  }
-  result_data[n + m * n_size] = acc;
-}
-
-// Helper for reference_matmul.
-// Computes one element in the result matrix.
-static iree_status_t reference_matmul_element(
-    iree_hal_dim_t m_size, iree_hal_dim_t k_size, iree_hal_dim_t n_size,
-    iree_hal_element_type_t lhs_type, iree_hal_element_type_t rhs_type,
-    iree_hal_element_type_t acc_type, bool transpose_rhs, void* lhs_data,
-    void* rhs_data, void* acc_data, void* result_data, iree_hal_dim_t m,
-    iree_hal_dim_t n) {
-  if (lhs_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
-      rhs_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
-      acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
-    reference_matmul_float_float_float_float(
-        m_size, k_size, n_size, lhs_type, rhs_type, acc_type, transpose_rhs,
-        (const float*)lhs_data, (const float*)rhs_data, (const float*)acc_data,
-        (float*)result_data, m, n);
-  } else if (iree_hal_element_type_is_integer(lhs_type, 8) &&
-             iree_hal_element_type_is_integer(rhs_type, 8) &&
-             iree_hal_element_type_is_integer(acc_type, 32)) {
-    reference_matmul_int8_t_int8_t_int32_t_int32_t(
-        m_size, k_size, n_size, lhs_type, rhs_type, acc_type, transpose_rhs,
-        (const int8_t*)lhs_data, (const int8_t*)rhs_data,
-        (const int32_t*)acc_data, (int32_t*)result_data, m, n);
-  } else if (iree_hal_element_type_is_integer(lhs_type, 32) &&
-             iree_hal_element_type_is_integer(rhs_type, 32) &&
-             iree_hal_element_type_is_integer(acc_type, 32)) {
-    reference_matmul_int32_t_int32_t_int32_t_int32_t(
-        m_size, k_size, n_size, lhs_type, rhs_type, acc_type, transpose_rhs,
-        (const int32_t*)lhs_data, (const int32_t*)rhs_data,
-        (const int32_t*)acc_data, (int32_t*)result_data, m, n);
-  } else if (lhs_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
-             rhs_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
-             acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16) {
-    reference_matmul_f16_f16_f16_f16(
-        m_size, k_size, n_size, lhs_type, rhs_type, acc_type, transpose_rhs,
-        (const uint16_t*)lhs_data, (const uint16_t*)rhs_data,
-        (const uint16_t*)acc_data, (uint16_t*)result_data, m, n);
-  } else if (lhs_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
-             rhs_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
-             acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
-    reference_matmul_f16_f16_f32_f32(
-        m_size, k_size, n_size, lhs_type, rhs_type, acc_type, transpose_rhs,
-        (const uint16_t*)lhs_data, (const uint16_t*)rhs_data,
-        (const float*)acc_data, (float*)result_data, m, n);
-  } else if (lhs_type == IREE_HAL_ELEMENT_TYPE_BFLOAT_16 &&
-             rhs_type == IREE_HAL_ELEMENT_TYPE_BFLOAT_16 &&
-             acc_type == IREE_HAL_ELEMENT_TYPE_BFLOAT_16) {
-    reference_matmul_bf16_bf16_bf16_bf16(
-        m_size, k_size, n_size, lhs_type, rhs_type, acc_type, transpose_rhs,
-        (const uint16_t*)lhs_data, (const uint16_t*)rhs_data,
-        (const uint16_t*)acc_data, (uint16_t*)result_data, m, n);
-  } else if (lhs_type == IREE_HAL_ELEMENT_TYPE_BFLOAT_16 &&
-             rhs_type == IREE_HAL_ELEMENT_TYPE_BFLOAT_16 &&
-             acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
-    reference_matmul_bf16_bf16_f32_f32(
-        m_size, k_size, n_size, lhs_type, rhs_type, acc_type, transpose_rhs,
-        (const uint16_t*)lhs_data, (const uint16_t*)rhs_data,
-        (const float*)acc_data, (float*)result_data, m, n);
-  } else {
-    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                            "unhandled combination of element types in matmul");
-  }
-  return iree_ok_status();
-}
-
-// Reference matmul implementation, used to compare matmul results against.
-static iree_status_t reference_matmul(
-    iree_hal_dim_t m_size, iree_hal_dim_t k_size, iree_hal_dim_t n_size,
-    iree_hal_element_type_t lhs_type, iree_hal_element_type_t rhs_type,
-    iree_hal_element_type_t acc_type, bool transpose_rhs,
-    iree_byte_span_t lhs_contents, iree_byte_span_t rhs_contents,
-    iree_byte_span_t acc_contents, iree_byte_span_t result_contents,
-    int compute_every) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, m_size);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, k_size);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, n_size);
-
-  iree_host_size_t count = 0;
-  for (iree_hal_dim_t m = 0; m < m_size; ++m) {
-    for (iree_hal_dim_t n = 0; n < n_size; ++n) {
-      if (++count < compute_every) continue;
-      count = 0;
-      IREE_RETURN_AND_END_ZONE_IF_ERROR(
-          z0, reference_matmul_element(
-                  m_size, k_size, n_size, lhs_type, rhs_type, acc_type,
-                  transpose_rhs, lhs_contents.data, rhs_contents.data,
-                  acc_contents.data, result_contents.data, m, n));
-    }
-  }
-
-  IREE_TRACE_ZONE_END(z0);
-  return iree_ok_status();
-}
-
-//===----------------------------------------------------------------------===//
-// Matmul comparison/logging
-//===----------------------------------------------------------------------===//
-
-typedef struct {
-  iree_allocator_t host_allocator;
-  iree_hal_dim_t m;
-  iree_hal_dim_t k;
-  iree_hal_dim_t n;
-  iree_hal_element_type_t lhs_type;
-  iree_hal_element_type_t rhs_type;
-  iree_hal_element_type_t acc_type;
-  iree_hal_element_type_t result_type;
-  bool transpose_rhs;
-  iree_byte_span_t lhs_contents;
-  iree_byte_span_t rhs_contents;
-  iree_byte_span_t acc_contents;
-  iree_byte_span_t actual_contents;
-  iree_byte_span_t expected_contents;
-} matmul_results_t;
-
-static void matmul_results_deinitialize(matmul_results_t* results);
-
-static iree_status_t matmul_results_initialize(
-    iree_hal_device_t* device, iree_hal_dim_t m_size, iree_hal_dim_t k_size,
-    iree_hal_dim_t n_size, uint32_t transpose_rhs, iree_hal_buffer_view_t* lhs,
-    iree_hal_buffer_view_t* rhs, iree_hal_buffer_view_t* acc,
-    iree_hal_buffer_view_t* result, iree_allocator_t host_allocator,
-    matmul_results_t* out_results) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  memset(out_results, 0, sizeof(*out_results));
-  out_results->host_allocator = host_allocator;
-
-  out_results->m = m_size;
-  out_results->k = k_size;
-  out_results->n = n_size;
-
-  out_results->lhs_type = iree_hal_buffer_view_element_type(lhs);
-  out_results->rhs_type = iree_hal_buffer_view_element_type(rhs);
-  out_results->acc_type = iree_hal_buffer_view_element_type(result);
-  out_results->result_type = iree_hal_buffer_view_element_type(result);
-
-  out_results->transpose_rhs = transpose_rhs != 0;
-
-  iree_hal_buffer_t* lhs_buffer = iree_hal_buffer_view_buffer(lhs);
-  iree_hal_buffer_t* rhs_buffer = iree_hal_buffer_view_buffer(rhs);
-  iree_hal_buffer_t* acc_buffer = acc ? iree_hal_buffer_view_buffer(acc) : NULL;
-  iree_hal_buffer_t* result_buffer = iree_hal_buffer_view_buffer(result);
-
-  iree_status_t status = iree_ok_status();
-
-  if (iree_status_is_ok(status)) {
-    out_results->lhs_contents.data_length =
-        iree_hal_buffer_byte_length(lhs_buffer);
-    status = iree_allocator_malloc(host_allocator,
-                                   out_results->lhs_contents.data_length,
-                                   (void**)&out_results->lhs_contents.data);
-  }
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_device_transfer_d2h(
-        device, lhs_buffer, 0, out_results->lhs_contents.data,
-        out_results->lhs_contents.data_length,
-        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-  }
-
-  if (iree_status_is_ok(status)) {
-    out_results->rhs_contents.data_length =
-        iree_hal_buffer_byte_length(rhs_buffer);
-    status = iree_allocator_malloc(host_allocator,
-                                   out_results->rhs_contents.data_length,
-                                   (void**)&out_results->rhs_contents.data);
-  }
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_device_transfer_d2h(
-        device, rhs_buffer, 0, out_results->rhs_contents.data,
-        out_results->rhs_contents.data_length,
-        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-  }
-
-  if (acc_buffer) {
-    if (iree_status_is_ok(status)) {
-      out_results->acc_contents.data_length =
-          iree_hal_buffer_byte_length(acc_buffer);
-      status = iree_allocator_malloc(host_allocator,
-                                     out_results->acc_contents.data_length,
-                                     (void**)&out_results->acc_contents.data);
-    }
-    if (iree_status_is_ok(status)) {
-      status = iree_hal_device_transfer_d2h(
-          device, acc_buffer, 0, out_results->acc_contents.data,
-          out_results->acc_contents.data_length,
-          IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-    }
-  }
-
-  if (iree_status_is_ok(status)) {
-    out_results->actual_contents.data_length =
-        iree_hal_buffer_byte_length(result_buffer);
-    status = iree_allocator_malloc(host_allocator,
-                                   out_results->actual_contents.data_length,
-                                   (void**)&out_results->actual_contents.data);
-  }
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_device_transfer_d2h(
-        device, result_buffer, 0, out_results->actual_contents.data,
-        out_results->actual_contents.data_length,
-        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
-  }
-
-  if (iree_status_is_ok(status)) {
-    out_results->expected_contents.data_length =
-        iree_hal_buffer_byte_length(result_buffer);
-    status = iree_allocator_malloc(
-        host_allocator, out_results->expected_contents.data_length,
-        (void**)&out_results->expected_contents.data);
-  }
-
-  if (!iree_status_is_ok(status)) {
-    matmul_results_deinitialize(out_results);
-  }
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
-static void matmul_results_deinitialize(matmul_results_t* results) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  iree_allocator_free(results->host_allocator, results->lhs_contents.data);
-  iree_allocator_free(results->host_allocator, results->rhs_contents.data);
-  if (!iree_byte_span_is_empty(results->acc_contents)) {
-    iree_allocator_free(results->host_allocator, results->acc_contents.data);
-  }
-  iree_allocator_free(results->host_allocator, results->actual_contents.data);
-  iree_allocator_free(results->host_allocator, results->expected_contents.data);
-
-  IREE_TRACE_ZONE_END(z0);
-}
-
-// Returns the largest number of characters to print any matrix element.
-static int get_max_elem_width(precision_t precision, iree_hal_dim_t rows,
-                              iree_hal_dim_t row_start, iree_hal_dim_t row_end,
-                              iree_hal_dim_t cols, iree_hal_dim_t col_start,
-                              iree_hal_dim_t col_end,
-                              iree_hal_element_type_t element_type,
-                              const uint8_t* matrix) {
-  int max_elem_width = 0;
-  for (int row = row_start; row < row_end; row++) {
-    for (int col = col_start; col < col_end; col++) {
-      iree_hal_dim_t idx = col + row * cols;
-      iree_test_utils_e2e_value_t elem =
-          iree_test_utils_read_buffer_element(idx, element_type, matrix);
-      // NOTE: iree_max is a macro and may evaluate its args twice.
-      char buf[64];
-      int this_elem_width =
-          iree_test_utils_snprintf_value(buf, sizeof(buf), elem, precision);
-      max_elem_width = iree_max(max_elem_width, this_elem_width);
-    }
-  }
-  return max_elem_width;
-}
-
-// Prints |matrix| to |file|, with |label| as caption.
-// |precision| controls how many decimals are printed for float values.
-//
-// If |other_matrix| is not NULL, then any matrix entries that disagree
-// between |matrix| and |other_matrix| (according to
-// matmul_result_elements_agree) are highlighted.
-//
-// |highlight| is either NULL or is a UTF-8 string that will be printed next to
-// any entry of |matrix| that disagrees with the corresponding entry of
-// |other_matrix|.
-//
-// |highlight| should be NULL if and only if |other_matrix| is NULL.
-//
-// In order for matrix columns to be properly laid out, the rendering of
-// |highlight| in a fixed-width font should have the width of two regular Latin
-// characters. According to
-// https://www.unicode.org/reports/tr11/#Recommendations, a single emoji
-// character should meet that requirement.
-static void print_matrix(FILE* file, const char* label, precision_t precision,
-                         iree_hal_dim_t rows, iree_hal_dim_t row_start,
-                         iree_hal_dim_t row_end, iree_hal_dim_t cols,
-                         iree_hal_dim_t col_start, iree_hal_dim_t col_end,
-                         iree_hal_element_type_t element_type,
-                         const uint8_t* matrix, const uint8_t* other_matrix,
-                         const char* highlight) {
-  IREE_ASSERT((other_matrix == NULL) == (highlight == NULL));
-  int max_elem_width =
-      get_max_elem_width(precision, rows, row_start, row_end, cols, col_start,
-                         col_end, element_type, matrix);
-  if (other_matrix) {
-    // NOTE: iree_max is a macro and may evaluate its args twice.
-    int other_matrix_max_elem_width =
-        get_max_elem_width(precision, rows, row_start, row_end, cols, col_start,
-                           col_end, element_type, other_matrix);
-    max_elem_width = iree_max(max_elem_width, other_matrix_max_elem_width);
-  }
-
-  fprintf(file,
-          "%s (rows %" PRIdsz "..%" PRIdsz " out of 0..%" PRIdsz
-          ", columns %" PRIdsz "..%" PRIdsz " out of 0..%" PRIdsz ")\n",
-          label, row_start, row_end - 1, rows - 1, col_start, col_end - 1,
-          cols - 1);
-  for (int row = row_start; row < row_end; row++) {
-    for (int col = col_start; col < col_end; col++) {
-      iree_hal_dim_t idx = col + row * cols;
-      iree_test_utils_e2e_value_t element =
-          iree_test_utils_read_buffer_element(idx, element_type, matrix);
-      bool disagree = false;
-      if (other_matrix) {
-        iree_test_utils_e2e_value_t other_element =
-            iree_test_utils_read_buffer_element(idx, element_type,
-                                                other_matrix);
-        disagree =
-            !iree_test_utils_result_elements_agree(element, other_element);
-      }
-      char buf[64];
-      iree_test_utils_snprintf_value(buf, sizeof(buf), element, precision);
-      fprintf(file, "%*s", max_elem_width, buf);
-      // See comment on |highlight| function parameter for why 2 spaces.
-      // A 3rd space is added unconditionally to make it clear that a highlight
-      // concerns the matrix entry to its left.
-      fprintf(file, "%s ", disagree ? highlight : "  ");
-    }
-    fprintf(file, "\n");
-  }
-}
-
-// Helper for check_matmul_results: handler for the failure case.
-// If |file| is not NULL, detailed logging is written to it.
-static iree_status_t check_matmul_failure(
-    FILE* file, const matmul_results_t* results,
-    iree_test_utils_e2e_value_t actual_value,
-    iree_test_utils_e2e_value_t expected_value, iree_hal_dim_t row,
-    iree_hal_dim_t col, int check_every) {
-  if (!file || check_every > 1) {
-    // No logging of errors with check_every>1 as most of the reference matrix
-    // elements have not been computed. The caller is expected to retry with
-    // check_every=1.
-    return iree_make_status(IREE_STATUS_ABORTED);
-  }
-
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  fprintf(file,
-          "\n\nerror: the actual and expected result matrices disagree "
-          "at row %" PRIdim ", column %" PRIdim ".\n\n",
-          row, col);
-  char actual_value_buf[32];
-  char expected_value_buf[32];
-  iree_test_utils_snprintf_value(actual_value_buf, sizeof(actual_value_buf),
-                                 actual_value, PRECISION_HIGH);
-  iree_test_utils_snprintf_value(expected_value_buf, sizeof(expected_value_buf),
-                                 expected_value, PRECISION_HIGH);
-  fprintf(file, "actual value: %s\n", actual_value_buf);
-  fprintf(file, "expected value: %s\n", expected_value_buf);
-
-  iree_hal_dim_t context = 8;
-  const char* context_env = getenv("IREE_MATMUL_TEST_SHOW_CONTEXT");
-  if (context_env) {
-    if (1 != sscanf(context_env, "%" PRIdim, &context)) {
-      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                              "failed to parse IREE_MATMUL_TEST_SHOW_CONTEXT "
-                              "as \"%%" PRIdim "\"; got \"%s\"",
-                              context_env);
-    }
-  }
-  iree_hal_dim_t m_start =
-      (iree_hal_dim_t)iree_max(0, (int64_t)row - (int64_t)context);
-  iree_hal_dim_t m_end = iree_min(results->m, row + context);
-  iree_hal_dim_t n_start =
-      (iree_hal_dim_t)iree_max(0, (int64_t)col - (int64_t)context);
-  iree_hal_dim_t n_end = iree_min(results->n, col + context);
-  iree_hal_dim_t k_start = 0;
-  iree_hal_dim_t k_end = iree_min(results->k, 2 * context);
-  // [k_start, k_end) could be arbitrarily long at this point. Constrain it a
-  // bit to avoid huge output.
-  k_end = iree_min(k_end, k_start + 4 * context);
-
-  fprintf(file, "\n");
-  print_matrix(file, "left-hand side", PRECISION_LOW, results->m, m_start,
-               m_end, results->k, k_start, k_end, results->lhs_type,
-               results->lhs_contents.data, NULL, NULL);
-  fprintf(file, "\n");
-  print_matrix(file, "right-hand side", PRECISION_LOW, results->k, k_start,
-               k_end, results->n, n_start, n_end, results->rhs_type,
-               results->rhs_contents.data, NULL, NULL);
-  fprintf(file, "\n");
-  if (results->acc_contents.data) {
-    print_matrix(file, "input accumulator", PRECISION_LOW, results->m, m_start,
-                 m_end, results->n, n_start, n_end, results->acc_type,
-                 results->acc_contents.data, NULL, NULL);
-    fprintf(file, "\n");
-  }
-  print_matrix(file, "expected result", PRECISION_LOW, results->m, m_start,
-               m_end, results->n, n_start, n_end, results->result_type,
-               results->expected_contents.data, results->actual_contents.data,
-               iree_test_utils_emoji(true));
-  fprintf(file, "\n");
-  print_matrix(file, "actual result", PRECISION_LOW, results->m, m_start, m_end,
-               results->n, n_start, n_end, results->result_type,
-               results->actual_contents.data, results->expected_contents.data,
-               iree_test_utils_emoji(false));
-  fprintf(file, "\n");
-
-  IREE_TRACE_ZONE_END(z0);
-  return iree_make_status(IREE_STATUS_ABORTED);
-}
-
-// Helper for check_matmul_results: the actual interesting part once we've
-// obtained and validated the {m,k,n}_size values. On error, detailed logging is
-// written to |file| if it is not NULL.
-static iree_status_t check_matmul_results_impl(FILE* file,
-                                               const matmul_results_t* results,
-                                               int check_every) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, reference_matmul(
-              results->m, results->k, results->n, results->lhs_type,
-              results->rhs_type, results->acc_type, results->transpose_rhs,
-              results->lhs_contents, results->rhs_contents,
-              results->acc_contents, results->expected_contents, check_every));
-
-  int count = 0;
-  for (iree_hal_dim_t m = 0; m < results->m; ++m) {
-    for (iree_hal_dim_t n = 0; n < results->n; ++n) {
-      if (++count < check_every) continue;
-      count = 0;
-      iree_hal_dim_t idx = m * results->n + n;
-      iree_test_utils_e2e_value_t actual_value =
-          iree_test_utils_read_buffer_element(idx, results->result_type,
-                                              results->actual_contents.data);
-      iree_test_utils_e2e_value_t expected_value =
-          iree_test_utils_read_buffer_element(idx, results->result_type,
-                                              results->expected_contents.data);
-      if (!iree_test_utils_result_elements_agree(actual_value,
-                                                 expected_value)) {
-        iree_status_t status = check_matmul_failure(
-            file, results, actual_value, expected_value, m, n, check_every);
-        IREE_TRACE_ZONE_END(z0);
-        return status;
-      }
-    }
-  }
-
-  IREE_TRACE_ZONE_END(z0);
-  return iree_ok_status();
-}
-
-// Given an actual matmul's inputs and output (all host-local), uses a reference
-// matmul implementation on the same inputs to check if the output is correct.
-// On error, detailed logging is written to |file| if it is not NULL.
-static iree_status_t check_matmul_results(FILE* file,
-                                          const matmul_results_t* results) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  int check_every = iree_test_utils_calculate_check_every(
-      results->m * results->n, results->n);
-  iree_status_t status = check_matmul_results_impl(file, results, check_every);
-  if (!iree_status_is_ok(status) && check_every > 1) {
-    // If we got a failure with check_every>1, that didn't log a useful
-    // numerical summary, as most of the reference matrix entries hadn't been
-    // computed. Rerun now with check_every=1 to get that numerical logging.
-    iree_status_ignore(status);
-    status = check_matmul_results_impl(file, results, 1);
-  }
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
-//===----------------------------------------------------------------------===//
-// `matmul_test` custom module
-//===----------------------------------------------------------------------===//
-// This uses the C++ wrapper to keep things simple. Though easier to use it's
-// got additional overhead/code-size bloat that doesn't matter in a test like
-// this. Making a C module builder API that removes the boilerplate there is TBD
-// so this file is written in C besides this module so that we can swap it back
-// to being pure C in the future.
-
-namespace iree {
-
-class MatmulTestModuleState final {
- public:
-  explicit MatmulTestModuleState(iree_allocator_t host_allocator)
-      : host_allocator_(host_allocator) {}
-  ~MatmulTestModuleState() = default;
-
-  // Fills the destination span with pseudorandom values of the given
-  // |element_type|. The given |seed| is passed to the pseudorandom generator.
-  // The pseudorandom values are reproducible both across runs and across
-  // machines.
-  StatusOr<vm::ref<iree_hal_buffer_view_t>> GenerateRandomMatrix(
-      const vm::ref<iree_hal_device_t> device, int64_t dim0, int64_t dim1,
-      iree_hal_element_type_t element_type, int32_t seed) {
-    iree_hal_dim_t dims[2] = {
-        (iree_hal_dim_t)dim0,
-        (iree_hal_dim_t)dim1,
-    };
-    iree_hal_buffer_params_t buffer_params = {0};
-    buffer_params.usage = IREE_HAL_BUFFER_USAGE_DEFAULT;
-    buffer_params.access = IREE_HAL_MEMORY_ACCESS_ALL;
-    buffer_params.type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE;
-    vm::ref<iree_hal_buffer_view_t> result_view;
-    struct callback_state_t {
-      iree_hal_element_type_t element_type;
-      int32_t seed;
-    } callback_state = {
-        element_type,
-        seed,
-    };
-    IREE_RETURN_IF_ERROR(iree_hal_buffer_view_generate_buffer(
-        device.get(), iree_hal_device_allocator(device.get()),
-        IREE_ARRAYSIZE(dims), dims, element_type,
-        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
-        +[](iree_hal_buffer_mapping_t* mapping, void* user_data) {
-          callback_state_t callback_state = *(callback_state_t*)user_data;
-          iree_byte_span_t span = mapping->contents;
-          // Generate "uniform" integer-valued numbers in the range [min, max].
-          int32_t min = 0;
-          int32_t max = 0;
-          iree_test_utils_get_min_max_for_element_type(
-              callback_state.element_type, &min, &max);
-          uint32_t range = (max - min + 1);
-          iree_host_size_t element_byte_count =
-              iree_hal_element_dense_byte_count(callback_state.element_type);
-          uint8_t* data_end = span.data + span.data_length;
-          uint32_t state = callback_state.seed;
-          for (uint8_t* data = span.data; data < data_end;
-               data += element_byte_count) {
-            int32_t value =
-                (int32_t)iree_test_utils_pseudorandom_range(&state, range) +
-                min;
-            iree_test_utils_write_element(callback_state.element_type, value,
-                                          data);
-          }
-          return iree_ok_status();
-        },
-        &callback_state, &result_view));
-    return std::move(result_view);
-  }
-
-  Status CheckMatmulResults(
-      const vm::ref<iree_hal_device_t> device, int64_t m, int64_t k, int64_t n,
-      int32_t transpose_rhs, const vm::ref<iree_hal_buffer_view_t> lhs,
-      const vm::ref<iree_hal_buffer_view_t> rhs,
-      const vm::ref<iree_hal_buffer_view_t> acc,
-      const vm::ref<iree_hal_buffer_view_t> actual_result) {
-    matmul_results_t results = {};
-    IREE_RETURN_IF_ERROR(matmul_results_initialize(
-        device.get(), (iree_hal_dim_t)m, (iree_hal_dim_t)k, (iree_hal_dim_t)n,
-        transpose_rhs, lhs.get(), rhs.get(), acc.get(), actual_result.get(),
-        host_allocator_, &results));
-    iree_status_t status = check_matmul_results(stderr, &results);
-    matmul_results_deinitialize(&results);
-    return status;
-  }
-
- private:
-  iree_allocator_t host_allocator_;
-};
-
-static const vm::NativeFunction<MatmulTestModuleState>
-    kMatmulTestModuleFunctions[] = {
-        vm::MakeNativeFunction("generate_random_matrix",
-                               &MatmulTestModuleState::GenerateRandomMatrix),
-        vm::MakeNativeFunction("check_matmul_results",
-                               &MatmulTestModuleState::CheckMatmulResults),
-};
-
-struct MatmulTestModule final : public vm::NativeModule<MatmulTestModuleState> {
-  using vm::NativeModule<MatmulTestModuleState>::NativeModule;
-  StatusOr<std::unique_ptr<MatmulTestModuleState>> CreateState(
-      iree_allocator_t host_allocator) override {
-    return std::make_unique<MatmulTestModuleState>(host_allocator);
-  }
-};
-
-}  // namespace iree
-
-static iree_status_t matmul_test_module_create(iree_vm_instance_t* instance,
-                                               iree_allocator_t host_allocator,
-                                               iree_vm_module_t** out_module) {
-  IREE_ASSERT_ARGUMENT(out_module);
-  *out_module = NULL;
-  auto module = std::make_unique<iree::MatmulTestModule>(
-      "matmul_test", /*version=*/0, instance, host_allocator,
-      iree::span<const iree::vm::NativeFunction<iree::MatmulTestModuleState>>(
-          iree::kMatmulTestModuleFunctions));
-  *out_module = module.release()->interface();
-  return iree_ok_status();
-}
-
-int main(int argc, char** argv) {
-  IREE_TRACE_APP_ENTER();
-
-  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
-  if (argc != 1) {
-    fprintf(stderr, "use --module= flags to specify the modules to run\n");
-    IREE_TRACE_APP_EXIT(EXIT_FAILURE);
-    return EXIT_FAILURE;
-  }
-
-  // Run the tests. Note that some modules may be compiled for other platforms
-  // and not have the required architectures for execution within them - to keep
-  // the test runner dumber we gracefully fail those cases by returning success.
-  iree_status_t status = iree_test_utils_load_and_run_e2e_tests(
-      iree_allocator_system(), matmul_test_module_create);
-  int exit_code = EXIT_SUCCESS;
-  if (!iree_status_is_ok(status)) {
-    iree_status_fprint(stderr, status);
-    bool is_device_unavailable = iree_status_is_not_found(status);
-    iree_status_free(status);
-    exit_code = is_device_unavailable ? EXIT_SUCCESS : EXIT_FAILURE;
-  }
-
-  IREE_TRACE_APP_EXIT(exit_code);
-  return exit_code;
-}
diff --git a/tools/testing/e2e/test_utils.c b/tools/testing/e2e/test_utils.c
deleted file mode 100644
index 29811482de5a..000000000000
--- a/tools/testing/e2e/test_utils.c
+++ /dev/null
@@ -1,494 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "tools/testing/e2e/test_utils.h"
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "iree/base/api.h"
-#include "iree/base/internal/cpu.h"
-#include "iree/base/internal/flags.h"
-#include "iree/base/internal/math.h"
-#include "iree/base/internal/path.h"
-#include "iree/hal/api.h"
-#include "iree/modules/hal/module.h"
-#include "iree/tooling/context_util.h"
-#include "iree/tooling/device_util.h"
-#include "iree/vm/api.h"
-
-IREE_FLAG(bool, require_exact_results, true,
-          "Requires floating point result elements to match exactly.");
-
-bool iree_test_utils_require_exact_results(void) {
-  return FLAG_require_exact_results;
-}
-
-IREE_FLAG(
-    float, acceptable_fp_delta, 1e-5f,
-    "Maximum absolute difference allowed with inexact floating point results.");
-
-float iree_test_utils_acceptable_fb_delta(void) {
-  return FLAG_acceptable_fp_delta;
-}
-
-IREE_FLAG(
-    int32_t, max_elements_to_check, 10000,
-    "Maximum number of tensor elements to check for the given test. For larger "
-    "buffers, only every n-th element will be checked for some n chosed to "
-    "stay just under that threshold and to avoid being a divisor of the inner "
-    "dimension size to avoid special patterns. As the check uses a slow "
-    "reference implementation, this is a trade-off between test latency and "
-    "coverage. The value 0 means check all elements.");
-
-int32_t iree_test_utils_max_elements_to_check(void) {
-  return FLAG_max_elements_to_check;
-}
-
-const char* iree_test_utils_emoji(bool good) { return good ? "🦄" : "🐞"; }
-
-int iree_test_utils_calculate_check_every(iree_hal_dim_t tot_elements,
-                                          iree_hal_dim_t no_div_of) {
-  int check_every = 1;
-  if (iree_test_utils_max_elements_to_check()) {
-    check_every =
-        ((tot_elements) + iree_test_utils_max_elements_to_check() - 1) /
-        iree_test_utils_max_elements_to_check();
-    if (check_every < 1) check_every = 1;
-    if (check_every > 1)
-      while ((no_div_of % check_every) == 0) ++check_every;
-  }
-  return check_every;
-}
-
-iree_test_utils_e2e_value_t iree_test_utils_value_make_none() {
-  iree_test_utils_e2e_value_t result;
-  result.type = IREE_TEST_UTILS_VALUE_TYPE_NONE;
-  return result;
-}
-
-iree_test_utils_e2e_value_t iree_test_utils_value_make_i8(int8_t value) {
-  iree_test_utils_e2e_value_t result;
-  result.type = IREE_TEST_UTILS_VALUE_TYPE_I8;
-  result.i8 = value;
-  return result;
-}
-
-iree_test_utils_e2e_value_t iree_test_utils_value_make_i16(int16_t value) {
-  iree_test_utils_e2e_value_t result;
-  result.type = IREE_TEST_UTILS_VALUE_TYPE_I16;
-  result.i16 = value;
-  return result;
-}
-
-iree_test_utils_e2e_value_t iree_test_utils_value_make_i32(int32_t value) {
-  iree_test_utils_e2e_value_t result;
-  result.type = IREE_TEST_UTILS_VALUE_TYPE_I32;
-  result.i32 = value;
-  return result;
-}
-
-iree_test_utils_e2e_value_t iree_test_utils_value_make_f16(uint16_t value) {
-  iree_test_utils_e2e_value_t result;
-  result.type = IREE_TEST_UTILS_VALUE_TYPE_F16;
-  result.f16_u16 = value;
-  return result;
-}
-
-iree_test_utils_e2e_value_t iree_test_utils_value_make_bf16(uint16_t value) {
-  iree_test_utils_e2e_value_t result;
-  result.type = IREE_TEST_UTILS_VALUE_TYPE_BF16;
-  result.bf16_u16 = value;
-  return result;
-}
-
-iree_test_utils_e2e_value_t iree_test_utils_value_make_f32(float value) {
-  iree_test_utils_e2e_value_t result;
-  result.type = IREE_TEST_UTILS_VALUE_TYPE_F32;
-  result.f32 = value;
-  return result;
-}
-
-iree_test_utils_e2e_value_t iree_test_utils_read_buffer_element(
-    iree_hal_dim_t index, iree_hal_element_type_t result_type,
-    const void* data) {
-  if (iree_hal_element_type_is_integer(result_type, 8)) {
-    return iree_test_utils_value_make_i8(((int8_t*)data)[index]);
-  } else if (iree_hal_element_type_is_integer(result_type, 16)) {
-    return iree_test_utils_value_make_i16(((int16_t*)data)[index]);
-  } else if (iree_hal_element_type_is_integer(result_type, 32)) {
-    return iree_test_utils_value_make_i32(((int32_t*)data)[index]);
-  } else if (result_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16) {
-    return iree_test_utils_value_make_f16(((uint16_t*)data)[index]);
-  } else if (result_type == IREE_HAL_ELEMENT_TYPE_BFLOAT_16) {
-    return iree_test_utils_value_make_bf16(((uint16_t*)data)[index]);
-  } else if (result_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
-    return iree_test_utils_value_make_f32(((float*)data)[index]);
-  }
-  iree_status_abort(iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                                     "unhandled matmul result type"));
-  return iree_test_utils_value_make_none();
-}
-
-int iree_test_utils_snprintf_value(char* buf, size_t bufsize,
-                                   iree_test_utils_e2e_value_t value,
-                                   precision_t precision) {
-  switch (value.type) {
-    case IREE_TEST_UTILS_VALUE_TYPE_I8:
-      return snprintf(buf, bufsize, "%" PRIi8, value.i8);
-    case IREE_TEST_UTILS_VALUE_TYPE_I16:
-      return snprintf(buf, bufsize, "%" PRIi16, value.i16);
-    case IREE_TEST_UTILS_VALUE_TYPE_I32:
-      return snprintf(buf, bufsize, "%" PRIi32, value.i32);
-    case IREE_TEST_UTILS_VALUE_TYPE_I64:
-      return snprintf(buf, bufsize, "%" PRIi64, value.i64);
-    case IREE_TEST_UTILS_VALUE_TYPE_F16:
-      return snprintf(buf, bufsize,
-                      precision == PRECISION_HIGH ? "%.5g" : "%.4g",
-                      iree_math_f16_to_f32(value.f16_u16));
-    case IREE_TEST_UTILS_VALUE_TYPE_BF16:
-      return snprintf(buf, bufsize,
-                      precision == PRECISION_HIGH ? "%.5g" : "%.4g",
-                      iree_math_bf16_to_f32(value.bf16_u16));
-    case IREE_TEST_UTILS_VALUE_TYPE_F32:
-      return snprintf(buf, bufsize,
-                      precision == PRECISION_HIGH ? "%.8g" : "%.4g", value.f32);
-    case IREE_TEST_UTILS_VALUE_TYPE_F64:
-      return snprintf(buf, bufsize,
-                      precision == PRECISION_HIGH ? "%.16g" : "%.4g",
-                      value.f64);
-    default:
-      iree_status_abort(iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                                         "unhandled value type"));
-      return 0;
-  }
-}
-
-bool iree_test_utils_result_elements_agree(iree_test_utils_e2e_value_t expected,
-                                           iree_test_utils_e2e_value_t actual) {
-  float acceptable_fp_delta = iree_test_utils_acceptable_fb_delta();
-  if (expected.type != actual.type) {
-    iree_status_abort(
-        iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "mismatched types"));
-    return false;
-  }
-
-  if (acceptable_fp_delta < 0.0f) {
-    iree_status_abort(iree_make_status(
-        IREE_STATUS_INVALID_ARGUMENT,
-        "negative tolerance (acceptable_fp_delta=%.8g)", acceptable_fp_delta));
-    return false;
-  }
-
-  switch (expected.type) {
-    case IREE_TEST_UTILS_VALUE_TYPE_I32:
-      return actual.i32 == expected.i32;
-    // Since we fill buffers with small integers for floating point GEMMs
-    // functional testing, we can test for bit-exactness on the actual and
-    // expected values. Inexact results are only permitted when the
-    // `require_exact_results` flag is set to `false`.
-    case IREE_TEST_UTILS_VALUE_TYPE_F16:
-      if (actual.f16_u16 == expected.f16_u16) return true;
-      if (iree_test_utils_max_elements_to_check()) return false;
-      return fabsf(iree_math_f16_to_f32(actual.f16_u16) -
-                   iree_math_f16_to_f32(expected.f16_u16)) <
-             acceptable_fp_delta;
-    case IREE_TEST_UTILS_VALUE_TYPE_BF16:
-      if (actual.bf16_u16 == expected.bf16_u16) return true;
-      if (iree_test_utils_require_exact_results()) return false;
-      return fabsf(iree_math_bf16_to_f32(actual.bf16_u16) -
-                   iree_math_bf16_to_f32(expected.bf16_u16)) <
-             acceptable_fp_delta;
-    case IREE_TEST_UTILS_VALUE_TYPE_F32:
-      if (actual.f32 == expected.f32) return true;
-      if (iree_test_utils_require_exact_results()) return false;
-      return fabsf(actual.f32 - expected.f32) < acceptable_fp_delta;
-    default:
-      iree_status_abort(iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                                         "unhandled value type"));
-      return false;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// RNG utilities
-//===----------------------------------------------------------------------===//
-
-void iree_test_utils_write_element(iree_hal_element_type_t element_type,
-                                   int32_t value, void* dst) {
-#define WRITE_ELEMENT_CASE(ETYPE, CTYPE) \
-  case IREE_HAL_ELEMENT_TYPE_##ETYPE:    \
-    *(CTYPE*)dst = (CTYPE)value;         \
-    break;
-
-  switch (element_type) {
-    WRITE_ELEMENT_CASE(INT_8, int8_t)
-    WRITE_ELEMENT_CASE(INT_16, int16_t)
-    WRITE_ELEMENT_CASE(INT_32, int32_t)
-    WRITE_ELEMENT_CASE(INT_64, int64_t)
-    WRITE_ELEMENT_CASE(SINT_8, int8_t)
-    WRITE_ELEMENT_CASE(SINT_16, int16_t)
-    WRITE_ELEMENT_CASE(SINT_32, int32_t)
-    WRITE_ELEMENT_CASE(SINT_64, int64_t)
-    WRITE_ELEMENT_CASE(UINT_8, uint8_t)
-    WRITE_ELEMENT_CASE(UINT_16, uint16_t)
-    WRITE_ELEMENT_CASE(UINT_32, uint32_t)
-    WRITE_ELEMENT_CASE(UINT_64, uint64_t)
-      // clang-format off
-    case IREE_HAL_ELEMENT_TYPE_FLOAT_16:
-      *(uint16_t*)dst = iree_math_f32_to_f16((float)value);
-      break;
-    case IREE_HAL_ELEMENT_TYPE_BFLOAT_16:
-      *(uint16_t*)dst = iree_math_f32_to_bf16((float)value);
-      break;
-    WRITE_ELEMENT_CASE(FLOAT_32, float)
-    WRITE_ELEMENT_CASE(FLOAT_64, double)
-    // clang-format on
-    default:
-      IREE_ASSERT(false, "unhandled element type");
-      break;
-  }
-
-#undef WRITE_ELEMENT_CASE
-}
-
-uint32_t iree_test_utils_pseudorandom_uint32(uint32_t* state) {
-  *state = (*state * IREE_PRNG_MULTIPLIER) % IREE_PRNG_MODULUS;
-  return *state;
-}
-
-uint32_t iree_test_utils_pseudorandom_range(uint32_t* state, uint32_t range) {
-  return iree_test_utils_pseudorandom_uint32(state) % range;
-}
-
-void iree_test_utils_get_min_max_for_element_type(
-    iree_hal_element_type_t element_type, int32_t* min, int32_t* max) {
-  switch (element_type) {
-    case IREE_HAL_ELEMENT_TYPE_INT_8:
-    case IREE_HAL_ELEMENT_TYPE_SINT_8:
-      *min = -2;
-      *max = +2;
-      break;
-    case IREE_HAL_ELEMENT_TYPE_UINT_8:
-      *min = 0;
-      *max = +2;
-      break;
-    case IREE_HAL_ELEMENT_TYPE_INT_16:
-    case IREE_HAL_ELEMENT_TYPE_SINT_16:
-    case IREE_HAL_ELEMENT_TYPE_FLOAT_16:
-      *min = -4;
-      *max = +4;
-      break;
-    case IREE_HAL_ELEMENT_TYPE_BFLOAT_16:
-      *min = -2;
-      *max = +2;
-      break;
-    case IREE_HAL_ELEMENT_TYPE_UINT_16:
-      *min = 0;
-      *max = +4;
-      break;
-    case IREE_HAL_ELEMENT_TYPE_INT_32:
-    case IREE_HAL_ELEMENT_TYPE_SINT_32:
-    case IREE_HAL_ELEMENT_TYPE_FLOAT_32:
-      *min = -8;
-      *max = +8;
-      break;
-    case IREE_HAL_ELEMENT_TYPE_UINT_32:
-      *min = 0;
-      *max = +8;
-      break;
-    case IREE_HAL_ELEMENT_TYPE_INT_64:
-    case IREE_HAL_ELEMENT_TYPE_SINT_64:
-    case IREE_HAL_ELEMENT_TYPE_FLOAT_64:
-      *min = -16;
-      *min = +16;
-      break;
-    case IREE_HAL_ELEMENT_TYPE_UINT_64:
-      *min = 0;
-      *max = +16;
-      break;
-    default:
-      IREE_ASSERT(false, "unhandled element type");
-      break;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Test runner
-//===----------------------------------------------------------------------===//
-
-iree_status_t iree_test_utils_check_test_function(iree_vm_function_t function,
-                                                  bool* out_is_valid) {
-  *out_is_valid = true;
-
-  iree_string_view_t function_name = iree_vm_function_name(&function);
-  if (iree_string_view_starts_with(function_name,
-                                   iree_make_cstring_view("__"))) {
-    // Internal compiler/runtime support function.
-    *out_is_valid = false;
-  }
-
-  iree_vm_function_signature_t function_signature =
-      iree_vm_function_signature(&function);
-  iree_host_size_t argument_count = 0;
-  iree_host_size_t result_count = 0;
-  IREE_RETURN_IF_ERROR(iree_vm_function_call_count_arguments_and_results(
-      &function_signature, &argument_count, &result_count));
-  if (argument_count || result_count) {
-    // Takes args or has results we don't expect.
-    *out_is_valid = false;
-  }
-
-  return iree_ok_status();
-}
-
-iree_status_t iree_test_utils_run_test_function(
-    iree_vm_context_t* context, iree_vm_function_t function,
-    iree_allocator_t host_allocator) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  iree_string_view_t function_name = iree_vm_function_name(&function);
-  IREE_TRACE_ZONE_APPEND_TEXT(z0, function_name.data, function_name.size);
-  fprintf(stderr, "--- TEST[%.*s] ---\n", (int)function_name.size,
-          function_name.data);
-  iree_string_view_t function_desc =
-      iree_vm_function_lookup_attr_by_name(&function, IREE_SV("description"));
-  if (!iree_string_view_is_empty(function_desc)) {
-    fprintf(stderr, "%.*s\n", (int)function_desc.size, function_desc.data);
-  }
-  iree_status_t status = iree_vm_invoke(
-      context, function, IREE_VM_INVOCATION_FLAG_NONE, /*policy=*/NULL,
-      /*inputs=*/NULL, /*outputs=*/NULL, host_allocator);
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
-iree_status_t iree_test_utils_run_all_test_functions(
-    iree_vm_context_t* context, iree_vm_module_t* test_module,
-    iree_allocator_t host_allocator) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  // Walk all functions and find the ones we can run (no args, non-internal).
-  const iree_vm_module_signature_t module_signature =
-      iree_vm_module_signature(test_module);
-  for (iree_host_size_t i = 0; i < module_signature.export_function_count;
-       ++i) {
-    // Get the function and filter to just the public user exports.
-    iree_vm_function_t function;
-    IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0, iree_vm_module_lookup_function_by_ordinal(
-                test_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
-    bool is_valid = false;
-    IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0, iree_test_utils_check_test_function(function, &is_valid));
-    if (is_valid) {
-      // Try to run the function and fail on mismatch.
-      IREE_RETURN_AND_END_ZONE_IF_ERROR(
-          z0,
-          iree_test_utils_run_test_function(context, function, host_allocator));
-    }
-  }
-
-  IREE_TRACE_ZONE_END(z0);
-  return iree_ok_status();
-}
-
-iree_status_t iree_test_utils_check_module_requirements(
-    iree_vm_module_t* module) {
-  iree_string_view_t target_features =
-      iree_vm_module_lookup_attr_by_name(module, IREE_SV("target_features"));
-  while (!iree_string_view_is_empty(target_features)) {
-    iree_string_view_t required_feature;
-    iree_string_view_split(target_features, ',', &required_feature,
-                           &target_features);
-    if (iree_string_view_is_empty(required_feature)) continue;
-    int64_t feature_is_supported = 0;
-    IREE_RETURN_IF_ERROR(
-        iree_cpu_lookup_data_by_key(required_feature, &feature_is_supported));
-    if (!feature_is_supported) {
-      return iree_make_status(
-          // The error status matters. We distinguish "feature not supported"
-          // which is a normal thing to happen from actual errors.
-          IREE_STATUS_NOT_FOUND,
-          "target device does not have the required feature '%.*s'",
-          (int)required_feature.size, required_feature.data);
-    }
-  }
-  return iree_ok_status();
-}
-
-iree_status_t iree_test_utils_load_and_run_e2e_tests(
-    iree_allocator_t host_allocator,
-    iree_status_t (*test_module_create)(iree_vm_instance_t*, iree_allocator_t,
-                                        iree_vm_module_t**)) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  iree_cpu_initialize(host_allocator);
-
-  iree_vm_instance_t* instance = NULL;
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_tooling_create_instance(host_allocator, &instance));
-
-  iree_tooling_module_list_t module_list;
-  iree_tooling_module_list_initialize(&module_list);
-
-  // Create the test module providing helper functions used by test programs.
-  iree_vm_module_t* custom_test_module = NULL;
-  iree_status_t status =
-      test_module_create(instance, host_allocator, &custom_test_module);
-  if (iree_status_is_ok(status)) {
-    status =
-        iree_tooling_module_list_push_back(&module_list, custom_test_module);
-  }
-  iree_vm_module_release(custom_test_module);
-
-  // Load all modules specified by --module= flags.
-  if (iree_status_is_ok(status)) {
-    status = iree_tooling_load_modules_from_flags(instance, host_allocator,
-                                                  &module_list);
-  }
-  iree_vm_module_t* test_module = iree_tooling_module_list_back(&module_list);
-
-  // Create the context with our support module and all --module= flags.
-  iree_vm_context_t* context = NULL;
-  iree_hal_device_t* device = NULL;
-  if (iree_status_is_ok(status)) {
-    status = iree_tooling_create_context_from_flags(
-        instance, module_list.count, module_list.values,
-        /*default_device_uri=*/iree_string_view_empty(), host_allocator,
-        &context, &device, /*out_device_allocator=*/NULL);
-  }
-
-  // Ensure the test module is possible to run.
-  if (iree_status_is_ok(status)) {
-    status = iree_test_utils_check_module_requirements(test_module);
-  }
-  iree_tooling_module_list_reset(&module_list);
-
-  // Begin profiling (if enabled).
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_begin_profiling_from_flags(device);
-  }
-
-  // Run all of the tests in the test module.
-  if (iree_status_is_ok(status)) {
-    status = iree_test_utils_run_all_test_functions(context, test_module,
-                                                    host_allocator);
-  }
-
-  // End profiling (if enabled).
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_end_profiling_from_flags(device);
-  }
-
-  iree_hal_device_release(device);
-  iree_vm_context_release(context);
-  iree_vm_instance_release(instance);
-
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
diff --git a/tools/testing/e2e/test_utils.h b/tools/testing/e2e/test_utils.h
deleted file mode 100644
index f095537112e9..000000000000
--- a/tools/testing/e2e/test_utils.h
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_TOOLS_TESTING_E2E_TEST_UTILS_H_
-#define IREE_TOOLS_TESTING_E2E_TEST_UTILS_H_
-#include <stdio.h>
-
-#include "iree/base/api.h"
-#include "iree/base/internal/flags.h"
-#include "iree/hal/api.h"
-#include "iree/vm/api.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-bool iree_test_utils_require_exact_results(void);
-
-float iree_test_utils_acceptable_fb_delta(void);
-
-int32_t iree_test_utils_max_elements_to_check(void);
-
-const char* iree_test_utils_emoji(bool good);
-
-int iree_test_utils_calculate_check_every(iree_hal_dim_t tot_elements,
-                                          iree_hal_dim_t no_div_of);
-
-// Defines the type of a primitive value.
-typedef enum iree_test_utils_value_type_e {
-  // Not a value type.
-  IREE_TEST_UTILS_VALUE_TYPE_NONE = 0,
-  // int8_t.
-  IREE_TEST_UTILS_VALUE_TYPE_I8 = 1,
-  // int16_t.
-  IREE_TEST_UTILS_VALUE_TYPE_I16 = 2,
-  // int32_t.
-  IREE_TEST_UTILS_VALUE_TYPE_I32 = 3,
-  // int64_t.
-  IREE_TEST_UTILS_VALUE_TYPE_I64 = 4,
-  // halft_t.
-  IREE_TEST_UTILS_VALUE_TYPE_F16 = 5,
-  // float.
-  IREE_TEST_UTILS_VALUE_TYPE_F32 = 6,
-  // double.
-  IREE_TEST_UTILS_VALUE_TYPE_F64 = 7,
-  // bfloat16
-  IREE_TEST_UTILS_VALUE_TYPE_BF16 = 8,
-} iree_test_utils_value_type_t;
-
-// Maximum size, in bytes, of any value type we can represent.
-#define IREE_E2E_TEST_VALUE_STORAGE_SIZE 8
-
-// A variant value type.
-typedef struct iree_test_utils_value_t {
-  iree_test_utils_value_type_t type;
-  union {
-    int8_t i8;
-    int16_t i16;
-    int32_t i32;
-    int64_t i64;
-    float f32;
-    uint16_t f16_u16;
-    uint16_t bf16_u16;
-    double f64;
-    uint8_t value_storage[IREE_E2E_TEST_VALUE_STORAGE_SIZE];  // max size of all
-                                                              // value types
-  };
-} iree_test_utils_e2e_value_t;
-
-// Enum controlling how many decimals to print floats with.
-typedef enum iree_test_utils_precision_e {
-  PRECISION_LOW,
-  PRECISION_HIGH,
-} precision_t;
-
-// Reads an element from a buffer given index.
-iree_test_utils_e2e_value_t iree_test_utils_read_buffer_element(
-    iree_hal_dim_t index, iree_hal_element_type_t result_type,
-    const void* data);
-
-// Prints a iree_e2e_test_value_t to a string buffer. Returns the number of
-// characters written. Like snprintf.
-int iree_test_utils_snprintf_value(char* buf, size_t bufsize,
-                                   iree_test_utils_e2e_value_t value,
-                                   precision_t precision);
-
-// Returns true if |expected| and |actual| agree to tolerable accuracy.
-bool iree_test_utils_result_elements_agree(iree_test_utils_e2e_value_t expected,
-                                           iree_test_utils_e2e_value_t actual);
-
-//===----------------------------------------------------------------------===//
-// RNG utilities
-//===----------------------------------------------------------------------===//
-
-// Parameter for locally defined lcg similar to std::minstd_rand.
-#define IREE_PRNG_MULTIPLIER 48271
-#define IREE_PRNG_MODULUS 2147483647
-
-// Simple deterministic pseudorandom generator.
-// This function is same as C++'s std::minstd_rand.
-uint32_t iree_test_utils_pseudorandom_uint32(uint32_t* state);
-
-// Returns a random uint32_t in the range [0, range).
-uint32_t iree_test_utils_pseudorandom_range(uint32_t* state, uint32_t range);
-
-// Writes an element of the given |element_type| with the given integral |value|
-// to |dst|.
-void iree_test_utils_write_element(iree_hal_element_type_t element_type,
-                                   int32_t value, void* dst);
-
-// Get minimum and maximum for integer-valued uniform distribution.
-void iree_test_utils_get_min_max_for_element_type(
-    iree_hal_element_type_t element_type, int32_t* min, int32_t* max);
-
-// Returns true if the |function| is a supported callable test function.
-// We only support functions that are publicly exported, not an internal
-// compiler/runtime function (__ prefixed), and take/return no args/results.
-iree_status_t iree_test_utils_check_test_function(iree_vm_function_t function,
-                                                  bool* out_is_valid);
-
-// Synchronous runs a test |function|.
-// If the test fails then the failure status is returned to the caller.
-iree_status_t iree_test_utils_run_test_function(
-    iree_vm_context_t* context, iree_vm_function_t function,
-    iree_allocator_t host_allocator);
-
-// Runs all test functions in |test_module|.
-iree_status_t iree_test_utils_run_all_test_functions(
-    iree_vm_context_t* context, iree_vm_module_t* test_module,
-    iree_allocator_t host_allocator);
-
-// Returns OK if there are declared requirements on |module| and they are all
-// met and otherwise NOT_FOUND indicating that the module should not be run.
-iree_status_t iree_test_utils_check_module_requirements(
-    iree_vm_module_t* module);
-
-iree_status_t iree_test_utils_load_and_run_e2e_tests(
-    iree_allocator_t host_allocator,
-    iree_status_t (*test_module_create)(iree_vm_instance_t*, iree_allocator_t,
-                                        iree_vm_module_t**));
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-
-#endif  // IREE_TOOLS_TESTING_E2E_TEST_UTILS_H_