From 3e7916d30ad2b453dc6879275551b5c6a36f14cc Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 31 May 2022 11:11:14 -0700
Subject: [PATCH 001/181] [ci][docker] Prune all non-relevant images (#11497)

* [skip ci][ci][docker] Prune all non-relevant images (#11491)

Before this would leave around any image that could be used in CI. This
PR changes it so that the `docker rmi` knows exactly which image is
being used in CI so all others (even those that are being used in the
same build but not currently on that node) are deleted

This also adds some more logging so we can see what's going on and
should help keep disk usage down.

Co-authored-by: driazati <driazati@users.noreply.github.com>

* [skip ci] Revert "[skip ci][ci][docker] Prune all non-relevant images (#11491)" (#11496)

* [ci][docker] Prune all non-relevant images

(this is a re-do of #11491)

Before this would leave around any image that could be used in CI. This PR changes it so that the `docker rmi` knows exactly which image is being used in CI so all others (even those that are being used in the same build but not currently on that node) are deleted

This also adds some more logging so we can see what's going on and should help keep disk usage down. Skipped CI since this runs during lint.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                   | 88 +++++++++++++++++++++++++++++++----
 jenkins/Build.groovy.j2       |  7 +++
 jenkins/DockerBuild.groovy.j2 |  8 ++++
 jenkins/Lint.groovy.j2        |  1 +
 jenkins/Prepare.groovy.j2     | 23 +++++++--
 jenkins/Test.groovy.j2        | 15 +++++-
 jenkins/macros.j2             |  9 ++--
 7 files changed, 134 insertions(+), 17 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index d239d362f9ae3..44389ba767dc7 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-26T15:43:31.409794
+// Generated at 2022-05-27T14:45:11.226042
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -108,11 +108,7 @@ def per_exec_ws(folder) {
 def init_git() {
   checkout scm
 
-  // Clear out all Docker images that aren't going to be used
-  sh(
-    script: "docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}' | { grep -vE '${ci_arm}|${ci_cpu}|${ci_gpu}|${ci_hexagon}|${ci_i386}|${ci_lint}|${ci_qemu}|${ci_wasm}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
-    label: 'Clean old Docker images',
-  )
+
   // Add more info about job node
   sh (
     script: './tests/scripts/task_show_node_info.sh',
@@ -160,6 +156,23 @@ def init_git() {
   )
 }
 
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
@@ -321,6 +334,7 @@ def build_docker_images() {
     parallel 'ci-lint': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_lint')
         }
@@ -328,6 +342,7 @@ def build_docker_images() {
     }, 'ci-cpu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_cpu')
         }
@@ -335,6 +350,7 @@ def build_docker_images() {
     }, 'ci-gpu': {
       node('GPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_gpu')
         }
@@ -342,6 +358,7 @@ def build_docker_images() {
     }, 'ci-qemu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_qemu')
         }
@@ -349,6 +366,7 @@ def build_docker_images() {
     }, 'ci-i386': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_i386')
         }
@@ -356,6 +374,7 @@ def build_docker_images() {
     }, 'ci-arm': {
       node('ARM') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_arm')
         }
@@ -363,6 +382,7 @@ def build_docker_images() {
     }, 'ci-wasm': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_wasm')
         }
@@ -370,6 +390,7 @@ def build_docker_images() {
     }, 'ci-hexagon': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_hexagon')
         }
@@ -424,6 +445,7 @@ def lint() {
   'Lint 1 of 2': {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        docker_init(ci_lint)
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -441,6 +463,7 @@ def lint() {
   'Lint 2 of 2': {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        docker_init(ci_lint)
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -518,6 +541,7 @@ stage('Build') {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") {
+          docker_init(ci_gpu)
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
@@ -564,6 +588,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu") {
+          docker_init(ci_cpu)
           init_git()
           sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
@@ -603,6 +628,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-wasm") {
+          docker_init(ci_wasm)
           init_git()
           sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
@@ -627,6 +653,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-i386") {
+          docker_init(ci_i386)
           init_git()
           sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
@@ -660,6 +687,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-arm") {
+          docker_init(ci_arm)
           init_git()
           sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
@@ -691,6 +719,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-qemu") {
+          docker_init(ci_qemu)
           init_git()
           sh (
             script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
@@ -721,6 +750,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") {
+          docker_init(ci_hexagon)
           init_git()
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
@@ -765,6 +795,7 @@ def shard_run_unittest_GPU_1_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -830,6 +861,7 @@ def shard_run_unittest_GPU_2_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -881,6 +913,7 @@ def shard_run_unittest_GPU_3_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -929,6 +962,7 @@ def shard_run_integration_CPU_1_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -974,6 +1008,7 @@ def shard_run_integration_CPU_2_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1019,6 +1054,7 @@ def shard_run_integration_CPU_3_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1064,6 +1100,7 @@ def shard_run_integration_CPU_4_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1109,6 +1146,7 @@ def shard_run_integration_CPU_5_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1154,6 +1192,7 @@ def shard_run_integration_CPU_6_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1200,6 +1239,7 @@ def shard_run_python_i386_1_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1246,6 +1286,7 @@ def shard_run_python_i386_2_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1291,6 +1332,7 @@ def shard_run_python_i386_3_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1336,6 +1378,7 @@ def shard_run_python_i386_4_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1381,6 +1424,7 @@ def shard_run_python_i386_5_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1427,6 +1471,7 @@ def shard_run_test_Hexagon_1_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1471,6 +1516,7 @@ def shard_run_test_Hexagon_2_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1514,6 +1560,7 @@ def shard_run_test_Hexagon_3_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1557,6 +1604,7 @@ def shard_run_test_Hexagon_4_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1600,6 +1648,7 @@ def shard_run_test_Hexagon_5_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1643,6 +1692,7 @@ def shard_run_test_Hexagon_6_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1686,6 +1736,7 @@ def shard_run_test_Hexagon_7_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1730,6 +1781,7 @@ def shard_run_integration_aarch64_1_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1774,6 +1826,7 @@ def shard_run_integration_aarch64_2_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1818,6 +1871,7 @@ def shard_run_integration_aarch64_3_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1862,6 +1916,7 @@ def shard_run_integration_aarch64_4_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1907,6 +1962,7 @@ def shard_run_topi_GPU_1_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1950,6 +2006,7 @@ def shard_run_topi_GPU_2_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1993,6 +2050,7 @@ def shard_run_topi_GPU_3_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2036,6 +2094,7 @@ def shard_run_topi_GPU_4_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2080,6 +2139,7 @@ def shard_run_frontend_GPU_1_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2123,6 +2183,7 @@ def shard_run_frontend_GPU_2_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2166,6 +2227,7 @@ def shard_run_frontend_GPU_3_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2209,6 +2271,7 @@ def shard_run_frontend_GPU_4_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2252,6 +2315,7 @@ def shard_run_frontend_GPU_5_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2295,6 +2359,7 @@ def shard_run_frontend_GPU_6_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2339,6 +2404,7 @@ def shard_run_topi_aarch64_1_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2387,6 +2453,7 @@ def shard_run_topi_aarch64_2_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2436,6 +2503,7 @@ def shard_run_frontend_aarch64_1_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2479,6 +2547,7 @@ def shard_run_frontend_aarch64_2_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2648,6 +2717,7 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init(ci_cpu)
               init_git()
               withEnv(['PLATFORM=cpu'], {
                 sh(
@@ -2692,6 +2762,7 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-qemu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init(ci_qemu)
               init_git()
               withEnv(['PLATFORM=qemu'], {
                 sh(
@@ -2736,6 +2807,7 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init(ci_cpu)
               init_git()
               withEnv(['PLATFORM=cpu'], {
                 sh(
@@ -2773,6 +2845,7 @@ stage('Test') {
     if (!skip_ci) {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
+          docker_init(ci_gpu)
           init_git()
           sh(
             script: """
@@ -2814,8 +2887,7 @@ stage('Test') {
   },
   )
 }
-}
-/*
+}/*
 stage('Build packages') {
   parallel 'conda CPU': {
     node('CPU') {
diff --git a/jenkins/Build.groovy.j2 b/jenkins/Build.groovy.j2
index 4b0b4ae2e2c80..62ccc94916048 100644
--- a/jenkins/Build.groovy.j2
+++ b/jenkins/Build.groovy.j2
@@ -62,6 +62,7 @@ stage('Build') {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
+          docker_init(ci_gpu)
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
@@ -79,6 +80,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
+          docker_init(ci_cpu)
           init_git()
           sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
@@ -102,6 +104,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
+          docker_init(ci_wasm)
           init_git()
           sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
@@ -126,6 +129,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-i386') }}) {
+          docker_init(ci_i386)
           init_git()
           sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
@@ -143,6 +147,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-arm') }}) {
+          docker_init(ci_arm)
           init_git()
           sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
@@ -160,6 +165,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-qemu') }}) {
+          docker_init(ci_qemu)
           init_git()
           sh (
             script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
@@ -177,6 +183,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
+          docker_init(ci_hexagon)
           init_git()
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
diff --git a/jenkins/DockerBuild.groovy.j2 b/jenkins/DockerBuild.groovy.j2
index 84bb8e3e376d1..e9d80801a9d9c 100644
--- a/jenkins/DockerBuild.groovy.j2
+++ b/jenkins/DockerBuild.groovy.j2
@@ -59,6 +59,7 @@ def build_docker_images() {
     parallel 'ci-lint': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_lint')
         }
@@ -66,6 +67,7 @@ def build_docker_images() {
     }, 'ci-cpu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_cpu')
         }
@@ -73,6 +75,7 @@ def build_docker_images() {
     }, 'ci-gpu': {
       node('GPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_gpu')
         }
@@ -80,6 +83,7 @@ def build_docker_images() {
     }, 'ci-qemu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_qemu')
         }
@@ -87,6 +91,7 @@ def build_docker_images() {
     }, 'ci-i386': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_i386')
         }
@@ -94,6 +99,7 @@ def build_docker_images() {
     }, 'ci-arm': {
       node('ARM') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_arm')
         }
@@ -101,6 +107,7 @@ def build_docker_images() {
     }, 'ci-wasm': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_wasm')
         }
@@ -108,6 +115,7 @@ def build_docker_images() {
     }, 'ci-hexagon': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_hexagon')
         }
diff --git a/jenkins/Lint.groovy.j2 b/jenkins/Lint.groovy.j2
index 61c13cd407d02..40dad3aef7be3 100644
--- a/jenkins/Lint.groovy.j2
+++ b/jenkins/Lint.groovy.j2
@@ -6,6 +6,7 @@ def lint() {
         num_shards=2,
         node='CPU-SMALL',
         ws='tvm/lint',
+        docker_image='ci_lint',
         )
       %}
         sh (
diff --git a/jenkins/Prepare.groovy.j2 b/jenkins/Prepare.groovy.j2
index b4db7de63bd15..2900775f49452 100644
--- a/jenkins/Prepare.groovy.j2
+++ b/jenkins/Prepare.groovy.j2
@@ -6,11 +6,7 @@ def per_exec_ws(folder) {
 def init_git() {
   checkout scm
 
-  // Clear out all Docker images that aren't going to be used
-  sh(
-    script: "docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %} | { grep -vE '{% for image in images %}{% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %}{% if not loop.last %}|{% endif %}{% endfor %}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
-    label: 'Clean old Docker images',
-  )
+
   // Add more info about job node
   sh (
     script: './tests/scripts/task_show_node_info.sh',
@@ -58,6 +54,23 @@ def init_git() {
   )
 }
 
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %})
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
index a08c50905a056..9f949ae717c2a 100644
--- a/jenkins/Test.groovy.j2
+++ b/jenkins/Test.groovy.j2
@@ -10,6 +10,7 @@
   node="GPU",
   ws="tvm/ut-python-gpu",
   platform="gpu",
+  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {% if shard_index == 1 %}
@@ -44,6 +45,7 @@
   num_shards=6,
   ws="tvm/integration-python-cpu",
   platform="cpu",
+  docker_image="ci_cpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
@@ -59,6 +61,7 @@
   num_shards=5,
   ws="tvm/integration-python-i386",
   platform="i386",
+  docker_image="ci_i386",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
@@ -78,6 +81,7 @@
   node="CPU-SMALL",
   ws="tvm/test-hexagon",
   platform="hexagon",
+  docker_image="ci_hexagon",
   test_method_names=test_method_names,
   num_shards=7,
 ) %}
@@ -98,6 +102,7 @@
   node="ARM-SMALL",
   ws="tvm/ut-python-arm",
   platform="arm",
+  docker_image="ci_arm",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
@@ -114,6 +119,7 @@
   num_shards=4,
   ws="tvm/topi-python-gpu",
   platform="gpu",
+  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
@@ -129,6 +135,7 @@
   num_shards=6,
   ws="tvm/frontend-python-gpu",
   platform="gpu",
+  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
@@ -143,6 +150,7 @@
   node="ARM-SMALL",
   ws="tvm/ut-python-arm",
   platform="arm",
+  docker_image="ci_arm",
   num_shards=2,
   test_method_names=test_method_names,
 ) %}
@@ -163,6 +171,7 @@
   node="ARM-SMALL",
   ws="tvm/frontend-python-arm",
   platform="arm",
+  docker_image="ci_arm",
   num_shards=2,
   test_method_names=test_method_names,
 ) %}
@@ -191,6 +200,7 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/ut-python-cpu",
     platform="cpu",
+    docker_image="ci_cpu",
   ) %}
     {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
     ci_setup(ci_cpu)
@@ -207,6 +217,7 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/test-qemu",
     platform="qemu",
+    docker_image="ci_qemu",
   ) %}
     {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
     add_microtvm_permissions()
@@ -226,6 +237,7 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/frontend-python-cpu",
     platform="cpu",
+    docker_image="ci_cpu",
 ) %}
     {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }}
     ci_setup(ci_cpu)
@@ -238,6 +250,7 @@ stage('Test') {
     if (!skip_ci) {
       node('GPU') {
         ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
+          docker_init(ci_gpu)
           init_git()
           {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
           add_microtvm_permissions()
@@ -256,4 +269,4 @@ stage('Test') {
   },
   )
 }
-}
+}
\ No newline at end of file
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 1c649e31fabfd..5a641b73fea84 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -19,7 +19,7 @@
   "workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
 {%- endmacro -%}
 
-{% macro sharded_test_step(name, num_shards, node, ws, platform, test_method_names) %}
+{% macro sharded_test_step(name, num_shards, node, ws, docker_image, platform, test_method_names) %}
 
 {% for shard_index in range(1, num_shards + 1) %}
 {% set method_name = "shard_run_" + name.replace(":", "").replace(" ", "-").replace("-", "_") + "_" + shard_index|string + "_of_" + num_shards|string %}
@@ -28,6 +28,7 @@ def {{ method_name }}() {
     node('{{ node }}') {
       ws({{ per_exec_ws(ws) }}) {
         try {
+          docker_init({{ docker_image }})
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -51,11 +52,12 @@ def {{ method_name }}() {
 {% endfor %}
 {% endmacro %}
 
-{% macro sharded_lint_step(name, num_shards, node, ws) %}
+{% macro sharded_lint_step(name, num_shards, docker_image, node, ws) %}
 {% for shard_index in range(1, num_shards + 1) %}
   '{{ name }} {{ shard_index }} of {{ num_shards }}': {
     node('{{ node }}') {
       ws({{ per_exec_ws(ws) }}) {
+        docker_init({{ docker_image }})
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -71,13 +73,14 @@ def {{ method_name }}() {
 {% endmacro %}
 
 
-{% macro test_step(name, node, ws, platform) %}
+{% macro test_step(name, node, ws, docker_image, platform) %}
   '{{ name }}': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('{{ node }}') {
         ws({{ per_exec_ws(ws) }}) {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init({{ docker_image }})
               init_git()
               withEnv(['PLATFORM={{ platform }}'], {
                 {{ caller() | indent(width=12) | trim }}

From c1b22eefb5dc5c00d945a4cae6c91ce078afcc7d Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 1 Jun 2022 02:50:00 +0800
Subject: [PATCH 002/181] [Arith] Merge surjective/non-surjective iter mapping
 detections (#11287)

* simplify (x * 96) % 64 to (x * 32) % 64

* adapt merge mulmod opt for OffsetOf computation

* merge DetectIterMap and DetectIterMapPadded

* adjust related interfaces for IterMapLevel

* - check incompatible left paddings
- determine case like x % 16, x in [0, 5) to be non-surjective, since usages may treat the region extent as 16 by mistake.
- skip second round of rewrite when there is no padding
- fix some typo in comments

* rebase upstream
---
 include/tvm/arith/iter_affine_map.h           | 114 ++-
 python/tvm/arith/iter_affine_map.py           |  53 +-
 src/arith/int_set.cc                          |   5 +-
 src/arith/iter_affine_map.cc                  | 490 +++++++------
 src/arith/pattern_match.h                     |   2 +
 src/arith/rewrite_simplify.cc                 |  72 +-
 src/arith/rewrite_simplify.h                  |   2 +
 src/tir/ir/buffer.cc                          |  17 +-
 src/tir/ir/index_map.cc                       |  23 +-
 src/tir/schedule/analysis/analysis.cc         |   8 +-
 src/tir/schedule/analysis/layout.cc           |  11 +-
 .../schedule/primitive/blockize_tensorize.cc  |   7 +-
 src/tir/schedule/primitive/compute_at.cc      |   2 +-
 src/tir/schedule/primitive/compute_inline.cc  |   5 +-
 .../primitive/layout_transformation.cc        |   7 +-
 .../schedule/primitive/loop_transformation.cc |   2 +-
 .../unittest/test_arith_iter_affine_map.py    | 674 ++++++++++--------
 .../unittest/test_arith_rewrite_simplify.py   |  14 +-
 tests/python/unittest/test_tir_buffer.py      |  14 +-
 .../unittest/test_tir_schedule_compute_at.py  |  38 +
 20 files changed, 871 insertions(+), 689 deletions(-)

diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
index 4cf6f086d1ed3..2c0e5e92997af 100644
--- a/include/tvm/arith/iter_affine_map.h
+++ b/include/tvm/arith/iter_affine_map.h
@@ -259,53 +259,29 @@ class IterSumExpr : public IterMapExpr {
   TVM_DEFINE_OBJECT_REF_COW_METHOD(IterSumExprNode);
 };
 
+/*! \brief Mapping level for iterators. */
+enum IterMapLevel {
+  // Require the mapping to be bijective.
+  Bijective = 0,
+  // Require the mapping to be surjective.
+  Surjective = 1,
+  // No mapping safety check.
+  NoCheck = 3
+};
+
 /*!
- * \brief Detect if indices can be written as
- *  [y_0 + c_0, y_1 + c_1, ..., y_n + c_n]
- *
- *  Here y = some-quasi-affine-iter-map(input_iters)
- *  and c are symbolic constants.
- *
- *  We also requires that y_i and y_j to be independent for i != j.
- *
- *  For returned value rv, the following is always true:
- *  - rv[i]->args.size() <=1: only one iterator per element.
- *
- * \param indices The indices to detect pattern for.
- * \param input_iters Map from variable to iterator's range.
- * \param predicate The predicate constraints on the input iterators
- * \param require_bijective A boolean flag that indicates whether the mapping should be bijective.
- * \param analyzer Analyzer used to get context information.
- * \param simplify_trivial_iterators If true, iterators with extent of
- *           1 will be replaced with a constant value.
- *
- * \return The detected pattern if a match exists,
- *         otherwise return an empty array.
+ * \brief Result of DetectIterMap.
  */
-Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                 const PrimExpr& predicate, bool require_bijective,
-                                 arith::Analyzer* analyzer, bool simplify_trivial_iterators = true);
+class IterMapResultNode : public Object {
+ public:
+  // The detected pattern if a match exists.
+  Array<IterSumExpr> indices;
 
-/*! \brief A utility struct for return values from DetectPaddedIterMap
- */
-struct PaddedIterMapResult {
   // Any errors that occurred while converting the input indices.  If
   // the array is empty, the conversion was successful.
   Array<String> errors;
 
-  // The detected pattern if a match exists.
-  Array<IterSumExpr> indices;
-
-  /* \brief Boolean expression indicating if padding was required
-   *
-   * `requires_padding` evaluates to true if the returned indices
-   * contain padding relative to the provided expressions, and false
-   * otherwise.  If `input_iters` contains a variable extent, this
-   * expression may be in terms of those variables.
-   */
-  PrimExpr requires_padding;
-
-  /* \brief Boolean expression indicating if a specific value w
+  /*! \brief Boolean expression indicating if a specific value w
    *
    * `padding_predicate` evaluates to true for a set of indices that
    * are outside the bounds of the provided index iterators, but
@@ -314,43 +290,57 @@ struct PaddedIterMapResult {
    * `input_iters`.
    */
   PrimExpr padding_predicate;
+
+  // overrides
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("errors", &errors);
+    v->Visit("indices", &indices);
+    v->Visit("padding_predicate", &padding_predicate);
+  }
+
+  static constexpr const char* _type_key = "arith.IterMapResult";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IterMapResultNode, Object);
+};
+
+/*!
+ * \brief Managed reference to IterMapResultNode.
+ * \sa IterMapResultNode
+ */
+class IterMapResult : public ObjectRef {
+ public:
+  // constructor
+  IterMapResult() { data_ = make_object<IterMapResultNode>(); }
+
+  /*! \return mutable pointers to the node. */
+  IterMapResultNode* operator->() const { return static_cast<IterMapResultNode*>(get_mutable()); }
 };
 
 /*!
  * \brief Detect if indices can be written as
  *  [y_0 + c_0, y_1 + c_1, ..., y_n + c_n]
  *
- *  Here y = some-quasi-affine-iter-map(input_iters) and c are
- *  symbolic constants.  The y_i iterators may be padded to fit this
- *  representation.
+ *  Here y = some-quasi-affine-iter-map(input_iters)
+ *  and c are symbolic constants.
  *
  *  We also requires that y_i and y_j to be independent for i != j.
  *
  *  For returned value rv, the following is always true:
- *  - rv.indices[i]->args.size() <=1: only one iterator per element.
+ *  - rv[i]->args.size() <=1: only one iterator per element.
  *
  * \param indices The indices to detect pattern for.
- *
  * \param input_iters Map from variable to iterator's range.
- *
  * \param predicate The predicate constraints on the input iterators
- *
- * \param require_bijective A boolean flag that indicates whether the
- * mapping should be bijective.  If true, no padding may be
- * introduced.
- *
+ * \param check_level The iter mapping checking level.
  * \param analyzer Analyzer used to get context information.
- *
  * \param simplify_trivial_iterators If true, iterators with extent of
  *           1 will be replaced with a constant value.
  *
- * \return An instance of PaddedIterMapResult.
+ * \return The detected iteration result.
+ * The return object's .indices is empty on failure.
  */
-PaddedIterMapResult DetectPaddedIterMap(const Array<PrimExpr>& indices,
-                                        const Map<Var, Range>& input_iters,
-                                        const PrimExpr& predicate, bool require_bijective,
-                                        arith::Analyzer* analyzer,
-                                        bool simplify_trivial_iterators = true);
+IterMapResult DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
+                            const PrimExpr& predicate, IterMapLevel check_level,
+                            arith::Analyzer* analyzer, bool simplify_trivial_iterators = true);
 
 /*!
  * \brief Use IterVarMap detector to rewrite and simplify the indices
@@ -358,12 +348,12 @@ PaddedIterMapResult DetectPaddedIterMap(const Array<PrimExpr>& indices,
  * \param indices The indices to detect pattern for.
  * \param input_iters Map from variable to iterator's range.
  * \param input_pred The predicate constraints on the input iterators
- * \param require_bijective A boolean flag that indicates whether the mapping should be bijective.
+ * \param check_level The iter mapping checking level.
  *
  * \return The indices after rewrite
  */
 Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                const PrimExpr& input_pred, bool require_bijective);
+                                const PrimExpr& input_pred, IterMapLevel check_level);
 
 /*!
  * \brief Apply the inverse of the affine transformation to the outputs.
@@ -403,7 +393,7 @@ Map<Var, PrimExpr> InverseAffineIterMap(const Array<IterSumExpr>& iter_map,
  * \param input_iters Map from variable to iterator's range.
  * \param sub_iters Iterators of subspace.
  * \param predicate The predicate constraints on the input iterators
- * \param require_bijective A boolean flag that indicates whether the mapping should be bijective.
+ * \param check_level The iter mapping checking level.
  * \param analyzer Analyzer used to get context information.
  *
  * \return The result list has length len(bindings) + 1
@@ -416,7 +406,7 @@ Map<Var, PrimExpr> InverseAffineIterMap(const Array<IterSumExpr>& iter_map,
 Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
                                       const Map<Var, Range>& input_iters,
                                       const Array<Var>& sub_iters, const PrimExpr& predicate,
-                                      bool require_bijective, arith::Analyzer* analyzer);
+                                      IterMapLevel check_level, arith::Analyzer* analyzer);
 
 /*!
  * \brief Given an expression that may contain IterMapExpr, transform it to normal PrimExpr.
diff --git a/python/tvm/arith/iter_affine_map.py b/python/tvm/arith/iter_affine_map.py
index 2be939a12277c..77d6f418b8537 100644
--- a/python/tvm/arith/iter_affine_map.py
+++ b/python/tvm/arith/iter_affine_map.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """ Iterator (quasi)affine mapping patterns."""
+from enum import IntEnum
 import tvm._ffi
 from tvm.runtime import Object
 from tvm.ir import PrimExpr
@@ -88,11 +89,35 @@ def __init__(self, args, base):
         self.__init_handle_by_constructor__(_ffi_api.IterSumExpr, args, base)
 
 
+class IterMapLevel(IntEnum):
+    """Possible kinds of iter mapping check level."""
+
+    Bijective = 0
+    Surjective = 1
+    NoCheck = 3
+
+    @staticmethod
+    def from_str(name: str):
+        """Helper to create level enum from string"""
+        if name is None:
+            return IterMapLevel.NoCheck
+        name = name.lower()
+        if name == "bijective":
+            check_level = IterMapLevel.Bijective
+        elif name == "surjective":
+            check_level = IterMapLevel.Surjective
+        elif name == "nocheck":
+            check_level = IterMapLevel.NoCheck
+        else:
+            raise ValueError(f"Unknown check level {name}")
+        return check_level
+
+
 def detect_iter_map(
     indices,
     input_iters,
     predicate=True,
-    require_bijective=False,
+    check_level=IterMapLevel.Surjective,
     simplify_trivial_iterators=True,
 ):
     """Detect if indices can be written as mapped iters from input iters
@@ -108,8 +133,8 @@ def detect_iter_map(
     predicate : PrimExpr
         The predicate constraints on the input iterators
 
-    require_bijective : bool
-        A boolean flag that indicates whether the mapping should be bijective
+    check_level : Union[str, IterMapLevel]
+        Checking level of iteration mapping
 
     simplify_trivial_iterators: bool
         If true, iterators with extent of 1 will be replaced with a
@@ -117,13 +142,17 @@ def detect_iter_map(
 
     Returns
     -------
-    results : List[IterSumExpr]
+    results : IterMapResult
         The iter map matching result.
-        Empty array if no match can be found.
+        The result's .indices is empty array if no match can be found.
 
     """
+    if isinstance(check_level, str):
+        check_level = IterMapLevel.from_str(check_level)
+    elif check_level is None:
+        check_level = IterMapLevel.NoCheck
     return _ffi_api.DetectIterMap(
-        indices, input_iters, predicate, require_bijective, simplify_trivial_iterators
+        indices, input_iters, predicate, check_level, simplify_trivial_iterators
     )
 
 
@@ -143,7 +172,9 @@ def normalize_iter_map_to_expr(expr):
     return _ffi_api.NormalizeIterMapToExpr(expr)
 
 
-def subspace_divide(bindings, input_iters, sub_iters, predicate=True, require_bijective=False):
+def subspace_divide(
+    bindings, input_iters, sub_iters, predicate=True, check_level=IterMapLevel.Surjective
+):
     """Detect if bindings can be written as
     [a_0*e_0 + b_0 + c_0, a_1*e_1 + b_1, ..., a_n*e_n + b_n]
     where a = some-quasi-affine-iter-map(input_iters set_minus sub_iters)
@@ -172,8 +203,8 @@ def subspace_divide(bindings, input_iters, sub_iters, predicate=True, require_bi
     predicate : PrimExpr
         The predicate constraints on the input iterators
 
-    require_bijective : bool
-        A boolean flag that indicates whether the bindings should be bijective
+    check_level : Union[str, IterMapLevel]
+        Checking level of iteration mapping
 
     Returns
     -------
@@ -185,7 +216,9 @@ def subspace_divide(bindings, input_iters, sub_iters, predicate=True, require_bi
         len(bindings): the predicate of outer space and inner space
         Empty array if no match can be found.
     """
-    return _ffi_api.SubspaceDivide(bindings, input_iters, sub_iters, predicate, require_bijective)
+    if isinstance(check_level, str):
+        check_level = IterMapLevel.from_str(check_level)
+    return _ffi_api.SubspaceDivide(bindings, input_iters, sub_iters, predicate, check_level)
 
 
 def inverse_affine_iter_map(iter_map, outputs):
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index a3fa879afa270..48fae479b042b 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -867,9 +867,10 @@ Optional<Array<IntSet>> EstimateRegionLowerBound(const Array<Range>& region,
     for (const Range& range : region) {
       affine_indices.push_back(range->min);
     }
-    iter_sum_exprs = DetectIterMap(
+    auto res = DetectIterMap(
         /*indices=*/affine_indices, /*input_iters=*/var_dom,
-        /*predicate=*/predicate, /*require_bijective=*/false, analyzer);
+        /*predicate=*/predicate, /*check_level=*/IterMapLevel::Surjective, analyzer);
+    iter_sum_exprs = res->indices;
   }
   if (iter_sum_exprs.empty()) {
     return NullOpt;
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 9fad3b2816a12..cce826fedca64 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -178,10 +178,7 @@ class IterMapRewriter : public ExprMutator {
 
   explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters,
                            bool simplify_trivial_iterators, Array<String>* errors)
-      : analyzer_(analyzer),
-        errors_(*errors),
-        requires_padding_(const_false()),
-        padding_predicate_(const_false()) {
+      : analyzer_(analyzer), errors_(*errors), padding_predicate_(const_false()) {
     for (auto kv : input_iters) {
       const Var& var = kv.first;
       const Range& vrng = kv.second;
@@ -202,16 +199,17 @@ class IterMapRewriter : public ExprMutator {
   }
 
   PrimExpr padding_predicate() const { return padding_predicate_; }
-  PrimExpr requires_padding() const { return requires_padding_; }
+  bool requires_padding() const { return requires_padding_; }
 
   IterSumExpr Rewrite(const PrimExpr& expr) {
     return NormalizeToIterWithOffset(ToIterSumExpr(DirectMutate(expr)));
   }
 
-  void UpdatePadding(const PrimExpr& expr) {
+  IterSumExpr RewriteAndUpdatePadding(const PrimExpr& expr) {
     update_iterator_padding_ = true;
-    DirectMutate(expr);
+    auto res = Rewrite(expr);
     update_iterator_padding_ = false;
+    return res;
   }
 
   IterSumExpr RewriteIterConstraint(const PrimExpr& expr,
@@ -222,7 +220,7 @@ class IterMapRewriter : public ExprMutator {
   }
 
   /*!
-   * \brief If require_bijective is true, this function checks two conditions:
+   * \brief If require bijective mapping, this function checks two conditions:
    *   - C0: Each iter mark should be fully covered by non-overlapping splits.
    *   - C1: All of the input iterators are used.
    *   Example: given x in [0, 8) y in [0, 6)
@@ -232,7 +230,7 @@ class IterMapRewriter : public ExprMutator {
    *     contribute two non-overlapping splits that covers x.
    *   - bindings = [x / 4, x % 4] won't pass because y is not used.
    *
-   *   If require_bijective is false, this function checks one condition:
+   *   If only require surjective mapping, this function checks one condition:
    *   - C0: Each iter mark has a chance to be fully covered by non-overlapping splits.
    *   Example: given x in [0, 8) y in [0, 6)
    *   - bindings = [x / 4] will pass because x / 4 can be one split of x
@@ -241,7 +239,7 @@ class IterMapRewriter : public ExprMutator {
    *   - bindings = [x / 3] will not pass because x / 3 can not be one split of x
    * \return whether the bindings are valid
    */
-  bool CheckMapping(const Array<IterSumExpr>& bindings, bool require_bijective) {
+  bool CheckMapping(const Array<IterSumExpr>& bindings, IterMapLevel check_level) {
     IterMarkSplitCollector collector;
     // We can check that for each iter mark:
     // All the splits that refers to the iter_mark covers its extent.
@@ -249,11 +247,11 @@ class IterMapRewriter : public ExprMutator {
     collector.Collect(bindings);
 
     for (const IterMark& mark : collector.visited_) {
-      if (TryNormalizeSplits(mark, collector.mark2splits_[mark], require_bijective).empty()) {
+      if (TryNormalizeSplits(mark, collector.mark2splits_[mark], check_level).empty()) {
         return false;
       }
     }
-    if (require_bijective) {
+    if (check_level == IterMapLevel::Bijective) {
       // all input marks must be visited
       for (const IterMark& mark : input_marks_) {
         if (collector.visited_.count(mark) == 0 && !is_one(mark->extent)) {
@@ -375,13 +373,14 @@ class IterMapRewriter : public ExprMutator {
   };
 
   struct IterPaddingInfo {
-    // Used and collected during first pass
-    std::vector<PrimExpr> divisors;
+    // GCD of padding factor collected during first pass
+    PrimExpr padding_factor{1};
+
+    PrimExpr left_pad{0};
+    PrimExpr right_pad{0};
 
-    // Defined on first encounter in second pass
-    IterSplitExpr padded;
-    PrimExpr left_pad;
-    PrimExpr right_pad;
+    // Padded form of original iter mark
+    IterMark padded;
   };
 
   // temp hash for de-duplication purposes.
@@ -427,41 +426,30 @@ class IterMapRewriter : public ExprMutator {
   // input iter marks
   std::vector<IterMark> input_marks_;
 
-  // Map from a normal PrimExpr to the padded iterator information for
+  // Map from an iter mark to the padded iterator information for
   // it.  This is necessary for introducing the same padding in all
   // usage of an input iterator.  (e.g. (i-1) occurring in the
   // expressions [(i-1)%8, ((i-1)//8)%4, (i-1)//32] should be
   // left-padded by 31 for each occurrence.)
-  std::unordered_map<PrimExpr, IterPaddingInfo, StructuralHash, StructuralEqual> padded_iter_map_;
+  std::unordered_map<IterMark, IterPaddingInfo, StructuralHash, StructuralEqual> padded_iter_map_;
+
+  // Map from padded iter mark to it's origin mark
+  std::unordered_map<IterMark, IterMark, StructuralHash, StructuralEqual> padded_origin_map_;
 
-  /* If allow_padding_ is true, allow the extents of the IterMap to be
+  /* If update_iterator_padding_ is true, allow the extents of the IterMap to be
    * padded beyond the original iterators.
    *
-   * For example, if allow_padding_ is true, the expressions i//4 and
+   * For example, if update_iterator_padding_ is true, the expressions i//4 and
    * i%4, where i is on the range [0,18), would be represented as
    * IterSplit(i, lower_factor=4, extent=5) and IterSplit(i, extent=4).
-   * This representation would be forbidden if allow_padding_ is false,
+   * This representation would be forbidden if update_iterator_padding_ is false,
    * because lower_factor=4 does not evenly divide the original extent of
    * 18.
    */
   bool update_iterator_padding_{false};
 
-  /* A boolean expression that is true if any padding has been introduced
-   * by the transformation, and false otherwise.
-   *
-   * Example: [i//4, i%4], i in range [0,16)
-   *     requires_padding_ will be false
-   *
-   * Example: [i//4, i%4], i in range [0,18)
-   *     requires_padding_ will be true
-   *
-   * Example: [i//4, i%4], i in range [0,N)
-   *     requires_padding_ will be the expression N%4==0
-   */
-  PrimExpr requires_padding_;
-
   /* A boolean expression that is true for any padding that has been
-   * introduced, and false otherwise. If allow_padding_ is false,
+   * introduced, and false otherwise. If update_iterator_padding_ is false,
    * padding_predicate_ will always be false.
    *
    * Example: [i//4, i%4], i in range [0,16)
@@ -475,6 +463,11 @@ class IterMapRewriter : public ExprMutator {
    */
   PrimExpr padding_predicate_;
 
+  /* A boolean flag denotes there are padding iterations detected
+   * in the first round of indices rewriting.
+   */
+  bool requires_padding_{false};
+
   // The map for sum that maps flattened form to IterMark with normal form and extent (and possibly
   // an extra offset)
   // Example(1): expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2)
@@ -538,13 +531,12 @@ class IterMapRewriter : public ExprMutator {
    *   If not, return an empty array.
    * \param mark The iterator of interest.
    * \param splits The splits to be verified.
-   * \param require_bijective A boolean flag that indicates whether the bindings should be
-   * bijective.
+   * \param check_level Iteration mapping's check level.
    * \return The normalized splits.
    */
   Array<IterSplitExpr> TryNormalizeSplits(const IterMark& mark,
                                           const std::vector<IterSplitExpr>& splits,
-                                          bool require_bijective) {
+                                          IterMapLevel check_level) {
     std::vector<bool> used(splits.size(), false);
     std::vector<IterSplitExpr> iters;
     PrimExpr expected_lower_factor = make_const(mark->source->dtype, 1);
@@ -559,7 +551,7 @@ class IterMapRewriter : public ExprMutator {
       }
       if (j == splits.size()) {
         // we do not allow incomplete split if the bindings should be bijective
-        if (require_bijective) {
+        if (check_level == IterMapLevel::Bijective) {
           return Array<IterSplitExpr>();
         }
         // look for the next split skipping this lower factor
@@ -578,18 +570,64 @@ class IterMapRewriter : public ExprMutator {
       expected_lower_factor = splits[j]->lower_factor * splits[j]->extent;
     }
 
+    // Extract iteration mark info before padding
+    auto pad_mark_it = padded_origin_map_.find(mark);
+    bool has_padding = pad_mark_it != padded_origin_map_.end();
+
+    bool match_full_iter = analyzer_->CanProveEqual(expected_lower_factor, mark->extent);
+    bool match_iter_divisor =
+        match_full_iter || CanProveDivisible(mark->extent, expected_lower_factor);
+
     // Case 1. bijective is required.
-    //         We check the extent we calculate is consistent with the extent of the mark
-    // Case 2. bijective is not required.
+    //         We check the extent we calculate is consistent with the extent of the mark and
+    //         iteration mark's padding is not allowed.
+    //
+    // Case 2. bijective is not required and there is no padding.
     //         We check the extent we calculate is a factor of the extent of the mark
     //         For example, y \in [0, 24) [(y / 2) % 6, y % 2] is valid, but y \in [0, 25) is not.
-    if (require_bijective) {
-      if (!analyzer_->CanProveEqual(expected_lower_factor, mark->extent)) {
-        return Array<IterSplitExpr>();
+    //
+    // Case 3. bijective is not required and there exists padding. We check either
+    //   (3.1) The extent we calculate is consistent with the extent of the padded mark and it is
+    //         the single split for the iter mark.
+    //         For example, padded iter p in [0, 24), [(p / 12)] is valid because it is surjective
+    //         according to how we pad the original iteration mark.
+    //   (3.2) The extent we calculate is a factor of the extent of the padded mark, and the extent
+    //         before padding is greater or equal than the extent we calculate.
+    //         For example, the original extent is 14, [(p % 12)] is valid, with p padded to 24.
+    //
+    if (check_level == IterMapLevel::Bijective) {
+      if (has_padding) {
+        ErrorLogger(this) << "Bijectvie mapping should not take iter paddings";
+        return {};
+      } else if (!match_full_iter) {
+        ErrorLogger(this) << "The iterations do not traverse full iter space";
+        return {};
       }
-    } else {
-      if (!CanProveDivisible(mark->extent, expected_lower_factor)) {
-        return Array<IterSplitExpr>();
+    } else if (!has_padding) {
+      if (!match_iter_divisor) {
+        ErrorLogger(this) << "The lower factor is not divisible by the full iter space extent";
+        return {};
+      }
+    } else if (check_level == IterMapLevel::Surjective) {
+      PrimExpr extent_before_padding = pad_mark_it->second->extent;
+      if (match_full_iter) {
+        if (splits.size() != 1) {
+          ErrorLogger(this) << "Dependent iterations on padding iter space";
+          return Array<IterSplitExpr>();
+        } else if (analyzer_->CanProveEqual(splits[0]->extent, expected_lower_factor) &&
+                   !analyzer_->CanProve(extent_before_padding >= expected_lower_factor)) {
+          ErrorLogger(this) << "Split on padding iteration is not surjective "
+                            << "if the split extent equals to the full iter space extent";
+          return Array<IterSplitExpr>();
+        }
+      } else if (match_iter_divisor) {
+        if (!analyzer_->CanProve(extent_before_padding >= expected_lower_factor)) {
+          ErrorLogger(this) << "The extent before padding is less than lower factor";
+          return Array<IterSplitExpr>();
+        }
+      } else {
+        ErrorLogger(this) << "The lower factor is not divisible by the full iter space extent";
+        return {};
       }
     }
     return Array<IterSplitExpr>(iters.rbegin(), iters.rend());
@@ -1018,39 +1056,23 @@ bool IterRangeSanityCheck(const Map<Var, Range>& iter_ranges) {
   return true;
 }
 
-Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                 const PrimExpr& predicate, bool require_bijective,
-                                 arith::Analyzer* analyzer, bool simplify_trivial_iterators) {
-  auto padded_result = DetectPaddedIterMap(indices, input_iters, predicate, require_bijective,
-                                           analyzer, simplify_trivial_iterators);
-  if (padded_result.errors.size()) {
-    return Array<IterSumExpr>();
-  }
-  if (!analyzer->CanProve(!padded_result.requires_padding)) {
-    return Array<IterSumExpr>();
-  }
-  return padded_result.indices;
-}
-
-PaddedIterMapResult DetectPaddedIterMap(const Array<PrimExpr>& indices,
-                                        const Map<Var, Range>& input_iters,
-                                        const PrimExpr& predicate, bool require_bijective,
-                                        arith::Analyzer* analyzer,
-                                        bool simplify_trivial_iterators) {
-  PaddedIterMapResult result;
+IterMapResult DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
+                            const PrimExpr& predicate, IterMapLevel check_level,
+                            arith::Analyzer* analyzer, bool simplify_trivial_iterators) {
+  IterMapResult result;
 
   // Overall detection algorithm is divided into two steps:
   // - Step0: IterMapRewriter rewrites the expression to use IterMapExpr patterns.
   // - Step1: IterIndependenceChecker checks if the iterator are independent.
   if (!IterRangeSanityCheck(input_iters)) {
-    result.errors.push_back("Invalid iterators.  Iterators may not be expressions of each other.");
+    result->errors.push_back("Invalid iterators.  Iterators may not be expressions of each other.");
     return result;
   }
   Map<Var, Range> constrained_input_iters = input_iters;
   std::vector<IterConstraint> constraints;
   if (!is_one(predicate) &&
       !MatchBoundConstraints(predicate, &constrained_input_iters, &constraints)) {
-    result.errors.push_back("Could not parse predicate as constraints on the input iterators.");
+    result->errors.push_back("Could not parse predicate as constraints on the input iterators.");
     return result;
   }
   // We have to make sure when we visit an iterator, all the constraints related with its successors
@@ -1065,58 +1087,65 @@ PaddedIterMapResult DetectPaddedIterMap(const Array<PrimExpr>& indices,
       [](const IterConstraint& a, const IterConstraint& b) { return a.expr_size < b.expr_size; });
 
   IterMapRewriter rewriter(analyzer, constrained_input_iters, simplify_trivial_iterators,
-                           &result.errors);
+                           &result->errors);
   // Step0.0: rewrite constraints in the order from size-small ones to size-big ones
   for (const IterConstraint& constraint : constraints) {
     auto res = rewriter.RewriteIterConstraint(constraint.iter, constraint.lower_bound,
                                               constraint.upper_bound);
-    if (result.errors.size()) {
+    if (result->errors.size() > 0) {
       return result;
     }
   }
   if (!rewriter.CheckConstraints()) {
-    result.errors.push_back("Invalid constraints.");
+    result->errors.push_back("Invalid constraints.");
     return result;
   }
 
-  // Step0.1: Check each index to determine required padding
-  bool allow_padding = !require_bijective;
+  // Step0.1: Rewrite indicies and determine required padding,
+  // if there is no padding, it should be the final result.
+  Array<IterSumExpr> rewrite_indices;
+  rewrite_indices.reserve(indices.size());
+  bool allow_padding = check_level != IterMapLevel::Bijective;
   if (allow_padding) {
     for (PrimExpr value : indices) {
-      rewriter.UpdatePadding(value);
+      rewrite_indices.push_back(rewriter.RewriteAndUpdatePadding(value));
+      if (result->errors.size() > 0) {
+        return result;
+      }
     }
   }
 
-  // Step0.2: rewrite indices
-  for (PrimExpr value : indices) {
-    result.indices.push_back(rewriter.Rewrite(value));
-    if (result.errors.size()) {
-      return result;
+  // Step0.2: Rewrite indices in the second round.
+  if (!allow_padding || rewriter.requires_padding()) {
+    rewrite_indices.clear();
+    for (PrimExpr value : indices) {
+      rewrite_indices.push_back(rewriter.Rewrite(value));
+      if (result->errors.size() > 0) {
+        return result;
+      }
     }
   }
-
-  result.requires_padding = rewriter.requires_padding();
-  result.padding_predicate = rewriter.padding_predicate();
+  result->padding_predicate = rewriter.padding_predicate();
 
   // Step1: IterIndependenceChecker checks if the iterator are independent.
-  if (!rewriter.CheckMapping(result.indices, require_bijective)) {
-    if (require_bijective) {
-      result.errors.push_back("Index mapping does not form a bijective transform.");
+  if (!rewriter.CheckMapping(rewrite_indices, check_level)) {
+    if (check_level == IterMapLevel::Bijective) {
+      result->errors.push_back("Index mapping does not form a bijective transform.");
     } else {
-      result.errors.push_back("Mapped indices are not independent.");
+      result->errors.push_back("Mapped indices are not independent.");
     }
     return result;
   }
-
+  result->indices = rewrite_indices;
   return result;
 }
 
 TVM_REGISTER_GLOBAL("arith.DetectIterMap")
     .set_body_typed([](const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                       const PrimExpr& input_pred, bool is_bijective,
+                       const PrimExpr& input_pred, int check_level,
                        bool simplify_trivial_iterators) {
       arith::Analyzer ana;
-      return DetectIterMap(indices, input_iters, input_pred, is_bijective, &ana,
+      return DetectIterMap(indices, input_iters, input_pred, IterMapLevel(check_level), &ana,
                            simplify_trivial_iterators);
     });
 
@@ -1246,15 +1275,17 @@ IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr o
     auto split = Downcast<IterSplitExpr>(dividend);
     return IterSumExpr({split}, make_zero(split.dtype()));
   } else if (dividend->IsInstance<IterSumExprNode>()) {
-    auto opt_fused = TryFuseIters(Downcast<IterSumExpr>(dividend));
+    auto sum = Downcast<IterSumExpr>(dividend);
+    if (sum->args.size() <= 1) {
+      return sum;
+    }
+    auto opt_fused = TryFuseIters(sum);
     if (!opt_fused) {
       ErrorLogger(this) << "Dividend  " << tvm::PrettyPrint(original_dividend)
                         << ", can't be written as a single fused IterSum";
       return IterSumExpr();
     }
-
     IterSumExpr fused = opt_fused.value();
-
     ICHECK_EQ(fused->args.size(), 1U);
     return fused;
   } else {
@@ -1263,140 +1294,159 @@ IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr o
   }
 }
 
+/*! \brief Find approximate least common multiplier. */
+PrimExpr ApproxLeastCommonMultiple(const PrimExpr& a, const PrimExpr& b, Analyzer* analyzer) {
+  auto fsplit = [](const PrimExpr& e) -> std::pair<PrimExpr, int64_t> {
+    if (const IntImmNode* imm = e.as<IntImmNode>()) {
+      return {1, imm->value};
+    }
+    PVar<PrimExpr> pv;
+    PVar<IntImm> pc;
+    if ((pv * pc).Match(e) || (pc * pv).Match(e)) {
+      return {pv.Eval(), pc.Eval()->value};
+    } else {
+      return {e, 1};
+    }
+  };
+  auto p1 = fsplit(a);
+  auto p2 = fsplit(b);
+  auto const_lcm = Integer(LeastCommonMultiple(p1.second, p2.second));
+  if (analyzer->CanProveEqual(p1.first, p2.first)) {
+    return p1.first * const_lcm;
+  } else if (analyzer->CanProveEqual(floormod(p1.first, p2.first), 0)) {
+    return p1.first * const_lcm;
+  } else if (analyzer->CanProveEqual(floormod(p2.first, p1.first), 0)) {
+    return p2.first * const_lcm;
+  } else {
+    return (p1.first * p2.first) * const_lcm;
+  }
+}
+
 std::pair<IterSplitExpr, PrimExpr> IterMapRewriter::PadDividendToDivisor(IterSplitExpr split,
                                                                          PrimExpr base,
                                                                          PrimExpr divisor) {
   // If FloorDiv: (((source//lower_factor) % extent) + base) // divisor
   // If FloorMod: (((source//lower_factor) % extent) + base) % divisor
 
-  PrimExpr lookup_key = split;
-
-  auto modified_divisor = [&]() {
-    if (update_iterator_padding_) {
-      return divisor;
-    }
-
-    auto it = padded_iter_map_.find(lookup_key);
-    if (it == padded_iter_map_.end()) {
-      return divisor;
-    }
-
-    const std::vector<PrimExpr>& divisors = it->second.divisors;
-    PrimExpr largest_divisor = divisor;
-    for (const auto& other : divisors) {
-      if (CanProveDivisible(other, largest_divisor)) {
-        // New one is bigger, use it
-        largest_divisor = other;
-      } else if (CanProveDivisible(largest_divisor, other)) {
-        // Current is bigger, keep it
-      } else {
-        ErrorLogger(this) << "Iterator appears in multiple terms with incompatible divisors "
-                          << tvm::PrettyPrint(largest_divisor) << " and "
-                          << tvm::PrettyPrint(other);
-      }
-    }
-    return largest_divisor;
-  }();
-
-  divisor = modified_divisor;
-
   // First, adding any padding that is on the lower side of a
-  // FloorDiv/FloorMod, such that floormod(iter-left_pad,divisor) == 0
-  // when iter==0.
-
-  PrimExpr left_pad;
-
-  if (is_zero(base)) {
-    // Padding on the left is unnecessary if base is known to be zero.
-    left_pad = make_zero(base->dtype);
-  } else {
-    left_pad = analyzer_->Simplify(floormod(base, divisor));
-  }
+  // FloorDiv/FloorMod, such that floormod(split - left_pad, divisor) == 0
+  // when iter == 0.
+  PrimExpr left_pad = analyzer_->Simplify(floormod(base, divisor));
 
   // Next, adding any padding that is on the upper side of a
-  // FloorDiv/FloorMod, such that floormod(left_pad + iter + right_pad, divisor) == 0
-  // when iter==extent.
-
+  // FloorDiv/FloorMod, such that floormod(left_pad + split + right_pad, divisor) == 0
+  // when iter == extent.
   PrimExpr right_edge = left_pad + split->extent;
   PrimExpr right_pad;
-
   if (CanProveDivisible(right_edge, divisor)) {
-    // Padding on the right is unnecessary if the extent is a multiple of
-    // the divisor.
     right_pad = 0;
   } else {
     right_pad = analyzer_->Simplify(floormod(-right_edge, divisor));
   }
 
-  if (is_zero(left_pad) && is_zero(right_pad)) {
-    return {split, left_pad};
-  }
-
+  const IterMark& mark = split->source;
   if (update_iterator_padding_) {
     // In the first pass, the primary goal is to collect all the divisors
-    // that may be used for padding.  These will impact the divisor used
-    // to determine padding in the second pass.
-    IterPaddingInfo& info = padded_iter_map_[lookup_key];
-
-    info.divisors.push_back(divisor);
-
-    PrimExpr padded_extent = left_pad + split->extent + right_pad;
-
-    IterSumExpr as_sum({split}, left_pad);
-    IterMark mark(as_sum, padded_extent);
-    IterSplitExpr new_split(mark);
-
-    return {new_split, left_pad};
+    // that may be used for padding. These will impact the divisor used
+    // to determine padding in the second pass. We try add padding to
+    // split's source iteraton mark thus all splits under the same mark will
+    // share the same padded source iteration.
+    auto& info = padded_iter_map_[mark];
+    info.padding_factor =
+        ApproxLeastCommonMultiple(info.padding_factor, divisor * split->lower_factor, analyzer_);
+
+    // If the split itself require no padding, return directly.
+    if (is_zero(left_pad) && is_zero(right_pad)) {
+      return {split, 0};
+    }
+
+    // Update padding requirement on the lower side of the source iter mark.
+    // In the second pass, all splits would check whether the maximum left pading
+    // on the iter mark is compatible with it's own left padding.
+    requires_padding_ = true;
+    PrimExpr mark_left_pad = left_pad * split->lower_factor;
+    info.left_pad = max(info.left_pad, mark_left_pad);
+
+    // Since we only care the extent in the first pass's result
+    // we just create result of compatible padded extent, ignoring
+    // possible relations between different padded iters.
+    PrimExpr padded_extent = analyzer_->Simplify(left_pad + split->extent + right_pad);
+    split.CopyOnWrite()->extent = padded_extent;
+    return {split, left_pad};
   }
 
-  // Any padding that is required during parsing should have been found
-  // during the first pass that determines the GCD.
-  auto it = padded_iter_map_.find(lookup_key);
+  // In the second pass, update iteration mark's to padded form
+  auto it = padded_iter_map_.find(mark);
   if (it == padded_iter_map_.end()) {
-    ErrorLogger(this) << "Dividend has extent " << tvm::PrettyPrint(split->extent) << " and offset "
-                      << tvm::PrettyPrint(base) << ", which requires padding for divisor "
-                      << tvm::PrettyPrint(divisor) << ".";
-    return {IterSplitExpr(), left_pad};
+    return {split, left_pad};
   }
-  IterPaddingInfo& info = it->second;
-
-  if (info.padded.defined()) {
-    // A previous visit already applied padding to this iterator.
-    // (e.g. Visiting `(i+1)//4`, then visiting `(i+1)%4`).
-    ICHECK(analyzer_->CanProveEqual(info.left_pad, left_pad));
-    ICHECK(analyzer_->CanProveEqual(info.right_pad, right_pad));
-
-    return {info.padded, left_pad};
+  auto& info = it->second;
+  if (is_zero(info.left_pad) && CanProveDivisible(mark->extent, info.padding_factor)) {
+    // the iter mark requires no padding
+    return {split, left_pad};
   }
 
-  // This is the first encounter with the iterator during the second pass.
-  IterSumExpr as_sum({split}, left_pad);
-  IterMark mark(as_sum, left_pad + split->extent + right_pad);
-  info.padded = IterSplitExpr(mark);
-  info.left_pad = left_pad;
-  info.right_pad = right_pad;
-
-  auto left_padding_introduced = (left_pad != 0);
-  // Equivalent to (0 <= split < left_pad), but easier to simplify in
-  // terms of the transformed variables.
-  auto left_padding_predicate =
-      left_padding_introduced && (floordiv(info.padded, divisor) == floordiv(base, divisor) &&
-                                  floormod(info.padded, divisor) < left_pad);
-
-  PrimExpr nparts = ceildiv(right_edge, divisor);
-
-  auto right_padding_introduced = (right_pad != 0);
-
-  // Equivalent to (right_edge <= split < right_edge+right_pad), but
-  // easier to simplify in terms of the transformed variables.
-  auto right_padding_predicate = right_padding_introduced &&
-                                 (floordiv(info.padded, divisor) == floordiv(right_edge, divisor) &&
-                                  floormod(info.padded, divisor) >= floormod(right_edge, divisor));
-
-  requires_padding_ = requires_padding_ || (left_padding_introduced || right_padding_introduced);
-  padding_predicate_ = padding_predicate_ || (left_padding_predicate || right_padding_predicate);
+  // check that padding factor is compatible with current split and divisor
+  ICHECK(CanProveDivisible(info.padding_factor, split->lower_factor))
+      << "The padding factor " << info.padding_factor << " is not divisible by "
+      << split->lower_factor << " for the split " << split;
+  ICHECK(CanProveDivisible(info.padding_factor, divisor))
+      << "The padding factor " << info.padding_factor << " is not divisible by " << divisor
+      << " for the split " << split;
+
+  if (!info.padded.defined()) {
+    // the first time encounter the iter mark to pad, update the padded mark.
+    PrimExpr mark_left_pad = info.left_pad;
+    if (CanProveDivisible(mark_left_pad, split->lower_factor)) {
+      // correct current split's left padding
+      // (mark_left_pad + iter) // lower_factor % extent  =>
+      // (left_pad * lower_factor + mark) // lower_factor % extent =>
+      // (left_pad + mark // lower_factor) % extent =>
+      // left_pad + (mark // lower_factor % extent) =>
+      // left_pad + split
+      // since the extent covers the full padding range.
+      left_pad = floordiv(mark_left_pad, split->lower_factor);
+    } else {
+      ErrorLogger(this) << "Detect incompatible left padding on "
+                        << tvm::PrettyPrint(NormalizeIterMapToExpr(split))
+                        << ", the iter mark is left padded with " << mark_left_pad;
+      return {IterSplitExpr(), PrimExpr()};
+    }
 
-  return {info.padded, left_pad};
+    PrimExpr right_edge = mark->extent + mark_left_pad;
+    PrimExpr mark_right_pad;
+    if (CanProveDivisible(right_edge, info.padding_factor)) {
+      mark_right_pad = 0;
+    } else {
+      mark_right_pad = floormod(-right_edge, info.padding_factor);
+    }
+    PrimExpr padded_extent = analyzer_->Simplify(right_edge + mark_right_pad);
+    info.right_pad = mark_right_pad;
+    info.padded = IterMark(IterSumExpr({IterSplitExpr(mark)}, mark_left_pad), padded_extent);
+    padded_origin_map_[info.padded] = mark;
+
+    auto left_padding_introduced = (mark_left_pad != 0);
+
+    // Equivalent to (0 <= split < left_pad), but easier to simplify in
+    // terms of the transformed variables.
+    auto left_padding_predicate =
+        left_padding_introduced &&
+        (floordiv(info.padded->source, info.padding_factor) == 0 &&
+         floormod(info.padded->source, info.padding_factor) < mark_left_pad);
+    auto right_padding_introduced = (mark_right_pad != 0);
+
+    // Equivalent to (right_edge <= split < right_edge + right_pad), but
+    // easier to simplify in terms of the transformed variables.
+    auto right_padding_predicate =
+        right_padding_introduced && (floordiv(info.padded->source, info.padding_factor) ==
+                                         floordiv(right_edge, info.padding_factor) &&
+                                     floormod(info.padded->source, info.padding_factor) >=
+                                         floormod(right_edge, info.padding_factor));
+    padding_predicate_ = padding_predicate_ || (left_padding_predicate || right_padding_predicate);
+  }
+  split.CopyOnWrite()->source = info.padded;
+  split.CopyOnWrite()->extent = floordiv(info.padded->extent, split->lower_factor);
+  return {split, left_pad};
 }
 
 PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, PrimExpr rhs) {
@@ -1462,7 +1512,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, P
                           /* extent = */ analyzer_->Simplify(floordiv(padded->extent, rhs)),
                           /* scale = */ padded->scale);
 
-  auto new_base = floordiv(base - left_pad, rhs);
+  auto new_base = analyzer_->Simplify(floordiv(base - left_pad, rhs), 6);
   if (is_zero(new_base)) {
     return std::move(new_split);
   } else {
@@ -1540,7 +1590,6 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr base, P
 
   // We handle scale!=1 in above code, hence we only consider floormod(x, rhs) below
   // where x=floormod(floordiv(iter, lower_factor), extent) + base
-
   auto pair = PadDividendToDivisor(lhs, base, rhs);
   IterSplitExpr padded = pair.first;
   if (!padded.defined()) {
@@ -1671,19 +1720,20 @@ PrimExpr NormalizeIterMapToExpr(const PrimExpr& expr) {
 TVM_REGISTER_GLOBAL("arith.NormalizeIterMapToExpr").set_body_typed(NormalizeIterMapToExpr);
 
 Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                const PrimExpr& input_pred, bool require_bijective) {
+                                const PrimExpr& input_pred, IterMapLevel check_level) {
   if (!IterRangeSanityCheck(input_iters)) return indices;
   Analyzer analyzer;
-  Array<IterSumExpr> rewrite =
-      DetectIterMap(indices, input_iters, input_pred, require_bijective, &analyzer);
+  auto res = DetectIterMap(indices, input_iters, input_pred, check_level, &analyzer);
+  Array<IterSumExpr> rewrite = res->indices;
+
   if (rewrite.empty()) {
     return indices;
   }
-  Array<PrimExpr> res;
-  res.reserve(rewrite.size());
+  Array<PrimExpr> simplified;
+  simplified.reserve(rewrite.size());
   IterMapToExprNormalizer converter(&analyzer);
-  for (const auto& expr : rewrite) res.push_back(converter.Convert(expr));
-  return res;
+  for (const auto& expr : rewrite) simplified.push_back(converter.Convert(expr));
+  return simplified;
 }
 
 /*!
@@ -1963,10 +2013,10 @@ class SubspaceDivider {
 Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
                                       const Map<Var, Range>& input_iters,
                                       const Array<Var>& sub_iters, const PrimExpr& predicate,
-                                      bool require_bijective, arith::Analyzer* analyzer) {
+                                      IterMapLevel check_level, arith::Analyzer* analyzer) {
   if (!IterRangeSanityCheck(input_iters)) return Array<Array<IterMark>>();
-  const Array<IterSumExpr>& maps =
-      DetectIterMap(bindings, input_iters, predicate, require_bijective, analyzer);
+  auto res = DetectIterMap(bindings, input_iters, predicate, check_level, analyzer);
+  const Array<IterSumExpr>& maps = res->indices;
   if (maps.empty()) return {};
 
   std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> inner_iter_set;
@@ -1993,10 +2043,10 @@ Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
 
 TVM_REGISTER_GLOBAL("arith.SubspaceDivide")
     .set_body_typed([](const Array<PrimExpr>& bindings, const Map<Var, Range>& root_iters,
-                       const Array<Var>& sub_iters, const PrimExpr& predicate,
-                       bool require_bijective) {
+                       const Array<Var>& sub_iters, const PrimExpr& predicate, int check_level) {
       arith::Analyzer ana;
-      return SubspaceDivide(bindings, root_iters, sub_iters, predicate, require_bijective, &ana);
+      return SubspaceDivide(bindings, root_iters, sub_iters, predicate, IterMapLevel(check_level),
+                            &ana);
     });
 
 class InverseAffineIterMapTransformer {
@@ -2128,5 +2178,7 @@ Map<Var, PrimExpr> InverseAffineIterMap(const Array<IterSumExpr>& iter_map,
 
 TVM_REGISTER_GLOBAL("arith.InverseAffineIterMap").set_body_typed(InverseAffineIterMap);
 
+TVM_REGISTER_NODE_TYPE(IterMapResultNode);
+
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/arith/pattern_match.h b/src/arith/pattern_match.h
index 7d1f315b3cb3c..6abcc728fc8de 100644
--- a/src/arith/pattern_match.h
+++ b/src/arith/pattern_match.h
@@ -203,6 +203,8 @@ class PVar : public Pattern<PVar<T>> {
     return value_;
   }
 
+  T EvalOr(const T& default_value) const { return filled_ ? value_ : default_value; }
+
  protected:
   /*! \brief The matched value */
   mutable T value_;
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index dab78c77a0a1d..f9e38dee48e50 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -776,26 +776,32 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
     TVM_TRY_REWRITE_IF(floordiv(floordiv(x, c1) + c2, c3), floordiv(x + c1 * c2, c1 * c3),
                        c1.Eval()->value > 0 && c3.Eval()->value > 0);
 
-    if (floordiv(x * c1, c2).Match(ret)) {
+    if (floordiv(x * c1 + y, c2).Match(ret) || floordiv(x * c1, c2).Match(ret) ||
+        floordiv(y + x * c1, c2).Match(ret)) {
       int64_t c1val = c1.Eval()->value;
       int64_t c2val = c2.Eval()->value;
-      if (c1val > 0 && c2val > 0) {
-        if (c1val % c2val == 0) return (x * floordiv(c1, c2)).Eval();
-        if (c2val % c1val == 0) return floordiv(x, floordiv(c2, c1)).Eval();
+      PrimExpr yval = y.EvalOr(Integer(0));
+      if (c2val == 0) return ret;
+
+      // try eliminate residue part
+      PrimExpr residue =
+          floordiv(x.Eval() * floormod(c1.Eval(), c2val) + floormod(yval, c2val), c2val);
+      PrimExpr y_div = CanProveEqual(floordiv(yval, c2val), 0) ? 0 : floordiv(yval, c2val);
+      auto bound = analyzer_->const_int_bound(residue);
+      if (bound.defined() && bound->max_value == bound->min_value) {
+        return x.Eval() * floordiv(c1val, c2.Eval()) + (y_div + Integer(bound->max_value));
       }
-    }
-    if (floordiv(x * c1 + c2, c3).Match(ret)) {
-      int64_t c1val = c1.Eval()->value;
-      int64_t c2val = c2.Eval()->value;
-      int64_t c3val = c3.Eval()->value;
-      if (c1val > 0 && c3val > 0 && c3val % c1val == 0 && floormod(c2val, c3val) < c1val) {
-        // assume c3 == a * c1, x == a * y + b, c2 = d * c3 + e then
-        // (x * c1 + c2) // c3
-        // ==> ((a * y + b) * c1 + d * a * c1 + e) // (a * c1)
-        // ==> y + d + (b * c1 + e) // c3
-        // ==> y + d since 0 <= b * c1 <= (a-1) * c1, 0 <= e < c1
-        // ==> x // (c3 // c1) + (c2 // c3)
-        return (floordiv(x, floordiv(c3, c1)) + floordiv(c2, c3)).Eval();
+
+      // try simplify divisor
+      if (c1val > 0 && c2val > 0 && c2val % c1val == 0 &&
+          CanProveLess(floormod(yval, c2val), c1val)) {
+        // assume c2 == a * c1, x == a * x' + b, y = d * c2 + e then
+        // (x * c1 + y) // c2
+        // ==> ((a * x' + b) * c1 + d * a * c1 + e) // (a * c1)
+        // ==> x' + d + (b * c1 + e) // c2
+        // ==> x' + d since 0 <= b * c1 <= (a-1) * c1, 0 <= e < c1
+        // ==> x // (c2 // c1) + (y // c2)
+        return floordiv(x.Eval(), floordiv(c2val, c1val)) + y_div;
       }
     }
 
@@ -804,28 +810,12 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
     TVM_TRY_REWRITE(floordiv(c1 * x, x), c1);
 
     // Rules involving 2-operands.
-    TVM_TRY_REWRITE_IF(floordiv(x * c1 + y, c2), x * floordiv(c1, c2) + floordiv(y, c2),
-                       c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
-
-    TVM_TRY_REWRITE_IF(floordiv(x * c1 + y, c2), floordiv(x, floordiv(c2, c1)),
-                       c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
-                           c2.Eval()->value % c1.Eval()->value == 0 &&
-                           CanProveEqual(floordiv(y.Eval(), c1.Eval()), 0));
-
     TVM_TRY_REWRITE_IF(floordiv(min(x * c1, y), c2), min(x * floordiv(c1, c2), floordiv(y, c2)),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
     TVM_TRY_REWRITE_IF(floordiv(max(x * c1, y), c2), max(x * floordiv(c1, c2), floordiv(y, c2)),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
-    TVM_TRY_REWRITE_IF(floordiv(y + x * c1, c2), floordiv(y, c2) + x * floordiv(c1, c2),
-                       c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
-
-    TVM_TRY_REWRITE_IF(floordiv(y + x * c1, c2), floordiv(x, floordiv(c2, c1)),
-                       c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
-                           c2.Eval()->value % c1.Eval()->value == 0 &&
-                           CanProveEqual(floordiv(y.Eval(), c1.Eval()), 0));
-
     TVM_TRY_REWRITE_IF(floordiv(min(y, x * c1), c2), min(floordiv(y, c2), x * floordiv(c1, c2)),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
@@ -878,6 +868,8 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
                        CanProveGreaterEqual(z.Eval(), 0));
     TVM_TRY_REWRITE_IF(floordiv(y + z * x, z), floordiv(y, z) + x,
                        CanProveGreaterEqual(z.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(floordiv(x - floormod(x, c1), c1), floordiv(x, c1), c1.Eval()->value != 0);
   }
   return ret;
 }
@@ -930,22 +922,22 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
 
   if (IsIndexType(op->dtype)) {
     // Be-aware of the division rules: we use floordiv/floormod here
-    TVM_TRY_REWRITE_IF(floormod(x * c1, c2), ZeroWithTypeLike(x),
-                       c2.Eval()->value != 0 && c1.Eval()->value % c2.Eval()->value == 0);
-
-    TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(y, c2),
-                       c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
+    TVM_TRY_REWRITE_IF(floormod(x * c1, c2), floormod(x * floormod(c1, c2), c2),
+                       c2.Eval()->value != 0);
 
     TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(x, floordiv(c2, c1)) * c1 + y,
                        c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
                            c2.Eval()->value % c1.Eval()->value == 0 &&
                            CanProveEqual(floordiv(y.Eval(), c1.Eval()), 0));
 
+    TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(x * floormod(c1, c2) + y, c2),
+                       c2.Eval()->value > 0);
+
     TVM_TRY_REWRITE_IF(floormod(x + c1, c2), floormod(x, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
-    TVM_TRY_REWRITE_IF(floormod(x + y * c1, c2), floormod(x, c2),
-                       c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
+    TVM_TRY_REWRITE_IF(floormod(x + y * c1, c2), floormod(x + y * floormod(c1, c2), c2),
+                       c2.Eval()->value > 0);
 
     TVM_TRY_REWRITE_IF(floormod(x * c1, x * c2), x * floormod(c1, c2), c2.Eval()->value != 0);
 
diff --git a/src/arith/rewrite_simplify.h b/src/arith/rewrite_simplify.h
index 258f833a7b21b..202b9209da6df 100644
--- a/src/arith/rewrite_simplify.h
+++ b/src/arith/rewrite_simplify.h
@@ -110,6 +110,8 @@ class RewriteSimplifier::Impl : public IRMutatorWithAnalyzer {
   bool CanProveGreaterEqual(const PrimExpr& x, int64_t val) {
     return analyzer_->CanProveGreaterEqual(x, val);
   }
+  // Whether x < val
+  bool CanProveLess(const PrimExpr& x, int64_t val) { return analyzer_->CanProveLess(x, val); }
   // Whether x == val
   bool CanProveEqual(const PrimExpr& x, int64_t val) {
     // TODO(tqchen) refer back to super-analyzer.
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index ccf186634b8af..dffb8b4992851 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -75,13 +75,15 @@ inline std::vector<const PrimExpr*> ExprSplitAddition(const PrimExpr& expr) {
 }
 
 // Searches for the following types of expr:
-//   mult_expr = (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
-//   mod_l_expr = c
+//   mult_expr = (a1 + a2 + ... + aj + c1 / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
+//   mod_l_expr = c2
 //   mod_r_expr = k1 * k2 * ... * ki
-// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c)
+//   where c1 ~= c2 mod k1 * k2 * ... * ki
+// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c1)
 // Currently the we will not search the add/mult combinations exhaustively
 //   as it will take too much computation.
-inline std::pair<bool, PrimExpr> MergeMulModInner(const PrimExpr& mult_expr,
+inline std::pair<bool, PrimExpr> MergeMulModInner(arith::Analyzer* analyzer,
+                                                  const PrimExpr& mult_expr,
                                                   const PrimExpr& mod_l_expr,
                                                   const PrimExpr& mod_r_expr) {
   using namespace tir;
@@ -119,9 +121,10 @@ inline std::pair<bool, PrimExpr> MergeMulModInner(const PrimExpr& mult_expr,
     } else if (inner_div_ptr) {
       PrimExpr overall_mult = mult_inner.get() ? mult_inner * mult_outer : mult_outer;
       if (expr_equal(overall_mult, inner_div_ptr->b) && expr_equal(overall_mult, mod_r_expr) &&
-          expr_equal(inner_div_ptr->a, mod_l_expr)) {
+          analyzer->CanProveEqual(floormod(inner_div_ptr->a - mod_l_expr, mod_r_expr), 0)) {
         // Found!
-        PrimExpr ret = no_opt_sum.get() ? no_opt_sum * mult_outer + mod_l_expr : mod_l_expr;
+        PrimExpr ret =
+            no_opt_sum.get() ? no_opt_sum * mult_outer + inner_div_ptr->a : inner_div_ptr->a;
         return std::make_pair(true, ret);
       } else {
         return std::make_pair(false, PrimExpr());
@@ -204,7 +207,7 @@ inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) {
     bool inner_find_opt = false;
     while (mult_it != mult_exprs.end()) {
       std::pair<bool, PrimExpr> ret =
-          MergeMulModInner(*mult_it, search_mod_it->first, search_mod_it->second);
+          MergeMulModInner(analyzer, *mult_it, search_mod_it->first, search_mod_it->second);
       if (ret.first) {
         inner_find_opt = true;
         auto temp_mod_it = search_mod_it;
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 77678d829a8e2..ba329676b1c33 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -76,17 +76,16 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
   // Unpack the output indices into linear combinations of the initial
   // indices.
   arith::Analyzer analyzer;
-  auto padded_iter_map =
-      DetectPaddedIterMap((*this)->final_indices, input_iters, /* predicate = */ 1,
-                          /* require_bijective = */ false, &analyzer,
-                          /* simplify_trivial_iterators = */ false);
-  CHECK(padded_iter_map.errors.empty()) << "Could not parse mapping as sum of iterators.  "
-                                        << "Error: " << padded_iter_map.errors[0];
+  auto padded_iter_map = DetectIterMap((*this)->final_indices, input_iters, /* predicate = */ 1,
+                                       /*check_level=*/arith::IterMapLevel::NoCheck, &analyzer,
+                                       /*simplify_trivial_iterators=*/false);
+  CHECK(padded_iter_map->errors.empty()) << "Could not parse mapping as sum of iterators.  "
+                                         << "Error: " << padded_iter_map->errors[0];
 
   // Determine expressions for the input variables, in terms of the
   // output variables.
   Map<Var, PrimExpr> inverse_exprs_map = InverseAffineIterMap(
-      padded_iter_map.indices, Array<PrimExpr>(output_vars.begin(), output_vars.end()));
+      padded_iter_map->indices, Array<PrimExpr>(output_vars.begin(), output_vars.end()));
 
   // Unpack the map to an array, maintaining the same parameter order.
   Array<PrimExpr> inverse_exprs;
@@ -94,7 +93,7 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
     inverse_exprs.push_back(inverse_exprs_map.at(index));
   }
 
-  PrimExpr padding_predicate = padded_iter_map.padding_predicate;
+  PrimExpr padding_predicate = padded_iter_map->padding_predicate;
   padding_predicate = arith::NormalizeIterMapToExpr(padding_predicate);
   padding_predicate = Substitute(padding_predicate, inverse_exprs_map);
 
@@ -141,14 +140,14 @@ IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
   // indices.
   arith::Analyzer analyzer;
   auto iter_map = DetectIterMap((*this)->final_indices, input_iters, /* predicate = */ 1,
-                                /* require_bijective = */ true, &analyzer,
+                                /* check_level = */ arith::IterMapLevel::Bijective, &analyzer,
                                 /* simplify_trivial_iterators = */ false);
-  CHECK(iter_map.size()) << "Index transformation was not bijective.";
+  CHECK(iter_map->indices.size()) << "Index transformation was not bijective.";
 
   // Determine expressions for the input variables, in terms of the
   // output variables.
-  Map<Var, PrimExpr> inverse_exprs_map =
-      InverseAffineIterMap(iter_map, Array<PrimExpr>(output_vars.begin(), output_vars.end()));
+  Map<Var, PrimExpr> inverse_exprs_map = InverseAffineIterMap(
+      iter_map->indices, Array<PrimExpr>(output_vars.begin(), output_vars.end()));
 
   // Unpack the map to an array, maintaining the same parameter order.
   Array<PrimExpr> inverse_exprs;
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index c4719015daa43..83ef6adae3b23 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -533,16 +533,16 @@ bool IsAffineBinding(const BlockRealize& realize, const Map<Var, Range>& loop_va
   if (loop_var_ranges.empty()) {
     return true;
   }
-  Array<arith::IterSumExpr> results = arith::DetectIterMap(
+  auto res = arith::DetectIterMap(
       /*indices=*/realize->iter_values,
       /*input_iters=*/loop_var_ranges,
       /*predicate=*/realize->predicate,
-      /*require_bijective=*/false,
+      /*check_level=*/arith::IterMapLevel::Surjective,
       /*analyzer=*/analyzer);
-  if (results.empty()) {
+  if (res->indices.empty()) {
     return false;
   }
-  for (const arith::IterSumExpr& sum_expr : results) {
+  for (const arith::IterSumExpr& sum_expr : res->indices) {
     const Array<arith::IterSplitExpr>& args = sum_expr->args;
     if (!args.empty() && !is_one(args[0]->scale)) {
       return false;
diff --git a/src/tir/schedule/analysis/layout.cc b/src/tir/schedule/analysis/layout.cc
index 993557f8be2f8..b0cafac3151f7 100644
--- a/src/tir/schedule/analysis/layout.cc
+++ b/src/tir/schedule/analysis/layout.cc
@@ -68,17 +68,18 @@ class SplitExprCollector {
    * \param index The indexing pattern
    * \param input_iters The input iterators' domain
    * \param predicate The predicate of the affine map
-   * \param require_bijective Whether the affine map is required to be bijective
+   * \param check_level The iter mapping checking level
    * \param analyzer The analyzer
    * \return The collected split expressions
    */
   static std::vector<SplitExpr> Collect(const PrimExpr& index,
                                         const Map<Var, Range>& input_iters,  //
                                         const PrimExpr& predicate,           //
-                                        bool require_bijective,              //
+                                        arith::IterMapLevel check_level,     //
                                         arith::Analyzer* analyzer) {
-    Array<arith::IterSumExpr> iter_sum_exprs = arith::DetectIterMap(
-        {analyzer->Simplify(index)}, input_iters, predicate, require_bijective, analyzer);
+    arith::IterMapResult res = arith::DetectIterMap({analyzer->Simplify(index)}, input_iters,
+                                                    predicate, check_level, analyzer);
+    const auto& iter_sum_exprs = res->indices;
     if (iter_sum_exprs.empty()) {
       return {};
     }
@@ -149,7 +150,7 @@ Optional<IndexMap> SuggestIndexMap(const Buffer& buffer, const Array<PrimExpr>&
   // Step 3. Detect the IterSplitExpr of the indexing pattern
   std::vector<SplitExprCollector::SplitExpr> split_exprs = SplitExprCollector::Collect(
       /*index=*/f_flatten_index(indices), input_iters, predicate,
-      /*require_bijective=*/false, analyzer);
+      /*check_level=*/arith::IterMapLevel::Surjective, analyzer);
   if (split_exprs.empty()) {
     return NullOpt;
   }
diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc
index 7ed80a1c5b8f2..4ede2dd90da80 100644
--- a/src/tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/tir/schedule/primitive/blockize_tensorize.cc
@@ -258,10 +258,9 @@ Array<Array<arith::IterMark>> CheckSubspaceDivisible(const IRModule& mod,
                                                      arith::Analyzer* analyzer) {
   const Block& block = block_realize->block;
 
-  Array<Array<arith::IterMark>> division =
-      arith::SubspaceDivide(block_realize->iter_values, collector.loop_var_domain,
-                            collector.inner_loop_vars, block_realize->predicate,
-                            /*require_bijective=*/false, analyzer);
+  Array<Array<arith::IterMark>> division = arith::SubspaceDivide(
+      block_realize->iter_values, collector.loop_var_domain, collector.inner_loop_vars,
+      block_realize->predicate, arith::IterMapLevel::Surjective, analyzer);
 
   if (division.empty()) {
     // If we can't do perfect subspace division, check if it is a trivial case of subspace division.
diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
index 2a349f8fe61ed..7f1d74ac20214 100644
--- a/src/tir/schedule/primitive/compute_at.cc
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -244,7 +244,7 @@ class ScopeReconstructor : private StmtMutator {
       if (preserve_unit_loops || !is_one(iter_dom->extent)) {
         Var var("ax" + std::to_string(loop_vars.size()), DataType::Int(32));
         loop_vars.push_back(var);
-        loop_extents.push_back(iter_dom->extent);
+        loop_extents.push_back(analyzer->Simplify(iter_dom->extent));
         iter_values.push_back(iter_dom->min + var);
         analyzer->Bind(var, Range::FromMinExtent(0, iter_dom->extent));
       } else {
diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc
index 452f72e7228f0..ad15e06e285af 100644
--- a/src/tir/schedule/primitive/compute_inline.cc
+++ b/src/tir/schedule/primitive/compute_inline.cc
@@ -552,13 +552,14 @@ class ReverseComputeInliner : public BaseInliner {
       }
     }
 
-    buffer_load_iter_map_ = arith::DetectIterMap(
+    auto res = arith::DetectIterMap(
         /*indices=*/buffer_load_indices_,
         /*input_iters=*/consumer_iter_doms,
         /*predicate=*/true,
-        /*require_bijective=*/true,
+        /*check_level=*/arith::IterMapLevel::Bijective,
         /*analyzer=*/&analyzer,
         /*simplify_trivial_iterators=*/false);
+    buffer_load_iter_map_ = res->indices;
     if (buffer_load_iter_map_.empty()) {
       // Failure: indices of BufferLoad are not bijective affine
       return false;
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 6da796fc955f3..692f68a600ae9 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -392,8 +392,9 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
 
   auto iter_map = arith::DetectIterMap(
       /*indices=*/transformed_block_iters, /*input_iters=*/block_iter_dom, /*predicate=*/Bool(true),
-      /*require_bijective=*/true, &analyzer, /*simplify_trivial_iterators=*/true);
-  if (iter_map.empty()) {
+      /*check_level=*/arith::IterMapLevel::Bijective, &analyzer,
+      /*simplify_trivial_iterators=*/true);
+  if (iter_map->indices.empty()) {
     throw NotBijectiveAffineIndexMapError(self->mod, index_map);
   }
 
@@ -417,7 +418,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
   // Step 5.2: Update the block body. Use the inverse map f^{-1} to replace the original block iters
   // in the body.
 
-  auto inverse_map = arith::InverseAffineIterMap(iter_map, new_block_vars);
+  auto inverse_map = arith::InverseAffineIterMap(iter_map->indices, new_block_vars);
   // Trivial block iters will be simplified in DetectIterMap, they should be mapped to constant
   // zero.
   for (const auto& iter_var : block_ptr->iter_vars) {
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index dbe6a3bbc0c5c..5315b139f0f6f 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -115,7 +115,7 @@ class IterMapSimplifyBlockBinding : public StmtExprMutator {
     Array<PrimExpr> v = arith::IterMapSimplify(/*indices=*/op->iter_values,
                                                /*input_iters=*/loop_var2extent_,
                                                /*input_pred=*/op->predicate,
-                                               /*require_bijective=*/false);
+                                               /*check_level=*/arith::IterMapLevel::Surjective);
     if (v.same_as(op->iter_values)) {
       return GetRef<Stmt>(op);
     } else {
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index fe766b921806b..d7bfa1c919478 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -14,9 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+from xml import dom
 import tvm
 import tvm.testing
-from tvm import te
 from tvm.tir import floormod, floordiv
 
 
@@ -48,56 +48,69 @@ def convert_iter_expr(expr):
     return tvm.arith.normalize_iter_map_to_expr(expr)
 
 
-def assert_iter_sum_pattern(sum_expr, extent, base, scale=1):
-    """Check the sum expr have the right pattern."""
-    assert isinstance(sum_expr, tvm.arith.IterSumExpr)
-    if extent == 1:
-        assert len(sum_expr.args) == 0
-    else:
-        assert len(sum_expr.args) == 1
-        tvm.testing.assert_prim_expr_equal(sum_expr.args[0].extent, extent)
-        tvm.testing.assert_prim_expr_equal(sum_expr.args[0].scale, scale)
-    tvm.testing.assert_prim_expr_equal(sum_expr.base, base)
+def assert_iter_sum_pattern(
+    expect_dict, dom_map, predicate=True, check_level="surjective", simplify_trivial_iterators=True
+):
+    keys = list(expect_dict.keys())
+    res = tvm.arith.detect_iter_map(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    indices = res.indices
+    assert len(indices) == len(keys), res.errors
+    print(indices)
+    for i, input_iter in enumerate(keys):
+        spec = expect_dict[input_iter]
+        (
+            extent,
+            base,
+        ) = spec[0:2]
+        scale = spec[2] if len(spec) > 2 else 1
+        expect_iter = spec[3] if len(spec) > 3 else None
+        sum_expr = indices[i]
+        assert isinstance(sum_expr, tvm.arith.IterSumExpr)
+        if extent == 1:
+            assert len(sum_expr.args) == 0
+        else:
+            assert len(sum_expr.args) == 1
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].extent, extent)
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].scale, scale)
+        tvm.testing.assert_prim_expr_equal(sum_expr.base, base)
+        if expect_iter is not None:
+            if not isinstance(expect_iter, tvm.arith.IterMapExpr):
+                sum_expr = convert_iter_expr(sum_expr)
+            tvm.ir.assert_structural_equal(sum_expr, expect_iter)
+
+
+def assert_iter_sum_failure(iters, dom_map, predicate=True, check_level="surjective"):
+    res = tvm.arith.detect_iter_map(
+        list(iters), dom_map, predicate=predicate, check_level=check_level
+    ).indices
+    assert len(res) == 0
 
 
 def test_trivial():
-    x = tvm.tir.Var("x", "int32"), 3
-    y = tvm.tir.Var("y", "int32"), 4
-    z = tvm.tir.Var("z", "int32"), 1
-
-    res = tvm.arith.detect_iter_map([x[0], y[0], 3], var_dom([x, y]))
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    dom_map = var_dom([(x, 3), (y, 4), (z, 1)])
 
-    assert len(res) == 3
-    assert_iter_sum_pattern(res[0], 3, 0)
-    assert_iter_sum_pattern(res[1], 4, 0)
-    assert_iter_sum_pattern(res[2], 1, 3)
-
-    res = tvm.arith.detect_iter_map([x[0], 3], var_dom([x, y]))
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 3, 0)
-    assert_iter_sum_pattern(res[1], 1, 3)
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0), 3: (1, 3)}, dom_map)
+    assert_iter_sum_pattern({x: (3, 0), 3: (1, 3)}, dom_map)
 
     # not independent
-    res = tvm.arith.detect_iter_map([x[0], x[0], 3], var_dom([x, y]))
-    assert len(res) == 0
+    assert_iter_sum_failure([x, x, 3], dom_map)
 
-    res = tvm.arith.detect_iter_map(
-        [x[0], y[0]], var_dom([x, y, z]), require_bijective=True, simplify_trivial_iterators=True
+    assert_iter_sum_pattern(
+        {x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=True
     )
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 3, 0)
-    assert_iter_sum_pattern(res[1], 4, 0)
-
-    res = tvm.arith.detect_iter_map(
-        [x[0], y[0]], var_dom([x, y, z]), require_bijective=True, simplify_trivial_iterators=False
+    assert_iter_sum_pattern(
+        {x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=False
     )
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 3, 0)
-    assert_iter_sum_pattern(res[1], 4, 0)
-
-    # not bijective
-    res = tvm.arith.detect_iter_map([x[0], z[0]], var_dom([x, y, z]), require_bijective=True)
-    assert len(res) == 0
+    assert_iter_sum_failure([x, z], dom_map, check_level="bijective")
 
 
 def test_fuse():
@@ -106,42 +119,27 @@ def test_fuse():
     c = tvm.tir.SizeVar("c", "int32")
     c0 = tvm.tir.SizeVar("c0", "int32")
 
-    res = tvm.arith.detect_iter_map([y * 3 + 1 + c + x], var_dom([(x, 3), (y, 4)]))
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 12, 1 + c)
+    assert_iter_sum_pattern({y * 3 + 1 + c + x: (12, 1 + c)}, var_dom([(x, 3), (y, 4)]))
 
-    res = tvm.arith.detect_iter_map([ifuse([(x, 3), (y, 4)])[0]], var_dom([(x, 3), (y, 4)]))
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 12, 0)
+    assert_iter_sum_pattern({ifuse([(x, 3), (y, 4)])[0]: (12, 0)}, var_dom([(x, 3), (y, 4)]))
 
     # fuse with symbolic factor
-    res = tvm.arith.detect_iter_map([(y + 1) * c + x], var_dom([(x, c), (y, 4)]))
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 4 * c, c)
+    assert_iter_sum_pattern({(y + 1) * c + x: (4 * c, c)}, var_dom([(x, c), (y, 4)]))
 
     # duplication
-    res = tvm.arith.detect_iter_map([y * 3 + x, y], var_dom([(x, 3), (y, 4)]))
-    assert len(res) == 0
-
-    # duplication 2
-    res = tvm.arith.detect_iter_map([y, x + 1, y], var_dom([(x, 3), (y, 4)]))
-    assert len(res) == 0
+    assert_iter_sum_failure([y * 3 + x, y], var_dom([(x, 3), (y, 4)]))
+    assert_iter_sum_failure([y, x + 1, y], var_dom([(x, 3), (y, 4)]))
 
     # factor mismatch
-    res = tvm.arith.detect_iter_map([y * 4 + x], var_dom([(x, 3), (y, 4)]))
-    assert len(res) == 0
+    assert_iter_sum_failure([y * 4 + x], var_dom([(x, 3), (y, 4)]))
 
     # simple stride pattern
-    res = tvm.arith.detect_iter_map([x * 4 + y * 2], var_dom([(x, 3), (y, 2)]))
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 6, 0, scale=2)
-    tvm.ir.assert_structural_equal(convert_iter_expr(res[0]), (x * 2 + y) * 2)
+    assert_iter_sum_pattern({x * 4 + y * 2: (6, 0, 2, (x * 2 + y) * 2)}, var_dom([(x, 3), (y, 2)]))
 
     # simple stride pattern with symbolic
-    res = tvm.arith.detect_iter_map([x * 2 * c0 + y * 2], var_dom([(x, 3), (y, c0)]))
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 3 * c0, 0, scale=2)
-    tvm.ir.assert_structural_equal(convert_iter_expr(res[0]), (x * c0 + y) * 2)
+    assert_iter_sum_pattern(
+        {x * 2 * c0 + y * 2: (3 * c0, 0, 2, (x * c0 + y) * 2)}, var_dom([(x, 3), (y, c0)])
+    )
 
 
 def test_split():
@@ -152,171 +150,138 @@ def test_split():
     fld = tvm.tir.floordiv
     flm = tvm.tir.floormod
 
-    res = tvm.arith.detect_iter_map([fld(x, 3), flm(x, 3) * 2 + c1], var_dom([(x, 24)]))
+    assert_iter_sum_pattern({fld(x, 3): (8, 0), flm(x, 3) * 2 + c1: (3, c1, 2)}, var_dom([(x, 24)]))
 
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 8, 0)
-    assert_iter_sum_pattern(res[1], 3, c1, 2)
-
-    res = tvm.arith.detect_iter_map([fld(x, 6), fld(flm(x, 6), 2), flm(x, 2)], var_dom([(x, 24)]))
-
-    assert len(res) == 3
-    assert_iter_sum_pattern(res[0], 4, 0)
-    assert_iter_sum_pattern(res[1], 3, 0)
-    assert_iter_sum_pattern(res[2], 2, 0)
+    assert_iter_sum_pattern(
+        {fld(x, 6): (4, 0), fld(flm(x, 6), 2): (3, 0), flm(x, 2): (2, 0)}, var_dom([(x, 24)])
+    )
 
     # simple symbolic bound
     # TODO(tvm-team) improve symbolic divisible check to enable
     # more complicated symbolic bound
-    res = tvm.arith.detect_iter_map([fld(x, c0), flm(x, c0)], var_dom([(x, c1 * c0)]))
-
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], c1, 0)
-    assert_iter_sum_pattern(res[1], c0, 0)
-
-    res = tvm.arith.detect_iter_map([fld(x * 2, 4), flm(x * 2, 4)], var_dom([(x, 8)]))
-
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 4, 0, scale=1)
-    assert_iter_sum_pattern(res[1], 2, 0, scale=2)
+    assert_iter_sum_pattern({fld(x, c0): (c1, 0), flm(x, c0): (c0, 0)}, var_dom([(x, c1 * c0)]))
 
-    res = tvm.arith.detect_iter_map([fld(x * 2, 4) * 4 + flm(x * 2, 4)], var_dom([(x, 8)]))
+    assert_iter_sum_pattern({fld(x * 2, 4): (4, 0, 1), flm(x * 2, 4): (2, 0, 2)}, var_dom([(x, 8)]))
 
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 8, 0, scale=2)
+    assert_iter_sum_pattern(
+        {
+            fld(x * 2, 4) * 4 + flm(x * 2, 4): (8, 0, 2),
+        },
+        var_dom([(x, 8)]),
+    )
 
-    res = tvm.arith.detect_iter_map([fld(x, flm(flm(y, 8), 6))], var_dom([(x, 24), (y, 8)]))
-    assert len(res) == 0
+    assert_iter_sum_failure([fld(x, flm(flm(y, 8), 6))], var_dom([(x, 24), (y, 8)]))
 
 
 def test_compound():
-    x = tvm.tir.Var("x", "int32"), 10
-    y = tvm.tir.Var("y", "int32"), 9
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
 
-    xo, xi = isplit(x, 5)
-    yo, yi = isplit(y, 3)
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
     z = ifuse([yo, xo, yi])
 
-    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([x, y]))
-
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 18, 0)
-    assert_iter_sum_pattern(res[1], 5, 0)
     # reconstruct the pattern manually
-    mx = tvm.arith.IterMark(x[0], 10)
-    my = tvm.arith.IterMark(y[0], 9)
-
+    mx = tvm.arith.IterMark(x, 10)
+    my = tvm.arith.IterMark(y, 9)
     xoscale = 3
-    xiscale = 1
     yoscale = 6
     yiscale = 1
     mxo = tvm.arith.IterSplitExpr(mx, 5, 2, xoscale)
-    mxi = tvm.arith.IterSplitExpr(mx, 1, 5, xiscale)
     myo = tvm.arith.IterSplitExpr(my, 3, 3, yoscale)
     myi = tvm.arith.IterSplitExpr(my, 1, 3, yiscale)
-
     mz = tvm.arith.IterMark(tvm.arith.IterSumExpr([myo, mxo, myi], 0), 18)
     sz = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(mz, 1, 18, 1)], 0)
-    tvm.ir.assert_structural_equal(sz, res[0])
+    assert_iter_sum_pattern({z[0]: (18, 0, 1, sz), xi[0]: (5, 0)}, var_dom([(x, 10), (y, 9)]))
 
 
 def test_predicate():
-    x = tvm.tir.Var("x", "int32"), 13
-    y = tvm.tir.Var("y", "int32"), 10
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
 
     # available contraints
     # upper bound only
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] < 128)
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 128, 0)
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] <= 127)
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 128, 0)
+    assert_iter_sum_pattern(
+        {x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 128
+    )
+
+    assert_iter_sum_pattern(
+        {x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y <= 127
+    )
 
     # lower bound only
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] > 5)
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 124, 6)
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] >= 6)
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 124, 6)
+    assert_iter_sum_pattern(
+        {x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y > 5
+    )
+
+    assert_iter_sum_pattern(
+        {x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y >= 6
+    )
 
     # lower bound + upper bound
-    res = tvm.arith.detect_iter_map(
-        [x[0] * 10 + y[0]],
-        var_dom([x, y]),
-        tvm.tir.And(x[0] * 10 + y[0] > 5, x[0] * 10 + y[0] < 128),
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y > 5, x * 10 + y < 128),
     )
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 122, 6)
-    res = tvm.arith.detect_iter_map(
-        [x[0] * 10 + y[0]],
-        var_dom([x, y]),
-        tvm.tir.And(x[0] * 10 + y[0] >= 6, x[0] * 10 + y[0] <= 127),
+
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y >= 6, x * 10 + y <= 127),
     )
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 122, 6)
 
     # constraint on one fused iter
     i = tvm.tir.Var("i", "int32")
     j = tvm.tir.Var("j", "int32")
     k = tvm.tir.Var("k", "int32")
-    res = tvm.arith.detect_iter_map(
-        [i * 8 + j * 2 + k],
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (88, 1)},
         var_dom([(i, 11), (j, 5), (k, 2)]),
-        tvm.tir.all(1 <= j * 2 + k, j * 2 + k < 9),
+        predicate=tvm.tir.all(1 <= j * 2 + k, j * 2 + k < 9),
     )
-    assert_iter_sum_pattern(res[0], 88, 1)
 
     # constraint on single var
-    res = tvm.arith.detect_iter_map([i], var_dom([(i, 48)]), tvm.tir.all(i < 10))
-    assert_iter_sum_pattern(res[0], 10, 0)
+    assert_iter_sum_pattern({i: (10, 0)}, var_dom([(i, 48)]), predicate=i < 10)
 
-    # iterations are subparts of constraint, invalid, case 1
-    res = tvm.arith.detect_iter_map(
+    # iterations are subparts of constraint, invalid case 1
+    assert_iter_sum_failure(
         [i, j, k],
         var_dom([(i, 128), (j, 128), (k, 128)]),
-        tvm.tir.all(i * 16384 + j * 128 + k < 100),
+        predicate=tvm.tir.all(i * 16384 + j * 128 + k < 100),
     )
-    assert len(res) == 0
 
-    # iterations are subparts of constraint, invalid, case 2
-    res = tvm.arith.detect_iter_map(
+    # iterations are subparts of constraint, invalid case 2
+    assert_iter_sum_failure(
         [i * 128 + j, k],
         var_dom([(i, 128), (j, 128), (k, 128)]),
-        tvm.tir.all(i * 16384 + j * 128 + k < 100),
+        predicate=i * 16384 + j * 128 + k < 100,
     )
-    assert len(res) == 0
 
     # irrelavant predicate
-    res = tvm.arith.detect_iter_map(
-        [i + j],
-        var_dom([(i, 1)]),
-        j <= 24,
-    )
-    assert_iter_sum_pattern(res[0], 1, j)
+    assert_iter_sum_pattern({i + j: (1, j)}, var_dom([(i, 1)]), predicate=j <= 24)
 
     # constraint on nested fused iters
-    res = tvm.arith.detect_iter_map(
-        [i * 8 + j * 2 + k],
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (22, 3)},
         var_dom([(i, 11), (j, 5), (k, 2)]),
-        tvm.tir.all(1 <= j * 2 + k, j * 2 + k < 9, 3 <= i * 8 + j * 2 + k, i * 8 + j * 2 + k < 25),
+        predicate=tvm.tir.all(
+            1 <= j * 2 + k, j * 2 + k < 9, 3 <= i * 8 + j * 2 + k, i * 8 + j * 2 + k < 25
+        ),
     )
-    assert_iter_sum_pattern(res[0], 22, 3)
 
     # duplicate constraint on one fused iter
-    res = tvm.arith.detect_iter_map(
-        [i * 6 + j * 2 + k],
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (66, 2)},
         var_dom([(i, 11), (j, 5), (k, 2)]),
-        tvm.tir.all(1 <= j * 2 + k, 2 <= j * 2 + k, j * 2 + k < 8, j * 2 + k < 9),
+        predicate=tvm.tir.all(1 <= j * 2 + k, 2 <= j * 2 + k, j * 2 + k < 8, j * 2 + k < 9),
     )
-    assert_iter_sum_pattern(res[0], 66, 2)
 
     # duplicate constraint on nested fused iters
-    res = tvm.arith.detect_iter_map(
-        [i * 6 + j * 2 + k],
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (15, 3)},
         var_dom([(i, 11), (j, 5), (k, 2)]),
-        tvm.tir.all(
+        predicate=tvm.tir.all(
             1 <= j * 2 + k,
             2 <= j * 2 + k,
             j * 2 + k < 8,
@@ -327,15 +292,13 @@ def test_predicate():
             i * 6 + j * 2 + k < 18,
         ),
     )
-    assert_iter_sum_pattern(res[0], 15, 3)
 
     # constraint on non-disjoint fused iters should fail
-    res = tvm.arith.detect_iter_map(
+    assert_iter_sum_failure(
         [i * 8 + j * 2 + k],
         var_dom([(i, 11), (j, 5), (k, 2)]),
-        tvm.tir.all(2 <= j * 2 + k, 0 <= i * 4 + j),
+        predicate=tvm.tir.all(2 <= j * 2 + k, 0 <= i * 4 + j),
     )
-    assert len(res) == 0
 
     # constraint on many disjoint fused iters, case 1
     # i4 * 6 + i5 in [3, 9), extent=6 (= scale of i2)
@@ -347,147 +310,135 @@ def test_predicate():
     i3 = tvm.tir.Var("i3", "int32")
     i4 = tvm.tir.Var("i4", "int32")
     i5 = tvm.tir.Var("i5", "int32")
-    res = tvm.arith.detect_iter_map(
-        [i0 * 180 + i1 * 60 + i2 * 30 + i3 * 15 + i4 * 6 + i5],
+    assert_iter_sum_pattern(
+        {i0 * 180 + i1 * 60 + i2 * 30 + i3 * 15 + i4 * 6 + i5: (540, 93)},
         var_dom([(i0, 3), (i1, 4), (i2, 3), (i3, 2), (i4, 3), (i5, 6)]),
-        tvm.tir.all(1 <= i1, 2 <= i2 * 2 + i3, 3 <= i4 * 6 + i5),
+        predicate=tvm.tir.all(1 <= i1, 2 <= i2 * 2 + i3, 3 <= i4 * 6 + i5),
     )
-    assert_iter_sum_pattern(res[0], 540, 93)
 
     # constraint on many disjoint fused iters, case 2
-    res = tvm.arith.detect_iter_map(
-        [i0 * 45 + i1 * 45 + i2 * 9 + i3 * 4 + i4],
+    assert_iter_sum_pattern(
+        {i0 * 45 + i1 * 45 + i2 * 9 + i3 * 4 + i4: (135, 28)},
         var_dom([(i0, 3), (i1, 2), (i2, 5), (i3, 3), (i4, 4)]),
-        tvm.tir.all(3 <= i1 * 5 + i2, i1 * 5 + i2 < 8, 1 <= i3 * 4 + i4, i3 * 4 + i4 < 10),
+        predicate=tvm.tir.all(
+            3 <= i1 * 5 + i2, i1 * 5 + i2 < 8, 1 <= i3 * 4 + i4, i3 * 4 + i4 < 10
+        ),
     )
-    assert_iter_sum_pattern(res[0], 135, 28)
 
     # constraint on split iters
-    res = tvm.arith.detect_iter_map(
-        [i % 16, i // 16],
+    assert_iter_sum_pattern(
+        {i % 16: (7, 3), i // 16: (8, 4)},
         var_dom([(i, 1024)]),
-        tvm.tir.all(3 <= i % 16, i % 16 < 10, 4 <= i // 16, i // 16 < 12),
-        require_bijective=True,
+        predicate=tvm.tir.all(3 <= i % 16, i % 16 < 10, 4 <= i // 16, i // 16 < 12),
+        check_level="bijective",
     )
-    assert_iter_sum_pattern(res[0], 7, 3)
-    assert_iter_sum_pattern(res[1], 8, 4)
 
     # constraint on split iters, nested case 1
-    res = tvm.arith.detect_iter_map(
-        [(i * 32 + j) % 16],
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (7, 3)},
         var_dom([(i, 5), (j, 32)]),
-        tvm.tir.all(3 <= (i * 32 + j) % 16, (i * 32 + j) % 16 < 10),
+        predicate=tvm.tir.all(3 <= (i * 32 + j) % 16, (i * 32 + j) % 16 < 10),
     )
-    assert_iter_sum_pattern(res[0], 7, 3)
 
     # constraint on split iters, nested case 2
-    res = tvm.arith.detect_iter_map(
-        [(i * 32 + j) % 16],
+    assert_iter_sum_failure(
+        [
+            (i * 32 + j) % 16,
+        ],
         var_dom([(i, 5), (j, 32)]),
-        tvm.tir.all(1 <= i * 32 + j, i * 32 + j <= 32),
+        predicate=tvm.tir.all(1 <= i * 32 + j, i * 32 + j <= 32),
+        check_level="bijective",
     )
-    assert len(res) == 0
-    res = tvm.arith.detect_iter_map(
-        [(i * 32 + j - 1) % 16, (i * 32 + j - 1) // 16],
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (16, 0)},
         var_dom([(i, 5), (j, 32)]),
-        tvm.tir.all(1 <= i * 32 + j, i * 32 + j <= 64),
+        predicate=tvm.tir.all(1 <= i * 32 + j, i * 32 + j <= 32),
+    )
+    assert_iter_sum_pattern(
+        {(i * 32 + j - 1) % 16: (16, 0), (i * 32 + j - 1) // 16: (4, 0)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(1 <= i * 32 + j, i * 32 + j <= 64),
     )
-    assert_iter_sum_pattern(res[0], 16, 0)
-    assert_iter_sum_pattern(res[1], 4, 0)
 
     # non-standard form of predicate
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 < 128 - y[0])
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 128, 0)
+    assert_iter_sum_pattern(
+        {x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 < 128 - y
+    )
 
     # duplicate constraint
-    res = tvm.arith.detect_iter_map(
-        [x[0] * 10 + y[0]],
-        var_dom([x, y]),
-        tvm.tir.all(x[0] * 10 + y[0] < 128, x[0] * 10 + y[0] < 64),
+    assert_iter_sum_pattern(
+        {x * 10 + y: (64, 0)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.all(x * 10 + y < 128, x * 10 + y < 64),
     )
 
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 64, 0)
-
     # useless constraint
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] < 140)
-
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 130, 0)
+    assert_iter_sum_pattern(
+        {x * 10 + y: (130, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 140
+    )
 
-    i1 = tvm.tir.Var("i1", "int32"), 7
-    i2 = tvm.tir.Var("i2", "int32"), 2
-    i3 = tvm.tir.Var("i3", "int32"), 4
-    i4 = tvm.tir.Var("i4", "int32"), 3
-    res = tvm.arith.detect_iter_map(
-        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
-        var_dom([i1, i2, i3, i4]),
-        (
+    i1 = tvm.tir.Var("i1", "int32")
+    i2 = tvm.tir.Var("i2", "int32")
+    i3 = tvm.tir.Var("i3", "int32")
+    i4 = tvm.tir.Var("i4", "int32")
+    assert_iter_sum_pattern(
+        {i1 * 20 + i2 * 10 + i3 * 3 + i4: (128, 0)},
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
             tvm.tir.all(
-                i1[0] * 2 + i2[0] < 13,
-                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
-                i3[0] * 3 + i4[0] < 10,
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
             )
         ),
     )
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 128, 0)
-
-    i1 = tvm.tir.Var("i1", "int32"), 7
-    i2 = tvm.tir.Var("i2", "int32"), 2
-    i3 = tvm.tir.Var("i3", "int32"), 4
-    i4 = tvm.tir.Var("i4", "int32"), 3
 
     # wrong constraint
-    res = tvm.arith.detect_iter_map(
-        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
-        var_dom([i1, i2, i3, i4]),
-        (
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
             tvm.tir.all(
-                i1[0] * 2 + i2[0] < 13,
-                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
-                i3[0] * 3 + i4[0] < 7,
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 7,
             )
         ),
     )
-    assert len(res) == 0
 
     # incompatible constraint
-    res = tvm.arith.detect_iter_map(
-        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
-        var_dom([i1, i2, i3, i4]),
-        (
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
             tvm.tir.all(
-                i1[0] * 2 + i2[0] < 13,
-                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
-                i3[0] * 3 + i4[0] < 10,
-                i1[0] * 4 + i3[0] < 20,
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
+                i1 * 4 + i3 < 20,
             )
         ),
     )
-    assert len(res) == 0
-
-    res = tvm.arith.detect_iter_map(
-        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
-        var_dom([i1, i2, i3, i4]),
-        (
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
             tvm.tir.all(
-                i1[0] * 2 + i2[0] < 13,
-                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
-                i1[0] * 4 + i3[0] < 20,
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i1 * 4 + i3 < 20,
             )
         ),
     )
-    assert len(res) == 0
 
     # zero iter
-    xo = tvm.tir.Var("xo", "int32"), 1
-    xi = tvm.tir.Var("xi", "int32"), 129
-    y = tvm.tir.Var("y", "int32"), 128
-
-    res = tvm.arith.detect_iter_map(
-        [xo[0] * 129 + xi[0], y[0]], var_dom([xo, xi, y]), xo[0] * 129 + xi[0] < 128
+    xo = tvm.tir.Var("xo", "int32")
+    xi = tvm.tir.Var("xi", "int32")
+    y = tvm.tir.Var("y", "int32")
+    assert_iter_sum_pattern(
+        {xo * 129 + xi: (128, 0), y: (128, 0)},
+        var_dom([(xo, 1), (xi, 129), (y, 128)]),
+        predicate=xo * 129 + xi < 128,
     )
 
 
@@ -554,9 +505,10 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[1][0], floormod(j0[0], 4))
     tvm.ir.assert_structural_equal(res[1][1], i3[0])
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3]))
+    assert_iter_sum_pattern
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
     assert len(res1) == 2
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
     assert len(res2) == 2
 
     # compound 1.2
@@ -568,9 +520,9 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[1][0], 0)
     tvm.ir.assert_structural_equal(res[1][1], (floormod(j0[0], 4) * 2) + i3[0])
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3]))
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
     assert len(res1) == 2
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
     assert len(res2) == 2
 
     # compound 1.3
@@ -589,9 +541,9 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[2][0], (i0[0] * 2) + floordiv(j0[0], 4) < 7)
     tvm.ir.assert_structural_equal(res[2][1], True)
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3]))
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
     assert len(res1) == 2
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
     assert len(res2) == 2
 
     # compound 1.5
@@ -607,9 +559,9 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[2][0], True)
     tvm.ir.assert_structural_equal(res[2][1], (floormod(j0[0], 4) * 2) + i3[0] < 7)
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3]))
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
     assert len(res1) == 2
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
     assert len(res2) == 2
 
     # compound 1.6
@@ -644,9 +596,9 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[2][0], 0)
     tvm.ir.assert_structural_equal(res[2][1], (floormod(l1[0], 3) * 3) + j3[0])
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3]))
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
     assert len(res1) == 3
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
     assert len(res2) == 3
 
     # compound 2.2
@@ -662,9 +614,11 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[2][0], 0)
     tvm.ir.assert_structural_equal(res[2][1], (floormod(l0[0] * 6 + l1[0], 3) * 3) + j3[0])
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l0, l1, j3]))
+    res1 = tvm.arith.detect_iter_map(
+        [res[0][1], res[1][1], res[2][1]], var_dom([l0, l1, j3])
+    ).indices
     assert len(res1) == 3
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0])).indices
     assert len(res2) == 3
 
     # compound 2.3
@@ -692,9 +646,9 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[3][0], (j0[0] * 2) + l0[0] < 7)
     tvm.ir.assert_structural_equal(res[3][1], (floormod(l1[0], 3) * 3) + j3[0] < 8)
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3]))
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
     assert len(res1) == 3
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
     assert len(res2) == 3
 
     # compound 2.5
@@ -730,13 +684,6 @@ def test_complex():
     i0 = ifuse([j0, j1], 200)
     i1 = ifuse([j2, j3], 50)
 
-    res = tvm.arith.detect_iter_map(
-        [i0[0], i1[0]],
-        var_dom([l0, l1, n0, n1, m1, l3]),
-        tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15),
-    )
-    assert len(res) == 2
-
     n0_mark = tvm.arith.IterMark(n0[0], n0[1])
     n1_mark = tvm.arith.IterMark(n1[0], n1[1])
     l0_mark = tvm.arith.IterMark(l0[0], l0[1])
@@ -784,16 +731,20 @@ def test_complex():
     i0_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i0_mark, 1, i0[1], 1)], 0)
     i1_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i1_mark, 1, i1[1], 1)], 0)
 
-    tvm.ir.assert_structural_equal(i0_final, res[0])
-    tvm.ir.assert_structural_equal(i1_final, res[1])
+    assert_iter_sum_pattern(
+        {i0[0]: (200, 0, 1, i0_final), i1[0]: (50, 0, 1, i1_final)},
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        predicate=tvm.tir.all(
+            i0[0] < 200, i1[0] < 50, m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15
+        ),
+    )
 
     # wrong constraint
-    res = tvm.arith.detect_iter_map(
+    assert_iter_sum_failure(
         [i0[0], i1[0]],
         var_dom([l0, l1, n0, n1, m1, l3]),
         tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 9, l2[0] < 16, j0[0] < 7, j3[0] < 14),
     )
-    assert len(res) == 0
 
     # subspace_division
     res = tvm.arith.subspace_divide(
@@ -822,34 +773,33 @@ def test_complex():
         ),
     )
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([n0, n1, m1, l3]), res[2][1])
-    assert len(res1) == 2
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([l0, l1]))
-    assert len(res2) == 2
+    assert_iter_sum_pattern(
+        {res[0][1]: (32, 0), res[1][1]: (15, 0)}, var_dom([n0, n1, m1, l3]), res[2][1]
+    )
+    assert_iter_sum_pattern({res[0][0]: (8, 0), res[1][0]: (4, 0)}, var_dom([l0, l1]))
 
 
 def test_normalize_iter_map_to_expr():
     fld = tvm.tir.floordiv
     flm = tvm.tir.floormod
 
-    x = tvm.tir.Var("x", "int32"), 10
-    y = tvm.tir.Var("y", "int32"), 9
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
 
-    xo, xi = isplit(x, 5)
-    yo, yi = isplit(y, 3)
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
     z = ifuse([yo, xo, yi])
-
-    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([x, y]))
+    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([(x, 10), (y, 9)]))
 
     tvm.ir.assert_structural_equal(
-        tvm.arith.normalize_iter_map_to_expr(res[0]),
-        fld(y[0], 3) * 6 + fld(x[0], 5) * 3 + flm(y[0], 3),
+        tvm.arith.normalize_iter_map_to_expr(res.indices[0]),
+        fld(y, 3) * 6 + fld(x, 5) * 3 + flm(y, 3),
     )
-    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(res[1]), flm(x[0], 5))
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(res.indices[1]), flm(x, 5))
 
     # iter mark wrap a complex expr
-    split = tvm.arith.IterSplitExpr(tvm.arith.IterMark(x[0] * y[0] + 1, 1024), 1, 1024, 1)
-    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(split), x[0] * y[0] + 1)
+    split = tvm.arith.IterSplitExpr(tvm.arith.IterMark(x * y + 1, 1024), 1, 1024, 1)
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(split), x * y + 1)
 
 
 def test_inverse_affine_iter_map():
@@ -863,7 +813,9 @@ def test_inverse_affine_iter_map():
     l1_0, l1_1 = isplit(l1, 4)
     l0_1_l1_1_fused = ifuse([l0_1, l1_1])
 
-    iter_map = tvm.arith.detect_iter_map([l0_1_l1_1_fused[0], l0_0[0], l1_0[0]], var_dom([l0, l1]))
+    iter_map = tvm.arith.detect_iter_map(
+        [l0_1_l1_1_fused[0], l0_0[0], l1_0[0]], var_dom([l0, l1])
+    ).indices
     outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
     res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
     assert len(res) == 2
@@ -882,7 +834,7 @@ def test_inverse_affine_iter_map():
 
     iter_map = tvm.arith.detect_iter_map(
         [l0_1_l2_1_l1_1_l2_0_fused[0], l0_0[0], l2_2[0], l1_0[0]], var_dom([l0, l1, l2])
-    )
+    ).indices
     outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
     res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
     assert len(res) == 3
@@ -902,7 +854,7 @@ def test_inverse_affine_iter_map():
     l1_0, l1_1 = isplit(l1, 8)
     l2 = ifuse([l1_1, l1_0])
 
-    iter_map = tvm.arith.detect_iter_map([l2[0]], var_dom([l0]))
+    iter_map = tvm.arith.detect_iter_map([l2[0]], var_dom([l0])).indices
     outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
     res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
     assert len(res) == 1
@@ -918,12 +870,11 @@ def test_free_variables():
     z = tvm.tir.Var("z", "int32")
 
     # illegal iter if z is within dom
-    res = tvm.arith.detect_iter_map([z * 19 + y * 3 + x], var_dom([(x, 3), (y, 3), (z, 3)]))
-    assert len(res) == 0
+    assert_iter_sum_failure([z * 19 + y * 3 + x], var_dom([(x, 3), (y, 3), (z, 3)]))
 
     # iter is valid if z is free, even there are linear forms of z
-    res = tvm.arith.detect_iter_map(
-        [z * 19 + y * 3 + x],
+    assert_iter_sum_pattern(
+        {z * 19 + y * 3 + x: (9, z * 19)},
         var_dom(
             [
                 (x, 3),
@@ -931,9 +882,8 @@ def test_free_variables():
             ]
         ),
     )
-    assert_iter_sum_pattern(res[0], 9, z * 19)
-    res = tvm.arith.detect_iter_map(
-        [z * z + y * 3 + x],
+    assert_iter_sum_pattern(
+        {z * z + y * 3 + x: (9, z * z)},
         var_dom(
             [
                 (x, 3),
@@ -941,7 +891,105 @@ def test_free_variables():
             ]
         ),
     )
-    assert_iter_sum_pattern(res[0], 9, z * z)
+
+
+def test_padding():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    # left padding only, offset divisible
+    sum = 64 + y
+    dom_map = var_dom([(y, 192)])
+    assert_iter_sum_pattern(
+        {fld(sum, 32): (6, 2, 1), flm(sum, 32): (32, 0, 1)},
+        dom_map,
+        check_level="bijective",
+    )
+
+    # left padding only, offset non-divisible
+    sum = 80 + y
+    dom_map = var_dom([(y, 176)])
+    assert_iter_sum_pattern(
+        {fld(sum, 32): (6, 2, 1)},
+        dom_map,
+    )
+    assert_iter_sum_pattern(
+        {flm(fld(sum, 2), 16): (16, 0, 1), flm(sum, 2): (2, 0, 1)},
+        dom_map,
+    )
+    assert_iter_sum_failure({fld(sum, 32), flm(sum, 32)}, dom_map)
+    assert_iter_sum_failure({fld(sum, 32), fld(sum, 4)}, dom_map)
+
+    # right padding only, offset divisible
+    sum = x * 32 + y * 8
+    dom_map = var_dom([(x, 5), (y, 4)])
+    assert_iter_sum_pattern(
+        {fld(sum, 16): (10, 0, 1), flm(sum, 16): (2, 0, 8)},
+        dom_map,
+    )
+    assert_iter_sum_failure({fld(sum, 5)}, dom_map)
+
+    # right padding only, offset non-divisible
+    dom_map = var_dom([(x, 26)])
+    assert_iter_sum_pattern(
+        {fld(x, 15): (2, 0, 1)},
+        dom_map,
+    )
+    assert_iter_sum_pattern(
+        {flm(fld(x, 3), 5): (5, 0, 1), flm(x, 3): (3, 0, 1)},
+        dom_map,
+    )
+
+    # padding constants on both side
+    sum = x + 71
+    dom_map = var_dom([(x, 45)])
+    assert_iter_sum_pattern({fld(sum, 32): (2, 2, 1)}, dom_map)
+    assert_iter_sum_pattern(
+        {flm(fld(x, 4), 8): (8, 0, 1), flm(x, 4): (4, 0, 1)},
+        dom_map,
+    )
+
+    # padding for free iteration part
+    sum = x * 360 + y
+    dom_map = var_dom([(y, 360)])
+    assert_iter_sum_pattern({fld(sum, 16): (23, fld(x * 360 - flm(x, 2) * 8, 16), 1)}, dom_map)
+    assert_iter_sum_pattern({flm(x * 360 + y, 16): (16, 0, 1)}, dom_map)
+
+    # multiple split with same mark offset, could
+    # be surjective on missing (padded // LCM)
+    assert_iter_sum_pattern(
+        {
+            flm(x + 10, 3): (3, 0),
+            flm(fld(x + 10, 3), 4): (4, 0),
+            flm(fld(fld(x + 10, 3), 4), 5): (5, 0),
+        },
+        var_dom([(x, 240)]),
+    )
+    assert_iter_sum_failure(
+        {
+            flm(x + 10, 3),
+            flm(fld(x + 10, 3), 4),
+            flm(fld(fld(x + 10, 3), 4), 5),
+            fld(fld(fld(x + 10, 3), 4), 5),
+        },
+        var_dom([(x, 240)]),
+    )
+
+    # different offsets on splits
+    assert_iter_sum_pattern(
+        {
+            flm(x + 1, 3): (3, 0),
+            flm(fld(x + 10, 3) + 2, 4): (4, 0),
+            flm(fld(fld(x + 10, 3), 4) + 3, 5): (5, 0),
+        },
+        var_dom([(x, 240)]),
+    )
+
+    # original extent is smaller than the divident
+    # it is not surjective wrt to the region [0, 16)
+    assert_iter_sum_failure({flm(x, 16)}, var_dom([(x, 3)]))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 8d26710f40dbf..82e1372f991e1 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -459,11 +459,13 @@ def test_div_index_simplify():
 def test_floordiv_index_simplify():
     # short name for floordiv
     fld = tvm.te.floordiv
+    flm = tvm.te.floormod
     ck = RewriteChecker()
     x, y, z = te.var("x"), te.var("y"), te.var("z")
 
     ck.verify(fld(fld(x, 2), 3), fld(x, 6))
     ck.verify(fld(fld(x, 2) + 1, 3), fld(x + 2, 6))
+    ck.verify(fld(x - flm(x, 21), 21), fld(x, 21))
 
     ck.verify(fld(x * 2, 4), fld(x, 2))
     ck.verify(fld(x * 4, 2), x * 2)
@@ -472,11 +474,17 @@ def test_floordiv_index_simplify():
     ck.verify(fld(x * 8 - 1, 16), fld(x * 8 + -1, 16))
     ck.verify(fld(x * 8 - 9, 16), fld(x, 2) + -1)
 
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1), override=True)
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 7), override=True)
+    ck.verify(fld(x * 360 + y, 16), x * 22)
+    ck.verify(fld(x * 360 + y, 25), x * 14)
+    ck.verify(fld(x * 360 - 8, 25), fld(x * 360 + -8, 25))
+
     ck.verify(fld(x * 4 + y, 2), x * 2 + fld(y, 2))
     ck.verify(fld(tvm.te.min(x * 6, y), 2), tvm.te.min(x * 3, fld(y, 2)))
     ck.verify(fld(tvm.te.max(x * 6, y), 2), tvm.te.max(x * 3, fld(y, 2)))
 
-    ck.verify(fld(y + x * 4, 2), fld(y, 2) + x * 2)
+    ck.verify(fld(y + x * 4, 2), x * 2 + fld(y, 2))
     ck.verify(fld(tvm.te.min(y, x * 6), 2), tvm.te.min(fld(y, 2), x * 3))
     ck.verify(fld(tvm.te.max(y, x * 6), 2), tvm.te.max(fld(y, 2), x * 3))
 
@@ -549,15 +557,17 @@ def test_mod_index_simplify():
 def test_floormod_index_simplify():
     # short name for floordiv
     flm = tvm.te.floormod
-    ck = RewriteChecker()
     x, y, z = te.var("x"), te.var("y"), te.var("z")
     ck = RewriteChecker()
     x, y, nx, ny, z = te.var("x"), te.var("y"), te.var("nx"), te.var("ny"), te.var("z")
 
     ck.verify(flm(x * 10, 2), 0)
+    ck.verify(flm(x * 9600, 6400), flm(x * 3200, 6400))
     ck.verify(flm(x * 10 + y, 2), flm(y, 2))
+    ck.verify(flm(x * 360 + y, 16), flm(x * 8 + y, 16))
     ck.verify(flm(x + 10, 2), flm(x, 2))
     ck.verify(flm(x + y * 10, 2), flm(x, 2))
+    ck.verify(flm(x + y * 360, 16), flm(x + y * 8, 16))
     ck.verify(flm(x * 10 + 1 + y * 2 + 2, 2), 1)
     ck.verify(flm(x * (-10), 2), 0)
     ck.verify(flm(x * (-10) + y, 2), flm(y, 2))
diff --git a/tests/python/unittest/test_tir_buffer.py b/tests/python/unittest/test_tir_buffer.py
index 337f9cbc07223..10e827978cc0a 100644
--- a/tests/python/unittest/test_tir_buffer.py
+++ b/tests/python/unittest/test_tir_buffer.py
@@ -137,6 +137,7 @@ def assert_simplified_equal(index_simplified, index_direct):
 
     idxd = tvm.tir.indexdiv
     idxm = tvm.tir.indexmod
+
     # Test Case1
     index_simplified = A_stride.offset_of(
         (idxd(idxm(k0, k1), s), idxm(idxm(k0, k1), s) + idxd(k0, k1) * k1)
@@ -174,7 +175,7 @@ def assert_simplified_equal(index_simplified, index_direct):
     j = te.size_var("j")
     k = te.size_var("k")
 
-    index_simplified = B.offset_of(
+    index_simplified1 = B.offset_of(
         (
             idxd(idxd(idxd((i * 50176 + j * 28672 + k), 1024), 14), 14),
             idxm(idxd(idxd((i * 50176 + j * 28672 + k), 1024), 14), 14),
@@ -182,8 +183,17 @@ def assert_simplified_equal(index_simplified, index_direct):
             idxm((i * 50176 + j * 28672 + k), 1024),
         )
     )
+    index_simplified2 = B.offset_of(
+        (
+            idxd(idxd(i * 49 + j * 28 + idxd(k, 1024), 14), 14),
+            idxm(idxd(i * 49 + j * 28 + idxd(k, 1024), 14), 14),
+            idxm(i * 7 + idxd(k, 1024), 14),
+            idxm(k, 1024),
+        )
+    )
     index_direct = B.offset_of((0, 0, 0, (i * 50176 + j * 28672 + k)))
-    assert_simplified_equal(index_simplified, index_direct)
+    assert_simplified_equal(index_simplified1, index_direct)
+    assert_simplified_equal(index_simplified2, index_direct)
 
 
 @tvm.testing.requires_llvm
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
index b06dcebe1d1c5..f477367adfad3 100644
--- a/tests/python/unittest/test_tir_schedule_compute_at.py
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -1249,6 +1249,44 @@ def test_compute_at_simplify_static_bound():
     verify_trace_roundtrip(sch=sch, mod=static_bound)
 
 
+def test_compute_at_non_perfect_channel_group():
+    @T.prim_func
+    def grouped_channel_bias(
+        X: T.Buffer[(720, 8, 8), "float32"], Y: T.Buffer[(720, 8, 8), "float32"]
+    ):
+        B = T.alloc_buffer([45], dtype="float32", scope="")
+        for i in T.grid(45):
+            with T.block("init"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = vi
+        for c_o, h, w, c_i in T.grid(2, 8, 8, 360):
+            with T.block("compute"):
+                hh, ww = T.axis.remap("SS", [h, w])
+                cc = T.axis.spatial(720, c_o * 360 + c_i)
+                Y[cc, hh, ww] = X[cc, hh, ww] + B[cc // 16]
+
+    @T.prim_func
+    def grouped_channel_bias_non_perfect_tiled(
+        X: T.Buffer[(720, 8, 8), "float32"], Y: T.Buffer[(720, 8, 8), "float32"]
+    ):
+        B = T.alloc_buffer([45], dtype="float32")
+        for c_o in range(2):
+            for ax0 in range(23):
+                with T.block("init"):
+                    vi = T.axis.spatial(45, c_o * 22 + ax0)
+                    B[vi] = vi
+            for h, w, c_i in T.grid(8, 8, 360):
+                with T.block("compute"):
+                    hh, ww = T.axis.remap("SS", [h, w])
+                    cc = T.axis.spatial(720, c_o * 360 + c_i)
+                    Y[cc, hh, ww] = X[cc, hh, ww] + B[cc // 16]
+
+    sch = tir.Schedule(grouped_channel_bias, debug_mask="all")
+    loop = sch.get_loops(sch.get_block("compute"))[0]
+    sch.compute_at(sch.get_block("init"), loop)
+    tvm.ir.assert_structural_equal(sch.mod["main"], grouped_channel_bias_non_perfect_tiled)
+
+
 def test_fail_subtree_complete_block():
     sch = tir.Schedule(fail_subtree_compact_dataflow, debug_mask="all")
     block = sch.get_block("B_0")

From ac5d7813dff34566645787c9f3f2e6576dd723da Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 31 May 2022 20:03:04 +0100
Subject: [PATCH 003/181] [microNPU] Fix flaky compute cycle annotation test
 (#11510)

Fixes non-deterministic test by disabling striping when running
the cascader.

Change-Id: Ib44f299f21fa0b41be4bfac3deb61a9c16818c58
---
 tests/python/contrib/test_ethosu/cascader/test_scheduler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
index b3610315441ef..2dce6dfdd67ed 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -48,7 +48,6 @@ def test_cascade(SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE, MobileNetv1Star
         cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
 
 
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11483")
 def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
     options = infra.make_options(
@@ -61,6 +60,7 @@ def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
         always_copy_size=1024,
         disable_pareto_plans=False,
         disable_pareto_proposals=False,
+        enable_striping=False,
     )
     sch, te_graph, const_dict = TwoConv2DTE
     cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
@@ -69,7 +69,7 @@ def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
     # [copy, copy, conv2d, copy, conv2d]
     stages = [6, 8, 9, 18, 19]
     # Expected hints for each operation
-    compute_cycles_hints = [4096, 5120, 1632, 2560, 3072]
+    compute_cycles_hints = [4096, 5120, 1440, 2560, 3072]
 
     for stage, compute_cycles_hint in zip(stages, compute_cycles_hints):
         op = sch.stages[stage]

From 2252f958f75c6e33b946d23f1ebb803d41f0b63d Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 31 May 2022 13:27:01 -0700
Subject: [PATCH 004/181] [microTVM][ARM][Zephyr] Add CMSIS dependencies in
 Zephyr project build (#11362)

* Test with CMSIS build added

disabled conv2d_nhwc_dsp.arm_cpu for non integers workloads

added debugging feature to TempDirectory

* revert arm_cpu strategy changes

* Address Andrew comments

* change copy to include

* add cmsis_path only as project option
---
 .../template_project/microtvm_api_server.py   | 45 ++++++++++--
 python/tvm/contrib/utils.py                   | 14 ++--
 tests/micro/zephyr/conftest.py                | 21 +++++-
 tests/micro/zephyr/test_zephyr.py             | 70 +++++++++++++++++++
 tests/micro/zephyr/test_zephyr_aot.py         |  2 +
 tests/micro/zephyr/test_zephyr_armv7m.py      |  1 +
 tests/scripts/task_python_microtvm.sh         |  1 +
 7 files changed, 144 insertions(+), 10 deletions(-)

diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index 059e7604896c0..bcf9f78f4b112 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -27,7 +27,6 @@
 import pathlib
 import queue
 import re
-import select
 import shlex
 import shutil
 import subprocess
@@ -35,7 +34,7 @@
 import tarfile
 import tempfile
 import threading
-import time
+from typing import Union
 import usb
 
 import serial
@@ -323,6 +322,12 @@ def _get_nrf_device_args(options):
         type="str",
         help="Extra definitions added project compile.",
     ),
+    server.ProjectOption(
+        "cmsis_path",
+        optional=["generate_project"],
+        type="str",
+        help="Path to the CMSIS directory.",
+    ),
 ]
 
 
@@ -333,6 +338,13 @@ def get_zephyr_base(options: dict):
     return zephyr_base
 
 
+def get_cmsis_path(options: dict) -> pathlib.Path:
+    """Returns CMSIS dependency path"""
+    cmsis_path = options.get("cmsis_path")
+    assert cmsis_path, "'cmsis_path' option not passed!"
+    return pathlib.Path(cmsis_path)
+
+
 class Handler(server.ProjectAPIHandler):
     def __init__(self):
         super(Handler, self).__init__()
@@ -424,6 +436,17 @@ def _get_platform_version(self, zephyr_base: str) -> float:
 
         return float(f"{version_major}.{version_minor}")
 
+    def _cmsis_required(self, project_path: Union[str, pathlib.Path]) -> bool:
+        """Check if CMSIS dependency is required."""
+        project_path = pathlib.Path(project_path)
+        for path in (project_path / "codegen" / "host" / "src").iterdir():
+            if path.is_file():
+                with open(path, "r") as lib_f:
+                    lib_content = lib_f.read()
+                if "<arm_nnsupportfunctions.h>" in lib_content and "<arm_math.h>" in lib_content:
+                    return True
+        return False
+
     def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
         # Check Zephyr version
         version = self._get_platform_version(get_zephyr_base(options))
@@ -470,8 +493,8 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                 shutil.copy2(src_path, dst_path)
 
         # Populate Makefile.
-        with open(API_SERVER_DIR / "CMakeLists.txt.template", "r") as cmake_template_f:
-            with open(project_dir / "CMakeLists.txt", "w") as cmake_f:
+        with open(project_dir / "CMakeLists.txt", "w") as cmake_f:
+            with open(API_SERVER_DIR / "CMakeLists.txt.template", "r") as cmake_template_f:
                 for line in cmake_template_f:
                     if self.API_SERVER_CRT_LIBS_TOKEN in line:
                         crt_libs = self.CRT_LIBS_BY_PROJECT_TYPE[options["project_type"]]
@@ -484,6 +507,20 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                     for item in flags:
                         cmake_f.write(f"target_compile_definitions(app PUBLIC {item})\n")
 
+            # Include CMSIS libraries if required.
+            if self._cmsis_required(extract_path):
+                cmsis_path = get_cmsis_path(options)
+                cmake_f.write("\n")
+                cmake_f.write(
+                    f'target_include_directories(tvm_model PRIVATE {str(cmsis_path / "CMSIS" / "DSP" / "Include")})\n'
+                )
+                cmake_f.write(
+                    f'target_include_directories(tvm_model PRIVATE {str(cmsis_path / "CMSIS" / "DSP" / "Include" / "dsp")})\n'
+                )
+                cmake_f.write(
+                    f'target_include_directories(tvm_model PRIVATE {str(cmsis_path / "CMSIS" / "NN" / "Include")})\n'
+                )
+
         self._create_prj_conf(project_dir, options)
 
         # Populate crt-config.h
diff --git a/python/tvm/contrib/utils.py b/python/tvm/contrib/utils.py
index e2ca182779c6f..89688b5bf86f4 100644
--- a/python/tvm/contrib/utils.py
+++ b/python/tvm/contrib/utils.py
@@ -93,11 +93,15 @@ def set_keep_for_debug(cls, set_to=True):
         finally:
             cls._KEEP_FOR_DEBUG = old_keep_for_debug
 
-    def __init__(self, custom_path=None):
+    def __init__(self, custom_path=None, keep_for_debug=None):
         if self.TEMPDIRS is None:
             raise DirectoryCreatedPastAtExit()
 
-        self._created_with_keep_for_debug = self._KEEP_FOR_DEBUG
+        if keep_for_debug is not None:
+            self._created_with_keep_for_debug = keep_for_debug
+        else:
+            self._created_with_keep_for_debug = self._KEEP_FOR_DEBUG
+
         if custom_path:
             os.mkdir(custom_path)
             self.temp_dir = custom_path
@@ -169,7 +173,7 @@ def listdir(self):
 atexit.register(TempDirectory.remove_tempdirs)
 
 
-def tempdir(custom_path=None):
+def tempdir(custom_path=None, keep_for_debug=None):
     """Create temp dir which deletes the contents when exit.
 
     Parameters
@@ -177,12 +181,14 @@ def tempdir(custom_path=None):
     custom_path : str, optional
         Manually specify the exact temp dir path
 
+    keep_for_debug : bool
+        Keep temp directory for debugging purposes
     Returns
     -------
     temp : TempDirectory
         The temp directory object
     """
-    return TempDirectory(custom_path)
+    return TempDirectory(custom_path=custom_path, keep_for_debug=keep_for_debug)
 
 
 class FileLock(object):
diff --git a/tests/micro/zephyr/conftest.py b/tests/micro/zephyr/conftest.py
index 177ca8aa269e8..997237d370a5d 100644
--- a/tests/micro/zephyr/conftest.py
+++ b/tests/micro/zephyr/conftest.py
@@ -59,7 +59,7 @@ def tvm_debug(request):
 
 
 @pytest.fixture
-def temp_dir(board):
+def temp_dir(board, tvm_debug):
     parent_dir = pathlib.Path(os.path.dirname(__file__))
     filename = os.path.splitext(os.path.basename(__file__))[0]
     board_workspace = (
@@ -76,4 +76,21 @@ def temp_dir(board):
     if not os.path.exists(board_workspace.parent):
         os.makedirs(board_workspace.parent)
 
-    return tempdir(board_workspace)
+    keep_for_debug = tvm_debug if tvm_debug else None
+    test_temp_dir = tempdir(custom_path=board_workspace, keep_for_debug=keep_for_debug)
+    return test_temp_dir
+
+
+@pytest.fixture(autouse=True)
+def skip_by_board(request, board):
+    """Skip test if board is in the list."""
+    if request.node.get_closest_marker("skip_boards"):
+        if board in request.node.get_closest_marker("skip_boards").args[0]:
+            pytest.skip("skipped on this board: {}".format(board))
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers",
+        "skip_by_board(board): skip test for the given board",
+    )
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index f89d11cf44dcc..2651435434b11 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -22,6 +22,7 @@
 
 import pytest
 import numpy as np
+
 import onnx
 from PIL import Image
 
@@ -32,6 +33,7 @@
 from tvm.relay.testing import byoc
 from tvm.contrib import utils
 from tvm.micro.testing.utils import check_tune_log
+from tvm.target import arm_isa
 
 import test_utils
 
@@ -87,6 +89,7 @@ def _make_add_sess(temp_dir, model, zephyr_board, west_cmd, build_config, dtype=
 
 # The same test code can be executed on both the QEMU simulation and on real hardware.
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_add_uint(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
 
@@ -112,6 +115,7 @@ def test_basic_add(sess):
 
 # The same test code can be executed on both the QEMU simulation and on real hardware.
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_add_float(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -138,6 +142,7 @@ def test_basic_add(sess):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_platform_timer(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
 
@@ -167,6 +172,7 @@ def test_basic_add(sess):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_relay(temp_dir, board, west_cmd, tvm_debug):
     """Testing a simple relay graph"""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -199,6 +205,7 @@ def test_relay(temp_dir, board, west_cmd, tvm_debug):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_onnx(temp_dir, board, west_cmd, tvm_debug):
     """Testing a simple ONNX model."""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -279,6 +286,7 @@ def check_result(
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_byoc_microtvm(temp_dir, board, west_cmd, tvm_debug):
     """This is a simple test case to check BYOC capabilities of microTVM"""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -359,6 +367,7 @@ def _make_add_sess_with_shape(temp_dir, model, zephyr_board, west_cmd, shape, bu
     ],
 )
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_rpc_large_array(temp_dir, board, west_cmd, tvm_debug, shape):
     """Test large RPC array transfer."""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -504,5 +513,66 @@ def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug):
     tvm.testing.assert_allclose(output, expected_output, rtol=1e-4, atol=1e-5)
 
 
+@tvm.testing.requires_micro
+def test_schedule_build_with_cmsis_dependency(temp_dir, board, west_cmd, tvm_debug):
+    """Test Relay schedule with CMSIS dependency. This test shows if microTVM Auto tuning
+    with Zephyr breaks if CMSIS dependency was required for a schedule.
+    """
+    model = test_utils.ZEPHYR_BOARDS[board]
+    build_config = {"debug": tvm_debug}
+    target = tvm.target.target.micro(model, options=["-keys=arm_cpu,cpu"])
+
+    isa = arm_isa.IsaAnalyzer(target)
+    if not isa.has_dsp_support:
+        pytest.skip(f"ISA does not support DSP. target: {target}")
+
+    # Create a Relay conv2d
+    data_shape = (1, 16, 16, 3)
+    weight_shape = (5, 5, 8, 3)
+    data = relay.var("data", relay.TensorType(data_shape, "int8"))
+    weight = relay.var("weight", relay.TensorType(weight_shape, "int8"))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+        out_dtype="int32",
+    )
+    func = relay.Function([data, weight], y)
+    ir_mod = tvm.IRModule.from_expr(func)
+
+    runtime = Runtime("crt", {"system-lib": True})
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(ir_mod, target=target, runtime=runtime)
+
+    project_options = {
+        "project_type": "host_driven",
+        "west_cmd": west_cmd,
+        "verbose": bool(build_config.get("debug")),
+        "zephyr_board": board,
+        "cmsis_path": os.getenv("CMSIS_PATH"),
+    }
+
+    project_dir = temp_dir / "project"
+    project = tvm.micro.generate_project(
+        str(test_utils.TEMPLATE_PROJECT_DIR),
+        mod,
+        project_dir,
+        project_options,
+    )
+    project.build()
+
+    with open(project_dir / "CMakeLists.txt", "r") as cmake_f:
+        cmake_content = cmake_f.read()
+
+    assert "CMSIS/DSP/Include" in cmake_content
+    assert "CMSIS/DSP/Include/dsp" in cmake_content
+    assert "CMSIS/DSP/Include" in cmake_content
+    assert "CMSIS/NN/Include" in cmake_content
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py
index cfe2ce2ae3c8f..3d509f100d6ec 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -38,6 +38,7 @@
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_tflite(temp_dir, board, west_cmd, tvm_debug):
     """Testing a TFLite model."""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -93,6 +94,7 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_qemu_make_fail(temp_dir, board, west_cmd, tvm_debug):
     """Testing QEMU make fail."""
     if board not in ["qemu_x86", "mps2_an521", "mps3_an547"]:
diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py b/tests/micro/zephyr/test_zephyr_armv7m.py
index 2631e43799668..c629403ced821 100644
--- a/tests/micro/zephyr/test_zephyr_armv7m.py
+++ b/tests/micro/zephyr/test_zephyr_armv7m.py
@@ -103,6 +103,7 @@ def _apply_desired_layout_no_simd(relay_mod):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_armv7m_intrinsic(temp_dir, board, west_cmd, tvm_debug):
     """Testing a ARM v7m SIMD extension."""
 
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index 557e938a6ed3a..2274c6ca6b283 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -27,6 +27,7 @@ make cython3
 run_pytest ctypes python-microtvm-zephyr-qemu_x86 tests/micro/zephyr --zephyr-board=qemu_x86
 run_pytest ctypes python-microtvm-zephyr-qemu_riscv32 tests/micro/zephyr --zephyr-board=qemu_riscv32
 run_pytest ctypes python-microtvm-zephyr-qemu_riscv64 tests/micro/zephyr --zephyr-board=qemu_riscv64
+run_pytest ctypes python-microtvm-zephyr-mps2_an521 tests/micro/zephyr --zephyr-board=mps2_an521
 
 # Arduino
 run_pytest ctypes python-microtvm-arduino apps/microtvm/arduino/template_project/tests

From a71536a130685a50582eea8c993030872cddb145 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 31 May 2022 15:57:30 -0700
Subject: [PATCH 005/181] [MetaSchedule] Enable Task Filtering (#11512)

This PR allows `relay.backend.MetaScheduleExtractTask` to take an extra argument `filter_func` which filters out tasks that don't need tuning. The counterpart of AutoScheduler is `traverse_to_get_io_tensors`.
---
 python/tvm/meta_schedule/relay_integration.py |  8 +-
 python/tvm/te/__init__.py                     |  2 +-
 python/tvm/te/operation.py                    | 29 ++-----
 src/relay/backend/task_extraction.cc          | 80 +++++++++++++------
 src/te/operation/create_primfunc.cc           | 33 --------
 src/te/operation/create_primfunc.h            |  3 -
 src/tir/schedule/concrete_schedule.cc         | 20 ++---
 .../test_meta_schedule_integration.py         | 63 +++++++++++++++
 8 files changed, 140 insertions(+), 98 deletions(-)

diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index 47f76830ab88f..b556338174130 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """MetaSchedule-Relay integration"""
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 import numpy as np  # type: ignore
 from tvm import nd
@@ -23,6 +23,7 @@
 from tvm.ir import IRModule, transform
 from tvm.runtime import NDArray
 from tvm.target import Target
+from tvm.te import Tensor
 
 from .extracted_task import ExtractedTask
 from .utils import autotvm_silencer
@@ -36,6 +37,7 @@ def extract_task_from_relay(
     opt_level: int = 3,
     pass_config: Optional[Dict[str, Any]] = None,
     disabled_pass: Optional[List[str]] = None,
+    filter_func: Callable[[List[Tensor]], bool] = None,
 ) -> List[ExtractedTask]:
     """Extract tuning tasks from a relay program.
 
@@ -53,6 +55,8 @@ def extract_task_from_relay(
         The pass config of the compiler
     disabled_pass : Optional[List[str]]
         The list of disabled passes of the compiler
+    filter_func : Callable[[List[tvm.te.Tensor]], bool]
+        The filter function to filter out the extracted tasks
 
     Returns
     -------
@@ -90,4 +94,4 @@ def extract_task_from_relay(
         config=pass_config,
         disabled_pass=disabled_pass,
     ):
-        return list(extract_task_func(mod, target, relay_params))
+        return list(extract_task_func(mod, target, relay_params, filter_func))
diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
index 4c4e223f2d723..1777d8707c7ce 100644
--- a/python/tvm/te/__init__.py
+++ b/python/tvm/te/__init__.py
@@ -39,7 +39,7 @@
 from .tag import tag_scope
 from .operation import placeholder, compute, scan, extern, var, size_var, const
 from .operation import thread_axis, reduce_axis
-from .operation import create_prim_func, create_prim_func_from_outputs
+from .operation import create_prim_func
 
 from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp, HybridOp
 from .autodiff import gradient
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 90d7cb5d75dbc..df5dd2c4ffd81 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -15,17 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 """ Operation class for computation declaration."""
+import inspect
+
 # pylint: disable=invalid-name
 from numbers import Integral as _Integral
-from typing import List, Union
-import inspect
+from typing import List
 
 import tvm._ffi
+import tvm.tir
+import tvm.tir._ffi_api
 from tvm._ffi.base import string_types
 from tvm.ir import Array
 from tvm.runtime import convert
-import tvm.tir
-import tvm.tir._ffi_api
 
 from . import _ffi_api
 from . import tag as _tag
@@ -528,23 +529,3 @@ def tir_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     if not isinstance(ops, (list, tuple, Array)):
         ops = [ops]
     return _ffi_api.CreatePrimFunc(ops)
-
-
-def create_prim_func_from_outputs(
-    outputs: Union[_tensor.Tensor, List[_tensor.Tensor]],
-) -> tvm.tir.PrimFunc:
-    """Create a TensorIR PrimFunc from output tensor(s) in TE
-
-    Parameters
-    ----------
-    outputs : Union[Tensor, List[Tensor]]
-        The source expression.
-
-    Returns
-    -------
-    func : tir.PrimFunc
-        The created function.
-    """
-    if not isinstance(outputs, (list, tuple, Array)):
-        outputs = [outputs]
-    return _ffi_api.CreatePrimFuncFromOutputs(outputs)
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index 0895fd42a3077..6ec881111d770 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -31,25 +31,58 @@ namespace tvm {
 namespace relay {
 namespace backend {
 
-namespace metaschedule {
-
-using meta_schedule::ExtractedTask;
+bool DefaultTaskFilter(const Array<te::Tensor>& args) {
+  using namespace ::tvm::te;
+  std::vector<Tensor> stack;
+  std::unordered_set<const TensorNode*> visited;
+  for (const Tensor& v : args) {
+    for (const PrimExpr& e : v->shape) {
+      // Dynamic shape is not supported for now
+      if (!e->IsInstance<IntImmNode>()) {
+        return false;
+      }
+    }
+    if (!visited.count(v.get())) {
+      visited.insert(v.get());
+      stack.push_back(v);
+    }
+  }
+  while (!stack.empty()) {
+    Tensor tensor = stack.back();
+    stack.pop_back();
+    if (tensor->op->IsInstance<PlaceholderOpNode>()) {
+      // do nothing
+    } else if (tensor->op->IsInstance<ComputeOpNode>()) {
+      Array<Tensor> inputs = tensor->op->InputTensors();
+      for (const Tensor& v : inputs) {
+        if (!visited.count(v.get())) {
+          visited.insert(v.get());
+          stack.push_back(v);
+        }
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
 
-Array<ExtractedTask> ExtractTask(IRModule mod, Target target,
-                                 Map<String, runtime::NDArray> params) {
+Array<meta_schedule::ExtractedTask> ExtractTask(
+    IRModule mod, Target target, Map<String, runtime::NDArray> params,
+    runtime::TypedPackedFunc<bool(const Array<te::Tensor>&)> filter_func) {
+  using meta_schedule::ExtractedTask;
+  if (filter_func == nullptr) {
+    filter_func = DefaultTaskFilter;
+  }
   backend::BindParamsInModule(mod, params);
-
   // is_vm=true for backward compatibility
   Array<Pass> pass_seqs = relay::backend::GetPassPrefix(/*is_homogenous=*/true, /*is_vm=*/true);
   pass_seqs.push_back(transform::FuseOps());
-
-  transform::Sequential seq(pass_seqs);
-  auto opt_mod = seq(std::move(mod));
+  mod = transform::Sequential(pass_seqs)(std::move(mod));
 
   std::vector<ExtractedTask> tasks;
   std::unordered_map<tec::CCacheKey, ExtractedTask> cache;
-
-  PostOrderVisit(opt_mod->Lookup("main"), [target, &tasks, &cache](const Expr& exp) {
+  PostOrderVisit(mod->Lookup("main"), [&target, &tasks, &cache, &filter_func](const Expr& exp) {
     if (exp->IsInstance<FunctionNode>()) {
       Function relay_func = Downcast<Function>(exp);
       if (!relay_func->HasNonzeroAttr(attr::kPrimitive)) {
@@ -61,17 +94,19 @@ Array<ExtractedTask> ExtractTask(IRModule mod, Target target,
         it->second->weight += 1;
         return;
       }
-      Array<te::Tensor> inputs_outputs;
+      Array<te::Tensor> inputs_outputs{nullptr};
       std::string fused_name;
       std::tie(inputs_outputs, fused_name) =
           tec::LowerTECompute(relay_func, target, /*return_inputs=*/true);
-      auto prim_func = tir::CreatePrimFunc(inputs_outputs);
-      GlobalVar prim_fn_var(fused_name);
-      IRModule relay_mod({{prim_fn_var, relay_func}});
-      IRModule tir_mod({{prim_fn_var, prim_func}});
-      ExtractedTask extracted_task(fused_name, relay_mod, target, {tir_mod}, 1);
-      tasks.push_back(extracted_task);
-      cache.emplace(cache_key, extracted_task);
+      if (filter_func(inputs_outputs)) {
+        tir::PrimFunc prim_func = tir::CreatePrimFunc(inputs_outputs);
+        GlobalVar prim_fn_var(fused_name);
+        IRModule relay_mod({{prim_fn_var, relay_func}});
+        IRModule tir_mod({{prim_fn_var, prim_func}});
+        ExtractedTask extracted_task(fused_name, relay_mod, target, {tir_mod}, 1);
+        tasks.push_back(extracted_task);
+        cache.emplace(cache_key, extracted_task);
+      }
     }
   });
   // Tasks are extracted via post order visit, return the reversed list.
@@ -83,12 +118,7 @@ Array<ExtractedTask> ExtractTask(IRModule mod, Target target,
   return tasks;
 }
 
-}  // namespace metaschedule
-
-TVM_REGISTER_GLOBAL("relay.backend.MetaScheduleExtractTask")
-    .set_body_typed([](IRModule mod, Target target, Map<String, runtime::NDArray> params) {
-      return metaschedule::ExtractTask(mod, target, params);
-    });
+TVM_REGISTER_GLOBAL("relay.backend.MetaScheduleExtractTask").set_body_typed(ExtractTask);
 
 }  // namespace backend
 }  // namespace relay
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 7e7dae855802f..03ad551c68391 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -458,40 +458,7 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
   return LayoutFreePlaceholdersNormalizer().Process(std::move(func));
 }
 
-PrimFunc CreatePrimFuncFromOutputs(const Array<te::Tensor>& outputs) {
-  std::vector<te::Tensor> stack;
-  std::unordered_set<const te::TensorNode*> visited;
-  for (const te::Tensor& output : outputs) {
-    if (!visited.count(output.get())) {
-      visited.insert(output.get());
-      stack.push_back(output);
-    }
-  }
-
-  Array<te::Tensor> arg_list;
-  while (!stack.empty()) {
-    te::Tensor tensor = stack.back();
-    stack.pop_back();
-    if (tensor->op->IsInstance<te::PlaceholderOpNode>()) {
-      arg_list.push_back(tensor);
-    } else if (tensor->op->IsInstance<te::ComputeOpNode>()) {
-      Array<te::Tensor> inputs = tensor->op->InputTensors();
-      for (const te::Tensor& input : inputs) {
-        if (!visited.count(input.get())) {
-          visited.insert(input.get());
-          stack.push_back(input);
-        }
-      }
-    }
-  }
-  for (const te::Tensor& output : outputs) {
-    arg_list.push_back(output);
-  }
-  return CreatePrimFunc(arg_list);
-}
-
 TVM_REGISTER_GLOBAL("te.CreatePrimFunc").set_body_typed(CreatePrimFunc);
-TVM_REGISTER_GLOBAL("te.CreatePrimFuncFromOutputs").set_body_typed(CreatePrimFuncFromOutputs);
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/te/operation/create_primfunc.h b/src/te/operation/create_primfunc.h
index d911e5ebcdb7d..c3cddd83f57a8 100644
--- a/src/te/operation/create_primfunc.h
+++ b/src/te/operation/create_primfunc.h
@@ -30,9 +30,6 @@ namespace tir {
 /*! \brief Use Tensor Expression to create a schedulable TensorIR func. */
 PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list);
 
-/*! \brief Create a schedulable TensorIR func from TE compute outputs. */
-PrimFunc CreatePrimFuncFromOutputs(const Array<te::Tensor>& outputs);
-
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 8066d85a8e7db..2289899c329bb 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -199,16 +199,16 @@ Schedule ConcreteScheduleNode::Copy() {
  * \param level An ScheduleErrorRenderLevel enum, level of error rendering
  * \sa ScheduleErrorRenderLevel
  */
-#define TVM_TIR_SCHEDULE_END(primitive, level)                    \
-  }                                                               \
-  catch (const ScheduleError& error) {                            \
-    if ((level) == ScheduleErrorRenderLevel::kDetail) {           \
-      throw tvm::runtime::Error(error.RenderReport(primitive));   \
-    } else if ((level) == ScheduleErrorRenderLevel::kFast) {      \
-      throw tvm::runtime::Error(error.FastErrorString());         \
-    } else if ((level) == ScheduleErrorRenderLevel::kNone) {      \
-      throw tvm::runtime::Error("ScheduleError: (not rendered)"); \
-    }                                                             \
+#define TVM_TIR_SCHEDULE_END(primitive, level)                                                \
+  }                                                                                           \
+  catch (const ScheduleError& error) {                                                        \
+    if ((level) == ScheduleErrorRenderLevel::kDetail) {                                       \
+      throw tvm::runtime::Error(error.RenderReport(primitive) + "\n" + runtime::Backtrace()); \
+    } else if ((level) == ScheduleErrorRenderLevel::kFast) {                                  \
+      throw tvm::runtime::Error(error.FastErrorString());                                     \
+    } else if ((level) == ScheduleErrorRenderLevel::kNone) {                                  \
+      throw tvm::runtime::Error("ScheduleError: (not rendered)");                             \
+    }                                                                                         \
   }
 
 /******** Schedule: Schedule: Sampling ********/
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index cd6e1b4c405ac..a423bdb48afdf 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -196,6 +196,69 @@ def test_meta_schedule_integration_extract_from_bert_base():
         assert expected_shape == shape, t.task_name
 
 
+@requires_torch
+def test_meta_schedule_integration_extract_from_resnet_with_filter_func():
+    def filter_func(args) -> bool:
+        from tvm import te, tir
+
+        has_complex_op = False
+        visited = set()
+
+        def traverse(t):
+            nonlocal has_complex_op
+            assert t.handle is not None
+            if t.handle.value in visited:
+                return
+            if isinstance(t.op, te.PlaceholderOp):
+                pass
+            elif isinstance(t.op, te.ComputeOp):
+                has_complex_op = has_complex_op or any(
+                    [isinstance(e, tir.Reduce) for e in t.op.body]
+                )
+                for x in t.op.input_tensors:
+                    traverse(x)
+            visited.add(t.handle.value)
+
+        for t in args:
+            traverse(t)
+        return has_complex_op
+
+    mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
+    extracted_tasks = ms.extract_task_from_relay(
+        mod,
+        target="llvm",
+        params=params,
+        filter_func=filter_func,
+    )
+    expected_task_names = [
+        "fused_" + s
+        for s in [
+            "nn_max_pool2d",
+            "nn_adaptive_avg_pool2d",
+            "nn_dense_add",
+            "nn_conv2d_add",
+            "nn_conv2d_add_1",
+            "nn_conv2d_add_2",
+            "nn_conv2d_add_add_nn_relu",
+            "nn_conv2d_add_add_nn_relu_1",
+            "nn_conv2d_add_nn_relu",
+            "nn_conv2d_add_nn_relu_1",
+            "nn_conv2d_add_nn_relu_2",
+            "nn_conv2d_add_nn_relu_3",
+            "nn_conv2d_add_nn_relu_4",
+            "nn_conv2d_add_nn_relu_5",
+            "nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu",
+            "nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1",
+            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu",
+            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1",
+        ]
+    ]
+
+    assert len(extracted_tasks) == len(expected_task_names)
+    for t in extracted_tasks:
+        assert t.task_name in expected_task_names, t.task_name
+
+
 @requires_torch
 def test_meta_schedule_integration_apply_history_best():
     mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])

From 0cd4dd2f2d6cab265844de0cb8745e0de8d22571 Mon Sep 17 00:00:00 2001
From: wangxiang2713 <49302617+wangxiang2713@users.noreply.github.com>
Date: Wed, 1 Jun 2022 20:58:14 +0800
Subject: [PATCH 006/181] [BugFix] Add lock for ModuleNode::GetFuncFromEnv
 (#11467)

* [BugFix] Add lock for ModuleNode::GetFuncFromEnv

* [BugFix] Add lock for ModuleNode::GetFuncFromEnv
---
 include/tvm/runtime/module.h | 2 ++
 src/runtime/module.cc        | 1 +
 2 files changed, 3 insertions(+)

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 875d999c64fab..31d05571eefd2 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -33,6 +33,7 @@
 #include <tvm/runtime/object.h>
 
 #include <memory>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -234,6 +235,7 @@ class TVM_DLL ModuleNode : public Object {
  private:
   /*! \brief Cache used by GetImport */
   std::unordered_map<std::string, std::shared_ptr<PackedFunc> > import_cache_;
+  std::mutex mutex_;
 };
 
 /*!
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 57fe57568994b..633dc7c176711 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -107,6 +107,7 @@ std::string ModuleNode::GetSource(const std::string& format) {
 }
 
 const PackedFunc* ModuleNode::GetFuncFromEnv(const std::string& name) {
+  std::lock_guard<std::mutex> lock(mutex_);
   auto it = import_cache_.find(name);
   if (it != import_cache_.end()) return it->second.get();
   PackedFunc pf;

From ee26ecf1d516af3c7693f6cb53901b4a055ef9d4 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Wed, 1 Jun 2022 15:51:56 +0100
Subject: [PATCH 007/181] [microNPU] Add transform matrices and part matcher to
 identity op (#11453)

* [microNPU] Add transform matrices and part matcher to identity op

* Address comments

* Enable cascader in identity tests

* Address comments
---
 .../contrib/ethosu/cascader/device_config.py  | 46 ++++++----
 .../backend/contrib/ethosu/te/identity.py     | 87 +++++++++++++++++-
 .../cascader/test_ethosu_identity_matcher.py  | 58 ++++++++++++
 .../contrib/test_ethosu/test_codegen.py       | 89 +++++++++++--------
 4 files changed, 223 insertions(+), 57 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py

diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index 27aa8b8c78c59..f654a2598ba41 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -48,9 +48,24 @@ def __init__(self, shape: List[int], layout="NHWC"):
             self.width = int(shape[3])
             self.depth = int(shape[2]) * int(shape[4])
         else:
-            self.height = int(shape[1])
-            self.width = int(shape[2])
-            self.depth = int(shape[3])
+            # identity layout is NHWC but the shape is not always 4
+            length = len(shape)
+            if length == 4:
+                self.height = int(shape[1])
+                self.width = int(shape[2])
+                self.depth = int(shape[3])
+            elif length == 3:
+                self.height = int(shape[0])
+                self.width = int(shape[1])
+                self.depth = int(shape[2])
+            elif length == 2:
+                self.height = int(shape[0])
+                self.width = int(shape[1])
+                self.depth = 1
+            elif length == 1:
+                self.height = int(shape[0])
+                self.width = 1
+                self.depth = 1
 
     def round_up(self, other: "_Shape"):
         self.height = _round_up(self.height, other.height)
@@ -627,18 +642,19 @@ def _get_subkernel_propagator(
         stride_w = int(op_attrs.get("stride_w", 1))
         transform = ifm_propagator.transform
 
-        if input_layout == "NHCWB16":
-            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
-            transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
-        else:
-            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
-            transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)
-
-        if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
-            if output_layout == "NHCWB16" and input_layout == "NHWC":
-                transform[3][-1] = depth
-            elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
-                transform[2][-1] = 1 + ((depth - 1) // 16)
+        if op_type != "ethosu_identity":
+            if input_layout == "NHCWB16":
+                transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+                transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
+            else:
+                transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+                transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)
+
+            if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
+                if output_layout == "NHCWB16" and input_layout == "NHWC":
+                    transform[3][-1] = depth
+                elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
+                    transform[2][-1] = 1 + ((depth - 1) // 16)
 
         return Propagator(transform, ifm_propagator.offset)
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/identity.py b/python/tvm/relay/backend/contrib/ethosu/te/identity.py
index 271ca1542fc5c..0b61e0c28b880 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/identity.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/identity.py
@@ -16,7 +16,10 @@
 # under the License.
 # pylint: disable=invalid-name,unused-argument
 """Tensor Expression for identity"""
+import numpy as np
 from tvm import te
+from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
+
 from .dma import read_compute, write_compute
 
 
@@ -56,7 +59,6 @@ def identity_compute(
     -------
     te.Tensor
         The Output Feature Map tensor.
-
     """
     dmaed_ifm = read_compute(ifm, ifm_zero_point, ifm_scale)
     id_attrs = {"op": "ethosu_identity", "activation": activation}
@@ -76,7 +78,86 @@ def identity_compute(
         name="ethosu_identity",
         attrs=id_attrs,
     )
+    length = len(ifm.shape)
+    ifm_matrix = np.identity(length + 1)
+    offset = np.zeros(length, dtype="int64")
+    ifm_propagator = Propagator(
+        ifm_matrix,
+        offset.tolist(),
+    )
+    propagator_attrs = {
+        "ifm_propagator": ifm_propagator,
+    }
+    return write_compute(identity, ofm_zero_point, ofm_scale, attrs=propagator_attrs)
+
+
+@register_matcher
+def match_ethosu_identity(output_tensor, device_config):
+    """Match a Tensor Expression corresponding to an NPU identity.
 
-    dmaed_ofm = write_compute(identity, ofm_zero_point, ofm_scale)
+    If the Tensor Expression matches, an EthosuPart will be created that models the
+    matched Tensor Expression. Otherwise, None will be returned.
 
-    return dmaed_ofm
+    Parameters
+    ----------
+    output_tensor : tvm.te.Tensor
+        The tensor to attempt to match with.
+    device_config : EthosuDeviceConfig
+        Target device configuration
+
+    Returns
+    -------
+    Union[None, EthosuPart]
+        The created EthosuPart if there was a match, otherwise None.
+    """
+    write = output_tensor
+    if write.op.name != "ethosu_write":
+        return None
+    identity = write.op.input_tensors[0]
+    if identity.op.name != "ethosu_identity":
+        return None
+    read = identity.op.input_tensors[0]
+    if read.op.name != "ethosu_read":
+        return None
+
+    input_tensors = [
+        read.op.input_tensors[0],
+    ]
+    subgraph = TESubgraph(input_tensors, output_tensor)
+    propagators = [
+        write.op.attrs["ifm_propagator"],
+    ]
+    ifm_dtype = input_tensors[0].dtype
+    ofm_dtype = output_tensor.dtype
+
+    input_tensors_shape = input_tensors[0].shape
+    length = len(input_tensors_shape)
+    assert length <= 4
+    channels = int(input_tensors_shape[length - 1]) if length >= 3 else 1
+
+    subkernels = len(device_config.get_kernel_steps(identity.op.name, 1, 1, ifm_dtype))
+
+    input_layout = output_layout = "NHWC"
+    output_quantum = device_config.get_output_quantum(output_layout)
+
+    valid_block_configs = device_config.get_valid_block_configs(
+        propagators[0],
+        identity.op.attrs,
+        output_tensor.shape,
+        channels,
+        channels,
+        output_layout,
+        input_layout,
+        ifm_dtype,
+        ofm_dtype,
+        1,
+        1,
+    )
+
+    return EthosuPart(
+        subgraph,
+        propagators,
+        output_quantum,
+        subkernels,
+        valid_block_configs,
+    )
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py
new file mode 100644
index 0000000000000..4609a5bc3779a
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import numpy as np
+
+from tvm import te
+import tvm.contrib.ethosu.cascader as cs
+from tvm.relay.backend.contrib.ethosu.te.identity import match_ethosu_identity, identity_compute
+from .infra import make_matrices
+
+
+def test_ethosu_identity_matcher():
+    ofm_channels = 21
+    ifm_shape = (1, 12, 15, ofm_channels)
+    ifm = te.placeholder(ifm_shape, dtype="int8")
+    lut = te.placeholder((), dtype="uint8")
+    out = identity_compute(
+        ifm=ifm,
+        lut=lut,
+        ifm_scale=1,
+        ifm_zero_point=0,
+        ofm_scale=1,
+        ofm_zero_point=0,
+        activation="NONE",
+    )
+
+    length = len(ifm.shape)
+    ifm_transform = np.identity(length + 1).tolist()
+    ifm_offset = np.zeros(length, dtype="int64").tolist()
+
+    device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+    part = match_ethosu_identity(out, device_config)
+
+    assert isinstance(part, cs.EthosuPart)
+    assert len(part.propagators) == 1
+    assert part.propagators[0].transform == ifm_transform
+    assert part.propagators[0].offset == ifm_offset
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index ce617d14fac2b..b6b78c3357605 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -37,6 +37,10 @@
 ACCEL_TYPES = ["ethos-u55-256", "ethos-u55-128", "ethos-u55-64", "ethos-u55-32", "ethos-u65-256"]
 
 
+def is_u55_accel_type(accel_type):
+    return "u55" in accel_type
+
+
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES + ["ethos-u65-512"])
 @pytest.mark.parametrize("ifm_shape", [(1, 299, 299, 2), (1, 55, 55, 3)])
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
@@ -270,9 +274,7 @@ def binary_elementwise(lhs, rhs):
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
         accel_type=accel_type,
-        # non 4D ops legalize into identity op that is not currently supported in the cascader
-        enable_cascader=(len(ifm_shape) == 4 and len(ifm2_shape) == 4)
-        and ("u65" not in accel_type),
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -301,8 +303,7 @@ def binary_elementwise(lhs, rhs):
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
         accel_type=accel_type,
-        # non 4D ops legalize into identity op that is not currently supported in the cascader
-        enable_cascader=False,
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -567,13 +568,12 @@ def generate_output_data(input_data):
     ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
     infra.compare_ethosu_with_reference(
-        # identity op is not supported in cascader
         ethosu_mod,
         input_data,
         output_data,
         accel_type,
         output_tolerance=1,
-        enable_cascader=False,
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -603,9 +603,12 @@ def create_model():
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    # reshape ops legalize into identity op that is not currently supported in the cascader
     infra.compare_ethosu_with_reference(
-        ethosu_mod, input_data, output_data, accel_type, enable_cascader=False
+        ethosu_mod,
+        input_data,
+        output_data,
+        accel_type,
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -626,8 +629,9 @@ def test_tflite_slice(accel_type, ifm_shape, begin, size):
     def slice_func(x):
         return tf.slice(x, begin, size)
 
-    # Ops that get legalized to identity is currently not supported by the cascader
-    infra.compare_tvm_with_tflite(slice_func, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        slice_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -642,9 +646,8 @@ def test_tflite_strided_slice(accel_type, ifm_shape, begin, end):
     def strided_slice_func(x):
         return tf.strided_slice(x, begin, end)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(
-        strided_slice_func, [ifm_shape], accel_type, enable_cascader=False
+        strided_slice_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
     )
 
 
@@ -667,12 +670,11 @@ def abs_func(x):
             op = tf.math.abs(x)
         return op
 
-    # non-4D tensors are legalized to identity which are not supported by the cascader
     infra.compare_tvm_with_tflite(
         abs_func,
         [ifm_shape],
         accel_type,
-        enable_cascader=(len(ifm_shape) == 4) and ("u65" not in accel_type),
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -752,8 +754,9 @@ def tanh_func(x):
         op = tf.nn.tanh(x)
         return op
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(tanh_func, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        tanh_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -774,7 +777,6 @@ def concat_func(*inputs):
         op = tf.concat(list(inputs), axis)
         return op
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(concat_func, shapes, accel_type, enable_cascader=False)
 
 
@@ -788,8 +790,9 @@ def sigmoid_function(x):
         op = tf.nn.sigmoid(x)
         return op
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(sigmoid_function, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        sigmoid_function, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 # This codegen test checks both, split and split_v
@@ -813,7 +816,6 @@ def split_func(x):
         op = tf.split(x, num_or_size_splits, axis=axis)
         return op
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(split_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
@@ -845,9 +847,12 @@ def create_model():
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = partition_for_ethosu(cpu_mod)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_ethosu_with_reference(
-        ethosu_mod, input_data, output_data, accel_type, enable_cascader=False
+        ethosu_mod,
+        input_data,
+        output_data,
+        accel_type,
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -860,8 +865,9 @@ def test_tflite_expand_dims(accel_type, ifm_shape, axis):
     def expand_dims_func(x):
         return tf.expand_dims(x, axis=axis)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(expand_dims_func, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        expand_dims_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -875,8 +881,9 @@ def test_tflite_squeeze(accel_type, ifm_shape, axis):
     def squeeze_func(x):
         return tf.squeeze(x, axis=axis)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(squeeze_func, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        squeeze_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -894,8 +901,9 @@ def resize_model(x):
             x, size, align_corners=align_corners, half_pixel_centers=False
         )
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        resize_model, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -918,8 +926,9 @@ def resize_model(x):
             x, size, align_corners=align_corners, half_pixel_centers=False
         )
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        resize_model, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -959,9 +968,11 @@ def conv2d_transpose(x):
             op = tf.nn.bias_add(op, bias)
         return op
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(
-        conv2d_transpose, [ifm_shape], accel_type=accel_type, enable_cascader=False
+        conv2d_transpose,
+        [ifm_shape],
+        accel_type=accel_type,
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -982,7 +993,6 @@ def test_tflite_pack(accel_type, ifm_shapes, axis):
     def pack_func(*inputs):
         return tf.stack(inputs, axis=axis)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type, enable_cascader=False)
 
 
@@ -998,7 +1008,6 @@ def test_tflite_unpack(accel_type, ifm_shape, axis):
     def unpack_func(x):
         return tf.unstack(x, axis=axis)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(unpack_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
@@ -1012,8 +1021,9 @@ def test_tflite_leaky_relu(accel_type, ifm_shape, alpha):
     def leaky_relu_func(x):
         return tf.nn.leaky_relu(x, alpha=alpha)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(leaky_relu_func, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        leaky_relu_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1045,8 +1055,9 @@ def fully_connected(x):
             x = tf.nn.relu(x)
         return x
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(fully_connected, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        fully_connected, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 if __name__ == "__main__":

From 62e449cb858bde9be0bdd3903f3515916bff0131 Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Wed, 1 Jun 2022 09:54:10 -0700
Subject: [PATCH 008/181] [microTVM][ARM]Add tests for arm schedules (#11472)

* add more tests for arm_cpu schedules

conv1d_ncw, conv1d_nwc, conv2d_NCHWc, depthwise_conv2d_NCHWc, dense_dsp, avg_ pool and max_pool tests are added.

Co-authored-by: Mohamad <mkatanbaf@users.noreply.github.com>
---
 .../relay/strategy/arm_cpu/test_avg_pool.py   | 168 ++++++++++++++++++
 .../relay/strategy/arm_cpu/test_conv1d_ncw.py | 117 ++++++++++++
 .../relay/strategy/arm_cpu/test_conv1d_nwc.py | 145 +++++++++++++++
 .../strategy/arm_cpu/test_conv2d_NCHWc.py     | 138 ++++++++++++++
 .../relay/strategy/arm_cpu/test_dense_dsp.py  |  90 ++++++++++
 .../arm_cpu/test_depthwise_conv2d_NCHWc.py    | 121 +++++++++++++
 .../relay/strategy/arm_cpu/test_max_pool.py   | 135 ++++++++++++++
 7 files changed, 914 insertions(+)
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_avg_pool.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_max_pool.py

diff --git a/tests/python/relay/strategy/arm_cpu/test_avg_pool.py b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
new file mode 100644
index 0000000000000..31a812b38eed7
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
@@ -0,0 +1,168 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicPoolTests:
+    @tvm.testing.requires_corstone300
+    def test_pool(
+        self,
+        pool_type,
+        shape,
+        dtype,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        layout,
+        ceil_mode,
+        count_include_pad,
+        schedule_name,
+    ):
+        """Test a subgraph with a single pool operator."""
+        ishape = shape
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+
+        out0 = getattr(relay.op.nn, pool_type)(
+            input0,
+            pool_size=pool_size,
+            strides=strides,
+            dilation=dilation,
+            padding=padding,
+            layout=layout,
+            out_layout="",
+            ceil_mode=ceil_mode,
+            count_include_pad=count_include_pad,
+        )
+
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        out1 = getattr(relay.op.nn, pool_type)(
+            input1,
+            pool_size=pool_size,
+            strides=strides,
+            dilation=dilation,
+            padding=padding,
+            layout=layout,
+            out_layout="",
+            ceil_mode=ceil_mode,
+            count_include_pad=count_include_pad,
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestAvgPool1d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    (
+        shape,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        layout,
+        ceil_mode,
+        count_include_pad,
+    ) = tvm.testing.parameters(
+        ((3, 32, 27), (3,), (2,), 0, 1, "NCW", False, False),
+        ((3, 32, 27), (3,), (2,), 0, 1, "NWC", False, False),
+        ((3, 32, 27), (3,), (2,), 0, 1, "NCW", True, False),
+        ((3, 32, 27), (3,), (2,), 1, 1, "NCW", False, True),
+        ((1, 1, 32), 3, 1, 0, 1, "NCW", False, False),
+        ((1, 4, 20), 3, 2, 2, 1, "NCW", False, False),
+    )
+    pool_type = tvm.testing.parameter("avg_pool1d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+class TestAvgPool2d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    (
+        shape,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        layout,
+        ceil_mode,
+        count_include_pad,
+    ) = tvm.testing.parameters(
+        ((3, 32, 27, 27), (3, 3), (2, 2), 0, 1, "NCHW", False, False),
+        ((3, 32, 27, 27), (3, 3), (2, 2), 0, 1, "NHWC", False, False),
+        ((2, 16, 27, 27), (3, 3), (2, 2), 0, 1, "NCHW", True, False),
+        ((2, 27, 27, 16), (3, 3), (2, 2), 0, 1, "NHWC", True, False),
+        ((2, 16, 27, 27), (3, 3), (2, 2), 0, 1, "NCHW", True, True),
+        ((1, 25, 5, 64), (25, 5), (25, 5), 0, 1, "NHWC", False, False),
+        ((1, 3, 3, 256), (3, 3), (3, 3), 0, 1, "NHWC", False, False),
+        ((1, 8, 8, 64), (8, 8), (8, 8), 0, 1, "NHWC", False, False),
+        ((1, 1, 32, 32), (3, 3), 1, 0, 1, "NCHW", False, False),
+        ((1, 4, 32, 20), (3, 3), (2, 2), 0, 1, "NCHW", False, False),
+    )
+    pool_type = tvm.testing.parameter("avg_pool2d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+class TestAvgPool3d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    (
+        shape,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        layout,
+        ceil_mode,
+        count_include_pad,
+    ) = tvm.testing.parameters(
+        ((3, 4, 8, 27, 27), (3, 3, 3), 2, 0, 1, "NCDHW", False, False),
+    )
+    pool_type = tvm.testing.parameter("avg_pool3d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
new file mode 100644
index 0000000000000..0f0507cfe7d3d
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
@@ -0,0 +1,117 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicConv1dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv1d(
+        self,
+        data_shape,
+        kernel_size,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv1d_ncw operator."""
+        ishape = data_shape
+        wshape = (num_filter, data_shape[1], kernel_size)
+
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv1d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout="NCW",
+            kernel_layout="OIW",
+            out_dtype="int32",
+            out_layout="NCW",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+
+        out1 = relay.op.nn.conv1d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout="NCW",
+            kernel_layout="OIW",
+            out_dtype="int32",
+            out_layout="NCW",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestConv1d_ncw(BasicConv1dTests):
+    """This test is for conv1d_ncw.generic schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((4, 32, 16), 3, 12, 1, 0, 1),
+        ((4, 16, 32), 3, 12, 1, 0, 1),
+        ((1, 12, 32), 3, 16, 1, 0, 1),
+        ((3, 10, 12), 4, 24, 1, 0, 1),
+        ((1, 7, 7), 3, 5, 1, 0, 1),
+        ((1, 2, 10), 4, 4, 2, (1, 1), 1),
+        ((1, 2, 20), 4, 4, 2, (0, 1), 1),
+        ((1, 4, 16), 1, 12, 1, (1, 0), 1),
+        ((1, 16, 24), 1, 32, 3, (2, 2), 1),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    data_layout = tvm.testing.parameter("NCW")
+    schedule_name = tvm.testing.parameter("conv1d_ncw.generic")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
new file mode 100644
index 0000000000000..e430ade2fac14
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicConv1dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv1d(
+        self,
+        data_shape,
+        kernel_size,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv1d_nwc operator."""
+        ishape = data_shape
+        wshape = (kernel_size, data_shape[-1], num_filter)
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv1d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout="NWC",
+            kernel_layout="WIO",
+            out_dtype="int32",
+            out_layout="NWC",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+
+        if kernel_layout == "WOI":
+            weight1 = relay.const(np.moveaxis(weight_data, 1, -1))
+        else:
+            weight1 = relay.const(weight_data)
+
+        out1 = relay.op.nn.conv1d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout="NWC",
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout="NWC",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestConv1d_dsp(BasicConv1dTests):
+    """This test is for conv1d_dsp schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((4, 32, 16), 3, 12, 1, 0, 1),
+        ((4, 16, 32), 3, 12, 1, 0, 1),
+        ((4, 32, 16), 3, 12, 1, 0, 1),
+        ((1, 32, 12), 3, 16, 1, 0, 1),
+        # TODO: The following 4 tests fail due to https://github.com/apache/tvm/issues/11466
+        # ((3, 12, 10), 4, 24, 1, 0, 1),
+        # ((1, 7, 7), 3, 5, 1, 0, 1),
+        # ((1, 10, 2), 4, 4, 2, (1, 1), 1),
+        # ((1, 20, 2), 4, 4, 2, (0, 1), 1),
+        ((1, 16, 4), 1, 12, 1, (1, 0), 1),
+        ((1, 24, 16), 1, 32, 3, (2, 2), 1),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    data_layout = tvm.testing.parameter("NWC")
+    kernel_layout = tvm.testing.parameter("WOI")
+    schedule_name = tvm.testing.parameter("conv1d_dsp")
+
+
+class TestConv1d_nwc(BasicConv1dTests):
+    """This test is for conv1d_nwc.generic schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((4, 32, 16), 3, 12, 1, 0, 1),
+        ((4, 16, 32), 3, 12, 1, 0, 1),
+        ((4, 32, 16), 3, 12, 1, 0, 1),
+        ((1, 32, 12), 3, 16, 1, 0, 1),
+        ((3, 12, 10), 4, 24, 1, 0, 1),
+        ((1, 7, 7), 3, 5, 1, 0, 1),
+        ((1, 10, 2), 4, 4, 2, (1, 1), 1),
+        ((1, 20, 2), 4, 4, 2, (0, 1), 1),
+        ((1, 16, 4), 1, 12, 1, (1, 0), 1),
+        ((1, 24, 16), 1, 32, 3, (2, 2), 1),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    data_layout = tvm.testing.parameter("NWC")
+    kernel_layout = tvm.testing.parameter("WIO")
+    schedule_name = tvm.testing.parameter("conv1d_nwc.generic")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
new file mode 100644
index 0000000000000..3b43d37c9075f
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
@@ -0,0 +1,138 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv2d_NCHWc(
+        self,
+        data_shape,
+        kernel_size,
+        data_layout,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv2d_NCHWc operator."""
+        ishape = data_shape
+        wshape = (num_filter, data_shape[1], *kernel_size)
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.contrib_conv2d_nchwc(
+            relay.layout_transform(input0, "NCHW", data_layout),
+            relay.layout_transform(weight0, "OIHW", kernel_layout),
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            channels=num_filter,
+            out_dtype="",
+            out_layout="",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+        out1 = relay.op.nn.contrib_conv2d_nchwc(
+            relay.layout_transform(input1, "NCHW", data_layout),
+            relay.layout_transform(weight1, "OIHW", kernel_layout),
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            channels=num_filter,
+            out_dtype="",
+            out_layout="",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestConv2d_NCHWc(BasicConv2dTests):
+    """This test is for conv2d_NCHWc.x86 schedule."""
+
+    (
+        data_shape,
+        kernel_size,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        kernel_layout,
+        data_layout,
+    ) = tvm.testing.parameters(
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int16", "OIHW4i4o", "NCHW4c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int32", "OIHW4i4o", "NCHW4c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int8", "OIHW2i8o", "NCHW8c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int16", "OIHW2i8o", "NCHW8c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int32", "OIHW2i8o", "NCHW8c"),
+        # ResNet18 workloads
+        # this test does not fit in corstone300 DCTM section.
+        # ((1, 3, 112, 112), (7, 7), 64, (2, 2), (3, 3), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 64, 28, 28), (3, 3), 64, (1, 1), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 64, 28, 28), (1, 1), 64, (1, 1), (0, 0), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 64, 28, 28), (3, 3), 128, (2, 2), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 64, 28, 28), (1, 1), 128, (2, 2), (0, 0), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 128, 14, 14), (3, 3), 128, (1, 1), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 128, 14, 14), (3, 3), 256, (2, 2), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 128, 14, 14), (1, 1), 256, (2, 2), (0, 0), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 256, 7, 7), (3, 3), 256, (1, 1), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 256, 7, 7), (3, 3), 512, (2, 2), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 256, 7, 7), (1, 1), 512, (2, 2), (0, 0), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 512, 3, 3), (3, 3), 512, (1, 1), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+    )
+    schedule_name = tvm.testing.parameter("conv2d_NCHWc.x86")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
new file mode 100644
index 0000000000000..3edffba8acaa6
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicDenseTests:
+    @tvm.testing.requires_corstone300
+    def test_dense(self, shape, weight_shape, dtype, schedule_name):
+        """Test a subgraph with a single dense operator."""
+        ishape = shape
+        wshape = weight_shape
+        units = weight_shape[0]
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.dense(
+            input0,
+            weight0,
+            units=units,
+            out_dtype="int32",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+        out1 = relay.op.nn.dense(
+            input1,
+            weight1,
+            units=units,
+            out_dtype="int32",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestDense(BasicDenseTests):
+    """This test is for dense_dsp schedule."""
+
+    shape, weight_shape = tvm.testing.parameters(
+        ((1, 128), (16, 128)),
+        ((32, 32), (32, 32)),
+        ((1, 64), (1, 64)),
+        ((11, 2), (2, 2)),
+        ((1, 32), (64, 32)),
+        ((3, 12), (10, 12)),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    schedule_name = tvm.testing.parameter("dense_dsp")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
new file mode 100644
index 0000000000000..69e9ab09e4c95
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_depthwise_conv2d_NCHWc(
+        self,
+        data_shape,
+        kernel_size,
+        data_layout,
+        kernel_layout,
+        groups,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single depthwise_conv2d_nchwc operator."""
+        ishape = data_shape
+        wshape = (data_shape[1], 1, *kernel_size)
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+        groups = groups
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.contrib_depthwise_conv2d_nchwc(
+            relay.layout_transform(input0, "NCHW", data_layout),
+            relay.layout_transform(weight0, "OIHW", kernel_layout),
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            groups=groups,
+            out_dtype="",
+            out_layout="",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+        out1 = relay.op.nn.contrib_depthwise_conv2d_nchwc(
+            relay.layout_transform(input1, "NCHW", data_layout),
+            relay.layout_transform(weight1, "OIHW", kernel_layout),
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            groups=groups,
+            out_dtype="",
+            out_layout="",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestDepthWiseConv2d_NCHWc(BasicConv2dTests):
+    """This test is for depthwise_conv2d_NCHWc schedule."""
+
+    (
+        data_shape,
+        kernel_size,
+        groups,
+        strides,
+        padding,
+        dilation,
+        kernel_layout,
+        data_layout,
+    ) = tvm.testing.parameters(
+        ((1, 16, 32, 32), (3, 3), 16, (1, 1), (1, 1, 1, 1), (1, 1), "OIHW1i4o", "NCHW4c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1, 1, 1), (1, 1), "OIHW1i8o", "NCHW8c"),
+    )
+    dtype = tvm.testing.parameter("int8", "int16", "int32")
+    schedule_name = tvm.testing.parameter("depthwise_conv2d_NCHWc")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_max_pool.py b/tests/python/relay/strategy/arm_cpu/test_max_pool.py
new file mode 100644
index 0000000000000..f58a041ecb746
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_max_pool.py
@@ -0,0 +1,135 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from pickle import FALSE
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicPoolTests:
+    @tvm.testing.requires_corstone300
+    def test_pool(
+        self,
+        pool_type,
+        shape,
+        dtype,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        layout,
+        ceil_mode,
+        schedule_name,
+    ):
+        """Test a subgraph with a single max_pool operator."""
+        ishape = shape
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+
+        out0 = getattr(relay.op.nn, pool_type)(
+            input0,
+            pool_size=pool_size,
+            strides=strides,
+            dilation=dilation,
+            padding=padding,
+            layout=layout,
+            out_layout="",
+            ceil_mode=ceil_mode,
+        )
+
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        out1 = getattr(relay.op.nn, pool_type)(
+            input1,
+            pool_size=pool_size,
+            strides=strides,
+            dilation=dilation,
+            padding=padding,
+            layout=layout,
+            out_layout="",
+            ceil_mode=ceil_mode,
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestMaxPool1d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    shape, pool_size, strides, padding, dilation, layout, ceil_mode = tvm.testing.parameters(
+        ((3, 32, 27), (3,), (2,), 0, 1, "NCW", True),
+        ((1, 32, 1), 3, 1, 0, 1, "NWC", False),
+        ((1, 20, 4), 3, 2, 0, 1, "NWC", False),
+    )
+    pool_type = tvm.testing.parameter("max_pool1d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+class TestMaxPool2d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    shape, pool_size, strides, padding, dilation, layout, ceil_mode = tvm.testing.parameters(
+        ((2, 32, 27, 27), (3, 3), (2, 2), 0, 1, "NCHW", False),
+        ((2, 32, 27, 27), (3, 3), (2, 2), 0, 1, "NCHW", True),
+        ((1, 26, 26, 12), (2, 2), (2, 2), 0, 1, "NHWC", False),
+        ((1, 11, 11, 32), (2, 2), (2, 2), 0, 1, "NHWC", False),
+        ((1, 3, 3, 64), (2, 2), (2, 2), 0, 1, "NHWC", False),
+        ((1, 32, 32, 1), (3, 3), 1, 0, 1, "NHWC", False),
+        ((1, 32, 20, 4), (3, 3), (2, 2), 0, 1, "NHWC", False),
+        ((1, 32, 32, 1), (3, 3), 1, 0, 1, "NHWC", True),
+        ((1, 32, 20, 4), (3, 3), (2, 2), 0, 1, "NHWC", True),
+    )
+    pool_type = tvm.testing.parameter("max_pool2d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+class TestMaxPool3d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    shape, pool_size, strides, padding, dilation, layout, ceil_mode = tvm.testing.parameters(
+        ((3, 4, 8, 27, 27), (3, 3, 3), 2, 0, 1, "NCDHW", False),
+    )
+    pool_type = tvm.testing.parameter("max_pool3d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 89c02358a13f2e744580c4615bfeb06962d71965 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Wed, 1 Jun 2022 11:51:35 -0700
Subject: [PATCH 009/181] [Relay] Plumb external codegen target via
 Target.current() (#11432)

* [Relay] Plumb external codegen target via Target.current() for all external codegen paths

(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for
context, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).

We want both old-style (via relay.ext.$toolchain) and new-style (via "RelayToTIR" Pass
attribute on target kind) external codegen to be able to access the current 'external codegen'
Target instance via Target.current().

 - For old-style, plumb the true Target through TEComplier and push it on the context
   stack before calling relay.ext.$toolchain.

 - For new-style, pass the CompilationConfig to the RelayToTIRTargetHook pass, make the jump from
   "Compiler" attribute value to Target via the new CompilationConfig::FindPrimitiveTargetForKind
   method, and push on the stack before invoking the custom "RelayToTIR" pass.

While working on this discovered RelayToTIRTargetHook was incompatible with the VM's compilation
flow since RelayToTIRTargetHook assumes all "Compiler" attributed functions are inlined. Generalize
it to support both inline and global function styles.

Extend Target::IsExternalCodegen to recognize target kinds with "RelayToTIR" attributes as
external.

Update target hooks unit test to exercise new support for outline-style, picking up the current target,
and compiling via the VM.

* - A bit of polishing en passant.

* - Add comment as per Josh's suggestion

Can't repro tests/python/contrib/test_ethosu/cascader/test_scheduler.py::test_compute_cycles_annotation failure, flake?
---
 include/tvm/relay/transform.h                 |  43 +++-
 include/tvm/target/target_kind.h              |  10 +
 src/relay/backend/aot_executor_codegen.cc     |   2 +-
 src/relay/backend/contrib/cmsisnn/target.cc   |   2 +-
 .../backend/contrib/codegen_c/codegen.cc      |  12 ++
 src/relay/backend/contrib/ethosu/codegen.cc   |   2 +-
 .../example_target_hooks/relay_to_tir.cc      | 200 +++++++++++++-----
 .../contrib/example_target_hooks/target.cc    |   5 +-
 src/relay/backend/graph_executor_codegen.cc   |   2 +-
 src/relay/backend/interpreter.cc              |   8 +-
 src/relay/backend/te_compiler.cc              |  57 ++---
 src/relay/backend/te_compiler.h               |  11 +-
 src/relay/backend/vm/compiler.cc              |  34 +--
 src/relay/backend/vm/compiler.h               |   4 +-
 src/relay/transforms/dead_code.cc             |   2 +
 src/relay/transforms/inline.cc                |   1 +
 src/relay/transforms/target_hooks.cc          | 150 ++++++++++---
 src/target/target.cc                          |   8 +-
 tests/cpp/target_test.cc                      |   6 +
 tests/python/frontend/onnx/test_forward.py    |   2 +-
 .../relay/dyn/test_dynamic_op_level2.py       |   4 +-
 tests/python/relay/test_external_codegen.py   |  54 +++++
 tests/python/relay/test_target_hooks.py       |  53 ++++-
 tests/python/relay/utils/external_codegen.py  |   2 +-
 24 files changed, 512 insertions(+), 162 deletions(-)

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 0d518e4ed547e..6e3bddf9adf5c 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -462,11 +462,50 @@ TVM_DLL Pass RemoveUnusedFunctions(Array<runtime::String> entry_functions);
 TVM_DLL Pass SimplifyExpr();
 
 /*!
- * \brief Run any registered RelayToTIR passes registered on the functions in a module.
+ * \brief Run any custom passes registered under "RelayToTIR" attributes on TargetKinds.
+ *
+ * This pass looks for inline, let-bound or global functions which have a "Compiler" attribute.
+ * If the attribute value corresponds to a TargetKind with a "RelayToTIR" attribute, then the
+ * 'custom' pass bound to that attribute is run (at most once) on the IRModule as a whole.
+ *
+ * If, in addition, the \p config has a Target with a matching TargetKind, that Target is set
+ * as the 'current' target before the custom pass is executed. In this way it is possible
+ * for custom passes to pick up target options which may guide how they transform the IRModule.
+ * (Those targets are referred to as 'extern codegen targets' elsewhere).
+ *
+ * A typical custom pass will:
+ *  - Find calls to "Compiler" attributes functions with matching compiler name.
+ *  - Lower those function to TIR PrimFuncs.
+ *  - Bind those functions into the IRModule under the the functions' "global_symbol" attribute.
+ *  - Replace all calls to those functions with 'call_lowered' to the matching global.
+ * Care should be taken to handle multiple calls to the same function.
+ * See src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc for an example custom pass.
+ *
+ * It is also possible (despite the pass and attribute names!) for the custom pass to proceed
+ * directly to a runtime::Module, which can be attached to the output IRModules "external_mods"
+ * attribute (taking care not to clobber any existing modules). In this case the flow is as above,
+ * except:
+ *  - The runtime::Module must contain a binding for each compiled function under their
+ *    "global_symbol" (ie runtime::Module::ImplementsFunction should return true).
+ *  - A Relay Function must be bound (or re-bound) into the result IRModule, again with the same
+ *    "global_symbol", but with only the "Extern" attribute set to Integer(1). The function body
+ *    should be the original function body. In this way we always have a TVM definition matching
+ *    every global function name.
+ *
+ * There are many existing runtime::Modules, ranging from source to object to dynamic libaries to
+ * entirely custom implementations. Some of those may require additional compilation using
+ * 'export_library' on the final build artifact.
+ *
+ * The OutlineCompilerFunctionsWithExistingGlobalSymbols and MarkCompilerFunctionsAsExtern utility
+ * passes can be used by custom passes to take care of some of the boilerplate.
+ *
+ * TODO(mbs): Rename PreLoweringTargetHooks?
+ *
+ * \param config All available targets.
  *
  * \return The pass.
  */
-TVM_DLL Pass RelayToTIRTargetHook();
+TVM_DLL Pass RelayToTIRTargetHook(CompilationConfig config);
 
 /*!
  * \brief A pass for manifesting explicit memory allocations and rewriting
diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index 395d3aab6757b..4879470e76545 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -402,6 +402,16 @@ namespace attr {
  * See also \p Target::IsExternalCodegenFor
  */
 constexpr const char* kIsExternalCodegen = "is_external_codegen";
+
+/*!
+ * \brief A \p TargetKind attribute of type \p FTVMRelayToTIR. If set, then the target kind name
+ * also corresponds to an external codegen 'compiler' name, and the bound value is a \p Pass
+ * to apply before the TVM lowering.
+ *
+ * See also \p Target::IsExternalCodegenFor
+ */
+constexpr const char* kRelayToTIR = "RelayToTIR";
+
 }  // namespace attr
 
 /*!
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 60f108aacf662..167afd2c5f782 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -1079,7 +1079,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           // lowering process directly.
           tec::UpdateFunctionMetadata(func, this->function_metadata_, workspace_byte_alignment);
         },
-        config_->host_virtual_device)(mod);
+        config_)(mod);
 
     auto lowered_main = lowered_mod->Lookup("main");
     auto lowered_main_func = GetRef<Function>(lowered_main.as<FunctionNode>());
diff --git a/src/relay/backend/contrib/cmsisnn/target.cc b/src/relay/backend/contrib/cmsisnn/target.cc
index 99bc0bc7cb205..fd2f18aa9905b 100644
--- a/src/relay/backend/contrib/cmsisnn/target.cc
+++ b/src/relay/backend/contrib/cmsisnn/target.cc
@@ -31,7 +31,7 @@ tvm::transform::Pass RelayToTIR();
 runtime::Module TIRToRuntime(IRModule mod, Target target);
 
 TVM_REGISTER_TARGET_KIND("cmsis-nn", kDLCPU)
-    .set_attr<FTVMRelayToTIR>("RelayToTIR", RelayToTIR())
+    .set_attr<FTVMRelayToTIR>(tvm::attr::kRelayToTIR, RelayToTIR())
     .set_attr<FTVMTIRToRuntime>("TIRToRuntime", TIRToRuntime);
 
 }  // namespace cmsisnn
diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc
index 19b8c579cd8b5..fd1c39bb92830 100644
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ b/src/relay/backend/contrib/codegen_c/codegen.cc
@@ -227,6 +227,14 @@ class CSourceCodegen : public CSourceModuleCodegenBase {
     Array<String> variables = std::get<0>(res);
     String func_name = std::get<1>(res);
 
+    Optional<Target> opt_target = Target::Current();
+    if (opt_target.defined() && opt_target.value()->kind->name == "ccompiler") {
+      Optional<String> header = opt_target.value()->GetAttr<String>("header");
+      if (header.defined() && !header.value().empty()) {
+        code_stream_ << header.value().c_str() << "\n";
+      }
+    }
+
     // Create headers
     code_stream_ << "#include <stdio.h>\n";
     code_stream_ << "#include <stdlib.h>\n";
@@ -293,6 +301,10 @@ runtime::Module CCompiler(const ObjectRef& ref) {
 
 TVM_REGISTER_GLOBAL("relay.ext.ccompiler").set_body_typed(CCompiler);
 
+TVM_REGISTER_TARGET_KIND("ccompiler", kDLCPU)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true))
+    .add_attr_option<String>("header", String(""));  // value is prepended to every output CModule
+
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/contrib/ethosu/codegen.cc b/src/relay/backend/contrib/ethosu/codegen.cc
index 47c80b47c5790..afa17750d8a8c 100644
--- a/src/relay/backend/contrib/ethosu/codegen.cc
+++ b/src/relay/backend/contrib/ethosu/codegen.cc
@@ -320,7 +320,7 @@ runtime::Module TIRToRuntime(IRModule mod, Target target) {
 
 TVM_REGISTER_TARGET_KIND("ethos-u", kDLCPU)
     .set_attr<Bool>("use_device_api", Bool(true))
-    .set_attr<FTVMRelayToTIR>("RelayToTIR", RelayToTIR())
+    .set_attr<FTVMRelayToTIR>(tvm::attr::kRelayToTIR, RelayToTIR())
     .set_attr<FTVMTIRToRuntime>("TIRToRuntime", TIRToRuntime);
 
 }  // namespace ethosu
diff --git a/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
index c498baa6d11d2..eb6cf1cce4207 100644
--- a/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
+++ b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
@@ -28,12 +28,37 @@
 #include <tvm/tir/op.h>
 
 #include "../../../op/call/call.h"
+#include "tvm/tir/function.h"
 
 namespace tvm {
 namespace relay {
 namespace contrib {
 namespace example_target_hooks {
 
+namespace {
+
+/*!
+ * \brief An example mutator for a "RelayToTIR" custom pass. Replaces every call to a Relay
+ * Function with "external_symbol" attribute of "replace_add_with_subtract" with a call to a
+ * TIR PrimFunc implementing subtraction.
+ *
+ * Illustrates six aspects a custom 'lowering' style pass may need to account for:
+ *  - Lowerable functions can appear inline as call ops, bound to let-bound variables, or as
+ *    global functions.
+ *  - Let-bound lowerable functions should be inlined on-the-fly since after processing the
+ *    let-binding is no longer required.
+ *  - There may be multiple calls to the same lowerable function. All calls need to be
+ *    rewritten, even though the function itself need be rewritten only once.
+ *  - GlobalVars must be shared between all calls and the new definition itself.
+ *  - Calls to lowered functions must use the "call_lowered" calling convention.
+ *  - The Target::Current() may hold an instance of the TargetKind from which the custom Pass
+ *    was extracted.
+ *
+ * Though not illustrated here, it is also valid for a "RelayToTIR" custom pass to add
+ * runtime::Modules to the output IRModule's "external_mods" attribute. In this case the
+ * IRModule must be left with an 'extern' Function definition with the matching "external_symbol"
+ * name.
+ */
 class ConvertAddToSubtract : public MixedModeMutator {
  public:
   explicit ConvertAddToSubtract(IRModule ir_module, Target host_target)
@@ -56,51 +81,102 @@ class ConvertAddToSubtract : public MixedModeMutator {
     return tir::BufferLoad(buffer, {index});
   }
 
-  void ReplaceAddWithSubtractPrimFunc(const GlobalVar& new_global_var, const Function& func) {
-    tir::Buffer x_buffer = tir::decl_buffer({8}, DataType::Float(32), "x");
-    tir::Buffer y_buffer = tir::decl_buffer({8}, DataType::Float(32), "y");
-    tir::Buffer out_buffer = tir::decl_buffer({8}, DataType::Float(32));
+  GlobalVar ReplaceAddWithSubtractPrimFunc(const Function& func) {
+    auto func_name = func->GetAttr<String>(::tvm::attr::kGlobalSymbol);
+    ICHECK(func_name.defined());
 
-    tir::Var x_var("x", DataType::Handle());
-    tir::Var y_var("y", DataType::Handle());
-    tir::Var out_var("out", DataType::Handle());
+    // --------------------------------------------------------------------------------------------
+    // Cases:
+    //  - Inline function:
+    //     - First encounter: create global var, rewrite to PrimFunc, add binding, replace call.
+    //     - Thereafter (via object sharing): discover global var already in module, replace call
+    //  - Global function:
+    //     - Assume func_name == global_var->name_hint
+    //     - First encounter: create global var, rewrite to PrimFunc, update binding, replace call
+    //     - Thereafter (via global var): discover global var already in module, replace call
+    // --------------------------------------------------------------------------------------------
 
-    Map<String, ObjectRef> dict_attrs;
-    dict_attrs.Set("global_symbol", new_global_var->name_hint);
-    dict_attrs.Set("tir.noalias", Bool(true));
+    // If necessary, introduce a new global var to map the function to and copy the source type
+    // over for InferType.
+    GlobalVar global_var;
+    bool need_rewriting;
+    if (ir_module_->ContainGlobalVar(func_name.value())) {
+      global_var = ir_module_->GetGlobalVar(func_name.value());
+      // Only rewrite to a PrimFunc if the global definition is still a Relay function.
+      need_rewriting = ir_module_->Lookup(global_var)->IsInstance<FunctionNode>();
+    } else {
+      global_var = GlobalVar(func_name.value());
+      global_var->checked_type_ = func->checked_type();
+      need_rewriting = true;
+    }
 
-    te::Var index("index", DataType::Int(32));
-    tir::Sub indexed_sub = tir::Sub(LoadIndex(x_buffer, index), LoadIndex(y_buffer, index));
-    tir::Stmt math_body = tir::BufferStore(out_buffer, indexed_sub, {index});
-    tir::Stmt math_loop = tir::For(index, 0, 8, tir::ForKind::kSerial, math_body);
+    // For illustration only, check if the current target matches the example_target_hook kind,
+    // and if so extract the example attribute value.
+    int64_t example_attribute_value = 0;
+    Optional<Target> opt_current_target = Target::Current();
+    if (opt_current_target.defined() &&
+        opt_current_target.value()->kind->name == "example_target_hook") {
+      example_attribute_value =
+          opt_current_target.value()->GetAttr<Integer>("example_attribute").value()->value;
+    }
 
-    Map<tir::Var, tir::Buffer> buffer_map = {
-        {x_var, x_buffer},
-        {y_var, y_buffer},
-        {out_var, out_buffer},
-    };
+    if (need_rewriting) {
+      // The called function is still in Relay form. Convert to TIR.
+      tir::Buffer x_buffer = tir::decl_buffer({8}, DataType::Float(32), "x");
+      tir::Buffer y_buffer = tir::decl_buffer({8}, DataType::Float(32), "y");
+      tir::Buffer out_buffer = tir::decl_buffer({8}, DataType::Float(32));
 
-    tir::PrimFunc replacement_func = tir::PrimFunc({x_var, y_var, out_var}, math_loop, VoidType(),
-                                                   buffer_map, {}, DictAttrs(dict_attrs));
+      tir::Var x_var("x", DataType::Handle());
+      tir::Var y_var("y", DataType::Handle());
+      tir::Var out_var("out", DataType::Handle());
 
-    // Switch to TIRToRuntime hook for testing
-    Bool tir_to_runtime = func->GetAttr<Bool>("tir_to_runtime").value_or(Bool(false));
-    if (tir_to_runtime) {
-      replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, custom_target_);
-    } else {
-      replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, host_target_);
+      Map<String, ObjectRef> dict_attrs;
+      dict_attrs.Set("global_symbol", global_var->name_hint);
+      dict_attrs.Set("tir.noalias", Bool(true));
+
+      te::Var index("index", DataType::Int(32));
+      tir::Sub indexed_sub = tir::Sub(LoadIndex(x_buffer, index), LoadIndex(y_buffer, index));
+      if (example_attribute_value > 0) {
+        // For illustration only, fold the example attribute into the result.
+        indexed_sub = tir::Sub(indexed_sub, FloatImm(DataType::Float(32),
+                                                     static_cast<double>(example_attribute_value)));
+      }
+
+      tir::Stmt math_body = tir::BufferStore(out_buffer, indexed_sub, {index});
+      tir::Stmt math_loop = tir::For(index, 0, 8, tir::ForKind::kSerial, math_body);
+
+      Map<tir::Var, tir::Buffer> buffer_map = {
+          {x_var, x_buffer},
+          {y_var, y_buffer},
+          {out_var, out_buffer},
+      };
+
+      tir::PrimFunc replacement_func = tir::PrimFunc({x_var, y_var, out_var}, math_loop, VoidType(),
+                                                     buffer_map, {}, DictAttrs(dict_attrs));
+
+      // Switch to TIRToRuntime hook for testing
+      Bool tir_to_runtime = func->GetAttr<Bool>("tir_to_runtime").value_or(Bool(false));
+      if (tir_to_runtime) {
+        replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, custom_target_);
+      } else {
+        replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, host_target_);
+      }
+
+      ir_module_->Update(global_var, replacement_func);  // Will Add if global_var is new.
     }
 
-    ir_module_->Add(new_global_var, replacement_func);
+    return global_var;
   }
 
+  using MixedModeMutator::VisitExpr_;
+
   Expr VisitExpr_(const LetNode* op) final {
     auto pre_visit = [this](const LetNode* op) {
       Expr var = this->VisitExpr(op->var);
       Expr value = this->VisitExpr(op->value);
 
-      // Outlineable function no longer needs let binding
-      if (this->CanLowerExpr(value)) {
+      if (AsLowerableFunction(value)) {
+        // Inline on-the-fly if the let-bound value is lowerable.
         this->memo_[var] = value;
       }
     };
@@ -110,8 +186,8 @@ class ConvertAddToSubtract : public MixedModeMutator {
       Expr body = this->VisitExpr(op->body);
       auto expr = GetRef<Expr>(op);
 
-      // Drop the let binding
-      if (this->CanLowerExpr(value)) {
+      if (AsLowerableFunction(value)) {
+        // The let binding is no longer needed since inlined on-the-fly above.
         this->memo_[expr] = this->VisitExpr(op->body);
       } else {
         Var var = Downcast<Var>(this->VisitExpr(op->var));
@@ -126,39 +202,49 @@ class ConvertAddToSubtract : public MixedModeMutator {
     return memo_[GetRef<Expr>(op)];
   }
 
-  bool CanLowerExpr(const Expr& expr) {
-    const auto* func = expr.as<FunctionNode>();
-    if (func == nullptr) {
-      return false;
-    }
-    auto func_name = func->GetAttr<String>(::tvm::attr::kGlobalSymbol);
-    if (!func_name.defined()) {
-      return false;
+  const FunctionNode* AsLowerableFunction(const Expr& expr) {
+    if (const auto* function_node = expr.as<FunctionNode>()) {
+      auto func_name = function_node->GetAttr<String>(::tvm::attr::kGlobalSymbol);
+      if (!func_name.defined()) {
+        return nullptr;
+      }
+      if (func_name != "replace_add_with_subtract") {
+        return nullptr;
+      }
+      return function_node;
+    } else if (const auto* global_var_node = expr.as<GlobalVarNode>()) {
+      return AsLowerableFunction(ir_module_->Lookup(GetRef<GlobalVar>(global_var_node)));
+    } else {
+      return nullptr;
     }
-    if (func_name != "replace_add_with_subtract") {
-      return false;
+  }
+
+  const GlobalVarNode* AsAlreadyLoweredFunction(const Expr& expr) {
+    if (const auto* global_var_node = expr.as<GlobalVarNode>()) {
+      if (ir_module_->Lookup(GetRef<GlobalVar>(global_var_node)).as<tir::PrimFuncNode>()) {
+        return global_var_node;
+      }
     }
-    return true;
+    return nullptr;
   }
 
   Expr Rewrite_(const CallNode* pre, const Expr& post) override {
-    if (const CallNode* call = post.as<CallNode>()) {
-      if (CanLowerExpr(call->op)) {
-        auto* func = call->op.as<FunctionNode>();
-        auto func_name = func->GetAttr<String>(::tvm::attr::kGlobalSymbol);
-
-        // Introduce a new global var to map the function to and copy the source type
-        // over for InferType
-        GlobalVar new_global_var(func_name.value());
-        new_global_var->checked_type_ = func->checked_type();
-        ReplaceAddWithSubtractPrimFunc(new_global_var, GetRef<Function>(func));
-
+    if (const auto* call = post.as<CallNode>()) {
+      GlobalVar new_op;
+      if (const auto* function_node = AsLowerableFunction(call->op)) {
+        // Add or replace the function with a PrimFunc.
+        new_op = ReplaceAddWithSubtractPrimFunc(GetRef<Function>(function_node));
+      } else if (const auto* global_var_node = AsAlreadyLoweredFunction(call->op)) {
+        // The function has already been rewritten, so we just need to update the call.
+        new_op = GetRef<GlobalVar>(global_var_node);
+      }
+      if (new_op.defined()) {
         // Since we are replacing the Relay function with a call to a TIR function, we must use
         // the call_lowered op.
         CallLoweredAttrs attrs;
         attrs.metadata.Set("relay_attrs", call->attrs);
         ICHECK(call->type_args.empty()) << "lowered functions cannot be polymorphic";
-        return CallLowered(std::move(new_global_var), call->args, std::move(attrs), call->span);
+        return CallLowered(std::move(new_op), call->args, std::move(attrs), call->span);
       }
     }
 
@@ -171,10 +257,12 @@ class ConvertAddToSubtract : public MixedModeMutator {
   Target custom_target_;
 };
 
+}  // namespace
+
 transform::Pass RelayToTIR() {
   runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
       [=](IRModule ir_module, transform::PassContext pass_context) {
-        auto relay_to_tir = ConvertAddToSubtract(ir_module, Target("c"));
+        ConvertAddToSubtract relay_to_tir(std::move(ir_module), Target("c"));
         return relay_to_tir.Mutate();
       };
   return tvm::transform::CreateModulePass(pass_func, 0, "RelayToTIR", {});
diff --git a/src/relay/backend/contrib/example_target_hooks/target.cc b/src/relay/backend/contrib/example_target_hooks/target.cc
index 6f1914eac4c3a..19bfa8c682986 100644
--- a/src/relay/backend/contrib/example_target_hooks/target.cc
+++ b/src/relay/backend/contrib/example_target_hooks/target.cc
@@ -34,7 +34,8 @@ runtime::Module TIRToRuntime(IRModule mod, Target target);
 
 TVM_REGISTER_TARGET_KIND("example_target_hook", kDLCPU)
     .set_attr<Bool>("use_device_api", Bool(true))
-    .set_attr<FTVMRelayToTIR>("RelayToTIR", relay::contrib::example_target_hooks::RelayToTIR())
-    .set_attr<FTVMTIRToRuntime>("TIRToRuntime", relay::contrib::example_target_hooks::TIRToRuntime);
+    .set_attr<FTVMRelayToTIR>(attr::kRelayToTIR, relay::contrib::example_target_hooks::RelayToTIR())
+    .set_attr<FTVMTIRToRuntime>("TIRToRuntime", relay::contrib::example_target_hooks::TIRToRuntime)
+    .add_attr_option<Integer>("example_attribute", Integer(0));
 
 }  // namespace tvm
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 2734439cddbdc..7dba23803f8c7 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -232,7 +232,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
           // lowering process directly.
           tec::UpdateFunctionMetadata(func, this->function_metadata_);
         },
-        config_->host_virtual_device)(mod);
+        config_)(mod);
 
     Optional<backend::FunctionInfo> main_func_info =
         lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 65ef296516956..9661040eab308 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -945,14 +945,13 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
  * rewritten \p mod and target-specific modules containing bindings for all TIR primitive
  * functions needed by the rewritten module.
  */
-IRModule Prepare(IRModule mod, CompilationConfig config) {
-  VirtualDevice host_virtual_device = config->host_virtual_device;
+IRModule Prepare(IRModule mod, const CompilationConfig& config) {
   // Run minimal transforms on module to establish invariants needed by interpreter.
   transform::Sequential seq(
       {transform::SimplifyInference(), qnn::transform::Legalize(),
        // Figure out which devices should be used to execute.
        // TODO(mbs): Should ignore all existing annotations when constant folding
-       transform::PlanDevices(std::move(config)),
+       transform::PlanDevices(config),
        // FuseOps will mark wrapped calls to prim-ops with the 'Primitive'
        // attribute.
        transform::FuseOps(/*fuse_opt_level=*/0),
@@ -962,8 +961,7 @@ IRModule Prepare(IRModule mod, CompilationConfig config) {
        transform::EtaExpand(
            /*expand_constructor=*/true, /*expand_global_var=*/false),
        transform::InferType(),
-       tec::LowerTEPass(/*module_name=*/"intrp", [](BaseFunc func) { /* no-op */ },
-                        std::move(host_virtual_device))});
+       tec::LowerTEPass(/*module_name=*/"intrp", [](BaseFunc func) { /* no-op */ }, config)});
 
   transform::PassContext pass_ctx = transform::PassContext::Current();
   With<transform::PassContext> ctx(pass_ctx);
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 76dbfef5386dd..73b44f7361a57 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -299,11 +299,10 @@ class TECompilerImpl : public TECompilerNode {
       // the module's globals. Furthermore, the external codegen tool must bind the compiled
       // function to the "global_symbol" attribute on the source_func. So do not use GetUniqueName
       // here.
-      auto target = Target("ext_dev");
       auto global_var = GlobalVar(opt_global_symbol.value());
       global_var->checked_type_ = key->source_func->checked_type();
       ir_module->Add(global_var, key->source_func);
-      value->cached_func = CachedFunc(target, global_var, {}, {}, te::Schedule{nullptr},
+      value->cached_func = CachedFunc(key->target, global_var, {}, {}, te::Schedule{nullptr},
                                       tir::PrimFunc{nullptr}, {}, ir_module);
       // Collect these here as it's removed in LowerExternalFunctions()
       device_contexts_.Set(value->cached_func->prim_fn_var, opt_compiler.value());
@@ -531,14 +530,14 @@ using AnalysisRemapping = std::unordered_map<Expr, Expr, ObjectHash, ObjectEqual
  */
 class LowerTensorExprMutator : public DeviceAwareExprMutator {
  public:
-  LowerTensorExprMutator(const IRModule& module, ProcessFn process_fn, String module_name,
-                         TECompiler compiler, VirtualDevice host_virtual_device)
+  LowerTensorExprMutator(IRModule module, ProcessFn process_fn, CompilationConfig config,
+                         String module_name, TECompiler compiler)
       : DeviceAwareExprMutator(module),
-        module_(module),
+        module_(std::move(module)),
         process_fn_(std::move(process_fn)),
+        config_(std::move(config)),
         module_name_(std::move(module_name)),
         compiler_(std::move(compiler)),
-        host_virtual_device_(std::move(host_virtual_device)),
         debug_op_(Op::Get("debug")) {}
 
   /*!
@@ -638,7 +637,7 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
       // Shape function keys use the underlying primitive function as their 'function',
       // but the generic 'cpu' target as the target since all shape functions run
       // on the host cpu irrespective of where the primitive runs.
-      CCacheKey shape_key(func, host_virtual_device_->target);
+      CCacheKey shape_key(func, config_->host_virtual_device->target);
       CachedFunc lowered_shape_func = compiler_->LowerShapeFunc(shape_key);
 
       // Capture the shape function's global var and parameters 'states' in call
@@ -733,7 +732,8 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
 
     // Special case: device_copies are left as calls to primitive operators
     // (thus undoing FuseOps) so that each backend can handle them directly.
-    // TODO(mbs): device_copy cleanup. Would be better for FuseOps to just leave device_copy alone.
+    // TODO(mbs): device_copy cleanup. Would be better for FuseOps to just leave device_copy
+    // alone.
     if (const auto* function_node = primitive_func.as<FunctionNode>()) {
       DeviceCopyProps device_copy_props = GetDeviceCopyProps(function_node->body);
       if (device_copy_props.body.defined()) {
@@ -771,10 +771,18 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
     // Typical case: call to fused primitive Relay Function.
     // Find the desired target device.
     Target target;
-    if (primitive_func->GetAttr<String>(attr::kCompiler).defined()) {
-      // The generic 'external device' target.
-      // TODO(mbs): Retire once replaced unified BYOC compiler and target machinery
-      target = Target("ext_dev");
+    Optional<String> opt_compiler = primitive_func->GetAttr<String>(attr::kCompiler);
+    if (opt_compiler.defined()) {
+      // This function needs to be compiled with external codegen.
+      Optional<Target> opt_target = config_->FindPrimitiveTargetForKind(opt_compiler.value());
+      if (opt_target.defined()) {
+        // The target is what's supplied by the compilation config for kind matching the
+        // "Compiler" name.
+        target = opt_target.value();
+      } else {
+        // Legacy fallback.
+        target = Target("ext_dev");
+      }
     } else {
       // The target corresponding to the call_node expression's annotation.
       VirtualDevice virtual_device = GetVirtualDevice(GetRef<Call>(call_node));
@@ -791,6 +799,8 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
 
   IRModule module_;
   ProcessFn process_fn_;
+  /*! \brief All available targets. */
+  CompilationConfig config_;
   // Map from in-scope let-bound variables to Functions known to be primitive, or PrimFuncs which
   // have already been lowered. We'll rewrite these to the fresh global vars bound to the lowered
   // primitive function as we go. Those vars will be bound in the target device-type specific
@@ -799,21 +809,15 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
   std::unordered_map<const VarNode*, BaseFunc> primitive_functions_;
   String module_name_;
   TECompiler compiler_;
-  /*!
-   * \brief The \p VirtualDevice for the host, which is where all shape-related data and computation
-   * must live.
-   */
-  VirtualDevice host_virtual_device_;
   // Cache ops that need to be frequently used later to reduce lookup overhead.
   const Op& debug_op_;
 };
 
 Pass LowerTensorExpr(const String& module_name, TECompiler compiler, ProcessFn process_fn,
-                     VirtualDevice host_virtual_device) {
+                     CompilationConfig config) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
       [=](Function func, IRModule module, PassContext ctx) {
-        LowerTensorExprMutator lower_te(module, process_fn, module_name, compiler,
-                                        host_virtual_device);
+        LowerTensorExprMutator lower_te(module, process_fn, config, module_name, compiler);
         return Downcast<Function>(lower_te.Mutate(func));
       };
   return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {});
@@ -1043,7 +1047,7 @@ void UpdateFunctionMetadata(BaseFunc func,
 }
 
 IRModule LowerTE(const IRModule& module, const String& module_name, ProcessFn process_fn,
-                 VirtualDevice host_virtual_device) {
+                 CompilationConfig config) {
   TECompiler compiler(module);
 
   // TODO(mbs): This is all unnecessarily convoluted. Better would be to accumulate the rewritten
@@ -1058,8 +1062,8 @@ IRModule LowerTE(const IRModule& module, const String& module_name, ProcessFn pr
   //    GlobalVar, and calls updated (sticking with regular Relay Call).
   //  - Calls to functions tagged with "Primitive" are compiled to PrimFuncs, and calls updated
   //    (using call_lowered convention).
-  IRModule updated_module = LowerTensorExpr(module_name, compiler, std::move(process_fn),
-                                            std::move(host_virtual_device))(module);
+  IRModule updated_module =
+      LowerTensorExpr(module_name, compiler, std::move(process_fn), std::move(config))(module);
 
   // The Functions tagged with "Compiler" are now residing in the cache ready to be
   // compiled by LowerExternalFunctions. However we still need a record of them in the
@@ -1159,15 +1163,14 @@ Map<Target, IRModule> GetPerTargetModules(IRModule mod) {
   return per_target_modules;
 }
 
-Pass LowerTEPass(const String& module_name, ProcessFn process_fn,
-                 VirtualDevice host_virtual_device) {
+Pass LowerTEPass(String module_name, ProcessFn process_fn, CompilationConfig complilation_config) {
   runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func = [=](IRModule module,
                                                                             PassContext ctx) {
-    return LowerTE(module, module_name, process_fn, host_virtual_device);
+    return LowerTE(module, module_name, process_fn, complilation_config);
   };
 
   return tvm::transform::Sequential(
-      {tvm::relay::transform::RelayToTIRTargetHook(),
+      {tvm::relay::transform::RelayToTIRTargetHook(complilation_config),
        tvm::transform::CreateModulePass(pass_func, 0, "LowerTE", {"InferType"}), InferType(),
        tvm::tir::transform::ExtractPrimFuncConstants()});
 }
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index 0b2288d6a156f..8312a20cb862b 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -189,7 +189,8 @@ IRModule LowerTE(
     const IRModule& module, backend::StaticMemoryPlan memory_plan, const String& module_name,
     ProcessFn process_fn = [](BaseFunc f) {});
 
-/*! \brief Pass to lower an IRModule's primitive functions to TIR.
+/*!
+ * \brief Pass to lower an IRModule's primitive functions to TIR.
  *
  * This is the "back half" of the Relay compiler which lowers "primitive functions"
  * to TE expressions, schedules them, and then to TIR. It annotates all functions
@@ -198,11 +199,11 @@ IRModule LowerTE(
  * \param module_name The name of this module
  * \param process_fn Callback allowing one-level up code generators to process
  * each function that we lower
- * \param host_virtual_device \p VirtualDevice for host data and computations
- * \returns The pass which lowers primative functions to TIR
+ * \param config All available targets.
+ * \returns The pass which lowers primitive functions to TIR
  */
-transform::Pass LowerTEPass(const String& module_name, ProcessFn process_fn,
-                            VirtualDevice host_virtual_device);
+transform::Pass LowerTEPass(String module_name, ProcessFn process_fn, CompilationConfig config);
+
 }  // namespace tec
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 5a62ac66f7365..e0b742a840906 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -523,11 +523,13 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
       op_index = itr->second;
     }
 
-    // Capture the dictionary of attributes from the original primitive function so that they
-    // can contribute to the hash of the compiled primitive. This way we can distinguish primitives
-    // with the same body expression but different attributes which may arbitrarily influence code
-    // generation.
-    op_attrs[op_index] = attrs->dict;
+    if (attrs.defined() && attrs->dict.defined()) {
+      // Capture the dictionary of attributes from the original primitive function so that they
+      // can contribute to the hash of the compiled primitive. This way we can distinguish
+      // primitives with the same body expression but different attributes which may arbitrarily
+      // influence code generation.
+      op_attrs[op_index] = attrs->dict;
+    }
 
     Emit(Instruction::InvokePacked(op_index, argument_registers.size(), output_tuple->fields.size(),
                                    argument_registers));
@@ -981,25 +983,25 @@ void VMCompiler::LowerImpl(IRModule mod) {
   }
 }
 
-transform::Sequential VMCompiler::MemoryOpt(const VirtualDevice& host_virtual_device) {
+transform::Sequential VMCompiler::MemoryOpt(const CompilationConfig& config) {
   Array<Pass> pass_seqs;
   // Remove unused functions
   Array<runtime::String> entry_functions{"main"};
   pass_seqs.push_back(transform::RemoveUnusedFunctions(entry_functions));
   // Manifest the allocations.
-  pass_seqs.push_back(transform::ManifestAlloc(host_virtual_device));
+  pass_seqs.push_back(transform::ManifestAlloc(config->host_virtual_device));
 
   // Compute away possibly introduced constant computation.
   pass_seqs.push_back(transform::FoldConstant());
 
   // Fuse & lower any new shape functions and device_copies.
-  pass_seqs.push_back(FuseAndLowerOperators(host_virtual_device));
+  pass_seqs.push_back(FuseAndLowerOperators(config));
 
   // Manifest the allocations needed for the shape functions.
-  pass_seqs.push_back(transform::ManifestAlloc(host_virtual_device));
+  pass_seqs.push_back(transform::ManifestAlloc(config->host_virtual_device));
 
   // Fuse & lower any new allocations.
-  pass_seqs.push_back(FuseAndLowerOperators(host_virtual_device));
+  pass_seqs.push_back(FuseAndLowerOperators(config));
 
   // TODO(mbrookhart, jroesch, masahi): this pass is very slow, and is
   // incomplete to provide memory resuse optimizations. Disable it until we can
@@ -1011,10 +1013,10 @@ transform::Sequential VMCompiler::MemoryOpt(const VirtualDevice& host_virtual_de
   pass_seqs.push_back(transform::FoldConstant());
 
   // Fuse & lower yet again
-  pass_seqs.push_back(FuseAndLowerOperators(host_virtual_device));
+  pass_seqs.push_back(FuseAndLowerOperators(config));
 
   // Create allocations for math introduced by dynamic region math.
-  pass_seqs.push_back(transform::ManifestAlloc(host_virtual_device));
+  pass_seqs.push_back(transform::ManifestAlloc(config->host_virtual_device));
 
   // Compute away possibly introduced constant computation.
   pass_seqs.push_back(transform::FoldConstant());
@@ -1030,7 +1032,7 @@ transform::Sequential VMCompiler::MemoryOpt(const VirtualDevice& host_virtual_de
   return transform::Sequential(std::move(pass_seqs));
 }
 
-transform::Sequential VMCompiler::FuseAndLowerOperators(const VirtualDevice& host_virtual_device) {
+transform::Sequential VMCompiler::FuseAndLowerOperators(const CompilationConfig& config) {
   Array<Pass> pass_seqs;
   // Hoist operators to "primitive" Functions.
   pass_seqs.push_back(FuseOps());
@@ -1043,7 +1045,7 @@ transform::Sequential VMCompiler::FuseAndLowerOperators(const VirtualDevice& hos
                                            backend::UpdateConstants(func, &params_);
                                          }
                                        },
-                                       host_virtual_device));
+                                       config));
   // Since lowered functions are bound in the IRModule, we can now eliminate any unused
   // let-bound functions.
   pass_seqs.push_back(DeadCodeElimination(/*inline_once=*/false));
@@ -1094,7 +1096,7 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
                                            backend::UpdateConstants(func, &params_);
                                          }
                                        },
-                                       config_->host_virtual_device));
+                                       config_));
 
   // Since lowered functions are bound in the IRModule, we can now eliminate any unused
   // let-bound functions.
@@ -1111,7 +1113,7 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   // external codegen.
   pass_seqs.push_back(transform::Inline());
 
-  pass_seqs.push_back(MemoryOpt(config_->host_virtual_device));
+  pass_seqs.push_back(MemoryOpt(config_));
   pass_seqs.push_back(transform::InferType());
 
   transform::Sequential seq(pass_seqs);
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index a65bdc5ab3cb6..163ec399013b0 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -146,10 +146,10 @@ class VMCompiler : public runtime::ModuleNode {
   IRModule OptimizeModuleImpl(IRModule mod);
 
   /*! \brief Returns the passes which layout memory. */
-  transform::Sequential MemoryOpt(const VirtualDevice& host_virtual_device);
+  transform::Sequential MemoryOpt(const CompilationConfig& config);
 
   /*! \brief Returns the passes which fuse then lower Relay primitive operators. */
-  transform::Sequential FuseAndLowerOperators(const VirtualDevice& host_virtual_device);
+  transform::Sequential FuseAndLowerOperators(const CompilationConfig& config);
 
   /*!
    * \brief Populate the global function names in a map where the value is used
diff --git a/src/relay/transforms/dead_code.cc b/src/relay/transforms/dead_code.cc
index ca1e04ae59fac..45cb8271b0746 100644
--- a/src/relay/transforms/dead_code.cc
+++ b/src/relay/transforms/dead_code.cc
@@ -534,6 +534,7 @@ namespace transform {
 // Declared in relay/transform.h
 Pass DeadCodeElimination(bool inline_once, bool ignore_impurity) {
   auto pass_func = [=](IRModule mod, PassContext pc) -> IRModule {
+    VLOG(1) << "Before:" << std::endl << PrettyPrint(mod);
     // Which let bindings are pure and can be safely elided?
     std::unordered_map<const VarNode*, bool> var_to_purity;
     if (!ignore_impurity) {
@@ -566,6 +567,7 @@ Pass DeadCodeElimination(bool inline_once, bool ignore_impurity) {
         result->Add(kv.first, kv.second);
       }
     }
+    VLOG(1) << "After:" << std::endl << PrettyPrint(result);
 
     return result;
   };
diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc
index a6e26364bbc4f..c55b6778093e5 100644
--- a/src/relay/transforms/inline.cc
+++ b/src/relay/transforms/inline.cc
@@ -69,6 +69,7 @@ class Inliner : ExprMutator {
         for (auto arg : vanilla_call->args) {
           new_args.push_back(VisitExpr(arg));
         }
+        // TODO(mbs): Does not handle multiple calls to the same global function.
         cur_node_->RemoveCallTo(gv);
         return MakeNewExpr(gv, new_args, GetRef<Call>(call_node));
       }
diff --git a/src/relay/transforms/target_hooks.cc b/src/relay/transforms/target_hooks.cc
index 0022baf881ba0..00953a1907e13 100644
--- a/src/relay/transforms/target_hooks.cc
+++ b/src/relay/transforms/target_hooks.cc
@@ -30,61 +30,143 @@ namespace tvm {
 namespace relay {
 namespace transform {
 
-class TargetHookVisitor : public tvm::relay::MixedModeVisitor {
-  /*! \brief Collected pass list for all nodes */
-  std::vector<Pass> pass_list_;
-  /*! \brief Attribute map for all registered targets */
-  TargetKindAttrMap<Pass> target_attr_map_;
-  using tvm::relay::MixedModeVisitor::VisitExpr_;
+namespace {
+
+/*!
+ * \brief A pass extracted from a target kind's "RelayToTIR" attribute, along with any
+ * 'external codegen' Target instance with matching kind name which should be current when
+ * the pass is applied.
+ */
+struct CustomPass {
+  std::string target_kind_name;
+  Pass pass;
+  Optional<Target> opt_target;
 
+  CustomPass(std::string target_kind_name, Pass pass, Optional<Target> opt_target)
+      : target_kind_name(std::move(target_kind_name)),
+        pass(std::move(pass)),
+        opt_target(std::move(opt_target)) {}
+};
+
+/*!
+ * \brief Collect all the \p CustomPasses needed according to the "Compiler" attributes on
+ * inlined or global functions.
+ */
+class TargetHookVisitor : public MixedModeVisitor {
  public:
-  TargetHookVisitor() : target_attr_map_(tvm::TargetKind::GetAttrMap<Pass>("RelayToTIR")) {}
+  TargetHookVisitor(IRModule mod, CompilationConfig config)
+      : mod_(std::move(mod)),
+        config_(std::move(config)),
+        target_attr_map_(tvm::TargetKind::GetAttrMap<Pass>(tvm::attr::kRelayToTIR)) {}
 
-  std::vector<Pass> Visit(const IRModule& ir_mod) {
-    for (const auto& it : ir_mod->functions) {
-      if (const auto* function_node = it.second.as<FunctionNode>()) {
+  std::vector<CustomPass> Visit() {
+    ICHECK(custom_passes_.empty());
+    // To ensure the passes are run in a deterministic order we'll search for functions in
+    // lexicographic order.
+    std::vector<std::pair<std::string, BaseFunc>> functions;
+    for (const auto& kv : mod_->functions) {
+      functions.emplace_back(kv.first->name_hint, kv.second);
+    }
+    std::sort(functions.begin(), functions.end());
+    for (const auto& kv : functions) {
+      if (const auto* function_node = kv.second.as<FunctionNode>()) {
+        // May be a top-level function with a "Compiler" attribute.
+        MaybeAddPassForFunction(function_node);
+      }
+      if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) {
+        // May have calls to inlined "Compiler" functions in body.
         VisitExpr(GetRef<Function>(function_node));
       }
     }
-    return pass_list_;
+    return std::move(custom_passes_);
   }
 
-  void VisitExpr_(const LetNode* op) final {
-    auto pre_visit = [this](const LetNode* op) {
-      this->VisitExpr(op->var);
-      this->VisitExpr(op->value);
+ private:
+  using tvm::relay::MixedModeVisitor::VisitExpr_;
+
+  void VisitExpr_(const LetNode* let_node) final {
+    auto pre_visit = [this](const LetNode* inner_let_node) {
+      this->VisitExpr(inner_let_node->var);
+      this->VisitExpr(inner_let_node->value);
     };
-    auto post_visit = [this](const LetNode* op) {
-      this->VisitExpr(op->body);
-      this->visit_counter_[op] += 1;
+    auto post_visit = [this](const LetNode* inner_let_node) {
+      this->VisitExpr(inner_let_node->body);
+      this->visit_counter_[inner_let_node] += 1;
     };
-    ExpandANormalForm(op, pre_visit, post_visit);
+    ExpandANormalForm(let_node, pre_visit, post_visit);
+  }
+
+  void VisitExpr_(const FunctionNode* function_node) override {
+    ExprVisitor::VisitExpr_(function_node);
+    MaybeAddPassForFunction(function_node);
   }
 
-  void VisitExpr_(const FunctionNode* func) override {
-    ExprVisitor::VisitExpr_(func);
-    if (!func->GetAttr<String>(attr::kCompiler).defined()) {
+  /*!
+   * \brief If \p function_node has a "Compiler" attribute, checks if we should include a
+   * matching custom pass. Otherwise no-op.
+   */
+  void MaybeAddPassForFunction(const FunctionNode* function_node) {
+    Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+    if (!opt_compiler) {
+      // No external codegen required.
       return;
     }
-    String code_gen_name = func->GetAttr<String>(attr::kCompiler).value();
-    Optional<TargetKind> target_kind = tvm::TargetKind::Get(code_gen_name);
-    if (!target_kind || !target_attr_map_.count(target_kind.value())) {
+    // First cross-over: use "Compiler" attribute name as target kind.
+    std::string kind_name = opt_compiler.value();
+    Optional<TargetKind> opt_target_kind = tvm::TargetKind::Get(kind_name);
+    if (!opt_target_kind || !target_attr_map_.count(opt_target_kind.value())) {
+      // Target kind does not exist or have the "RelayToTIR" attribute, no custom pass to consider.
       return;
     }
-    Pass custom_target_pass = target_attr_map_[target_kind.value()];
-    if (std::find(pass_list_.begin(), pass_list_.end(), custom_target_pass) == pass_list_.end()) {
-      pass_list_.push_back(custom_target_pass);
+    if (!seen_kinds_.emplace(kind_name).second) {
+      // Already accounted for custom pass.
+      return;
     }
+    // Second (optional) cross-over: find unique Target instance in overall available targets with
+    // the same kind so that it can be made available when custom pass is invoked.
+    Optional<Target> opt_target = config_->FindPrimitiveTargetForKind(opt_compiler.value());
+    Pass custom_target_pass = target_attr_map_[opt_target_kind.value()];
+    custom_passes_.emplace_back(std::move(kind_name), std::move(custom_target_pass),
+                                std::move(opt_target));
   }
+
+  /*! \brief IRModule we are visiting. */
+  IRModule mod_;
+  /*! \brief All available targets. */
+  CompilationConfig config_;
+  /*! \brief Cached attribute map for all registered targets */
+  TargetKindAttrMap<Pass> target_attr_map_;
+  /*! \brief Which target kind names have already contributed to the custom passes list. */
+  std::unordered_set<std::string> seen_kinds_;
+  /*!
+   * \brief All the custom passes to run, paired with their corresponding target instances, if any.
+   */
+  std::vector<CustomPass> custom_passes_;
 };
 
-Pass RelayToTIRTargetHook() {
-  auto pass_func = [=](IRModule mod, const PassContext& pass_ctx) {
-    auto target_hook_visitor = TargetHookVisitor();
-    std::vector<Pass> pass_list = target_hook_visitor.Visit(mod);
-    Sequential run_hooks(pass_list);
+}  // namespace
 
-    return run_hooks(mod);
+Pass RelayToTIRTargetHook(CompilationConfig config) {
+  auto pass_func = [config = std::move(config)](IRModule mod, const PassContext& pass_ctx) {
+    VLOG(1) << "Before:" << std::endl << PrettyPrint(mod);
+    TargetHookVisitor target_hook_visitor(mod, config);
+    std::vector<CustomPass> custom_passes = target_hook_visitor.Visit();
+    for (const auto& custom_pass : custom_passes) {
+      if (custom_pass.opt_target.defined()) {
+        VLOG(0) << "Invoking custom pass for target "
+                << custom_pass.opt_target.value()->ToDebugString();
+        // Push the target on the stack.
+        With<Target> with_target(custom_pass.opt_target.value());
+        // Invoke the pass with target in scope.
+        mod = custom_pass.pass(mod);
+      } else {
+        // Invoke the pass.
+        VLOG(0) << "Invoking custom pass for target kind '" << custom_pass.target_kind_name << "'";
+        mod = custom_pass.pass(mod);
+      }
+    }
+    VLOG(1) << "After:" << std::endl << PrettyPrint(mod);
+    return mod;
   };
   return tvm::transform::CreateModulePass(pass_func, 0, "RelayToTIRTargetHook", {});
 }
diff --git a/src/target/target.cc b/src/target/target.cc
index 75126ed11c70a..3cdfa0cc0d5e8 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -495,8 +495,12 @@ Target::Target(TargetKind kind, Optional<ObjectRef> host, String tag, Array<Stri
 }
 
 bool Target::IsExternalCodegen() const {
-  TargetKindAttrMap<Bool> attr_map = TargetKind::GetAttrMap<Bool>(::tvm::attr::kIsExternalCodegen);
-  return attr_map.get(get()->kind, Bool(false));
+  TargetKindAttrMap<Bool> is_external_codegen_map =
+      TargetKind::GetAttrMap<Bool>(tvm::attr::kIsExternalCodegen);
+  TargetKindAttrMap<FTVMRelayToTIR> relay_to_tir_map =
+      TargetKind::GetAttrMap<FTVMRelayToTIR>(tvm::attr::kRelayToTIR);
+  return is_external_codegen_map.get(get()->kind, Bool(false)) ||
+         relay_to_tir_map.count(get()->kind);
 }
 
 bool Target::IsExternalCodegenFor(const Target& that) const {
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index b657ac0c5783d..2c85e47e7fb89 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -20,6 +20,7 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
 #include <tvm/ir/expr.h>
+#include <tvm/relay/transform.h>
 #include <tvm/target/target.h>
 
 #include <cmath>
@@ -144,16 +145,21 @@ TVM_REGISTER_TARGET_KIND("test_external_codegen_1", kDLCUDA)
 TVM_REGISTER_TARGET_KIND("test_external_codegen_2", kDLMetal)
     .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
 
+TVM_REGISTER_TARGET_KIND("test_external_codegen_3", kDLCPU)
+    .set_attr<FTVMRelayToTIR>(tvm::attr::kRelayToTIR, tvm::relay::transform::InferType());
+
 TEST(Target, ExternalCodegen) {
   Target regular("cuda");
   Target external0("test_external_codegen_0");
   Target external1("test_external_codegen_1");
   Target external2("test_external_codegen_2");
+  Target external3("test_external_codegen_3");
 
   ASSERT_FALSE(regular.IsExternalCodegen());
   ASSERT_TRUE(external0.IsExternalCodegen());
   ASSERT_TRUE(external1.IsExternalCodegen());
   ASSERT_TRUE(external2.IsExternalCodegen());
+  ASSERT_TRUE(external3.IsExternalCodegen());
 
   ASSERT_TRUE(external0.IsExternalCodegenFor(regular));
   ASSERT_FALSE(regular.IsExternalCodegenFor(external0));
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 41123a2548256..dbc5147e20300 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -6653,4 +6653,4 @@ def verify_LinearRegressor(a_shape, c_shape, i_shape, targets=1, batch=1):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/dyn/test_dynamic_op_level2.py b/tests/python/relay/dyn/test_dynamic_op_level2.py
index a017762ce35db..690ddcac8d512 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level2.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level2.py
@@ -208,6 +208,4 @@ def verify_pad_default_fill(dshape, pad_width, dtype):
 
 
 if __name__ == "__main__":
-    test_dyn_pad()
-    test_dyn_upsampling_infer_type_const()
-    test_dyn_upsampling_run()
+    tvm.testing.main()
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index c5a9041b15fe4..4f451a125184d 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -31,6 +31,8 @@
     set_external_func_attr,
     parametrize_external_codegen_checks,
     parametrize_external_json_codegen_checks,
+    check_graph_executor_result,
+    check_vm_result,
 )
 
 
@@ -180,6 +182,58 @@ def test_extern_gcc(check_result):
     check_result(mod, inputs, (2, 2), (y_data * y_data) - (x_data + x_data))
 
 
+# TODO(mbs): The check_aot_executor_result does not support the list-of-targets, mostly because
+# tvm.testing.aot.compile_and_run requires the target to be a kind name string, and
+# tvm.testing.aot.compile_models requires a single Target object. However, code outside of
+# tvm.testing.aot is ready for this more general form.
+@pytest.mark.parametrize("check_result", [check_graph_executor_result, check_vm_result])
+def test_extern_gcc_with_target_instance(check_result):
+    shape = (8, 8)
+    dtype = "int32"
+
+    def make_mod():
+        x0 = relay.var("x0", shape=shape, dtype=dtype)
+        y0 = relay.var("y0", shape=shape, dtype=dtype)
+        z = x0 + y0
+        f = relay.Function([x0, y0], z)
+        f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
+        x = relay.var("x", shape=shape, dtype=dtype)
+        y = relay.var("y", shape=shape, dtype=dtype)
+        call = relay.Call(f, [x, y])
+        return tvm.IRModule.from_expr(call)
+
+    host_target = tvm.target.Target("llvm")
+    generic_target = tvm.target.Target("llvm", host=host_target)
+    # The header attribute is just whitespace, so compilation is as usual.
+    good_extern_codegen_target = tvm.target.Target(
+        {"kind": "ccompiler", "header": "// Good"}, host=host_target
+    )
+    # The header attribute is ill-formed, so compilation is expected to fail.
+    bogus_extern_codegen_target = tvm.target.Target(
+        {"kind": "ccompiler", "header": "Bogus"}, host=host_target
+    )
+
+    mod = make_mod()
+
+    x_data = np.random.rand(*shape).astype(dtype)
+    y_data = np.random.rand(*shape).astype(dtype)
+    expected_result = x_data + y_data
+    inputs = {"x": x_data, "y": y_data}
+
+    check_result(
+        mod, inputs, shape, expected_result, target=[generic_target, good_extern_codegen_target]
+    )
+
+    with pytest.raises(RuntimeError):
+        check_result(
+            mod,
+            inputs,
+            shape,
+            expected_result,
+            target=[generic_target, bogus_extern_codegen_target],
+        )
+
+
 @pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
 def test_extern_gcc_consts():
     @tvm._ffi.register_func("relay.ext.ccompiler.constant_updater")
diff --git a/tests/python/relay/test_target_hooks.py b/tests/python/relay/test_target_hooks.py
index 22b3b8cb30638..046b2c7e541de 100644
--- a/tests/python/relay/test_target_hooks.py
+++ b/tests/python/relay/test_target_hooks.py
@@ -18,19 +18,25 @@
 import sys
 import numpy as np
 import pytest
+import logging
 
+import tvm
 import tvm.testing
 from tvm import relay, IRModule
 
 from utils.external_codegen import (
+    parametrize_external_codegen_checks,
     set_external_func_attr,
     check_aot_executor_result,
     check_graph_executor_result,
+    check_vm_result,
 )
 
+logging.basicConfig(level=logging.INFO)
 
-@pytest.mark.parametrize("check_result", [check_aot_executor_result, check_graph_executor_result])
-def test_tir_external_generation(check_result):
+
+@parametrize_external_codegen_checks
+def test_tir_external_generation_inline_without_target_instance(check_result):
     shape = (8,)
     x_data = np.random.randint(255, size=shape).astype("float32")
     y_data = np.random.randint(255, size=shape).astype("float32")
@@ -50,6 +56,49 @@ def test_tir_external_generation(check_result):
     check_result(func, inputs, (8,), x_data - y_data)
 
 
+# TODO(mbs): The check_aot_executor_result does not support list-of-targets, mostly because
+# tvm.testing.aot.compile_and_run requires the target to be a kind name string, and
+# tvm.testing.aot.compile_models requires a single Target object. However, code outside of
+# tvm.testing.aot is ready for this more general form.
+@pytest.mark.parametrize("check_result", [check_graph_executor_result, check_vm_result])
+def test_tir_external_generation_outline_with_target_instance(check_result):
+    shape = (8,)
+    x_data = np.random.randint(255, size=shape).astype("float32")
+    y_data = np.random.randint(255, size=shape).astype("float32")
+    inputs = {"x": x_data, "y": y_data}
+    # Compile with an instance of the hooked target kind to demonstrate plumbing target attributes
+    # into custom passes.
+    host_target = tvm.target.Target("llvm")
+    generic_target = tvm.target.Target("llvm", host=host_target)
+    extern_codegen_target = tvm.target.Target(
+        "example_target_hook -example_attribute=42", host=host_target
+    )
+    mod = tvm.parser.fromtext(
+        """
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(8), float32], %y: Tensor[(8), float32]) -> Tensor[(8), float32] {
+              @replace_add_with_subtract(%x, %y) * 2.0f
+            }
+            
+            def @replace_add_with_subtract(%x: Tensor[(8), float32], %y: Tensor[(8), float32],
+                                           Inline=1,
+                                           Primitive=1,
+                                           Compiler="example_target_hook",
+                                           global_symbol="replace_add_with_subtract") -> Tensor[(8), float32] {
+              %x + %y  // will be rewritten to TIR implementing %x - %y - 42.0f by custom pass
+            }  
+        """
+    )
+
+    check_result(
+        mod,
+        inputs,
+        (8,),
+        (x_data - y_data - 42.0) * 2.0,
+        target=[generic_target, extern_codegen_target],
+    )
+
+
 @pytest.mark.parametrize("check_result", [check_aot_executor_result, check_graph_executor_result])
 def test_runtime_module_generation(check_result):
     shape = (8,)
diff --git a/tests/python/relay/utils/external_codegen.py b/tests/python/relay/utils/external_codegen.py
index 6d3d917ff5a23..8e5ab803de7a6 100644
--- a/tests/python/relay/utils/external_codegen.py
+++ b/tests/python/relay/utils/external_codegen.py
@@ -22,7 +22,7 @@
 import pytest
 
 import tvm
-from tvm import relay, runtime
+from tvm import relay, runtime, testing
 from tvm.contrib import utils
 
 

From 24b93f56fdbb723cc0f631ce4da0e27d7fb212b1 Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Wed, 1 Jun 2022 22:59:30 +0300
Subject: [PATCH 010/181] [VM] check DLManagedTensor for conditions to
 construct NDArray (#11504)

* check DLManagedTensor for contiguous and alignment to construct correct NDArray

* correction from the reviewer

* update error description for incontiguous DLTensors

* small update

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 src/runtime/ndarray.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 0b4a9dfdd9e91..c7bfefa9a8e73 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -206,8 +206,7 @@ NDArray NDArray::Empty(ShapeTuple shape, DLDataType dtype, Device dev, Optional<
 }
 
 NDArray NDArray::FromExternalDLTensor(const DLTensor& dl_tensor) {
-  ICHECK(::tvm::runtime::IsContiguous(dl_tensor))
-      << "External DLTensor is not contiguous. It does not support for now";
+  ICHECK(::tvm::runtime::IsContiguous(dl_tensor)) << "External DLTensor must be contiguous.";
   ICHECK(IsAligned(dl_tensor)) << "Data in DLTensor is not aligned as required by NDArray";
   NDArray::Container* data = new NDArray::Container();
 
@@ -224,7 +223,7 @@ NDArray NDArray::FromExternalDLTensor(const DLTensor& dl_tensor) {
 
 NDArray NDArray::NewFromDLTensor(DLTensor* tensor, const Device& dev) {
   ICHECK(::tvm::runtime::IsContiguous(*tensor))
-      << "DLTensor is not contiguous. It does not support for now";
+      << "DLTensor is not contiguous. Copying from non-contiguous data is currently not supported";
   std::vector<int64_t> shape;
   for (int64_t i = 0; i < tensor->ndim; i++) {
     shape.push_back(tensor->shape[i]);
@@ -240,6 +239,9 @@ NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
   data->SetDeleter(Internal::DLPackDeleter);
   // fill up content.
   data->manager_ctx = tensor;
+  ICHECK(::tvm::runtime::IsContiguous(tensor->dl_tensor)) << "DLManagedTensor must be contiguous.";
+  ICHECK(IsAligned(tensor->dl_tensor))
+      << "Data in DLManagedTensor is not aligned as required by NDArray";
   data->dl_tensor = tensor->dl_tensor;
   // update shape_
   std::vector<ShapeTuple::index_type> shape;

From b9890dbbebeff95202a7dc65cbce3e808869cd33 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 1 Jun 2022 13:05:30 -0700
Subject: [PATCH 011/181] [skip ci][ci][docs] Add CI infra docs (#11403)

* [skip ci][ci][docs] Add CI infra docs

This adds some documentation around CI infra and pointers to the guides to run a deploy.

* Address comments

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docs/contribute/ci.rst | 108 ----------------------
 jenkins/README.md      | 203 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 201 insertions(+), 110 deletions(-)

diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst
index d40e4d5ab74b2..0cc1bf9dd992b 100644
--- a/docs/contribute/ci.rst
+++ b/docs/contribute/ci.rst
@@ -63,114 +63,6 @@ Reproduce Failures
 
 Most TVM Python tests run under |pytest|_ and can be run as described in :ref:`pr-testing`.
 
-Keeping CI Green
-****************
-
-Developers rely on the TVM CI to get signal on their PRs before merging.
-Occasionally breakages slip through and break ``main``, which in turn causes
-the same error to show up on an PR that is based on the broken commit(s). Broken
-commits can be identified `through GitHub <https://github.com/apache/tvm/commits/main>`_
-via the commit status icon or via `Jenkins <https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>`_.
-In these situations it is possible to either revert the offending commit or
-submit a forward fix to address the issue. It is up to the committer and commit
-author which option to choose, keeping in mind that a broken CI affects all TVM
-developers and should be fixed as soon as possible.
-
-Skip CI for Reverts
--------------------
-
-For reverts and trivial forward fixes, adding ``[skip ci]`` to the revert's
-PR title will cause CI to shortcut and only run lint. Committers should
-take care that they only merge CI-skipped PRs to fix a failure on ``main`` and
-not in cases where the submitter wants to shortcut CI to merge a change faster.
-The PR title is checked when the build is first run (specifically during the lint
-step, so changes after that has run do not affect CI and will require the job to
-be re-triggered by another ``git push``).
-
-.. code:: bash
-
-  # Revert HEAD commit, make sure to insert '[skip ci]' at the beginning of
-  # the commit subject
-  git revert HEAD
-  git checkout -b my_fix
-  # After you have pushed your branch, create a PR as usual.
-  git push my_repo
-  # Example: Skip CI on a branch with an existing PR
-  # Adding this commit to an existing branch will cause a new CI run where
-  # Jenkins is skipped
-  git commit --allow-empty --message "[skip ci] Trigger skipped CI"
-  git push my_repo
-
-Handling Flaky Failures
-***********************
-
-.. https://stackoverflow.com/questions/4743845/format-text-in-a-link-in-restructuredtext/4836544#4836544
-.. |pytest's @xfail decorator| replace:: pytest's ``@xfail`` decorator
-.. _pytest's @xfail decorator: https://docs.pytest.org/en/6.2.x/skipping.html#xfail-mark-test-functions-as-expected-to-fail
-.. |strict=True| replace:: ``strict=True``
-.. _strict=True: https://docs.pytest.org/en/6.2.x/skipping.html#strict-parameter
-
-If you notice a failure on your PR that seems unrelated to your change, you should
-search `recent GitHub issues related to flaky tests <https://github.com/apache/tvm/issues?q=is%3Aissue+%5BCI+Problem%5D+Flaky+>`_ and
-`file a new issue <https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-problem.md&title=%5BCI+Problem%5D+>`_
-if you don't see any reports of the failure. If a certain test or class of tests affects
-several PRs or commits on ``main`` with flaky failures, the test should be disabled via
-|pytest's @xfail decorator|_ with |strict=True|_ and the relevant issue linked in the
-disabling PR.
-
-.. code:: python
-
-    @pytest.mark.xfail(strict=False, reason="Flaky test: https://github.com/apache/tvm/issues/1234")
-    def test_something_flaky():
-        pass
-
-``ci-docker-staging``
-*********************
-
-The `ci-docker-staging <https://github.com/apache/tvm/tree/ci-docker-staging>`_
-branch is used to test updates to Docker images and ``Jenkinsfile`` changes. When
-running a build for a normal PR from a forked repository, Jenkins uses the code
-from the PR except for the ``Jenkinsfile`` itself, which comes from the base branch.
-When branches are built, the ``Jenkinsfile`` in the branch is used, so a committer
-with write access must push PRs to a branch in apache/tvm to properly test
-``Jenkinsfile`` changes. If your PR makes changes to the ``Jenkinsfile``, make sure
-to @ a `committer <https://github.com/apache/tvm/blob/main/CONTRIBUTORS.md>`_
-and ask them to push your PR as a branch to test the changes.
-
-.. _docker_images:
-
-Docker Images
-*************
-
-.. |top_of_the_Jenkinsfile| replace:: top of the ``Jenkinsfile``
-.. _top_of_the_Jenkinsfile: https://github.com/apache/tvm/blob/7481a297740f073b193a3f09b3e27f056e8c7f2e/Jenkinsfile#L48-L54
-
-Each CI job runs most of its work inside a Docker container, built from files
-in the `docker/ <https://github.com/apache/tvm/tree/main/docker>`_ folder. These
-files are built nightly in Jenkins via the `docker-images-ci <https://ci.tlcpack.ai/job/docker-images-ci/>`_ job.
-The images for these containers are hosted in the `tlcpack Docker Hub <https://hub.docker.com/u/tlcpack>`_
-and referenced at the |top_of_the_Jenkinsfile|_. These can be inspected and run
-locally via standard Docker commands.
-
-.. code:: bash
-
-    # Beware: CI images can be several GB in size
-    # Get a bare docker shell in the ci-gpu container
-    docker run -it tlcpack/ci-gpu:v0.78 /bin/bash
-
-``docker/bash.sh`` will automatically grab the latest image from the ``Jenkinsfile``
-and help in mounting your current directory.
-
-.. code:: bash
-
-    # Run the ci_cpu image specified in Jenkinsfile
-    cd tvm
-    bash docker/bash.sh ci_cpu
-    # the tvm directory is automatically mounted
-    # example: build tvm (note: this will overrwrite build/)
-    $ ./tests/scripts/task_config_build_cpu.sh
-    $ ./tests/scripts/task_build.sh build -j32
-
 
 Reporting Issues
 ****************
diff --git a/jenkins/README.md b/jenkins/README.md
index 454664b40c643..f2f695f9fc5da 100644
--- a/jenkins/README.md
+++ b/jenkins/README.md
@@ -15,14 +15,213 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
+# TVM CI
+
+TVM runs CI jobs on every commit to an open pull request and to branches in the apache/tvm repo (such as `main`). These jobs are essential to keeping the TVM project in a healthy state and preventing breakages. Jenkins does most of the work in running the TVM tests, though some smaller jobs are also run on GitHub Actions.
+
+## GitHub Actions
+
+GitHub Actions is used to run Windows jobs, MacOS jobs, and various on-GitHub automations. These are defined in [`.github/workflows`](../.github/workflows/). These automations include bots to:
+* [cc people based on subscribed teams/topics](https://github.com/apache/tvm/issues/10317)
+* [allow non-committers to merge approved / CI passing PRs](https://discuss.tvm.apache.org/t/rfc-allow-merging-via-pr-comments/12220)
+* [add cc-ed people as reviewers on GitHub](https://discuss.tvm.apache.org/t/rfc-remove-codeowners/12095)
+* [ping languishing PRs after no activity for a week (currently opt-in only)](https://github.com/apache/tvm/issues/9983)
+* [push a `last-successful` branch to GitHub with the last `main` commit that passed CI](https://github.com/apache/tvm/tree/last-successful)
+
+https://github.com/apache/tvm/actions has the logs for each of these workflows. Note that when debugging these workflows changes from PRs from forked repositories won't be reflected in the PR. These should be tested in the forked repository first and linked in the PR body.
+
+
+## Keeping CI Green
+
+Developers rely on the TVM CI to get signal on their PRs before merging.
+Occasionally breakages slip through and break `main`, which in turn causes
+the same error to show up on an PR that is based on the broken commit(s). Broken
+commits can be identified [through GitHub](https://github.com/apache/tvm/commits/main>)
+via the commit status icon or via [Jenkins](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>).
+In these situations it is possible to either revert the offending commit or
+submit a forward fix to address the issue. It is up to the committer and commit
+author which option to choose, keeping in mind that a broken CI affects all TVM
+developers and should be fixed as soon as possible.
+
+Some tests are also flaky and fail for reasons unrelated to the PR. The [CI monitoring rotation](https://github.com/apache/tvm/wiki/CI-Monitoring-Runbook) watches for these failures and disables tests as necessary. It is the responsibility of those who wrote the test to ultimately fix and re-enable the test.
+
+
+## Dealing with Flakiness
+
+If you notice a failure on your PR that seems unrelated to your change, you should
+search [recent GitHub issues related to flaky tests](https://github.com/apache/tvm/issues?q=is%3Aissue+%5BCI+Problem%5D+Flaky+>) and
+[file a new issue](https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-problem.md&title=%5BCI+Problem%5D+>)
+if you don't see any reports of the failure. If a certain test or class of tests affects
+several PRs or commits on `main` with flaky failures, the test should be disabled via
+[pytest's @xfail decorator](https://docs.pytest.org/en/6.2.x/skipping.html#xfail-mark-test-functions-as-expected-to-fail) with [`strict=False`](https://docs.pytest.org/en/6.2.x/skipping.html#strict-parameter) and the relevant issue linked in the
+disabling PR.
+
+```python
+@pytest.mark.xfail(strict=False, reason="Flaky test: https://github.com/apache/tvm/issues/1234")
+    def test_something_flaky():
+        pass
+```
+
+Then submit a PR as usual
+
+```bash
+git add <test file>
+git commit -m'[skip ci][ci] Disable flaky test: `<test_name>`
+
+See #<issue number>
+'
+gh pr create
+```
+    
+## Skipping CI
+
+For reverts and trivial forward fixes, adding `[skip ci]` to the revert's
+PR title will cause CI to shortcut and only run lint. Committers should
+take care that they only merge CI-skipped PRs to fix a failure on `main` and
+not in cases where the submitter wants to shortcut CI to merge a change faster.
+The PR title is checked when the build is first run (specifically during the lint
+step, so changes after that has run do not affect CI and will require the job to
+be re-triggered by another `git push`).
+
+```bash
+# Revert HEAD commit, make sure to insert '[skip ci]' at the beginning of
+# the commit subject
+git revert HEAD
+git checkout -b my_fix
+# After you have pushed your branch, create a PR as usual.
+git push my_repo
+# Example: Skip CI on a branch with an existing PR
+# Adding this commit to an existing branch will cause a new CI run where
+# Jenkins is skipped
+git commit --allow-empty --message "[skip ci] Trigger skipped CI"
+git push my_repo
+```
+
+## Docker Images
+
+Each CI job runs most of its work inside a Docker container, built from files
+in the [`docker/`](../docker) folder. These
+files are built nightly in Jenkins via the [docker-images-ci](https://ci.tlcpack.ai/job/docker-images-ci/>) job.
+The images for these containers are hosted in the [tlcpack Docker Hub](https://hub.docker.com/u/tlcpack>)
+and referenced in the [`Jenkinsfile.j2`](Jenkinsfile.j2). These can be inspected and run
+locally via standard Docker commands.
+
+### `ci-docker-staging`
+
+The [ci-docker-staging](https://github.com/apache/tvm/tree/ci-docker-staging>)
+branch is used to test updates to Docker images and `Jenkinsfile` changes. When
+running a build for a normal PR from a forked repository, Jenkins uses the code
+from the PR except for the `Jenkinsfile` itself, which comes from the base branch.
+When branches are built, the `Jenkinsfile` in the branch is used, so a committer
+with write access must push PRs to a branch in apache/tvm to properly test
+`Jenkinsfile` changes. If your PR makes changes to the `Jenkinsfile`, make sure
+to @ a [committer](../CONTRIBUTORS.md>)
+and ask them to push your PR as a branch to test the changes.
+
 # Jenkins CI
 
+TVM uses Jenkins for running Linux continuous integration (CI) tests on
+[branches](https://ci.tlcpack.ai/job/tvm/) and
+[pull requests](https://ci.tlcpack.ai/job/tvm/view/change-requests/) through a
+build configuration specified in a [`Jenkinsfile`](../Jenkinsfile).
+Other jobs run in GitHub Actions for Windows and MacOS jobs.
+
+## `Jenkinsfile`
+
 The template files in this directory are used to generate the [`Jenkinsfile`](../Jenkinsfile) used by Jenkins to run CI jobs for each commit to PRs and branches.
 
 To regenerate the `Jenkinsfile`, run
 
 ```bash
-pip install -r jenkins/requirements.txt
-python jenkins/generate.py
+python3 -mvenv _venv
+_venv/bin/pip3 install -r jenkins/requirements.txt
+_venv/bin/python3 jenkins/generate.py
 ```
 
+# Infrastructure
+
+Jenkins runs in AWS on an EC2 instance fronted by an ELB which makes it available at https://ci.tlcpack.ai. These definitions are declared via Terraform in the [tlc-pack/ci-terraform](https://github.com/tlc-pack/ci-terraform) repository. The Terraform code references custom AMIs built in [tlc-pack/ci-packer](https://github.com/tlc-pack/ci-packer). [tlc-pack/ci](https://github.com/tlc-pack/ci) contains Ansible scripts to deploy the Jenkins head node and set it up to interact with AWS.
+
+The Jenkins head node has a number of autoscaling groups with labels that are used to run jobs (e.g. `CPU`, `GPU` or `ARM`) via the [EC2 Fleet](https://plugins.jenkins.io/ec2-fleet/) plugin.
+
+## Deploying
+
+Deploying Jenkins can disrupt developers so it must be done with care. Jobs that are in-flight will be cancelled and must be manually restarted. Follow the instructions [here](https://github.com/tlc-pack/ci/issues/10) to run a deploy.
+
+## Monitoring
+
+Dashboards of CI data can be found:
+* within Jenkins at https://ci.tlcpack.ai/monitoring (HTTP / JVM stats)
+* at https://monitoring.tlcpack.ai (job status, worker status)
+
+## CI Diagram
+
+This details the individual parts that interact in TVM's CI. For details on operations, see https://github.com/tlc-pack/ci.
+
+```mermaid
+graph TD
+    Commit --> GitHub
+    GitHub --> |`push` webhook| WebhookServer(Webhook Server)
+    JobExecutor(Job Executor)
+    WebhookServer --> JobExecutor
+    JobExecutor -->  EC2Fleet(EC2 Fleet Plugin)
+    EC2Fleet --> |capacity request| EC2(EC2 Autoscaler)
+    JobExecutor --> WorkerEC2Instance
+    Docker --> |build cache, artifacts| S3
+    WorkerEC2Instance --> Docker
+    Docker --> |docker pull| G(Docker Hub)
+    Docker --> |docker push / pull| ECR
+    Docker --> |Execute jobs| CIScripts(CI Scripts)
+    RepoCITerraform(ci-terraform repo) --> |terraform| ECR
+    RepoCITerraform(ci-terraform repo) --> |terraform| EC2
+    RepoCITerraform(ci-terraform repo) --> |terraform| S3
+    RepoCI(ci repo) --> |configuration via Ansible| WorkerEC2Instance
+    RepoCIPacker(ci-packer) --> |AMIs| EC2
+    Monitoring_Scrapers(Jenkins Scraper) --> Monitoring_DB(Postrgres)
+    Grafana --> Monitoring_DB
+    GitHub --> Windows
+    GitHub --> MacOS
+
+    Developers --> |check PR status|JenkinsUI(Jenkins Web UI)
+    Monitoring_Scrapers --> |fetch job data| JenkinsUI
+    Developers --> |git push| Commit
+    Developers --> |create PR| GitHub
+    
+    subgraph Jenkins Head Node
+        WebhookServer
+        JobExecutor
+        EC2Fleet
+        JenkinsUI
+    end
+
+    subgraph GitHub Actions
+        Windows
+        MacOS
+    end
+
+    subgraph Configuration / Terraform
+        RepoCITerraform
+        RepoCI
+        RepoCIPacker
+    end
+
+    subgraph Monitoring
+        Monitoring_DB
+        Grafana
+        Monitoring_Scrapers
+    end
+    
+    subgraph AWS
+        subgraph Jenkins Workers
+            WorkerEC2Instance(Worker EC2 Instance)
+            subgraph "Worker EC2 Instance"
+                Docker
+                CIScripts
+            end
+        end
+        EC2
+        ECR
+        S3
+    end
+
+```

From a1d95ec1ea30ac70e544a3cf10c839e228d407bf Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 1 Jun 2022 13:07:36 -0700
Subject: [PATCH 012/181] [ci] Add conditionals for non-Python tests (#11438)

These don't get sharded in any way so there's no point in running them multiple times.

cc Mousius areusch
---
 Jenkinsfile            | 7 +------
 jenkins/Test.groovy.j2 | 4 ++++
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 44389ba767dc7..b9175f06afdc5 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-27T14:45:11.226042
+// Generated at 2022-05-31T16:54:56.997402
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -1268,7 +1268,6 @@ def shard_run_python_i386_1_of_5() {
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
                 label: 'Run i386 integration tests',
               )
-              fsim_test(ci_i386)
             })
           }
         } finally {
@@ -1360,7 +1359,6 @@ def shard_run_python_i386_3_of_5() {
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
                 label: 'Run i386 integration tests',
               )
-              fsim_test(ci_i386)
             })
           }
         } finally {
@@ -1406,7 +1404,6 @@ def shard_run_python_i386_4_of_5() {
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
                 label: 'Run i386 integration tests',
               )
-              fsim_test(ci_i386)
             })
           }
         } finally {
@@ -1452,7 +1449,6 @@ def shard_run_python_i386_5_of_5() {
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
                 label: 'Run i386 integration tests',
               )
-              fsim_test(ci_i386)
             })
           }
         } finally {
@@ -2476,7 +2472,6 @@ def shard_run_topi_aarch64_2_of_2() {
                       )
 
               ci_setup(ci_arm)
-              cpp_unittest(ci_arm)
               sh (
                 script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
                 label: 'Run test_arm_compute_lib test',
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
index 9f949ae717c2a..d86575c247c75 100644
--- a/jenkins/Test.groovy.j2
+++ b/jenkins/Test.groovy.j2
@@ -74,7 +74,9 @@
     script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
     label: 'Run i386 integration tests',
   )
+  {% if shard_index == 2 or num_shards < 2 %}
   fsim_test(ci_i386)
+  {% endif %}
 {% endcall %}
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="test: Hexagon",
@@ -156,7 +158,9 @@
 ) %}
   {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
   ci_setup(ci_arm)
+  {% if shard_index == 1 %}
   cpp_unittest(ci_arm)
+  {% endif %}
   sh (
     script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
     label: 'Run test_arm_compute_lib test',

From e84f163f573c07bb9f41209b8f722c76a92ae65d Mon Sep 17 00:00:00 2001
From: Sergey <88086617+shtinsa@users.noreply.github.com>
Date: Wed, 1 Jun 2022 23:13:41 +0300
Subject: [PATCH 013/181] [TE] Optimized version of concatenation layer
 (#11341)

* [TE] Optimized version of concatenation layer
     1. Concat implemented using extern_op
     2. New tests added.
     3. Workaround to allow inline extern_op-s with other layers.

* *test fix

* test_any.py fix.

* test_forward.py from tensorflow fix.

* lint fix.

* Fixes after code review.

* New comment added.

* Lint fix.

* Another lint fix.

* Comments added.

* rebase issue fix.

* Restored previous state.

* Update after code review.

* After code review changes.

* lint review.

* Change strategy for cuda to fix tests.

* Rebase to main

* Comments changes after review.

* Some more comments fixes.

* One more error fix in comments.

* restart build
---
 python/tvm/relay/op/_transform.py             |   7 +-
 python/tvm/relay/op/strategy/cuda.py          |  14 ++-
 python/tvm/relay/op/strategy/generic.py       |  21 ++++
 python/tvm/relay/op/strategy/x86.py           |  40 +++++--
 python/tvm/topi/x86/__init__.py               |   1 +
 python/tvm/topi/x86/concat.py                 | 109 ++++++++++++++++++
 python/tvm/topi/x86/injective.py              |  42 ++++++-
 src/relay/op/tensor/transform.cc              |   1 -
 src/te/schedule/schedule_dataflow_rewrite.cc  |  30 ++++-
 tests/python/relay/test_op_level1.py          |  97 ++++++++++++++++
 .../test_micro_model_library_format.py        |  27 +++--
 11 files changed, 359 insertions(+), 30 deletions(-)
 create mode 100644 python/tvm/topi/x86/concat.py

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 0338035329fcf..d87ee266f01df 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -68,7 +68,12 @@
 
 
 # concatenate
-_reg.register_schedule("concatenate", strategy.schedule_concatenate)
+@_reg.register_compute("concatenate")
+def compute_concat(attrs, inputs, output_type):
+    return [topi.concatenate(inputs, attrs.axis)]
+
+
+_reg.register_strategy("concatenate", strategy.concatenate_strategy)
 
 # sliding_window
 @_reg.register_compute("sliding_window")
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 59971d4e206f5..4a7cff5f3f33c 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -42,11 +42,15 @@ def schedule_reduce_cuda(attrs, outs, target):
         return topi.cuda.schedule_reduce(outs)
 
 
-@schedule_concatenate.register(["cuda", "gpu"])
-def schedule_concatenate_cuda(attrs, outs, target):
-    """schedule concatenate for cuda"""
-    with target:
-        return topi.cuda.schedule_injective(outs)
+@concatenate_strategy.register(["cuda", "gpu"])
+def concatenate_strategy_cuda(attrs, inputs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_concat(topi.transform.concatenate),
+        wrap_topi_schedule(topi.cuda.schedule_injective),
+        name="concatenate.cuda",
+    )
+    return strategy
 
 
 @schedule_pool.register(["cuda", "gpu"])
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index fa62af5f9fed2..2bb009dbc8f71 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1781,6 +1781,15 @@ def _compute_scanop(attrs, inputs, _):
     return _compute_scanop
 
 
+def wrap_compute_concat(topi_compute):
+    """Wrap concatenate topi compute"""
+
+    def _compute_concat(attrs, inputs, _):
+        return [topi_compute(inputs, attrs.axis)]
+
+    return _compute_concat
+
+
 @override_native_generic_func("cumsum_strategy")
 def cumsum_strategy(attrs, inputs, out_type, target):
     """cumsum generic strategy"""
@@ -1793,6 +1802,18 @@ def cumsum_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
+@override_native_generic_func("concat_strategy")
+def concatenate_strategy(attrs, inputs, out_type, target):
+    """concatenate generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_concat(topi.concatenate),
+        wrap_topi_schedule(topi.generic.schedule_injective),
+        name="concatenate",
+    )
+    return strategy
+
+
 @override_native_generic_func("cumprod_strategy")
 def cumprod_strategy(attrs, inputs, out_type, target):
     """cumprod generic strategy"""
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 0beb99e4f7dbf..59a57fd233f56 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -19,7 +19,7 @@
 import logging
 
 import re
-from tvm import topi
+from tvm import topi, tir
 from tvm.topi.x86.utils import target_has_vnni
 from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.te import SpecializedCondition
@@ -48,13 +48,6 @@ def schedule_reduce_cpu(attrs, outs, target):
         return topi.x86.schedule_reduce(outs)
 
 
-@schedule_concatenate.register("cpu")
-def schedule_concatenate_cpu(attrs, outs, target):
-    """schedule concatenate op for x86"""
-    with target:
-        return topi.x86.schedule_concatenate(outs)
-
-
 @schedule_pool.register("cpu")
 def schedule_pool_cpu(attrs, outs, target):
     """schedule pooling ops for x86"""
@@ -741,3 +734,34 @@ def conv2d_winograd_without_weight_transfrom_strategy_cpu(attrs, inputs, out_typ
             "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
         )
     return strategy
+
+
+@concatenate_strategy.register(["cpu"])
+def concatenate_strategy_cpu(attrs, inputs, out_type, target):
+    """concatenate x86 strategy"""
+    strategy = _op.OpStrategy()
+    use_only_old_concat = False
+    for inpt in inputs:
+        shape = inpt.shape
+        for i in shape:
+            if not isinstance(i, tir.expr.IntImm):
+                use_only_old_concat = True
+                break
+    if use_only_old_concat:
+        strategy.add_implementation(
+            wrap_compute_concat(topi.transform.concatenate),
+            wrap_topi_schedule(topi.x86.injective.schedule_concatenate),
+            name="concatenate.generic",
+        )
+    else:
+        strategy.add_implementation(
+            wrap_compute_concat(topi.x86.concatenate),
+            wrap_topi_schedule(topi.x86.schedule_concatenate_cpu),
+            name="concatenate.cpu",
+        )
+        strategy.add_implementation(
+            wrap_compute_concat(topi.transform.concatenate),
+            wrap_topi_schedule(topi.x86.injective.schedule_concatenate),
+            name="concatenate.generic",
+        )
+    return strategy
diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py
index 34a5e0362d871..d075090f01eac 100644
--- a/python/tvm/topi/x86/__init__.py
+++ b/python/tvm/topi/x86/__init__.py
@@ -43,3 +43,4 @@
 from .scatter import *
 from .group_conv2d import *
 from .math_alter_op import *
+from .concat import *
diff --git a/python/tvm/topi/x86/concat.py b/python/tvm/topi/x86/concat.py
new file mode 100644
index 0000000000000..5cb3cd3f57d50
--- /dev/null
+++ b/python/tvm/topi/x86/concat.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"concatenate related operators"
+from typing import Optional
+import tvm
+from tvm import te
+import numpy as np
+from ..utils import get_const_int, const_vector
+
+
+def concatenate(data: tvm.te.Tensor, axis: Optional[int] = 0):
+    """Join a sequence of arrays along an existing axis. Optimized for CPU exeution.
+
+    Parameters
+    ----------
+    data : tuple of tvm.te.Tensor
+        The arrays to concatenate
+
+    axis : int, optional
+        The axis along which the arrays will be joined. Default is 0.
+
+    Returns
+    -------
+    ret : tvm.te.Tensor
+    """
+
+    def gen_ir_1d(data_bufs, in_outers_tensor, in_cumsum_tensor, out_buf):
+        """Custom conactenation execution."""
+        i_b = tvm.tir.ir_builder.create()
+        data_bufs1 = [i_b.buffer_ptr(data_buf) for data_buf in data_bufs]
+        out_buf = i_b.buffer_ptr(out_buf)
+        outers = i_b.buffer_ptr(in_outers_tensor)
+        cumsum = i_b.buffer_ptr(in_cumsum_tensor)
+        for i in range(len(data)):
+            with i_b.for_range(0, outers[i], name="j") as j:
+                out_buf[cumsum[i] + j] = data_bufs1[i][j]
+        return i_b.get()
+
+    def gen_ir(data_bufs, in_outers_tensor, in_cumsum_tensor, out_buf, inner, outer):
+        """Common case of conactenation execution."""
+        i_b = tvm.tir.ir_builder.create()
+        data_bufs1 = [i_b.buffer_ptr(data_buf) for data_buf in data_bufs]
+        out_buf = i_b.buffer_ptr(out_buf)
+        outers = i_b.buffer_ptr(in_outers_tensor)
+        cumsum = i_b.buffer_ptr(in_cumsum_tensor)
+        if inner > 1:
+            with i_b.for_range(0, inner, name="inn", kind="parallel") as inn:
+                pos = inn * outer
+                for i in range(len(data)):
+                    offset = inn * outers[i]
+                    with i_b.for_range(0, outers[i], name="j") as j:
+                        out_buf[pos + cumsum[i] + j] = data_bufs1[i][offset + j]
+        else:
+            for i in range(len(data)):
+                with i_b.for_range(0, outers[i], name="j", kind="parallel") as j:
+                    out_buf[cumsum[i] + j] = data_bufs1[i][j]
+        return i_b.get()
+
+    if axis < 0:
+        axis += len(data[0].shape)
+    concat_axis_sizes = [int(t.shape[axis]) for t in data]
+    join_size = int(np.sum(concat_axis_sizes))
+    in_outers = [int(np.prod(i.shape[axis:])) for i in data]
+    in_outers_cumsum = [0, *np.cumsum(in_outers, dtype="int64")[0:-1]]
+    dtype = data[0].dtype
+    out_shape = data[0].shape[:axis] + [join_size] + data[0].shape[axis + 1 :]
+    in_outers_tensor = const_vector(in_outers)
+    in_cumsum_tensor = const_vector(in_outers_cumsum, name="cumsum")
+    right_val = np.prod(out_shape[axis:])
+    left_val = np.prod(out_shape[:axis])
+
+    if (
+        len(data[0].shape) == 1
+        or right_val == 1
+        or (left_val == 1 and axis == len(data[0].shape) - 1)
+        or (left_val == 1 and right_val == 1)
+    ):
+        # badly parallelized case
+        return te.extern(
+            [out_shape],
+            list(data) + [in_outers_tensor, in_cumsum_tensor],
+            lambda ins, outs: gen_ir_1d(ins, ins[-2], ins[-1], outs[0]),
+            dtype=dtype,
+            name="concatenate_ext",
+        )
+
+    inner = get_const_int(int(left_val))
+    outer = get_const_int(int(right_val))
+    return te.extern(
+        [out_shape],
+        list(data) + [in_outers_tensor, in_cumsum_tensor],
+        lambda ins, outs: gen_ir(ins, ins[-2], ins[-1], outs[0], inner, outer),
+        dtype=dtype,
+        name="concatenate_ext",
+    )
diff --git a/python/tvm/topi/x86/injective.py b/python/tvm/topi/x86/injective.py
index 6492b78d6037a..78893397ba31d 100644
--- a/python/tvm/topi/x86/injective.py
+++ b/python/tvm/topi/x86/injective.py
@@ -17,20 +17,22 @@
 # pylint: disable=invalid-name
 """x86 declaration and schedules."""
 from tvm import te
+from tvm.topi import tag
 from tvm.tir import IntImm
+from tvm.topi.generic.injective import (
+    schedule_injective_from_existing as schedule_injective_for_concat,
+)
 from ..utils import is_empty_shape
 
 
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
-
     Parameters
     ----------
     sch: Schedule
          The schedule to update.
     out: Tensor
          The tensor representing the injective op.
-
     Returns
     -------
     sch: Schedule
@@ -61,13 +63,11 @@ def schedule_injective_from_existing(sch, out):
 
 def schedule_injective(outs):
     """X86 schedule for injective op.
-
     Parameters
     ----------
     outs: Array of Tensor
           The computation graph description of injective in the format
           of an array of tensors.
-
     Returns
     -------
     sch: Schedule
@@ -85,13 +85,11 @@ def schedule_injective(outs):
 
 def schedule_concatenate(outs):
     """X86 schedule for concatenate op.
-
     Parameters
     ----------
     outs: Array of Tensor
           The computation graph description of injective in the format
           of an array of tensors.
-
     Returns
     -------
     sch: Schedule
@@ -132,5 +130,37 @@ def vectorize(sch, tensor, vectorize_limit):
     return s
 
 
+def schedule_concatenate_cpu(outs):
+    """X86 schedule for concatenate op.
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description in the format
+          of an array of tensors.
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+
+    def traverse(op):
+        if tag.is_injective(op.tag):
+            schedule_injective_for_concat(s, op.output(0))
+
+        for tensor in op.input_tensors:
+            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
+                traverse(tensor.op)
+        scheduled_ops.append(op)
+
+    for out in outs:
+        traverse(out.op)
+
+    return s
+
+
 schedule_elemwise = schedule_injective
 schedule_broadcast = schedule_injective
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index e888eccc2b1c7..57bf9f36def93 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -346,7 +346,6 @@ RELAY_REGISTER_OP("concatenate")
     .set_support_level(1)
     .add_type_rel("Concatenate", ConcatenateRel<ConcatenateAttrs>)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConcatenateLayout)
-    .set_attr<FTVMCompute>("FTVMCompute", ConcatenateCompute)
     .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 TVM_REGISTER_NODE_TYPE(StackAttrs);
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
index 2b30055c4f424..a8363fd084cd2 100644
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ b/src/te/schedule/schedule_dataflow_rewrite.cc
@@ -511,6 +511,29 @@ void InjectInline(ScheduleNode* sch, bool feature_extraction_mode) {
   std::vector<bool> changed(sch->stages.size(), false);
   std::vector<Stmt> new_hybrid_body(sch->stages.size());
   std::vector<bool> hybrid_changed(sch->stages.size(), false);
+  // (sshtin): this workaround allows to inline extern ops into their consumer.
+  // All inputs for extern op should not be inlined because inlining may happen
+  // before TE generation for particular extern op. That may lead to
+  // crash during lowering or building stages.
+  // The problem description:
+  // In case of operations fusing, arguments inlining
+  // prevents creation of ProducerNode for extern operation.
+  // Instead of the creation it is supposed to use operation argument as inlined buffer
+  // but extern_op TIR generation can be peformed after inlining procedure so
+  // newly generated TIR does not have reference to input data at all.
+  std::unordered_map<Operation, Operation> ext_ops;
+  for (size_t i = 0; i < sch->stages.size(); i++) {
+    Stage stage = sch->stages[i];
+    auto ext_op = stage->op.as<ExternOpNode>();
+    if (ext_op) {
+      auto inps = ext_op->InputTensors();
+      for (size_t ii = 0; ii < inps.size(); ++ii) {
+        if (ext_ops.find(inps[ii]->op) == ext_ops.end()) {
+          ext_ops[inps[ii]->op] = stage->op;
+        }
+      }
+    }
+  }
   // inline all the ops
   for (size_t i = sch->stages.size(); i != 0; --i) {
     Stage stage = sch->stages[i - 1];
@@ -525,8 +548,13 @@ void InjectInline(ScheduleNode* sch, bool feature_extraction_mode) {
         for (auto iv : compute->axis) {
           args.push_back(iv->var);
         }
+        if (ext_ops.find(stage->op) != ext_ops.end()) {
+          // sshtin: The extern op can try to get access to the input tensors as a raw data,
+          // that can lead to error in IR builder.
+          stage->attach_type = kGroupRoot;
+          continue;
+        }
         ICHECK_EQ(compute->body.size(), 1U) << "can only inline compute op with 1 output";
-
         if (feature_extraction_mode && compute->attrs.count("const_matrix")) {
           // Use constant value to replace access of const matrices.
           // This produces wrong IR but is good enough for feature extraction purposes.
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 170850809ad54..f4afc9e90562c 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -431,6 +431,103 @@ def test_batch_norm():
         )
 
 
+def do_concat_test(shapes, t_shape, dtype, axis, dev, target):
+    varsToConcat = []
+    inputData = []
+    pos = 0
+    for s in shapes:
+        varsToConcat.append(relay.var("x{}".format(pos), shape=s))
+        inputData.append(np.random.rand(*s).astype(dtype))
+        pos += 1
+    t = relay.var("z", shape=t_shape, dtype=dtype)
+    z = relay.concatenate(varsToConcat, axis=axis)
+    z = relay.add(z, t)
+    params = varsToConcat
+    params.append(t)
+    func = relay.Function(params, z)
+    t_data = np.random.uniform(low=-10, high=10, size=t_shape).astype(dtype)
+    ref_res = np.concatenate((tuple(inputData)), axis=axis) + t_data
+    mod = tvm.IRModule.from_expr(func)
+
+    executor = relay.create_executor("graph", mod=mod, device=dev, target=target)
+    op_res1 = executor.evaluate()(*inputData, t_data)
+
+    tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=0.000001)
+    op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+        *inputData, t_data
+    )
+    tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=0.000001)
+
+
+@tvm.testing.parametrize_targets("llvm")
+def test_concatenate1(target, dev):
+    np.random.seed(471)
+    maxNumDimensions = 6
+    shape = [4, 32, 16, 1, 31, 20, 21, 8, 28, 7]  # just randomly selected 10 numbers
+    for dtype in ["float32"]:
+        for dimsNum in range(1, maxNumDimensions):
+            np.random.shuffle(shape)
+            for axis in range(0, dimsNum):  # range should be (-dimsNum + 1, dimsNum)
+                numToConcat = np.random.uniform(low=2, high=10, size=(1)).astype("int64")[0]
+                shapes = []
+                # the code below to normalize axes index. For some reasons tvm notifies about error if the axis is negative
+                normalizedAxis = axis
+                if axis < 0:
+                    normalizedAxis += dimsNum
+                finalSize = 0
+                for i in range(0, numToConcat):
+                    shp = tuple(shape[:dimsNum])
+                    finalSize += shape[(i % len(shape))]
+                    shapes.append(
+                        shp[:normalizedAxis]
+                        + tuple([shape[(i % len(shape))]])
+                        + shp[normalizedAxis + 1 :]
+                    )
+                t_shape = shp[:normalizedAxis] + tuple([finalSize]) + shp[normalizedAxis + 1 :]
+                do_concat_test(shapes, t_shape, dtype, axis, dev, target)
+
+
+@tvm.testing.parametrize_targets("llvm")
+def test_concatenate2(target, dev):
+    # test to cover cases (1, .. , x, 1, .. , 1)
+    np.random.seed(13)
+    maxNumDimensions = 6
+    shape = [8, 3, 25, 33, 12, 29, 5, 11, 29, 11]  # just randomly selected 10 numbers
+    ind = 0
+    for dtype in ["float32"]:
+        for dimsNum in range(2, maxNumDimensions):
+            np.random.shuffle(shape)
+            for axis in range(-dimsNum + 1, dimsNum):  # range should be (-dimsNum + 1, dimsNum)
+                numToConcat = np.random.uniform(low=2, high=10, size=(1)).astype("int64")[0]
+                shapes = []
+                # the code below to normalize axes index. For some reasons tvm notifies about error if the axis is negative
+                normalizedAxis = axis
+                if axis < 0:
+                    normalizedAxis += dimsNum
+                finalSize = 0
+                for i in range(0, numToConcat):
+                    axisVal = [1] * dimsNum
+                    axisVal[axis] = shape[(ind % len(shape))]
+                    ind += 1
+                    finalSize += axisVal[axis]
+                    shapes.append(tuple(axisVal))
+                temp = [1] * dimsNum
+                temp[axis] = finalSize
+                t_shape = tuple(temp)
+                do_concat_test(shapes, t_shape, dtype, axis, dev, target)
+
+
+@tvm.testing.parametrize_targets("llvm")
+def test_concatenate3(target, dev):
+    np.random.seed(477)
+    for dtype in ["float32"]:
+        axis = -2
+        ending = 1
+        shapes = [[3, 2, 1, ending], [3, 2, 1, ending]]
+        t_shape = [3, 2, 2, ending]
+        do_concat_test(shapes, t_shape, dtype, axis, dev, target)
+
+
 def test_batch_norm_fold_const():
     axis = 1
     dtype = "float32"
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index ad054479fd7b2..d707e6b4646b7 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -22,6 +22,7 @@
 
 import numpy
 import pytest
+import platform
 
 import tvm
 import tvm.relay
@@ -418,14 +419,24 @@ def test_export_byoc_c_module():
         with tf.extractfile("./metadata.json") as f:
             metadata = json.load(f)
         main_md = metadata["memory"]["functions"]["main"]
-        assert main_md == [
-            {
-                "constants_size_bytes": 0,
-                "device": 1,
-                "io_size_bytes": 4800,
-                "workspace_size_bytes": 800,
-            }
-        ]
+        if platform.architecture()[0] == "64bit":
+            assert main_md == [
+                {
+                    "constants_size_bytes": 0,
+                    "device": 1,
+                    "io_size_bytes": 4800,
+                    "workspace_size_bytes": 1264,
+                }
+            ]
+        else:
+            assert main_md == [
+                {
+                    "constants_size_bytes": 0,
+                    "device": 1,
+                    "io_size_bytes": 4800,
+                    "workspace_size_bytes": 1248,
+                }
+            ]
 
 
 if __name__ == "__main__":

From a329df40289eeca45163454bc1998a998d151d26 Mon Sep 17 00:00:00 2001
From: Ziheng Jiang <ziheng@apache.org>
Date: Wed, 1 Jun 2022 13:25:05 -0700
Subject: [PATCH 014/181] [COMMUNITY] driazati -> Committer (#11525)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index b0ad37c4e545c..cfd99ae73f653 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -62,6 +62,7 @@ We do encourage everyone to work anything they are interested in.
 - [Lily Orth-Smith](https://github.com/electriclilies): @electriclilies - relay
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic) (PMC): @kparzysz-quic - hexagon, llvm
 - [Andrew Reusch](https://github.com/areusch): (PMC) @areusch - runtime, microTVM
+- [David Riazati](https://github.com/driazati): @driazati - ci, community
 - [Jared Roesch](https://github.com/jroesch) (PMC): @jroesch - relay
 - [Gustavo Romero](https://github.com/gromero): @gromero - microtvm, tvmc
 - [Giuseppe Rossini](https://github.com/giuseros): @giuseros - aot, arm

From ce60bfa0ff014752e879ea5eae7ad87a9d32bc2c Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 1 Jun 2022 15:16:09 -0700
Subject: [PATCH 015/181] [ci] Add filter to teams (#11455)

This improves the parsing to avoid issues like in #11454

commit-id:53a06ab3

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/ci/test_ci.py        | 15 +++++++++++++++
 tests/scripts/github_tag_teams.py |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index f5297c7ae7cce..042c109dd9d49 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -511,6 +511,7 @@ def run(type, data, check):
         """
         comment2 = """
         something @person4
+        @person5
         """
         teams = {
             "data": {
@@ -731,6 +732,20 @@ def run(type, data, check):
         check="Dry run, would have updated issues/1234 with {'body': '@person2 @SOME1-ONE-\\n\\ncc @person1'}",
     )
 
+    run(
+        type="ISSUE",
+        data={
+            "title": "[] A title",
+            "number": 1234,
+            "user": {
+                "login": "person5",
+            },
+            "labels": [],
+            "body": "@person2 @SOME1-ONE-",
+        },
+        check="No one to cc, exiting",
+    )
+
 
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/scripts/github_tag_teams.py b/tests/scripts/github_tag_teams.py
index 96c22cf6a5db3..f040c1edc9780 100755
--- a/tests/scripts/github_tag_teams.py
+++ b/tests/scripts/github_tag_teams.py
@@ -122,7 +122,7 @@ def add_tag(tag, users):
     for tag in result:
         result[tag] = list(set(result[tag]))
 
-    return {k.lower(): v for k, v in result.items()}
+    return {k.lower(): v for k, v in result.items() if k.strip()}
 
 
 def tags_from_title(title: str) -> List[str]:

From c6d7ecd0b5e71796c79b001f439322ae1d0ddbe0 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 1 Jun 2022 23:57:33 -0700
Subject: [PATCH 016/181] [TE] Fix `te.CreatePrimFunc` for 0-dim computation
 (#11518)

For 0-dimensional computation, `te.CreatePrimFunc` creates an opaque block with 0 block iters,
which is mistakenly passed into TVMScript auto-completion that failed to add the root block properly.
As an example,

```python
>> from tvm import te
>> a = te.placeholder((), name="a", dtype="int32")
>> b = te.placeholder((), name="b", dtype="int32")
>> c = te.compute(a.shape, lambda *i: a(*i) + b(*i), name="c")
>> f = te.create_prim_func([a, b, c])
>> print(f.body.block.reads)
[a[], b[]]
>> print(f.body.block.writes)
[c[]]
```

This PR fixes this issue by enforcing the consistency that `te.CreatePrimFunc`
always creates scheduleable blocks with at least 1 block iter:

```python
@T.prim_func
def func(a: T.Buffer[(), "int32"], b: T.Buffer[(), "int32"], c: T.Buffer[(), "int32"]) -> None:
    # function attr dict
    T.func_attr({"global_symbol": "main", "tir.noalias": True})
    # body
    # with T.block("root")
    with T.block("c"):
        vi = T.axis.spatial(1, 0)
        T.reads(a[()], b[()])
        T.writes(c[()])
        c[()] = a[()] + b[()]
```
---
 .../task_scheduler/task_scheduler.cc          |  2 ++
 src/te/operation/create_primfunc.cc           |  8 +++++-
 .../unittest/test_te_create_primfunc.py       | 27 +++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index 7485f4e076cdc..fd1d95cd1f19b 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -94,6 +94,8 @@ void SendToRunner(const Runner& runner, const TuneContext& context, PackedFunc l
 
 void TaskSchedulerNode::InitializeTask(int task_id) {
   TuneContext task = this->tasks[task_id];
+  TVM_PY_LOG(INFO, this->logging_func)
+      << "Initializing Task #" << task_id << ": " << task->task_name;
   TVM_PY_LOG(INFO, task->logging_func)
       << "Initializing Task #" << task_id << ": " << task->task_name;
   CHECK(task->mod.defined()) << "ValueError: Require `context.mod`, but it is not defined";
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 03ad551c68391..27cfdd605c5d4 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -264,6 +264,12 @@ BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
   }
   // Set script_parsing_detect_access
   annotations.Set(tir::attr::script_parsing_detect_access, IntImm(DataType::Int(32), 3));
+  if (iter_vars.empty()) {
+    IterVar iter(Range::FromMinExtent(0, 1), Var("vi", DataType::Int(32)), IterVarType::kDataPar);
+    PrimExpr binding(0);
+    iter_vars.push_back(iter);
+    bindings.push_back(binding);
+  }
 
   // Step 6. Create Block and BlockRealize.
   return BlockRealize(/*iter_values=*/std::move(bindings),
@@ -454,7 +460,7 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
                             {{"global_symbol", String("main")}, {"tir.noalias", Bool(true)}});
   const auto* complete = runtime::Registry::Get("script.Complete");
   ICHECK(complete);
-  func = (*complete)(func, info.root_alloc);
+  func = (*complete)(std::move(func), info.root_alloc);
   return LayoutFreePlaceholdersNormalizer().Process(std::move(func));
 }
 
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 014ca71a8112a..5d9ad003b487c 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -524,6 +524,32 @@ def test_int64_indices():
     assert loop.extent.dtype == "int64"
 
 
+def test_zero_dim_add():
+    def te_func():
+        a = te.placeholder((), name="a", dtype="int32")
+        b = te.placeholder((), name="b", dtype="int32")
+        c = te.compute(a.shape, lambda *i: a(*i) + b(*i), name="c")
+        return [a, b, c]
+
+    @T.prim_func
+    def expected(
+        a: T.Buffer[(), "int32"],
+        b: T.Buffer[(), "int32"],
+        c: T.Buffer[(), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            with T.block("c"):
+                vi = T.axis.spatial(1, 0)
+                T.reads(a[()], b[()])
+                T.writes(c[()])
+                c[()] = a[()] + b[()]
+
+    _check_workload(te_func, expected)
+
+
 if __name__ == "__main__":
     test_unique_name_complete_block()
     test_unique_name_reduction_block()
@@ -541,3 +567,4 @@ def test_int64_indices():
     test_argmax_idx_val()
     test_argmax_val_idx()
     test_int64_indices()
+    test_zero_dim_add()

From e60849c89934caa5709d4c42c5b7eda3f26c5e76 Mon Sep 17 00:00:00 2001
From: mhyang-pllab <75776819+mhyang-pllab@users.noreply.github.com>
Date: Thu, 2 Jun 2022 15:53:15 +0800
Subject: [PATCH 017/181] Add ceil shape registration (#11533)

---
 python/tvm/relay/op/_tensor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 23aff8bbb8b42..37cb263c489d3 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -306,3 +306,4 @@ def elemwise_shape_func(attrs, inputs, _):
 register_shape_func("sigmoid", False, elemwise_shape_func)
 register_shape_func("tanh", False, elemwise_shape_func)
 register_shape_func("logical_not", False, elemwise_shape_func)
+register_shape_func("ceil", False, elemwise_shape_func)

From 4c513b9de3ebfdf4a1356f0daf7350e74ca74005 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 2 Jun 2022 01:44:05 -0700
Subject: [PATCH 018/181] [Bugfix][TIR] Handle bool tensor in FlattenBuffer
 (#11532)

This PR fixes an existing bug in TIR lowering where the TIR below triggers an error:

```python
@T.prim_func
def func(a: T.Buffer[10, "bool"], b: T.Buffer[10, "bool"]) -> None:
    T.func_attr({"global_symbol": "main", "tir.noalias": True})
    for i in T.serial(10):
        with T.block("b"):
            vi = T.axis.spatial(10, i)
            b[vi] = a[vi]

tvm.build(func, target="llvm")
```

The error message is:

```
  File "/root/Projects/tvm-dev/src/tir/transforms/flatten_buffer.cc", line 173
TVMError:
---------------------------------------------------------------
An error occurred during the execution of TVM.
For more information, please see: https://tvm.apache.org/docs/errors.html
---------------------------------------------------------------

Check failed: store->buffer->dtype == DataType::Int(8) (bool vs. int8) : Expected int8 backing array
for boolean tensor
```

This PR fixes this behavior.
---
 src/tir/transforms/flatten_buffer.cc          | 18 ++++-----
 .../test_tir_transform_flatten_buffer.py      | 37 ++++++++++++++++++-
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/src/tir/transforms/flatten_buffer.cc b/src/tir/transforms/flatten_buffer.cc
index c7cc51d27113a..21de191db0091 100644
--- a/src/tir/transforms/flatten_buffer.cc
+++ b/src/tir/transforms/flatten_buffer.cc
@@ -53,9 +53,7 @@ class BufferFlattener : public StmtExprMutator {
   static PrimFunc Flatten(PrimFunc func) {
     Map<Var, Buffer> preflattened_buffer_map =
         Merge(func->buffer_map, func->preflattened_buffer_map);
-
     auto pass = BufferFlattener(func->buffer_map);
-
     auto writer = func.CopyOnWrite();
     writer->body = pass.VisitStmt(func->body);
     writer->preflattened_buffer_map = preflattened_buffer_map;
@@ -137,7 +135,7 @@ class BufferFlattener : public StmtExprMutator {
     } else {
       PrimExpr expr = it->second;
       if (expr.dtype() != var.dtype()) {
-        expr = Cast(var.dtype(), std::move(expr));
+        expr = tvm::cast(var.dtype(), std::move(expr));
       }
       return expr;
     }
@@ -164,33 +162,35 @@ class BufferFlattener : public StmtExprMutator {
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     BufferStore store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    bool store_returns_bool = (op->value.dtype() == DataType::Bool());
+    store = VisitBufferAccess(store);
 
     // Handle casts from the value's dtype to the dtype of the
     // backing array.
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
-    if (store->value.dtype() == DataType::Bool()) {
+    if (store_returns_bool) {
       ICHECK_EQ(store->buffer->dtype, DataType::Int(8))
           << "Expected int8 backing array for boolean tensor";
       auto writer = store.CopyOnWrite();
-      writer->value = tir::Cast(DataType::Int(8), store->value);
+      writer->value = tvm::cast(DataType::Int(8), store->value);
+      return store;
     }
-    auto flattened_indices = store->buffer->ElemOffset(store->indices);
-    return VisitBufferAccess(std::move(store));
+    return store;
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
     bool load_returns_bool = (op->dtype == DataType::Bool());
     BufferLoad load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
     load = VisitBufferAccess(load);
-
     // Handle casts from dtype of the backing array to value's dtype.
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
     if (load_returns_bool) {
       ICHECK_EQ(load->buffer->dtype, DataType::Int(8))
           << "Expected int8 backing array for boolean tensor";
-      return tir::Cast(DataType::Bool(), load);
+      load.CopyOnWrite()->dtype = DataType::Int(8);
+      return tvm::cast(DataType::Bool(), load);
     } else {
       return std::move(load);
     }
diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py
index 65be43aba3212..f1a33a4fb203d 100644
--- a/tests/python/unittest/test_tir_transform_flatten_buffer.py
+++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import tir, te
+from tvm import te, tir
 from tvm.script import tir as T
 
 
@@ -268,6 +268,33 @@ def annotated_loops(a: T.handle) -> None:
         A[i] = 0.0
 
 
+@T.prim_func
+def boolean_handling_before(a: T.Buffer[10, "bool"], b: T.Buffer[10, "bool"]) -> None:
+    for i0 in T.serial(10):
+        with T.block("b"):
+            T.reads(a[i0])
+            T.writes(b[i0])
+            b[i0] = a[i0]
+
+
+@T.prim_func
+def boolean_handling_after(a: T.Buffer[10, "int8"], b: T.Buffer[10, "int8"]) -> None:
+    T.preflattened_buffer(a, [10], dtype="bool", data=a.data)
+    T.preflattened_buffer(b, [10], dtype="bool", data=b.data)
+    # body
+    for i0 in T.serial(10):
+        b[i0] = T.cast(T.cast(a[i0], "bool"), "int8")
+
+
+@T.prim_func
+def boolean_handle_after(a: T.Buffer[10, "int8"], b: T.Buffer[10, "int8"]) -> None:
+    T.preflattened_buffer(a, [10], dtype="bool", data=a.data)
+    T.preflattened_buffer(b, [10], dtype="bool", data=b.data)
+    # body
+    for i0 in T.serial(10):
+        b[i0] = T.cast(T.cast(a[i0], "bool"), "int8")
+
+
 def test_elementwise():
     _check(compacted_elementwise_func, flattened_elementwise_func)
 
@@ -319,6 +346,13 @@ def test_annotated_loops():
     tvm.ir.assert_structural_equal(attr3.value, tvm.tir.FloatImm("float32", 0.0))
 
 
+def test_boolean_handling():
+    _check(boolean_handling_before, boolean_handling_after)
+    # mod = tvm.IRModule.from_expr(boolean_handling_before)
+    # mod = tvm.tir.transform.FlattenBuffer()(mod)
+    # print(mod.script())
+
+
 if __name__ == "__main__":
     test_elementwise()
     test_gpu_workload()
@@ -329,3 +363,4 @@ def test_annotated_loops():
     test_strided_buffer()
     test_lower_te()
     test_annotated_loops()
+    test_boolean_handling()

From bbca53d2ab354d7e8bed11fc9e1eae13fbee7730 Mon Sep 17 00:00:00 2001
From: apeskov <peskovnn@gmail.com>
Date: Thu, 2 Jun 2022 13:04:12 +0300
Subject: [PATCH 019/181] [DNNL] Add TensorRequisite concept (#11345)

Allow to use DNNL runtime in multi instance mode.
Thread safe execution of Run() method.

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>
---
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 1412 +++++------------
 .../contrib/dnnl/dnnl_tensor_requisite.h      |  720 +++++++++
 src/runtime/contrib/dnnl/dnnl_utils.cc        |   24 +-
 src/runtime/contrib/dnnl/dnnl_utils.h         |   98 +-
 4 files changed, 1239 insertions(+), 1015 deletions(-)
 create mode 100644 src/runtime/contrib/dnnl/dnnl_tensor_requisite.h

diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index f6a1c3b790807..a2417f012ea42 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -32,7 +32,12 @@
 
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
-#include "dnnl.hpp"
+
+// TODO(@apeskov): Have to mute warning from dnnl headers.
+//  -Wzero-as-null-pointer-constant and -Wdocumentation-unknown-command
+#include <dnnl.hpp>
+
+#include "dnnl_tensor_requisite.h"
 #include "dnnl_utils.h"
 
 namespace tvm {
@@ -43,552 +48,82 @@ using namespace tvm::runtime;
 using namespace tvm::runtime::json;
 
 class DNNLJSONRuntime : public JSONRuntimeBase {
-  using tag = dnnl::memory::format_tag;
-  using dt = dnnl::memory::data_type;
-
  public:
   DNNLJSONRuntime(const std::string& symbol_name, const std::string& graph_json,
                   const Array<String> const_names)
-      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+      : JSONRuntimeBase(symbol_name, graph_json, const_names),
+        next_unique_eid_offset_(data_entry_.size()),
+        run_arg_eid_(input_var_eid_) {
+    for (const auto e : outputs_) run_arg_eid_.push_back(EntryID(e));
+  }
 
-  const char* type_key() const { return "dnnl_json"; }
+  const char* type_key() const override { return "dnnl_json"; }
 
   void Init(const Array<NDArray>& consts) override {
-    BuildEngine();
-
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
 
     // Setup constants entries for weights.
     SetupConstants(consts);
+    BuildEngine();
   }
 
-  void Run() override {
-    // Fill in the input buffers.
-    for (size_t i = 0; i < input_nodes_.size(); ++i) {
-      auto eid = EntryID(input_nodes_[i], 0);
-      size_t offset_in_bytes =
-          entry_out_mem_[eid].second * ((data_entry_[eid]->dtype.bits + 7) / 8);
-      size_t buffer_size = GetDataSize(*data_entry_[eid]);
-      write_to_dnnl_memory(data_entry_[eid]->data, entry_out_mem_[eid].first, buffer_size,
-                           offset_in_bytes);
-    }
+  /* Unused stub implementation */
+  void Run() override { LOG(FATAL) << "Unreachable code"; }
 
-    // Invoke the engine through intepreting the stream.
-    for (size_t i = 0; i < net_.size(); ++i) {
-      net_.at(i).execute(stream_, net_args_.at(i));
-    }
-    stream_.wait();
-
-    // Read output buffers.
-    for (size_t i = 0; i < outputs_.size(); ++i) {
-      auto eid = EntryID(outputs_[i]);
-      size_t offset_in_bytes =
-          entry_out_mem_[eid].second * ((data_entry_[eid]->dtype.bits + 7) / 8);
-      size_t buffer_size = GetDataSize(*data_entry_[eid]);
-      read_from_dnnl_memory(data_entry_[eid]->data, entry_out_mem_[eid].first, buffer_size,
-                            offset_in_bytes);
+  /* Thread safe implementation of Run. Keep runtime instance immutable */
+  void Run(const TVMArgs& args) const {
+    auto arg_data_provider = makeIODataProvider(args);
+    auto mem_solver = tensor_registry_.MakeSolver(arg_data_provider);
+    // Execute primitives one by one
+    for (const auto& act : net_) {
+      auto prim = std::get<0>(act);
+      auto arg_reqs = std::get<1>(act);
+
+      // Find proper dnnl::memory buffers
+      std::unordered_map<int, dnnl::memory> mem_args;
+      for (const auto& kvp : arg_reqs) mem_args[kvp.first] = mem_solver(kvp.second);
+
+      prim.execute(stream_, mem_args);
     }
   }
 
- private:
-  tag layout2tag(std::string layout) {
-    static const std::map<std::string, tag> str2tag = {{"nc", tag::nc},
-                                                       {"cn", tag::cn},
-                                                       {"tn", tag::tn},
-                                                       {"nt", tag::nt},
-                                                       {"ncw", tag::ncw},
-                                                       {"nwc", tag::nwc},
-                                                       {"nchw", tag::nchw},
-                                                       {"nhwc", tag::nhwc},
-                                                       {"chwn", tag::chwn},
-                                                       {"ncdhw", tag::ncdhw},
-                                                       {"ndhwc", tag::ndhwc},
-                                                       {"oi", tag::oi},
-                                                       {"io", tag::io},
-                                                       {"oiw", tag::oiw},
-                                                       {"owi", tag::owi},
-                                                       {"wio", tag::wio},
-                                                       {"iwo", tag::iwo},
-                                                       {"oihw", tag::oihw},
-                                                       {"hwio", tag::hwio},
-                                                       {"ohwi", tag::ohwi},
-                                                       {"ihwo", tag::ihwo},
-                                                       {"iohw", tag::iohw},
-                                                       {"oidhw", tag::oidhw},
-                                                       {"dhwio", tag::dhwio},
-                                                       {"odhwi", tag::odhwi},
-                                                       {"iodhw", tag::iodhw},
-                                                       {"idhwo", tag::idhwo},
-                                                       {"goiw", tag::goiw},
-                                                       {"gowi", tag::gowi},
-                                                       {"wigo", tag::wigo},
-                                                       {"gohwi", tag::gohwi},
-                                                       {"goihw", tag::goihw},
-                                                       {"hwigo", tag::hwigo},
-                                                       {"giohw", tag::giohw},
-                                                       {"goidhw", tag::goidhw},
-                                                       {"giodhw", tag::giodhw},
-                                                       {"godhwi", tag::godhwi},
-                                                       {"dhwigo", tag::dhwigo},
-                                                       {"tnc", tag::tnc},
-                                                       {"ntc", tag::ntc},
-                                                       {"ldnc", tag::ldnc},
-                                                       {"ldigo", tag::ldigo},
-                                                       {"ldgoi", tag::ldgoi},
-                                                       {"ldio", tag::ldio},
-                                                       {"ldoi", tag::ldoi},
-                                                       {"ldgo", tag::ldgo},
-                                                       {"nCdhw16c", tag::nCdhw16c},
-                                                       {"nCdhw4c", tag::nCdhw4c},
-                                                       {"nCdhw8c", tag::nCdhw8c},
-                                                       {"nChw16c", tag::nChw16c},
-                                                       {"nChw4c", tag::nChw4c},
-                                                       {"nChw8c", tag::nChw8c},
-                                                       {"nCw16c", tag::nCw16c},
-                                                       {"nCw4c", tag::nCw4c},
-                                                       {"nCw8c", tag::nCw8c},
-                                                       {"NCw16n16c", tag::NCw16n16c},
-                                                       {"NChw16n16c", tag::NChw16n16c},
-                                                       {"NCdhw16n16c", tag::NCdhw16n16c},
-                                                       {"NCdhw32n32c", tag::NCdhw32n32c},
-                                                       {"NChw32n32c", tag::NChw32n32c},
-                                                       {"IOhw16i16o", tag::IOhw16i16o},
-                                                       {"OI16i16o", tag::OI16i16o},
-                                                       {"OI16i32o", tag::OI16i32o},
-                                                       {"OI16i64o", tag::OI16i64o},
-                                                       {"OI8i16o2i", tag::OI8i16o2i},
-                                                       {"OI8i32o2i", tag::OI8i32o2i},
-                                                       {"OI8i64o2i", tag::OI8i64o2i},
-                                                       {"OI4i16o4i", tag::OI4i16o4i},
-                                                       {"OI4i32o4i", tag::OI4i32o4i},
-                                                       {"OI4i64o4i", tag::OI4i64o4i},
-                                                       {"Ohwi32o", tag::Ohwi32o},
-                                                       {"IOdhw16i16o", tag::IOdhw16i16o},
-                                                       {"gIOhw16i16o", tag::gIOhw16i16o},
-                                                       {"gOhwi32o", tag::gOhwi32o},
-                                                       {"Goidhw16g", tag::Goidhw16g},
-                                                       {"IOw16o16i", tag::IOw16o16i},
-                                                       {"OIw16i16o", tag::OIw16i16o},
-                                                       {"OIw16i32o", tag::OIw16i32o},
-                                                       {"OIw16i64o", tag::OIw16i64o},
-                                                       {"IOw16i16o", tag::IOw16i16o},
-                                                       {"gIOw16i16o", tag::gIOw16i16o},
-                                                       {"OIw16o16i", tag::OIw16o16i},
-                                                       {"Oiw16o", tag::Oiw16o},
-                                                       {"OIw4i16o4i", tag::OIw4i16o4i},
-                                                       {"OIw4i32o4i", tag::OIw4i32o4i},
-                                                       {"OIw4i64o4i", tag::OIw4i64o4i},
-                                                       {"OIw2i8o4i", tag::OIw2i8o4i},
-                                                       {"OIw4i4o", tag::OIw4i4o},
-                                                       {"OIw4o4i", tag::OIw4o4i},
-                                                       {"Oiw4o", tag::Oiw4o},
-                                                       {"OIw8i16o2i", tag::OIw8i16o2i},
-                                                       {"OIw8i32o2i", tag::OIw8i32o2i},
-                                                       {"OIw8i64o2i", tag::OIw8i64o2i},
-                                                       {"OIw8i8o", tag::OIw8i8o},
-                                                       {"OIw8o16i2o", tag::OIw8o16i2o},
-                                                       {"OIw8o8i", tag::OIw8o8i},
-                                                       {"OIw8o4i", tag::OIw8o4i},
-                                                       {"OIw16i16o4i", tag::OIw16i16o4i},
-                                                       {"OIw16i32o4i", tag::OIw16i32o4i},
-                                                       {"OIw16i48o4i", tag::OIw16i48o4i},
-                                                       {"OIw16i64o4i", tag::OIw16i64o4i},
-                                                       {"OIw16i16o2i", tag::OIw16i16o2i},
-                                                       {"OIw16i32o2i", tag::OIw16i32o2i},
-                                                       {"OIw16i48o2i", tag::OIw16i48o2i},
-                                                       {"OIw16i64o2i", tag::OIw16i64o2i},
-                                                       {"OIw16o16i2o", tag::OIw16o16i2o},
-                                                       {"Owi16o", tag::Owi16o},
-                                                       {"OwI16o2i", tag::OwI16o2i},
-                                                       {"Owi4o", tag::Owi4o},
-                                                       {"Owi8o", tag::Owi8o},
-                                                       {"IOhw16o16i", tag::IOhw16o16i},
-                                                       {"Ohwi16o", tag::Ohwi16o},
-                                                       {"OhwI16o2i", tag::OhwI16o2i},
-                                                       {"Ohwi4o", tag::Ohwi4o},
-                                                       {"Ohwi8o", tag::Ohwi8o},
-                                                       {"OIhw16i16o", tag::OIhw16i16o},
-                                                       {"OIhw16i32o", tag::OIhw16i32o},
-                                                       {"OIhw16i64o", tag::OIhw16i64o},
-                                                       {"OIhw16o16i", tag::OIhw16o16i},
-                                                       {"Oihw16o", tag::Oihw16o},
-                                                       {"OIhw4i16o4i", tag::OIhw4i16o4i},
-                                                       {"OIhw4i32o4i", tag::OIhw4i32o4i},
-                                                       {"OIhw4i64o4i", tag::OIhw4i64o4i},
-                                                       {"OIhw4i4o", tag::OIhw4i4o},
-                                                       {"OIhw4o4i", tag::OIhw4o4i},
-                                                       {"Oihw4o", tag::Oihw4o},
-                                                       {"OIhw8i16o2i", tag::OIhw8i16o2i},
-                                                       {"OIhw8i32o2i", tag::OIhw8i32o2i},
-                                                       {"OIhw8i64o2i", tag::OIhw8i64o2i},
-                                                       {"OIhw8i8o", tag::OIhw8i8o},
-                                                       {"OIhw8o16i2o", tag::OIhw8o16i2o},
-                                                       {"OIhw8o8i", tag::OIhw8o8i},
-                                                       {"OIhw8o4i", tag::OIhw8o4i},
-                                                       {"OIhw2i8o4i", tag::OIhw2i8o4i},
-                                                       {"IOdhw16o16i", tag::IOdhw16o16i},
-                                                       {"Odhwi16o", tag::Odhwi16o},
-                                                       {"OdhwI16o2i", tag::OdhwI16o2i},
-                                                       {"Odhwi4o", tag::Odhwi4o},
-                                                       {"Odhwi8o", tag::Odhwi8o},
-                                                       {"OIdhw16i16o", tag::OIdhw16i16o},
-                                                       {"OIdhw16i32o", tag::OIdhw16i32o},
-                                                       {"OIdhw16i64o", tag::OIdhw16i64o},
-                                                       {"OIdhw16o16i", tag::OIdhw16o16i},
-                                                       {"Oidhw16o", tag::Oidhw16o},
-                                                       {"OIdhw4i4o", tag::OIdhw4i4o},
-                                                       {"OIdhw4o4i", tag::OIdhw4o4i},
-                                                       {"Oidhw4o", tag::Oidhw4o},
-                                                       {"OIdhw8i16o2i", tag::OIdhw8i16o2i},
-                                                       {"OIdhw8i32o2i", tag::OIdhw8i32o2i},
-                                                       {"OIdhw8i64o2i", tag::OIdhw8i64o2i},
-                                                       {"OIdhw4i16o4i", tag::OIdhw4i16o4i},
-                                                       {"OIdhw16i16o4i", tag::OIdhw16i16o4i},
-                                                       {"OIdhw16i32o4i", tag::OIdhw16i32o4i},
-                                                       {"OIdhw16i48o4i", tag::OIdhw16i48o4i},
-                                                       {"OIdhw16i64o4i", tag::OIdhw16i64o4i},
-                                                       {"OIdhw16i16o2i", tag::OIdhw16i16o2i},
-                                                       {"OIdhw16i32o2i", tag::OIdhw16i32o2i},
-                                                       {"OIdhw16i48o2i", tag::OIdhw16i48o2i},
-                                                       {"OIdhw16i64o2i", tag::OIdhw16i64o2i},
-                                                       {"OIdhw4i32o4i", tag::OIdhw4i32o4i},
-                                                       {"OIdhw4i64o4i", tag::OIdhw4i64o4i},
-                                                       {"OIdhw2i8o4i", tag::OIdhw2i8o4i},
-                                                       {"OIdhw8i8o", tag::OIdhw8i8o},
-                                                       {"OIdhw8o8i", tag::OIdhw8o8i},
-                                                       {"OIdhw8o4i", tag::OIdhw8o4i},
-                                                       {"gIOw16o16i", tag::gIOw16o16i},
-                                                       {"gOIw16i16o", tag::gOIw16i16o},
-                                                       {"gOIw16o16i", tag::gOIw16o16i},
-                                                       {"gOiw16o", tag::gOiw16o},
-                                                       {"gOIw4i16o4i", tag::gOIw4i16o4i},
-                                                       {"gOIw2i8o4i", tag::gOIw2i8o4i},
-                                                       {"gOIw4i4o", tag::gOIw4i4o},
-                                                       {"gOIw4o4i", tag::gOIw4o4i},
-                                                       {"gOiw4o", tag::gOiw4o},
-                                                       {"gOIw8i16o2i", tag::gOIw8i16o2i},
-                                                       {"gOIw8i8o", tag::gOIw8i8o},
-                                                       {"gOIw8o16i2o", tag::gOIw8o16i2o},
-                                                       {"gOIw8o8i", tag::gOIw8o8i},
-                                                       {"gOIw8o4i", tag::gOIw8o4i},
-                                                       {"gOIw16i16o4i", tag::gOIw16i16o4i},
-                                                       {"gOIw16i16o2i", tag::gOIw16i16o2i},
-                                                       {"gOIw16o16i2o", tag::gOIw16o16i2o},
-                                                       {"gOwi16o", tag::gOwi16o},
-                                                       {"gOwI16o2i", tag::gOwI16o2i},
-                                                       {"gOwi4o", tag::gOwi4o},
-                                                       {"gOwi8o", tag::gOwi8o},
-                                                       {"Goiw8g", tag::Goiw8g},
-                                                       {"Goiw16g", tag::Goiw16g},
-                                                       {"gIOhw16o16i", tag::gIOhw16o16i},
-                                                       {"gOhwi16o", tag::gOhwi16o},
-                                                       {"gOhwI16o2i", tag::gOhwI16o2i},
-                                                       {"gOhwi4o", tag::gOhwi4o},
-                                                       {"gOhwi8o", tag::gOhwi8o},
-                                                       {"Goihw16g", tag::Goihw16g},
-                                                       {"gOIhw16i16o", tag::gOIhw16i16o},
-                                                       {"gOIhw16o16i", tag::gOIhw16o16i},
-                                                       {"gOihw16o", tag::gOihw16o},
-                                                       {"gOIhw4i16o4i", tag::gOIhw4i16o4i},
-                                                       {"gOIhw2i8o4i", tag::gOIhw2i8o4i},
-                                                       {"gOIhw4i4o", tag::gOIhw4i4o},
-                                                       {"gOIhw4o4i", tag::gOIhw4o4i},
-                                                       {"gOihw4o", tag::gOihw4o},
-                                                       {"Goihw8g", tag::Goihw8g},
-                                                       {"gOIhw8i16o2i", tag::gOIhw8i16o2i},
-                                                       {"gOIhw8i8o", tag::gOIhw8i8o},
-                                                       {"gOIhw8o16i2o", tag::gOIhw8o16i2o},
-                                                       {"OIw4o8i8o4i", tag::OIw4o8i8o4i},
-                                                       {"OIdhw4o8i8o4i", tag::OIdhw4o8i8o4i},
-                                                       {"OIhw4o8i8o4i", tag::OIhw4o8i8o4i},
-                                                       {"OIhw2o8i8o2i", tag::OIhw2o8i8o2i},
-                                                       {"gOIw4o8i8o4i", tag::gOIw4o8i8o4i},
-                                                       {"gOIdhw4o8i8o4i", tag::gOIdhw4o8i8o4i},
-                                                       {"gOIhw4o8i8o4i", tag::gOIhw4o8i8o4i},
-                                                       {"gOIhw2o8i8o2i", tag::gOIhw2o8i8o2i},
-                                                       {"OIhw16i16o4i", tag::OIhw16i16o4i},
-                                                       {"OIhw16i32o4i", tag::OIhw16i32o4i},
-                                                       {"OIhw16i48o4i", tag::OIhw16i48o4i},
-                                                       {"OIhw16i64o4i", tag::OIhw16i64o4i},
-                                                       {"OIhw16i16o2i", tag::OIhw16i16o2i},
-                                                       {"OIhw16i32o2i", tag::OIhw16i32o2i},
-                                                       {"OIhw16i48o2i", tag::OIhw16i48o2i},
-                                                       {"OIhw16i64o2i", tag::OIhw16i64o2i},
-                                                       {"OIhw16o16i2o", tag::OIhw16o16i2o},
-                                                       {"gOIhw16i16o4i", tag::gOIhw16i16o4i},
-                                                       {"gOIhw16i16o2i", tag::gOIhw16i16o2i},
-                                                       {"gOIhw16o16i2o", tag::gOIhw16o16i2o},
-                                                       {"gOIhw8o8i", tag::gOIhw8o8i},
-                                                       {"gOIhw8o4i", tag::gOIhw8o4i},
-                                                       {"gIOdhw16i16o", tag::gIOdhw16i16o},
-                                                       {"gIOdhw16o16i", tag::gIOdhw16o16i},
-                                                       {"gOdhwi16o", tag::gOdhwi16o},
-                                                       {"gOdhwI16o2i", tag::gOdhwI16o2i},
-                                                       {"gOdhwi4o", tag::gOdhwi4o},
-                                                       {"gOdhwi8o", tag::gOdhwi8o},
-                                                       {"gOIdhw16i16o", tag::gOIdhw16i16o},
-                                                       {"gOIdhw16o16i", tag::gOIdhw16o16i},
-                                                       {"gOidhw16o", tag::gOidhw16o},
-                                                       {"gOIdhw4i4o", tag::gOIdhw4i4o},
-                                                       {"gOIdhw4o4i", tag::gOIdhw4o4i},
-                                                       {"gOidhw4o", tag::gOidhw4o},
-                                                       {"gOIdhw8i16o2i", tag::gOIdhw8i16o2i},
-                                                       {"gOIdhw4i16o4i", tag::gOIdhw4i16o4i},
-                                                       {"gOIdhw16i16o4i", tag::gOIdhw16i16o4i},
-                                                       {"gOIdhw16i16o2i", tag::gOIdhw16i16o2i},
-                                                       {"gOIdhw2i8o4i", tag::gOIdhw2i8o4i},
-                                                       {"gOIdhw8i8o", tag::gOIdhw8i8o},
-                                                       {"gOIdhw8o8i", tag::gOIdhw8o8i},
-                                                       {"gOIdhw8o4i", tag::gOIdhw8o4i},
-                                                       {"gOIw2i4o2i", tag::gOIw2i4o2i},
-                                                       {"gOIhw2i4o2i", tag::gOIhw2i4o2i},
-                                                       {"gOIdhw2i4o2i", tag::gOIdhw2i4o2i},
-                                                       {"gOIw2o4i2o", tag::gOIw2o4i2o},
-                                                       {"gOIhw2o4i2o", tag::gOIhw2o4i2o},
-                                                       {"gOIdhw2o4i2o", tag::gOIdhw2o4i2o},
-                                                       {"gOIw4i8o2i", tag::gOIw4i8o2i},
-                                                       {"gOIhw4i8o2i", tag::gOIhw4i8o2i},
-                                                       {"gOIdhw4i8o2i", tag::gOIdhw4i8o2i},
-                                                       {"gOIw4o8i2o", tag::gOIw4o8i2o},
-                                                       {"gOIhw4o8i2o", tag::gOIhw4o8i2o},
-                                                       {"gOIdhw4o8i2o", tag::gOIdhw4o8i2o},
-                                                       {"ldOi32o", tag::ldOi32o},
-                                                       {"ldOI32o4i", tag::ldOI32o4i},
-                                                       {"ldgOi32o", tag::ldgOi32o},
-                                                       {"ldgOI32o2i", tag::ldgOI32o2i},
-                                                       {"ldgOI32o4i", tag::ldgOI32o4i},
-                                                       {"OwI16o4i", tag::OwI16o4i},
-                                                       {"OhwI16o4i", tag::OhwI16o4i},
-                                                       {"gOwI16o4i", tag::gOwI16o4i},
-                                                       {"gOhwI16o4i", tag::gOhwI16o4i},
-                                                       {"OdhwI16o4i", tag::OdhwI16o4i},
-                                                       {"gOdhwI16o4i", tag::gOdhwI16o4i},
-                                                       {"Owi32o", tag::Owi32o},
-                                                       {"OwI32o2i", tag::OwI32o2i},
-                                                       {"OwI32o4i", tag::OwI32o4i},
-                                                       {"Owi48o", tag::Owi48o},
-                                                       {"OwI48o2i", tag::OwI48o2i},
-                                                       {"OwI48o4i", tag::OwI48o4i},
-                                                       {"Owi64o", tag::Owi64o},
-                                                       {"OwI64o2i", tag::OwI64o2i},
-                                                       {"OwI64o4i", tag::OwI64o4i},
-                                                       {"wIo2i", tag::wIo2i},
-                                                       {"wIo4i", tag::wIo4i},
-                                                       {"gOwi32o", tag::gOwi32o},
-                                                       {"gOwI32o2i", tag::gOwI32o2i},
-                                                       {"gOwI32o4i", tag::gOwI32o4i},
-                                                       {"gOwi48o", tag::gOwi48o},
-                                                       {"gOwI48o2i", tag::gOwI48o2i},
-                                                       {"gOwI48o4i", tag::gOwI48o4i},
-                                                       {"gOwi64o", tag::gOwi64o},
-                                                       {"gOwI64o2i", tag::gOwI64o2i},
-                                                       {"gOwI64o4i", tag::gOwI64o4i},
-                                                       {"gwio", tag::gwio},
-                                                       {"gwIo2i", tag::gwIo2i},
-                                                       {"gwIo4i", tag::gwIo4i},
-                                                       {"OhwI32o", tag::OhwI32o},
-                                                       {"OhwI32o2i", tag::OhwI32o2i},
-                                                       {"OhwI32o4i", tag::OhwI32o4i},
-                                                       {"Ohwi48o", tag::Ohwi48o},
-                                                       {"OhwI48o2i", tag::OhwI48o2i},
-                                                       {"OhwI48o4i", tag::OhwI48o4i},
-                                                       {"Ohwi64o", tag::Ohwi64o},
-                                                       {"OhwI64o2i", tag::OhwI64o2i},
-                                                       {"OhwI64o4i", tag::OhwI64o4i},
-                                                       {"hwIo2i", tag::hwIo2i},
-                                                       {"hwIo4i", tag::hwIo4i},
-                                                       {"gOhwI32o", tag::gOhwI32o},
-                                                       {"gOhwI32o2i", tag::gOhwI32o2i},
-                                                       {"gOhwI32o4i", tag::gOhwI32o4i},
-                                                       {"gOhwi48o", tag::gOhwi48o},
-                                                       {"gOhwI48o2i", tag::gOhwI48o2i},
-                                                       {"gOhwI48o4i", tag::gOhwI48o4i},
-                                                       {"gOhwi64o", tag::gOhwi64o},
-                                                       {"gOhwI64o2i", tag::gOhwI64o2i},
-                                                       {"gOhwI64o4i", tag::gOhwI64o4i},
-                                                       {"ghwio", tag::ghwio},
-                                                       {"ghwIo2i", tag::ghwIo2i},
-                                                       {"ghwIo4i", tag::ghwIo4i},
-                                                       {"Odhwi32o", tag::Odhwi32o},
-                                                       {"OdhwI32o2i", tag::OdhwI32o2i},
-                                                       {"OdhwI32o4i", tag::OdhwI32o4i},
-                                                       {"Odhwi48o", tag::Odhwi48o},
-                                                       {"OdhwI48o2i", tag::OdhwI48o2i},
-                                                       {"OdhwI48o4i", tag::OdhwI48o4i},
-                                                       {"Odhwi64o", tag::Odhwi64o},
-                                                       {"OdhwI64o2i", tag::OdhwI64o2i},
-                                                       {"OdhwI64o4i", tag::OdhwI64o4i},
-                                                       {"dhwIo2i", tag::dhwIo2i},
-                                                       {"dhwIo4i", tag::dhwIo4i},
-                                                       {"gOdhwi32o", tag::gOdhwi32o},
-                                                       {"gOdhwI32o2i", tag::gOdhwI32o2i},
-                                                       {"gOdhwI32o4i", tag::gOdhwI32o4i},
-                                                       {"gOdhwi48o", tag::gOdhwi48o},
-                                                       {"gOdhwI48o2i", tag::gOdhwI48o2i},
-                                                       {"gOdhwI48o4i", tag::gOdhwI48o4i},
-                                                       {"gOdhwi64o", tag::gOdhwi64o},
-                                                       {"gOdhwI64o2i", tag::gOdhwI64o2i},
-                                                       {"gOdhwI64o4i", tag::gOdhwI64o4i},
-                                                       {"gdhwio", tag::gdhwio},
-                                                       {"gdhwIo2i", tag::gdhwIo2i},
-                                                       {"gdhwIo4i", tag::gdhwIo4i},
-                                                       {"ldIo32i", tag::ldIo32i},
-                                                       {"ldgIo32i", tag::ldgIo32i},
-                                                       {"ldgIO32i2o", tag::ldgIO32i2o},
-                                                       {"nCdhw32c", tag::nCdhw32c},
-                                                       {"nChw32c", tag::nChw32c},
-                                                       {"nCw32c", tag::nCw32c},
-                                                       {"NCw32n16c", tag::NCw32n16c},
-                                                       {"NChw32n16c", tag::NChw32n16c},
-                                                       {"NCdhw32n16c", tag::NCdhw32n16c},
-                                                       {"NCw32n32c", tag::NCw32n32c},
-                                                       {"OI16i16o4i", tag::OI16i16o4i},
-                                                       {"IOw8o16i2o", tag::IOw8o16i2o},
-                                                       {"IOhw8o16i2o", tag::IOhw8o16i2o},
-                                                       {"Owhi16o", tag::Owhi16o},
-                                                       {"OIdhw8o16i2o", tag::OIdhw8o16i2o},
-                                                       {"IOdhw8o16i2o", tag::IOdhw8o16i2o},
-                                                       {"Goiw4g", tag::Goiw4g},
-                                                       {"gIOw8o16i2o", tag::gIOw8o16i2o},
-                                                       {"Goiw32g", tag::Goiw32g},
-                                                       {"Goihw4g", tag::Goihw4g},
-                                                       {"gIOhw8o16i2o", tag::gIOhw8o16i2o},
-                                                       {"Goihw32g", tag::Goihw32g},
-                                                       {"gOwhi16o", tag::gOwhi16o},
-                                                       {"IOw4i8o8i4o", tag::IOw4i8o8i4o},
-                                                       {"IOhw4i8o8i4o", tag::IOhw4i8o8i4o},
-                                                       {"IOdhw4i8o8i4o", tag::IOdhw4i8o8i4o},
-                                                       {"gIOw4i8o8i4o", tag::gIOw4i8o8i4o},
-                                                       {"gIOhw4i8o8i4o", tag::gIOhw4i8o8i4o},
-                                                       {"gIOdhw4i8o8i4o", tag::gIOdhw4i8o8i4o},
-                                                       {"gOIdhw8o16i2o", tag::gOIdhw8o16i2o},
-                                                       {"gIOdhw8o16i2o", tag::gIOdhw8o16i2o},
-                                                       {"Goidhw32g", tag::Goidhw32g},
-                                                       {"OI16i32o4i", tag::OI16i32o4i},
-                                                       {"OI16i48o4i", tag::OI16i48o4i},
-                                                       {"OI16i64o4i", tag::OI16i64o4i},
-                                                       {"OI16i16o2i", tag::OI16i16o2i},
-                                                       {"OI16i32o2i", tag::OI16i32o2i},
-                                                       {"OI16i48o2i", tag::OI16i48o2i},
-                                                       {"OI16i64o2i", tag::OI16i64o2i},
-                                                       {"OwI16i16o2i", tag::OwI16i16o2i},
-                                                       {"gOwI16i16o2i", tag::gOwI16i16o2i},
-                                                       {"OhwI16i16o2i", tag::OhwI16i16o2i},
-                                                       {"gOhwI16i16o2i", tag::gOhwI16i16o2i},
-                                                       {"OdhwI16i16o2i", tag::OdhwI16i16o2i},
-                                                       {"gOdhwI16i16o2i", tag::gOdhwI16i16o2i},
-                                                       {"OwI16i16o4i", tag::OwI16i16o4i},
-                                                       {"gOwI16i16o4i", tag::gOwI16i16o4i},
-                                                       {"OhwI16i16o4i", tag::OhwI16i16o4i},
-                                                       {"gOhwI16i16o4i", tag::gOhwI16i16o4i},
-                                                       {"OdhwI16i16o4i", tag::OdhwI16i16o4i},
-                                                       {"gOdhwI16i16o4i", tag::gOdhwI16i16o4i},
-                                                       {"OwI16i32o2i", tag::OwI16i32o2i},
-                                                       {"OwI16i32o4i", tag::OwI16i32o4i},
-                                                       {"OwI16i48o2i", tag::OwI16i48o2i},
-                                                       {"OwI16i48o4i", tag::OwI16i48o4i},
-                                                       {"OwI16i64o2i", tag::OwI16i64o2i},
-                                                       {"OwI16i64o4i", tag::OwI16i64o4i},
-                                                       {"gOwI16i32o2i", tag::gOwI16i32o2i},
-                                                       {"gOwI16i32o4i", tag::gOwI16i32o4i},
-                                                       {"gOwI16i48o2i", tag::gOwI16i48o2i},
-                                                       {"gOwI16i48o4i", tag::gOwI16i48o4i},
-                                                       {"gOwI16i64o2i", tag::gOwI16i64o2i},
-                                                       {"gOwI16i64o4i", tag::gOwI16i64o4i},
-                                                       {"OhwI16i32o2i", tag::OhwI16i32o2i},
-                                                       {"OhwI16i32o4i", tag::OhwI16i32o4i},
-                                                       {"OhwI16i48o2i", tag::OhwI16i48o2i},
-                                                       {"OhwI16i48o4i", tag::OhwI16i48o4i},
-                                                       {"OhwI16i64o2i", tag::OhwI16i64o2i},
-                                                       {"OhwI16i64o4i", tag::OhwI16i64o4i},
-                                                       {"gOhwI16i32o2i", tag::gOhwI16i32o2i},
-                                                       {"gOhwI16i32o4i", tag::gOhwI16i32o4i},
-                                                       {"gOhwI16i48o2i", tag::gOhwI16i48o2i},
-                                                       {"gOhwI16i48o4i", tag::gOhwI16i48o4i},
-                                                       {"gOhwI16i64o2i", tag::gOhwI16i64o2i},
-                                                       {"gOhwI16i64o4i", tag::gOhwI16i64o4i},
-                                                       {"OdhwI16i32o2i", tag::OdhwI16i32o2i},
-                                                       {"OdhwI16i32o4i", tag::OdhwI16i32o4i},
-                                                       {"OdhwI16i48o2i", tag::OdhwI16i48o2i},
-                                                       {"OdhwI16i48o4i", tag::OdhwI16i48o4i},
-                                                       {"OdhwI16i64o2i", tag::OdhwI16i64o2i},
-                                                       {"OdhwI16i64o4i", tag::OdhwI16i64o4i},
-                                                       {"gOdhwI16i32o2i", tag::gOdhwI16i32o2i},
-                                                       {"gOdhwI16i32o4i", tag::gOdhwI16i32o4i},
-                                                       {"gOdhwI16i48o2i", tag::gOdhwI16i48o2i},
-                                                       {"gOdhwI16i48o4i", tag::gOdhwI16i48o4i},
-                                                       {"gOdhwI16i64o2i", tag::gOdhwI16i64o2i},
-                                                       {"gOdhwI16i64o4i", tag::gOdhwI16i64o4i},
-                                                       {"hwioG16g", tag::hwioG16g},
-                                                       {"NCdhw40n32c", tag::NCdhw40n32c},
-                                                       {"NChw40n32c", tag::NChw40n32c},
-                                                       {"NCw40n32c", tag::NCw40n32c},
-                                                       {"OIdhw4o8i8o2i", tag::OIdhw4o8i8o2i},
-                                                       {"OIhw4o8i8o2i", tag::OIhw4o8i8o2i},
-                                                       {"OIw4o8i8o2i", tag::OIw4o8i8o2i},
-                                                       {"gOIdhw4o8i8o2i", tag::gOIdhw4o8i8o2i},
-                                                       {"gOIhw4o8i8o2i", tag::gOIhw4o8i8o2i},
-                                                       {"gOIw4o8i8o2i", tag::gOIw4o8i8o2i},
-                                                       {"IOdhw4i8o8i2o", tag::IOdhw4i8o8i2o},
-                                                       {"IOhw4i8o8i2o", tag::IOhw4i8o8i2o},
-                                                       {"IOw4i8o8i2o", tag::IOw4i8o8i2o},
-                                                       {"gIOdhw4i8o8i2o", tag::gIOdhw4i8o8i2o},
-                                                       {"gIOhw4i8o8i2o", tag::gIOhw4i8o8i2o},
-                                                       {"gIOw4i8o8i2o", tag::gIOw4i8o8i2o},
-                                                       {"NCdhw40n16c", tag::NCdhw40n16c},
-                                                       {"NCw40n16c", tag::NCw40n16c},
-                                                       {"NChw40n16c", tag::NChw40n16c},
-                                                       {"NCw2c32n8c", tag::NCw2c32n8c},
-                                                       {"NChw2c32n8c", tag::NChw2c32n8c},
-                                                       {"NCdhw2c32n8c", tag::NCdhw2c32n8c},
-                                                       {"OIw2i8o16i4o", tag::OIw2i8o16i4o},
-                                                       {"OIhw2i8o16i4o", tag::OIhw2i8o16i4o},
-                                                       {"OIdhw2i8o16i4o", tag::OIdhw2i8o16i4o},
-                                                       {"OIw2o8i16o4i", tag::OIw2o8i16o4i},
-                                                       {"OIw2o8i16o2i", tag::OIw2o8i16o2i},
-                                                       {"IOw2i8o16i4o", tag::IOw2i8o16i4o},
-                                                       {"IOw2i8o16i2o", tag::IOw2i8o16i2o},
-                                                       {"OIhw2o8i16o4i", tag::OIhw2o8i16o4i},
-                                                       {"OIhw2o8i16o2i", tag::OIhw2o8i16o2i},
-                                                       {"IOhw2i8o16i4o", tag::IOhw2i8o16i4o},
-                                                       {"IOhw2i8o16i2o", tag::IOhw2i8o16i2o},
-                                                       {"OIdhw2o8i16o4i", tag::OIdhw2o8i16o4i},
-                                                       {"OIdhw2o8i16o2i", tag::OIdhw2o8i16o2i},
-                                                       {"IOdhw2i8o16i4o", tag::IOdhw2i8o16i4o},
-                                                       {"IOdhw2i8o16i2o", tag::IOdhw2i8o16i2o},
-                                                       {"gOIw2o8i16o2i", tag::gOIw2o8i16o2i},
-                                                       {"gIOw2i8o16i2o", tag::gIOw2i8o16i2o},
-                                                       {"gIOhw2i8o16i2o", tag::gIOhw2i8o16i2o},
-                                                       {"gIOdhw2i8o16i2o", tag::gIOdhw2i8o16i2o},
-                                                       {"gOIhw2o8i16o2i", tag::gOIhw2o8i16o2i},
-                                                       {"gOIdhw2o8i16o2i", tag::gOIdhw2o8i16o2i},
-                                                       {"gOIw2o8i16o4i", tag::gOIw2o8i16o4i},
-                                                       {"gOIhw2o8i16o4i", tag::gOIhw2o8i16o4i}};
-    std::string key = "";
-    for (const auto& c : layout) {
-      if (std::isalpha(c, std::locale("C"))) {
-        char lower_c = std::tolower(c);
-        if (std::isupper(c) && (layout.find(lower_c) != std::string::npos)) {
-          key.push_back(c);
-        } else {
-          key.push_back(lower_c);
-        }
-      } else if (std::isdigit(c)) {
-        key.push_back(c);
-      } else {
-        LOG(FATAL) << "invalid char '" << c << "' in " << layout << std::endl;
-      }
-    }
-    if (str2tag.count(key) == 0) {
-      LOG(WARNING) << "convert unregistered layout '" << key << "' to tag::any";
-      return tag::any;
+  /* Override GetFunction to reimplement Run method */
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) override {
+    if (this->symbol_name_ == name) {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        ICHECK(this->initialized_) << "The module has not been initialized";
+
+        ICHECK_EQ(args.size(), input_var_eid_.size() + outputs_.size())
+            << "Found mismatch in the number of provided data entries and required.";
+
+        Run(args);
+      });
     } else {
-      return str2tag.at(key);
+      return JSONRuntimeBase::GetFunction(name, sptr_to_self);
+    }
+  }
+
+  /* Same as makeInitDataProvider but in case of InputOutput return real DLTensor */
+  TensorRegistry::DLTensorProvider makeIODataProvider(const TVMArgs& args) const {
+    auto extract_dl_tensor = [](const TVMArgValue& val) -> const DLTensor* {
+      ICHECK(val.type_code() == kTVMNDArrayHandle || val.type_code() == kTVMDLTensorHandle)
+          << "Expect NDArray or DLTensor";
+      return val.IsObjectRef<NDArray>() ? val.operator NDArray().operator->()
+                                        : val.operator DLTensor*();
+    };
+
+    std::map<uint32_t, const DLTensor*> io_map;  // eid to dl tensor map
+    for (size_t i = 0; i < run_arg_eid_.size(); i++) {
+      io_map[run_arg_eid_[i]] = extract_dl_tensor(args[i]);
     }
+
+    // lambda with captured IO data handlers
+    return [io_map](uint32_t eid) -> const DLTensor* { return io_map.at(eid); };
   }
 
-  std::map<std::string, dnnl::algorithm> elt_name2algo{
+ private:
+  const std::map<std::string, dnnl::algorithm> elt_name2algo{
       {"abs", dnnl::algorithm::eltwise_abs},
       {"exp", dnnl::algorithm::eltwise_exp},
       {"log", dnnl::algorithm::eltwise_log},
@@ -626,64 +161,14 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     return std::regex_match(op_name, bias_add_pat) ? true : false;
   }
 
-  dnnl::memory::dims TransDims2Plain(dnnl::memory::dims input_dims, std::string layout) {
-    std::vector<char> axis = {
-        'N', 'C', 'O', 'I', 'D', 'H', 'W',
-    };
-    dnnl::memory::dims out_dims;
-    std::string::iterator t = layout.begin();
-    // Remove numbers in layout string to match the size of input_dims
-    while (t != layout.end()) {
-      if (*t >= '0' && *t <= '9') {
-        layout.erase(t);
-      } else {
-        t++;
-      }
-    }
-    // Push the correct shapes of each axis into the output_dims
-    for (auto a : axis) {
-      if (layout.find(a) != std::string::npos) {
-        dnnl::memory::dim shape = input_dims[layout.find(a)];
-        char lower_a = std::tolower(a);
-        for (size_t i = 0; i < layout.size(); ++i) {
-          if (lower_a == layout[i]) {
-            shape *= input_dims[i];
-          }
-        }
-        out_dims.push_back(shape);
-      }
-    }
-    // Multiply O and I with G, respectively
-    if (layout.find("G") != std::string::npos) {
-      dnnl::memory::dim G = 1;
-      if (layout.find("g") != std::string::npos) {
-        G = input_dims[layout.find("g")] * input_dims[layout.find("G")];
-      } else {
-        G = input_dims[layout.find("G")];
-      }
-      out_dims[0] *= G;
-      out_dims[1] *= G;
-    }
-    return out_dims;
-  }
-
-  dnnl::memory::dims TransformStr2Dims(std::vector<std::string> strs, bool dilates = false) {
-    dnnl::memory::dims out_dims;
-    if (dilates) {
-      std::transform(strs.begin(), strs.end(), std::back_inserter(out_dims),
-                     [](const std::string& str) { return std::stoi(str) - 1; });
-    } else {
-      std::transform(strs.begin(), strs.end(), std::back_inserter(out_dims),
-                     [](const std::string& str) { return std::stoi(str); });
-    }
-    return out_dims;
-  }
-
   // Build up the engine based on the input graph.
   void BuildEngine() {
     engine_ = dnnl::engine(dnnl::engine::kind::cpu, 0);
     stream_ = dnnl::stream(engine_);
 
+    std::set<uint32_t> io_eid_set(run_arg_eid_.begin(), run_arg_eid_.end());
+    tensor_registry_ = TensorRegistry(engine_, io_eid_set);
+
     std::regex conv_pat(".*conv[1-3]d.*");
     std::regex deconv_pat(".*deconv[1-3]d.*");
     std::regex conv_transpose_pat(".*conv[1-3]d_transpose.*");
@@ -725,562 +210,471 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     }
   }
 
-  // Bind a JSON graph node entry to a DNNL memory.
-  dnnl::memory BindDNNLMemory(const JSONGraphNodeEntry& entry, dnnl::memory::desc mem_desc,
-                              size_t offset = 0) {
-    auto eid = EntryID(entry);
-    if (entry_out_mem_.count(eid) == 0) {
-      return BindDNNLMemory(entry, dnnl::memory(mem_desc, engine_), offset);
-    }
-    return entry_out_mem_[eid].first;
-  }
-
-  // Bind a JSON graph node entry to a given DNNL memory.
-  dnnl::memory BindDNNLMemory(const JSONGraphNodeEntry& entry, dnnl::memory mem,
-                              size_t offset = 0) {
-    auto eid = EntryID(entry);
-    // Since the DNNL memory has been created before calling this function, we assume the entry
-    // has not yet been bound to the other DNNL memory; otherwise it may have memory leak.
-    ICHECK_EQ(entry_out_mem_.count(eid), 0);
-
-    entry_out_mem_[eid] = {mem, offset};
-    return entry_out_mem_[eid].first;
-  }
-
   void Convolution(const size_t& nid) {
     auto node = nodes_[nid];
     auto op_name = node.GetOpName();
     dnnl::primitive_attr attr;
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
     bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
-    auto data_entry = node.GetInputs()[0];
-    auto weight_entry = node.GetInputs()[1];
-    JSONGraphNodeEntry out_entry(nid, 0);
-    dnnl::memory::dims input_shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    dnnl::memory::dims weight_shape = nodes_[weight_entry.id_].GetOpShape()[weight_entry.index_];
-    dnnl::memory::dims out_shape = nodes_[out_entry.id_].GetOpShape()[out_entry.index_];
-    dnnl::memory::dim channels =
-        node.GetAttr<std::vector<std::string>>("channels")[0] != ""
-            ? std::stoi(node.GetAttr<std::vector<std::string>>("channels")[0])
-            : out_shape[1];
-    std::vector<std::string> str_strides = node.GetAttr<std::vector<std::string>>("strides");
-    std::vector<std::string> str_dilates = node.GetAttr<std::vector<std::string>>("dilation");
-    std::vector<std::string> str_padding = node.GetAttr<std::vector<std::string>>("padding");
-    std::vector<std::string> str_padding_l(str_padding.begin(),
-                                           str_padding.begin() + str_padding.size() / 2);
-    std::vector<std::string> str_padding_r(str_padding.end() - str_padding.size() / 2,
-                                           str_padding.end());
-    dnnl::memory::dim groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
-    std::string data_layout = node.GetAttr<std::vector<std::string>>("data_layout")[0];
-    std::string kernel_layout = node.GetAttr<std::vector<std::string>>("kernel_layout")[0];
-
-    // Memory shapes.
-    dnnl::memory::dims src_dims = TransDims2Plain(input_shape, data_layout);
-    dnnl::memory::dims weights_dims_ = TransDims2Plain(weight_shape, kernel_layout);
-    dnnl::memory::dims bias_dims = {channels};
-    dnnl::memory::dims strides_dims = TransformStr2Dims(str_strides);
-    dnnl::memory::dims dilates_dims = TransformStr2Dims(str_dilates, true);
-    dnnl::memory::dims padding_dims_l = TransformStr2Dims(str_padding_l);
-    dnnl::memory::dims padding_dims_r = TransformStr2Dims(str_padding_r);
-    dnnl::memory::dims dst_dims = src_dims;
-    dst_dims[1] = channels;
-    weights_dims_[0] = channels;
-    weights_dims_[1] = src_dims[1];
-    for (size_t i = 2; i < src_dims.size(); i++) {
-      dnnl::memory::dim K = weights_dims_[i];
-      dnnl::memory::dim S = strides_dims[i - 2];
-      dnnl::memory::dim D = dilates_dims[i - 2];
-      dnnl::memory::dim PL = padding_dims_l[i - 2];
-      dnnl::memory::dim PR = padding_dims_r[i - 2];
-      dnnl::memory::dim DK = 1 + (K - 1) * (D + 1);
-      dst_dims[i] = (src_dims[i] - DK + PL + PR) / S + 1;
+    auto src_tr = GetInput(nid, 0);
+    auto wgh_tr = GetInput(nid, 1);
+    auto dst_tr = GetOutput(nid, 0);
+    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+    auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
+    auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
+    auto padding = GetNodeAttr<std::vector<int64_t>>(node, "padding");
+    std::vector<int64_t> padding_l(padding.begin(), padding.begin() + padding.size() / 2);
+    std::vector<int64_t> padding_r(padding.begin() + padding.size() / 2, padding.end());
+    auto groups = GetNodeAttr<int>(node, "groups");
+    auto src_layout = GetNodeAttr<std::string>(node, "data_layout");
+    auto dst_layout = GetNodeAttr<std::string>(node, "out_layout");
+    auto wgh_layout = GetNodeAttr<std::string>(node, "kernel_layout");
+
+    // dst_layout == "" means to use data_layout
+    if (dst_layout.empty()) dst_layout = src_layout;
+
+    // Minus one for DNNL representation. No dilation for DNNL is 0, for relay is 1.
+    for (auto& d : dilates) d--;
+
+    // Take into account provided layout strings
+    src_tr = src_tr.TreatAs(src_layout);
+    dst_tr = dst_tr.TreatAs(dst_layout);
+    wgh_tr = wgh_tr.TreatAs(wgh_layout);
+
+    // Should support G mixed with O. Like { G*O, I, H, W }
+    // Use { G, O, I, H, W } weight format even if groups == 1
+    if (wgh_layout.find("G") == std::string::npos) {
+      auto w_dims = wgh_tr.dims();
+      w_dims[0] /= groups;
+      w_dims.insert(w_dims.begin(), groups);
+      wgh_tr = wgh_tr.Reshape(w_dims);
     }
 
-    dnnl::memory::dims weights_dims = weights_dims_;
-    if (groups > 1) {
-      weights_dims = {groups, channels / groups, src_dims[1] / groups};
-      weights_dims.insert(weights_dims.end(), weights_dims_.begin() + 2, weights_dims_.end());
-      if (kernel_layout == "OIHW") {
-        kernel_layout.insert(0, "G");
-      }
+    // Assumption that bias is correct and can be squeezed to 1D
+    bias_tr = bias_tr.Reshape({dst_tr.dims()[1]});
+
+    // TODO(@apeskov): This is WA. In case of padded blocked tensor format we do not know original
+    //  shapes. Example tensor {1, 10, 224, 224} with layout "NCNH8c" will lead to tensor
+    //  {1, 2, 224, 224, 8}. Identically as for shapes {1, 11, 224, 224} or {1, 15, 224, 224}.
+    //
+    // Let's try to compensate it for weight tensor. Weight IC should match with source IC.
+    // Example src: [1, 3, 224, 224] with layout NCHW
+    //         wgh: [16, 3, 3, 3] with layout OIHW2i8o -> [2, 2, 3, 3, 2, 8]
+    if (wgh_tr.dims()[2] != src_tr.dims()[1] / groups) {
+      auto wgh_croped_dims = wgh_tr.dims();
+      wgh_croped_dims[2] = src_tr.dims()[1];
+      auto zero_offset = dnnl::memory::dims(wgh_tr.dims().size(), 0);
+      wgh_tr = wgh_tr.Crop(wgh_croped_dims, zero_offset);
     }
 
-    // Memory descriptions.
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    auto conv_src_md = dnnl::memory::desc(src_dims, dtype, layout2tag(data_layout));
-    auto conv_weights_md = dnnl::memory::desc(weights_dims, dtype, layout2tag(kernel_layout));
-    auto conv_bias_md = dnnl::memory::desc(bias_dims, dtype, tag::any);
-    auto conv_dst_md = dnnl::memory::desc(dst_dims, dtype, tag::any);
-
     // Conv description.
-    auto conv_desc =
-        has_bias ? dnnl::convolution_forward::desc(
-                       dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct,
-                       conv_src_md, conv_weights_md, conv_bias_md, conv_dst_md, strides_dims,
-                       dilates_dims, padding_dims_l, padding_dims_r)
-                 : dnnl::convolution_forward::desc(dnnl::prop_kind::forward_inference,
-                                                   dnnl::algorithm::convolution_direct, conv_src_md,
-                                                   conv_weights_md, conv_dst_md, strides_dims,
-                                                   dilates_dims, padding_dims_l, padding_dims_r);
+    auto conv_desc = dnnl::convolution_forward::desc(
+        dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct,
+        src_tr.LayoutAny().desc(), wgh_tr.LayoutAny().desc(), bias_tr.LayoutAny().desc(),
+        dst_tr.LayoutAny().desc(), strides, dilates, padding_l, padding_r);
 
     // Enable elementwise post-ops.
     auto conv_prim_desc = dnnl::convolution_forward::primitive_desc(conv_desc, attr, engine_);
 
-    // Push to the network.
-    auto conv = dnnl::convolution_forward(conv_prim_desc);
-    net_.push_back(conv);
-
-    // Data memory.
-    auto conv_src_memory = BindDNNLMemory(data_entry, conv_src_md);
+    src_tr = src_tr.RequestLayout(conv_prim_desc.src_desc());
+    wgh_tr = wgh_tr.RequestLayout(conv_prim_desc.weights_desc());
+    dst_tr = dst_tr.RequestLayout(conv_prim_desc.dst_desc());
+    bias_tr = bias_tr.RequestLayout(conv_prim_desc.bias_desc());
 
-    // Weight memory.
-    auto conv_weights_memory = BindDNNLMemory(weight_entry, conv_prim_desc.weights_desc());
+    auto scratchpad_tr = TensorRequisite::AsIs(conv_prim_desc.scratchpad_desc());
 
-    // Output memory.
-    auto conv_dst_memory = BindDNNLMemory(out_entry, conv_prim_desc.dst_desc());
-
-    // Bias memory.
-    auto conv_bias_memory = dnnl::memory({bias_dims, dtype, tag::x}, engine_);
-    if (has_bias) {
-      auto bias_entry = node.GetInputs()[2];
-      BindDNNLMemory(bias_entry, conv_bias_memory);
-
-      // Bind memory buffers.
-      net_args_.push_back({{DNNL_ARG_SRC, conv_src_memory},
-                           {DNNL_ARG_WEIGHTS, conv_weights_memory},
-                           {DNNL_ARG_BIAS, conv_bias_memory},
-                           {DNNL_ARG_DST, conv_dst_memory}});
-    } else {
-      // Bind memory buffers.
-      net_args_.push_back({{DNNL_ARG_SRC, conv_src_memory},
-                           {DNNL_ARG_WEIGHTS, conv_weights_memory},
-                           {DNNL_ARG_DST, conv_dst_memory}});
-    }
+    Submit(dnnl::convolution_forward(conv_prim_desc), {{DNNL_ARG_SRC, src_tr},
+                                                       {DNNL_ARG_WEIGHTS, wgh_tr},
+                                                       {DNNL_ARG_BIAS, bias_tr},
+                                                       {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+                                                       {DNNL_ARG_DST, dst_tr}});
   }
 
   void Deconvolution(const size_t& nid) {
     auto node = nodes_[nid];
     auto op_name = node.GetOpName();
     dnnl::primitive_attr attr;
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
     bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
-    auto data_entry = node.GetInputs()[0];
-    auto weight_entry = node.GetInputs()[1];
-    JSONGraphNodeEntry out_entry(nid, 0);
-    dnnl::memory::dims input_shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    dnnl::memory::dims weight_shape = nodes_[weight_entry.id_].GetOpShape()[weight_entry.index_];
-    dnnl::memory::dims out_shape = nodes_[out_entry.id_].GetOpShape()[out_entry.index_];
-    dnnl::memory::dim channels =
-        node.GetAttr<std::vector<std::string>>("channels")[0] != ""
-            ? std::stoi(node.GetAttr<std::vector<std::string>>("channels")[0])
-            : out_shape[1];
-    std::vector<std::string> str_strides = node.GetAttr<std::vector<std::string>>("strides");
-    std::vector<std::string> str_dilates = node.GetAttr<std::vector<std::string>>("dilation");
-    std::vector<std::string> str_padding = node.GetAttr<std::vector<std::string>>("padding");
-    std::vector<std::string> str_padding_l(str_padding.begin(),
-                                           str_padding.begin() + str_padding.size() / 2);
-    std::vector<std::string> str_padding_r(str_padding.end() - str_padding.size() / 2,
-                                           str_padding.end());
-    std::vector<std::string> str_out_padding =
-        node.GetAttr<std::vector<std::string>>("output_padding");
-    dnnl::memory::dim groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
-    std::string data_layout = node.GetAttr<std::vector<std::string>>("data_layout")[0];
-    std::string kernel_layout = node.GetAttr<std::vector<std::string>>("kernel_layout")[0];
-
-    // Memory shapes.
-    dnnl::memory::dims src_dims = TransDims2Plain(input_shape, data_layout);
-    dnnl::memory::dims weights_dims_ = TransDims2Plain(weight_shape, kernel_layout);
-    // legalize shape IOHW with layout OIHW
-    if (weights_dims_[0] == src_dims[1] && weights_dims_[1] == channels) {
-      std::swap(weights_dims_[0], weights_dims_[1]);
-      if (kernel_layout.find("OI") == 0) {
-        kernel_layout.replace(kernel_layout.find("OI"), 2, "IO");
-      }
-    }
-    weights_dims_[0] = channels;
-    weights_dims_[1] = src_dims[1];
-    dnnl::memory::dims bias_dims = {channels};
-    dnnl::memory::dims strides_dims = TransformStr2Dims(str_strides);
-    dnnl::memory::dims dilates_dims = TransformStr2Dims(str_dilates, true);
-    dnnl::memory::dims padding_dims_l = TransformStr2Dims(str_padding_l);
-    dnnl::memory::dims padding_dims_r = TransformStr2Dims(str_padding_r);
-    dnnl::memory::dims out_padding = TransformStr2Dims(str_out_padding);
-    dnnl::memory::dims dst_dims = src_dims;
-    dst_dims[1] = channels;
-    for (size_t i = 2; i < src_dims.size(); i++) {
-      dnnl::memory::dim K = weights_dims_[i];
-      dnnl::memory::dim S = strides_dims[i - 2];
-      dnnl::memory::dim D = dilates_dims[i - 2];
-      dnnl::memory::dim PL = padding_dims_l[i - 2];
-      dnnl::memory::dim PR = padding_dims_r[i - 2];
-      dnnl::memory::dim OP = out_padding[i - 2];
-      dnnl::memory::dim DK = 1 + (K - 1) * (D + 1);
-      dst_dims[i] = S * (src_dims[i] - 1) + DK - PL - PR + OP;
+    auto src_tr = GetInput(nid, 0);
+    auto wgh_tr = GetInput(nid, 1);
+    auto dst_tr = GetOutput(nid, 0);
+    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+
+    auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
+    auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
+    auto padding = GetNodeAttr<std::vector<int64_t>>(node, "padding");
+    std::vector<int64_t> padding_l(padding.begin(), padding.begin() + padding.size() / 2);
+    std::vector<int64_t> padding_r(padding.begin() + padding.size() / 2, padding.end());
+    auto groups = GetNodeAttr<int>(node, "groups");
+    auto src_layout = GetNodeAttr<std::string>(node, "data_layout");
+    auto dst_layout = GetNodeAttr<std::string>(node, "out_layout");
+    auto wgh_layout = GetNodeAttr<std::string>(node, "kernel_layout");
+
+    // dst_layout == "" means to use data_layout
+    if (dst_layout.empty()) dst_layout = src_layout;
+
+    // Minus one for DNNL representation. No dilation for DNNL is 0, for relay is 1.
+    for (auto& d : dilates) d--;
+
+    // TODO(@apeskov): WA. conv3dTranspose uses wrong layout specifier. IO instead of OI.
+    auto wgh_logic_layout = TensorRequisite::DefaultLogicLayoutFor(wgh_layout);
+    if (wgh_logic_layout == "OIDHW") wgh_logic_layout = "IODHW";
+    if (wgh_logic_layout == "GOIDHW") wgh_logic_layout = "GIODHW";
+
+    // Take into account provided layout strings
+    src_tr = src_tr.TreatAs(src_layout);
+    dst_tr = dst_tr.TreatAs(dst_layout);
+    wgh_tr = wgh_tr.TreatAs(wgh_layout, wgh_logic_layout);
+
+    // Should support G mixed with O. Like { G*O, I, H, W }
+    if (wgh_layout.find("G") == std::string::npos) {
+      auto w_dims = wgh_tr.dims();
+      w_dims[0] /= groups;
+      w_dims.insert(w_dims.begin(), groups);
+      wgh_tr = wgh_tr.Reshape(w_dims);
     }
 
-    dnnl::memory::dims weights_dims = weights_dims_;
-    if (groups > 1) {
-      weights_dims = {groups, channels / groups, src_dims[1] / groups};
-      weights_dims.insert(weights_dims.end(), weights_dims_.begin() + 2, weights_dims_.end());
-    }
+    // Assumption that bias is correct and can be squeezed to 1D
+    bias_tr = bias_tr.Reshape({dst_tr.dims()[1]});
 
-    // Memory descriptions.
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    auto deconv_src_md = dnnl::memory::desc(src_dims, dtype, layout2tag(data_layout));
-    auto deconv_weights_md = dnnl::memory::desc(weights_dims, dtype, layout2tag(kernel_layout));
-    auto deconv_bias_md = dnnl::memory::desc(bias_dims, dtype, tag::x);
-    auto deconv_dst_md = dnnl::memory::desc(dst_dims, dtype, tag::any);
-
-    // Transposed covn2d description.
-    auto deconv_desc =
-        has_bias ? dnnl::deconvolution_forward::desc(
-                       dnnl::prop_kind::forward_inference, dnnl::algorithm::deconvolution_direct,
-                       deconv_src_md, deconv_weights_md, deconv_bias_md, deconv_dst_md,
-                       strides_dims, dilates_dims, padding_dims_l, padding_dims_r)
-                 : dnnl::deconvolution_forward::desc(
-                       dnnl::prop_kind::forward_inference, dnnl::algorithm::deconvolution_direct,
-                       deconv_src_md, deconv_weights_md, deconv_dst_md, strides_dims, dilates_dims,
-                       padding_dims_l, padding_dims_r);
+    // Conv description.
+    auto deconv_desc = dnnl::deconvolution_forward::desc(
+        dnnl::prop_kind::forward_inference, dnnl::algorithm::deconvolution_direct,
+        src_tr.LayoutAny().desc(), wgh_tr.LayoutAny().desc(), bias_tr.LayoutAny().desc(),
+        dst_tr.LayoutAny().desc(), strides, dilates, padding_l, padding_r);
 
     // Enable elementwise post-ops.
     auto deconv_prim_desc = dnnl::deconvolution_forward::primitive_desc(deconv_desc, attr, engine_);
 
-    // Push to the network.
-    auto deconv = dnnl::deconvolution_forward(deconv_prim_desc);
-    net_.push_back(deconv);
-
-    // Data memory.
-    auto deconv_src_memory = BindDNNLMemory(data_entry, deconv_src_md);
-
-    // Weight memory.
-    auto deconv_weights_memory = BindDNNLMemory(weight_entry, deconv_prim_desc.weights_desc());
-
-    // Output memory.
-    auto deconv_dst_memory = BindDNNLMemory(out_entry, deconv_prim_desc.dst_desc());
+    src_tr = src_tr.RequestLayout(deconv_prim_desc.src_desc());
+    wgh_tr = wgh_tr.RequestLayout(deconv_prim_desc.weights_desc());
+    dst_tr = dst_tr.RequestLayout(deconv_prim_desc.dst_desc());
+    bias_tr = bias_tr.RequestLayout(deconv_prim_desc.bias_desc());
 
-    // Bias memory.
-    auto deconv_bias_memory = dnnl::memory({bias_dims, dtype, tag::x}, engine_);
-    if (has_bias) {
-      auto bias_entry = node.GetInputs()[2];
-      BindDNNLMemory(bias_entry, deconv_bias_memory);
+    auto scratchpad_tr = TensorRequisite::AsIs(deconv_prim_desc.scratchpad_desc());
 
-      // Bind memory buffers.
-      net_args_.push_back({{DNNL_ARG_SRC, deconv_src_memory},
-                           {DNNL_ARG_WEIGHTS, deconv_weights_memory},
-                           {DNNL_ARG_BIAS, deconv_bias_memory},
-                           {DNNL_ARG_DST, deconv_dst_memory}});
-    } else {
-      // Bind memory buffers.
-      net_args_.push_back({{DNNL_ARG_SRC, deconv_src_memory},
-                           {DNNL_ARG_WEIGHTS, deconv_weights_memory},
-                           {DNNL_ARG_DST, deconv_dst_memory}});
-    }
+    Submit(dnnl::deconvolution_forward(deconv_prim_desc), {{DNNL_ARG_SRC, src_tr},
+                                                           {DNNL_ARG_WEIGHTS, wgh_tr},
+                                                           {DNNL_ARG_BIAS, bias_tr},
+                                                           {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+                                                           {DNNL_ARG_DST, dst_tr}});
   }
 
   void Dense(const size_t& nid) {
     auto node = nodes_[nid];
     auto op_name = node.GetOpName();
     dnnl::primitive_attr attr;
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
     bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
-    auto data_entry = node.GetInputs()[0];
-    auto weight_entry = node.GetInputs()[1];
-    JSONGraphNodeEntry out_entry(nid, 0);
-    dnnl::memory::dims input_shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    dnnl::memory::dims weight_shape = nodes_[weight_entry.id_].GetOpShape()[weight_entry.index_];
-    dnnl::memory::dims out_shape = nodes_[out_entry.id_].GetOpShape()[out_entry.index_];
-    dnnl::memory::dim OC = out_shape[1];
-
-    // Memory shapes.
-    dnnl::memory::dims data_dims = input_shape;
-    dnnl::memory::dims weight_dims = weight_shape;
-    dnnl::memory::dims bias_dims = {OC};
-    dnnl::memory::dims out_dims = out_shape;
-
-    // Memory descriptions.
-    auto dl_dtype = nodes_[data_entry.id_].GetOpDataType()[data_entry.index_];
-    auto dtype = dtype_dl2dnnl(dl_dtype);
-    auto data_md = dnnl::memory::desc({data_dims, dtype, tag::nc});
-    auto weight_md = dnnl::memory::desc({weight_dims, dtype, tag::nc});
-    auto bias_md = dnnl::memory::desc({bias_dims, dtype, tag::x});
-    auto dst_md = dnnl::memory::desc({out_dims, dtype, tag::nc});
+    auto src_tr = GetInput(nid, 0);
+    auto wgh_tr = GetInput(nid, 1);
+    auto dst_tr = GetOutput(nid, 0);
+    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+
+    // Assumption that bias is correct and can be squeezed to 1D
+    bias_tr = bias_tr.Reshape({dst_tr.dims()[1]});
 
     // Dense description.
-    auto dense_desc = dnnl::inner_product_forward::desc(dnnl::prop_kind::forward_inference, data_md,
-                                                        weight_md, bias_md, dst_md);
+    auto dense_desc = dnnl::inner_product_forward::desc(
+        dnnl::prop_kind::forward_inference, src_tr.LayoutAny().desc(), wgh_tr.LayoutAny().desc(),
+        bias_tr.LayoutAny().desc(), dst_tr.LayoutAny().desc());
 
     // Enable elementwise post-ops.
     auto dense_prim_desc = dnnl::inner_product_forward::primitive_desc(dense_desc, attr, engine_);
 
-    auto dense = dnnl::inner_product_forward(dense_prim_desc);
-    net_.push_back(dense);
+    src_tr = src_tr.RequestLayout(dense_prim_desc.src_desc());
+    wgh_tr = wgh_tr.RequestLayout(dense_prim_desc.weights_desc());
+    dst_tr = dst_tr.RequestLayout(dense_prim_desc.dst_desc());
+    bias_tr = bias_tr.RequestLayout(dense_prim_desc.bias_desc());
 
-    // Memories.
-    auto data_memory = BindDNNLMemory(data_entry, data_md);
-    auto weight_memory = BindDNNLMemory(weight_entry, weight_md);
+    auto scratchpad_tr = TensorRequisite::AsIs(dense_prim_desc.scratchpad_desc());
 
-    // Bias memory.
-    auto bias_memory = dnnl::memory(bias_md, engine_);
-    if (has_bias) {
-      auto bias_entry = node.GetInputs()[2];
-      BindDNNLMemory(bias_entry, bias_memory);
-    } else {
-      float bias[OC] = {0};
-      write_to_dnnl_memory(bias, bias_memory, OC * ((dl_dtype.bits + 7) / 8));
-    }
-
-    // Output memory.
-    auto dst_memory = BindDNNLMemory(out_entry, dense_prim_desc.dst_desc());
-
-    net_args_.push_back({{DNNL_ARG_SRC, data_memory},
-                         {DNNL_ARG_WEIGHTS, weight_memory},
-                         {DNNL_ARG_BIAS, bias_memory},
-                         {DNNL_ARG_DST, dst_memory}});
+    Submit(dnnl::inner_product_forward(dense_prim_desc), {{DNNL_ARG_SRC, src_tr},
+                                                          {DNNL_ARG_WEIGHTS, wgh_tr},
+                                                          {DNNL_ARG_BIAS, bias_tr},
+                                                          {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+                                                          {DNNL_ARG_DST, dst_tr}});
   }
 
   void BatchNorm(const size_t& nid) {
     auto node = nodes_[nid];
 
-    auto data_entry = node.GetInputs()[0];
-    auto gamma_entry = node.GetInputs()[1];
-    auto beta_entry = node.GetInputs()[2];
-    auto mean_entry = node.GetInputs()[3];
-    auto variance_entry = node.GetInputs()[4];
-    dnnl::memory::dims data_shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    dnnl::memory::dim IC = data_shape[1];
-    float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("epsilon")[0]);
+    auto src_tr = GetInput(nid, 0);
+    auto gamma_tr = GetInput(nid, 1);
+    auto beta_tr = GetInput(nid, 2);
+    auto mean_tr = GetInput(nid, 3);
+    auto var_tr = GetInput(nid, 4);
+    auto dst_tr = GetOutput(nid, 0);
 
-    // Memory description.
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    dnnl::memory::desc data_md = GenDNNLMemDescByShape(data_shape, dtype);
+    auto axis = GetNodeAttr<int>(node, "axis");
+    auto epsilon = GetNodeAttr<float>(node, "epsilon");
+    auto center = GetNodeAttr<bool>(node, "center");
+    auto scale = GetNodeAttr<bool>(node, "scale");
+
+    ICHECK(axis == 1 && center && scale) << "Unimplemented BatchNorm case";
 
-    // BN description.
     auto bn_desc = dnnl::batch_normalization_forward::desc(
-        dnnl::prop_kind::forward_inference, data_md, epsilon,
+        dnnl::prop_kind::forward_inference, src_tr.desc(), epsilon,
         dnnl::normalization_flags::use_global_stats | dnnl::normalization_flags::use_scale_shift);
     auto bn_prim_desc = dnnl::batch_normalization_forward::primitive_desc(bn_desc, engine_);
-    auto bn = dnnl::batch_normalization_forward(bn_prim_desc);
-    net_.push_back(bn);
-
-    // Memories.
-    auto data_memory = BindDNNLMemory(data_entry, data_md);
-    JSONGraphNodeEntry out_entry(nid, 0);
-    auto out_memory = BindDNNLMemory(out_entry, data_md);
-    auto mean_memory = BindDNNLMemory(mean_entry, bn_prim_desc.mean_desc());
-    auto variance_memory = BindDNNLMemory(variance_entry, bn_prim_desc.variance_desc());
-
-    // In DNNL, weight is composed of gamma+beta, so we point them to the same DNNL memory but
-    // assign an offset to beta data for runtime serialization.
-    auto weight_memory = BindDNNLMemory(gamma_entry, bn_prim_desc.weights_desc(), 0);
-    BindDNNLMemory(beta_entry, weight_memory, IC);
-
-    net_args_.push_back({{DNNL_ARG_SRC, data_memory},
-                         {DNNL_ARG_DST, out_memory},
-                         {DNNL_ARG_SCALE_SHIFT, weight_memory},
-                         {DNNL_ARG_MEAN, mean_memory},
-                         {DNNL_ARG_VARIANCE, variance_memory}});
+
+    // Concatenate scale and shift tensors
+    auto scale_shift_tr = TensorRequisite::AsIs(bn_prim_desc.weights_desc(), GenUniqueEid());
+    auto sc_sh_dims = scale_shift_tr.dims();
+    ICHECK(sc_sh_dims.size() == 2);
+    ICHECK(sc_sh_dims[0] == 2);
+    sc_sh_dims[0] /= 2;
+    auto scale_tr = scale_shift_tr.Crop(sc_sh_dims, {0, 0}).Squeeze();
+    auto shift_tr = scale_shift_tr.Crop(sc_sh_dims, {1, 0}).Squeeze();
+
+    auto register_copy = [this](const TensorRequisite& src, const TensorRequisite& dst) {
+      dnnl::reorder::primitive_desc copy_pd(engine_, src.desc(), engine_, dst.desc());
+      Submit(dnnl::reorder(copy_pd), {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}});
+    };
+
+    register_copy(gamma_tr, scale_tr);
+    register_copy(beta_tr, shift_tr);
+
+    Submit(dnnl::batch_normalization_forward(bn_prim_desc), {{DNNL_ARG_SRC, src_tr},
+                                                             {DNNL_ARG_DST, dst_tr},
+                                                             {DNNL_ARG_SCALE_SHIFT, scale_shift_tr},
+                                                             {DNNL_ARG_MEAN, mean_tr},
+                                                             {DNNL_ARG_VARIANCE, var_tr}});
   }
 
   void Pooling(const size_t& nid, dnnl::algorithm algo) {
     auto node = nodes_[nid];
 
+    auto src_tr = GetInput(nid, 0);
+    auto dst_tr = GetOutput(nid, 0);
+
     // Setup attributes.
-    auto data_entry = node.GetInputs()[0];
-    JSONGraphNodeEntry out_entry(nid, 0);
-    dnnl::memory::dims input_shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    dnnl::memory::dims out_shape = nodes_[out_entry.id_].GetOpShape()[out_entry.index_];
-    std::vector<std::string> str_kernel = node.GetAttr<std::vector<std::string>>("pool_size");
-    std::vector<std::string> str_strides = node.GetAttr<std::vector<std::string>>("strides");
-    std::vector<std::string> str_padding = node.GetAttr<std::vector<std::string>>("padding");
-    std::vector<std::string> str_padding_l(str_padding.begin(),
-                                           str_padding.begin() + str_padding.size() / 2);
-    std::vector<std::string> str_padding_r(str_padding.end() - str_padding.size() / 2,
-                                           str_padding.end());
-    std::vector<std::string> str_dilates = node.GetAttr<std::vector<std::string>>("dilation");
-    std::string layout = node.GetAttr<std::vector<std::string>>("layout")[0];
+    auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
+    auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
+    auto padding = GetNodeAttr<std::vector<int64_t>>(node, "padding");
+    std::vector<int64_t> padding_l(padding.begin(), padding.begin() + padding.size() / 2);
+    std::vector<int64_t> padding_r(padding.begin() + padding.size() / 2, padding.end());
+    auto kernel = GetNodeAttr<std::vector<int64_t>>(node, "pool_size");
+    auto src_layout = GetNodeAttr<std::string>(node, "layout");
+    auto dst_layout = GetNodeAttr<std::string>(node, "out_layout");
+
+    // dst_layout == "" means to use data_layout
+    if (dst_layout.empty()) dst_layout = src_layout;
+
+    // Minus one for DNNL representation. No dilation for DNNL is 0, for relay is 1.
+    for (auto& d : dilates) d--;
+
+    // Take into account provided layout strings
+    src_tr = src_tr.TreatAs(src_layout);
+    dst_tr = dst_tr.TreatAs(dst_layout);
 
     // Attributes related to AvgPool
     if (algo == dnnl::algorithm::pooling_avg) {
-      int int_countpad = std::stoi(node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
-      bool count_include_pad = int_countpad != 0 ? true : false;
-      algo = count_include_pad ? dnnl::algorithm::pooling_avg_include_padding
-                               : dnnl::algorithm::pooling_avg_exclude_padding;
+      auto include_pad = GetNodeAttr<bool>(node, "count_include_pad");
+      algo = include_pad ? dnnl::algorithm::pooling_avg_include_padding
+                         : dnnl::algorithm::pooling_avg_exclude_padding;
     }
 
-    dnnl::memory::dims src_dims = TransDims2Plain(input_shape, layout);
-    dnnl::memory::dims dst_dims = TransDims2Plain(out_shape, layout);
-    dnnl::memory::dims kernel_dims = TransformStr2Dims(str_kernel);
-    dnnl::memory::dims strides_dims = TransformStr2Dims(str_strides);
-    dnnl::memory::dims dilates_dims = TransformStr2Dims(str_dilates, true);
-    dnnl::memory::dims padding_dims_l = TransformStr2Dims(str_padding_l);
-    dnnl::memory::dims padding_dims_r = TransformStr2Dims(str_padding_r);
-
-    // Memory descriptions.
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    auto pool_src_md = dnnl::memory::desc(src_dims, dtype, layout2tag(layout));
-    auto pool_dst_md = dnnl::memory::desc(dst_dims, dtype, tag::any);
-
     // Pooling description.
-    auto pool_desc = dnnl::pooling_forward::desc(dnnl::prop_kind::forward_inference, algo,
-                                                 pool_src_md, pool_dst_md, strides_dims,
-                                                 kernel_dims, padding_dims_l, padding_dims_r);
-
-    auto pool_prim_desc = dnnl::pooling_forward::primitive_desc(pool_desc, engine_, true);
-    auto pool = dnnl::pooling_forward(pool_prim_desc);
-    net_.push_back(pool);
+    auto pool_desc = dnnl::pooling_v2_forward::desc(
+        dnnl::prop_kind::forward_inference, algo, src_tr.desc(),  //<= Do not use any for src tensor
+        dst_tr.LayoutAny().desc(), strides, kernel, dilates, padding_l, padding_r);
+    auto pool_prim_desc = dnnl::pooling_v2_forward::primitive_desc(pool_desc, engine_);
 
-    // Memories.
-    auto pool2d_src_memory = BindDNNLMemory(data_entry, pool_src_md);
+    src_tr = src_tr.RequestLayout(pool_prim_desc.src_desc());
+    dst_tr = dst_tr.RequestLayout(pool_prim_desc.dst_desc());
 
-    auto pool2d_dst_memory = BindDNNLMemory(out_entry, pool_prim_desc.dst_desc());
+    auto scratchpad_tr = TensorRequisite::AsIs(pool_prim_desc.scratchpad_desc());
 
-    // Bind memory buffers.
-    net_args_.push_back({{DNNL_ARG_SRC, pool2d_src_memory}, {DNNL_ARG_DST, pool2d_dst_memory}});
+    Submit(dnnl::pooling_v2_forward(pool_prim_desc),
+           {{DNNL_ARG_SRC, src_tr}, {DNNL_ARG_DST, dst_tr}, {DNNL_ARG_SCRATCHPAD, scratchpad_tr}});
   }
 
   void Eltwise(const size_t& nid) {
     auto node = nodes_[nid];
     auto op_name = node.GetOpName();
-    auto algo = elt_name2algo[op_name];
+    auto algo = elt_name2algo.at(op_name);
+
+    auto src_tr = GetInput(nid, 0);
+    auto dst_tr = GetOutput(nid, 0);
 
-    auto data_entry = node.GetInputs()[0];
-    dnnl::memory::dims shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    dnnl::memory::desc data_md = GenDNNLMemDescByShape(shape, dtype);
     float alpha = 0., beta = 0.;
     if (op_name == "clip") {
-      alpha = std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);
-      beta = std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
+      alpha = GetNodeAttr<float>(node, "a_min");
+      beta = GetNodeAttr<float>(node, "a_max");
     } else if (op_name == "nn.leaky_relu") {
-      alpha = std::stof(node.GetAttr<std::vector<std::string>>("alpha")[0]);
+      alpha = GetNodeAttr<float>(node, "alpha");
     }
 
-    auto elt_desc =
-        dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, algo, data_md, alpha, beta);
+    auto elt_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, algo,
+                                                src_tr.desc(), alpha, beta);
     auto elt_prim_desc = dnnl::eltwise_forward::primitive_desc(elt_desc, engine_);
-    ICHECK(data_md == elt_prim_desc.dst_desc());
-
-    auto elt = dnnl::eltwise_forward(elt_prim_desc);
-    net_.push_back(elt);
+    ICHECK(src_tr.desc() == elt_prim_desc.dst_desc());
 
-    auto data_memory = BindDNNLMemory(data_entry, data_md);
-    JSONGraphNodeEntry out_entry(nid, 0);
-    auto out_memory = BindDNNLMemory(out_entry, data_md);
-
-    net_args_.push_back({{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, out_memory}});
+    Submit(dnnl::eltwise_forward(elt_prim_desc), {{DNNL_ARG_SRC, src_tr}, {DNNL_ARG_DST, dst_tr}});
   }
 
   void Softmax(const size_t& nid) {
     auto node = nodes_[nid];
 
-    auto data_entry = node.GetInputs()[0];
-    dnnl::memory::dims shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+    auto src_tr = GetInput(nid, 0);
+    auto dst_tr = GetOutput(nid, 0);
+
+    auto axis = GetNodeAttr<int>(node, "axis");
     if (axis < 0) {
-      axis = shape.size() + axis;
+      axis = src_tr.dims().size() + axis;
     }
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    dnnl::memory::desc data_md = GenDNNLMemDescByShape(shape, dtype);
 
     auto softmax_desc =
-        dnnl::softmax_forward::desc(dnnl::prop_kind::forward_inference, data_md, axis);
+        dnnl::softmax_forward::desc(dnnl::prop_kind::forward_inference, src_tr.desc(), axis);
     auto softmax_prim_desc = dnnl::softmax_forward::primitive_desc(softmax_desc, engine_);
-    ICHECK(data_md == softmax_prim_desc.dst_desc());
-
-    auto softmax = dnnl::softmax_forward(softmax_prim_desc);
-    net_.push_back(softmax);
+    ICHECK(dst_tr.desc() == softmax_prim_desc.dst_desc());
 
-    auto data_memory = BindDNNLMemory(data_entry, data_md);
-    JSONGraphNodeEntry out_entry(nid, 0);
-    auto out_memory = BindDNNLMemory(out_entry, data_md);
-
-    net_args_.push_back({{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, out_memory}});
+    Submit(dnnl::softmax_forward(softmax_prim_desc),
+           {{DNNL_ARG_SRC, src_tr}, {DNNL_ARG_DST, dst_tr}});
   }
 
   void Binary(const size_t& nid, dnnl::algorithm algo) {
     auto node = nodes_[nid];
+    ICHECK_EQ(node.GetInputs().size(), 2U);
 
     // Memory and compute description.
-    std::vector<dnnl::memory::dims> data_dims;
-    std::vector<dnnl::memory::desc> data_mds;
-    std::vector<dnnl::memory> data_memories;
+    auto lhs_tr = GetInput(nid, 0);
+    auto rhs_tr = GetInput(nid, 1);
+    auto dst_tr = GetOutput(nid, 0);
 
-    ICHECK_EQ(node.GetInputs().size(), 2U);
-    for (auto entry : node.GetInputs()) {
-      auto data_shape = nodes_[entry.id_].GetOpShape()[entry.index_];
-      auto dtype = dtype_dl2dnnl(nodes_[entry.id_].GetOpDataType()[entry.index_]);
-      dnnl::memory::desc data_md = GenDNNLMemDescByShape(data_shape, dtype);
-
-      data_dims.push_back(data_shape);
-      data_mds.push_back(data_md);
-      data_memories.push_back(BindDNNLMemory(entry, data_md));
-    }
-    ICHECK(data_dims[0] == data_dims[1]);
-    auto out_md = data_mds[0];
-    JSONGraphNodeEntry out_entry(nid, 0);
-    auto out_memory = BindDNNLMemory(out_entry, out_md);
+    lhs_tr = lhs_tr.Broadcast(dst_tr.dims());
+    rhs_tr = rhs_tr.Broadcast(dst_tr.dims());
 
-    auto binary_desc = dnnl::binary::desc(algo, data_mds[0], data_mds[1], out_md);
+    auto binary_desc = dnnl::binary::desc(algo, lhs_tr.desc(), rhs_tr.desc(), dst_tr.desc());
     auto binary_prim_desc = dnnl::binary::primitive_desc(binary_desc, engine_);
-    auto binary = dnnl::binary(binary_prim_desc);
-    net_.push_back(binary);
 
-    net_args_.push_back({{DNNL_ARG_SRC_0, data_memories[0]},
-                         {DNNL_ARG_SRC_1, data_memories[1]},
-                         {DNNL_ARG_DST, out_memory}});
+    Submit(dnnl::binary(binary_prim_desc),
+           {{DNNL_ARG_SRC_0, lhs_tr}, {DNNL_ARG_SRC_1, rhs_tr}, {DNNL_ARG_DST, dst_tr}});
+  }
+
+  template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
+  T AttrConvert(std::vector<std::string> val) {
+    ICHECK_EQ(val.size(), 1);
+    return std::stol(val[0]);
+  }
+
+  template <typename T, std::enable_if_t<std::is_floating_point<T>::value, int> = 0>
+  T AttrConvert(std::vector<std::string> val) {
+    ICHECK_EQ(val.size(), 1);
+    return std::stof(val[0]);
+  }
+
+  template <typename T, std::enable_if_t<std::is_same<T, std::string>::value, int> = 0>
+  T AttrConvert(std::vector<std::string> val) {
+    ICHECK_EQ(val.size(), 1);
+    return val[0];
+  }
+
+  template <typename T,
+            std::enable_if_t<std::is_same<T, std::vector<typename T::value_type>>::value, int> = 0>
+  T AttrConvert(std::vector<std::string> val) {
+    T res;
+    for (const auto& el : val) res.push_back(AttrConvert<typename T::value_type>({el}));
+    return res;
+  }
+
+  /*!
+   * \brief Helper to extract node attribute with ability to specify default value and result type.
+   */
+  template <typename T>
+  const T GetNodeAttr(const json::JSONGraphNode& node, std::string name,
+                      std::vector<std::string> def = {}) {
+    auto attr = node.HasAttr(name) ? node.GetAttr<std::vector<std::string>>(name) : def;
+    return AttrConvert<T>(attr);
   }
 
-  // Read from DNNL memory (+offset) and write to the handle.
-  inline void read_from_dnnl_memory(void* handle, const dnnl::memory& mem, size_t size,
-                                    size_t offset = 0) {
-    uint8_t* src = static_cast<uint8_t*>(mem.get_data_handle());
-    std::copy(src + offset, src + offset + size, static_cast<uint8_t*>(handle));
+  TensorRequisite GetInput(const size_t& nid, const int idx) {
+    if (idx == -1) return {};  // -1 reserved value for empty input.
+
+    const JSONGraphNode& node = nodes_[nid];
+
+    ICHECK_LT(idx, node.GetInputs().size());
+    auto data_entry = node.GetInputs()[idx];
+
+    auto shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
+    auto dtype = nodes_[data_entry.id_].GetOpDataType()[data_entry.index_];
+    auto eid = node_row_ptr_[data_entry.id_] + data_entry.index_;
+    auto const_dl_tensor = data_entry_[eid];
+
+    auto desc = MakePlainDesc(shape, dtype);
+
+    TensorRequisite res;
+    if (const_dl_tensor) {
+      ICHECK(const_dl_tensor->data);
+      ICHECK(const_dl_tensor->strides == nullptr);
+      auto mem = dnnl::memory(desc, engine_, const_dl_tensor->data);
+      res = TensorRequisite::AsIs(mem, eid);
+    } else {
+      res = TensorRequisite::AsIs(desc, eid);
+    }
+    return res;
   }
 
-  // Read from the handle and write to DNNL memory (+offset).
-  inline void write_to_dnnl_memory(void* handle, const dnnl::memory& mem, size_t size,
-                                   size_t offset = 0) {
-    uint8_t* dst = static_cast<uint8_t*>(mem.get_data_handle());
-    std::copy(reinterpret_cast<uint8_t*>(handle), reinterpret_cast<uint8_t*>(handle) + size,
-              dst + offset);
+  TensorRequisite GetOutput(const size_t& nid, const int idx) {
+    if (idx == -1) return {};  // -1 reserved value for empty input.
+
+    const JSONGraphNode& node = nodes_[nid];
+
+    ICHECK_LT(idx, node.GetNumOutput());
+    auto shape = node.GetOpShape()[idx];
+    auto dtype = node.GetOpDataType()[idx];
+    auto eid = node_row_ptr_[nid] + static_cast<uint32_t>(idx);
+
+    ICHECK(data_entry_[eid] == nullptr);
+    auto desc = MakePlainDesc(shape, dtype);
+
+    return TensorRequisite::AsIs(desc, eid).Backward();
   }
 
-  // Generate DNNL memory description and infer the data layout by the given shape.
-  inline dnnl::memory::desc GenDNNLMemDescByShape(const dnnl::memory::dims& shape, dt dtype) {
-    dnnl::memory::desc data_md;
-    switch (shape.size()) {
-      case 2:
-        data_md = dnnl::memory::desc({shape, dtype, tag::ab});
-        break;
-      case 3:
-        data_md = dnnl::memory::desc({shape, dtype, tag::abc});
-        break;
-      case 4:
-        data_md = dnnl::memory::desc({shape, dtype, tag::abcd});
-        break;
-      case 5:
-        data_md = dnnl::memory::desc({shape, dtype, tag::abcde});
-        break;
-      default:
-        LOG(FATAL) << "Unsupported data shape dimension: " << shape.size();
-        break;
+  /*! \brief Helper function to register primitive into execution queue */
+  void Submit(const dnnl::primitive& prim,
+              const std::unordered_map<int, TensorRequisite>& tr_args) {
+    // Register all provided TR arguments
+    std::unordered_map<int, TensorRegistry::ArgId> prim_arg_id;
+    TensorRegistry::ActionQue post_prim_actions;
+    for (const auto& kvp : tr_args) {
+      const auto& key = kvp.first;
+      const auto& tr = kvp.second;
+
+      if (!tr.defined()) continue;  // empty arg is admitted. Just skip it
+      auto arg_id = tensor_registry_.Register(tr, tr.IsReversed() ? &post_prim_actions : &net_);
+      prim_arg_id[key] = arg_id;
     }
-    return data_md;
+
+    // Register main primitive
+    net_.push_back({prim, prim_arg_id});
+
+    // Register post actions
+    net_.insert(net_.end(), post_prim_actions.begin(), post_prim_actions.end());
   }
 
+  uint32_t GenUniqueEid() { return next_unique_eid_offset_++; }
+
   /* The dnnl engine. */
   dnnl::engine engine_;
   /* The dnnl stream. */
   dnnl::stream stream_;
   /* The network layers that are represented in dnnl primitives. */
-  std::vector<dnnl::primitive> net_;
-  /* The memory that is consumed by arguments. */
-  std::vector<std::unordered_map<int, dnnl::memory>> net_args_;
-  /* The entry ID to its corresponding output memory. */
-  std::unordered_map<uint32_t, std::pair<dnnl::memory, size_t>> entry_out_mem_;
+  TensorRegistry::ActionQue net_;
+  /* Storage for all memory objects */
+  TensorRegistry tensor_registry_;
+  /* Generator of new unique eid which doesn't match with existing data entry */
+  uint32_t next_unique_eid_offset_;
+  /* Map of Run arg idx to corresponding eid */
+  std::vector<uint32_t> run_arg_eid_;
 };
 
 runtime::Module DNNLJSONRuntimeCreate(String symbol_name, String graph_json,
diff --git a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
new file mode 100644
index 0000000000000..d02ceff5de823
--- /dev/null
+++ b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
@@ -0,0 +1,720 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/dnnl/dnnl_tensor_requisite.cc
+ * \brief Helper TR wrapper to simplify tensors processing
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_DNNL_DNNL_TENSOR_REQUISITE_H_
+#define TVM_RUNTIME_CONTRIB_DNNL_DNNL_TENSOR_REQUISITE_H_
+
+#include <dlpack/dlpack.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+// TODO(@apeskov): Have to mute warning from dnnl headers.
+//  -Wzero-as-null-pointer-constant and -Wdocumentation-unknown-command
+#include <dnnl.hpp>
+
+#include "dnnl_utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace utils;
+
+/*!
+ * \brief Helper object to simplify tensor transformation description.
+ *
+ * Allow to specify original source tensor and future actions which should be applied to it.
+ * Can be treated as sequence of reordering or reinterpretation of original source tensor.
+ * Finally TR can be solved as proper interpretation of source memory buffer, or sequence of
+ * dnnl::reorder operators which will provide desired data.
+ *
+ * \note Empty TR object allow any manipulation. Empty TR will be returned.
+ *
+ * \sa TensorRegistry
+ *
+ * Example:
+ * \code
+ *   dnnl::memory src_mem = ...;  // 5D tensor, shape {5, 2, 128, 128, 8}
+ *
+ *   // Construct TR
+ *   auto tr = TensorRequisite.AsIs(src_mem, eid);  // 5D
+ *
+ *   // describe sequence of layout transformation
+ *   tr = tr.TreatAs("ABCD8b");  // 4D
+ *   tr = tr.Permute({0, 2, 3, 1});  // Permute axes NCHW -> NHWC
+ *   tr = tr.Crop({1, 128, 128, 16}, {0, 0, 0});  // extract first batch element
+ *   tr = tr.Squeeze(); // 1D
+ *
+ *   // register TR
+ *   TensorRegistry t_reg;
+ *   auto t_id = t_reg.register(tr);
+ *
+ *   // Get final dnnl::memory object
+ *   auto solver = t_reg.MakeSolver(ext_tensor_provider);
+ *   auto mem = solver(t_id);
+ * \endcode
+ *
+ */
+class TensorRequisite {
+ public:
+  using Tid = uint32_t;
+  static constexpr Tid kUndefinedTid = std::numeric_limits<uint32_t>::max() - 1;
+
+  /*! \brief Empty constructor */
+  TensorRequisite() {}
+
+  /*! \brief Construct TR on top of existing memory object */
+  static TensorRequisite AsIs(const dnnl::memory& mem, Tid id = kUndefinedTid) {
+    auto res = AsIs(mem.get_desc(), id);
+    if (mem.get_data_handle() != nullptr) res.mem_ = mem;
+    return res;
+  }
+
+  /*! \brief Construct TR on top of existing memory descriptor object */
+  static TensorRequisite AsIs(const dnnl::memory::desc& desc, Tid id = kUndefinedTid) {
+    return {desc, {}, false, {}, id, false};
+  }
+
+  /*! \brief return logical shape of tensor */
+  dnnl::memory::dims dims() const { return t_desc_.dims(); }
+
+  /*! \brief return data type of tensor */
+  dnnl::memory::data_type data_type() const { return t_desc_.data_type(); }
+
+  /*! \brief return tensor desc */
+  dnnl::memory::desc desc() const { return t_desc_; }
+
+  /*! \brief Make TR with backward dataflow */
+  TensorRequisite Backward() const {
+    if (!defined()) return *this;
+    ICHECK(orig_ == nullptr);
+    return {t_desc_, orig_, reinterpret_, mem_, eid_, true};
+  }
+
+  /*! \brief Produce TR with permuted axes */
+  TensorRequisite Permute(const std::vector<int>& permutation) const {
+    if (!defined()) return *this;  // nothing for empty TR
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    // reinterpret memory buffer with new strides
+    auto desc = t_desc_.permute_axes(permutation);
+    return {desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Produce TR with reinterpret data of original tr */
+  TensorRequisite Reshape(const dnnl::memory::dims& shape) const {
+    if (!defined()) return *this;  // nothing for empty TR
+    if (t_desc_.dims() == shape) return *this;
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    // reinterpret memory buffer with new strides
+    auto desc = t_desc_.reshape(shape);
+    return {desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Produce TR with broadcasted values */
+  TensorRequisite Broadcast(const dnnl::memory::dims& shape) const {
+    if (!defined()) return *this;  // nothing for empty TR
+    if (t_desc_.dims() == shape) return *this;
+    ICHECK(!reverse_data_flow_);
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+
+    // numpy like broadcast
+    auto extended_dims = t_desc_.dims();
+    auto one_filled = dnnl::memory::dims(shape.size() - extended_dims.size(), 1);
+    extended_dims.insert(extended_dims.begin(), one_filled.begin(), one_filled.end());
+    auto desc = t_desc_.reshape(extended_dims);
+    for (size_t i = 0; i < extended_dims.size(); i++) {
+      if (extended_dims[i] == shape[i]) continue;
+      ICHECK(extended_dims[i] == 1);
+      ICHECK(desc.data.dims[i] == desc.data.padded_dims[i]);
+
+      desc.data.dims[i] = shape[i];
+      desc.data.padded_dims[i] = shape[i];
+      desc.data.format_desc.blocking.strides[i] = 0;
+    }
+
+    // reinterpret memory buffer with new strides
+    return {desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Produce TR with sub memory view (ROI) */
+  TensorRequisite Crop(const dnnl::memory::dims& shape, const dnnl::memory::dims& offset) const {
+    if (!defined()) return *this;  // nothing for empty TR
+
+    ICHECK_EQ(shape.size(), t_desc_.dims().size());
+    ICHECK_EQ(offset.size(), t_desc_.dims().size());
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    // reinterpret memory buffer with new strides
+    auto desc = t_desc_.submemory_desc(shape, offset, /*allow_empty=*/true);
+
+    // Originally DNNL implementation is very limited. Let's slightly enhance it.
+    if (!desc && t_desc_.data.format_kind == dnnl_blocked) {
+      bool offset_is_zero =
+          std::all_of(offset.begin(), offset.end(), [](auto el) { return el == 0; });
+
+      dnnl::memory::dims block_sizes(t_desc_.dims().size(), 1);
+      for (int i = 0; i < t_desc_.data.format_desc.blocking.inner_nblks; i++)
+        block_sizes[t_desc_.data.format_desc.blocking.inner_idxs[i]] *=
+            t_desc_.data.format_desc.blocking.inner_blks[i];
+
+      bool shape_reduction_less_than_block = true;
+      for (int i = 0; i < t_desc_.data.ndims; i++) {
+        shape_reduction_less_than_block &= t_desc_.data.dims[i] - shape[i] < block_sizes[i];
+      }
+
+      // This is auto padded case. Just update dims value.
+      if (offset_is_zero && shape_reduction_less_than_block) {
+        desc = t_desc_;
+        std::copy(shape.begin(), shape.end(), desc.data.dims);
+      }
+    }
+
+    ICHECK(desc);
+
+    return {desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Produce TR with squeeze shape */
+  TensorRequisite Squeeze(const dnnl::memory::dims& dims_to_squeeze = {}) const {
+    if (!defined()) return *this;  // nothing for empty TR
+
+    dnnl::memory::dims squeezed_dims;
+    if (dims_to_squeeze.empty()) {
+      for (auto d : t_desc_.dims())
+        if (d != 1) squeezed_dims.push_back(d);
+    } else {
+      for (size_t i = 0; i < t_desc_.dims().size(); i++)
+        if (std::find(dims_to_squeeze.begin(), dims_to_squeeze.end(), i) == dims_to_squeeze.end())
+          squeezed_dims.push_back(t_desc_.dims()[i]);
+    }
+
+    if (squeezed_dims.empty()) squeezed_dims = {1};
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    // reinterpret memory buffer with new strides
+    auto desc = t_desc_.reshape(squeezed_dims);
+    return {desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Produce TR with specified layout descriptor */
+  TensorRequisite RequestLayout(dnnl::memory::desc desc) const {
+    if (!defined()) return *this;  // nothing for empty TR
+
+    // If it's the same desc just return self
+    if (desc == t_desc_) return *this;
+
+    ICHECK(t_desc_.dims() == desc.dims()) << "Requested layout is not compatible with "
+                                             "presented shape";
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    return {desc, orig, false, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Define which logical dims ordering is default for particular layout string. */
+  static std::string DefaultLogicLayoutFor(const std::string& layout) {
+    // Rank is all non digit marked dims
+    auto it = layout.begin();
+    while (it != layout.end() && !std::isdigit(*it)) it++;
+    int rank = std::distance(layout.begin(), it);
+
+    static const std::vector<std::string> sparse_dims = {"W", "HW", "DHW"};
+    if (layout.find("N") != std::string::npos) return "NC" + sparse_dims[rank - 3];
+    if (layout.find("G") != std::string::npos) return "GOI" + sparse_dims[rank - 4];
+    if (layout.find("O") != std::string::npos) return "OI" + sparse_dims[rank - 3];
+
+    LOG(FATAL) << "Unknown layout " << layout << "There is no default scheme to handle it";
+    return {};
+  }
+
+  /*!
+   * \brief Treat TR shape as described in layout string.
+   *
+   * Blocked dimensions will be concatenated and put into proper shape position corresponding to  .
+   * resulting_layout_logic argument. If desired logic layout was not provided it will be deduced
+   * automatically based on some internal heuristics.
+   *
+   * Limitation 1. Blocking dims should be dense. Dims marked with digits use natural strides.
+   * Limitation 2. Blocking dims are innermost. Dims marked like 8c, 4o goes after regular
+   *               dimensions. NC8cHW4h4cD is not valid tensor in terms of DNNL. And cannot be
+   *               achieved with memory reinterpretation, so data copy is required. Proper layout
+   *               looks like NCHWD_8c4h4c, first part is outer dims, second digits marked part is
+   *               innermost.
+   */
+  TensorRequisite TreatAs(const std::string& layout, std::string desired_logic_layout = "") const {
+    if (desired_logic_layout.empty()) desired_logic_layout = DefaultLogicLayoutFor(layout);
+
+    const auto origin_dims = dims();
+
+    // split layout string to tokens {size, tag} like {16, 'C'}, {4, 'O'}
+    std::vector<std::pair<int, char>> layout_tokens;
+    for (auto it = layout.begin(); it != layout.end();) {
+      auto start = it;
+      while (std::isdigit(*it)) it++;
+      int blk_size = start == it ? -1 : std::stoi(std::string{start, it});
+      layout_tokens.push_back({blk_size, std::toupper(*it)});
+      it++;
+    }
+
+    // check applicability of layout
+    auto it = layout_tokens.begin();
+    while (it != layout_tokens.end() && it->first == -1) it++;
+    int rank = std::distance(layout_tokens.begin(), it);
+    while (it != layout_tokens.end()) {
+      ICHECK_NE(it->first, -1) << "DNNL limitation. Blocking dims should be innermost. "
+                               << "But received layout is " << layout;
+      it++;
+    }
+
+    ICHECK_EQ(layout_tokens.size(), origin_dims.size());
+    ICHECK_EQ(rank, desired_logic_layout.size()) << layout;
+
+    std::vector<std::pair<int, char>> outermost_tokens(layout_tokens.begin(),
+                                                       layout_tokens.begin() + rank);
+    std::vector<std::pair<int, char>> innermost_tokens(layout_tokens.begin() + rank,
+                                                       layout_tokens.end());
+    // define dim resulting dim positions
+    std::map<char, int> dim_position_by_tag;
+    for (size_t i = 0; i < desired_logic_layout.size(); i++)
+      dim_position_by_tag[std::toupper(desired_logic_layout[i])] = i;
+
+    // Construct resulting desc by modifying original one
+    dnnl::memory::desc res_desc = t_desc_;
+
+    memset(&res_desc.data.format_desc.blocking, 0, sizeof(res_desc.data.format_desc.blocking));
+    std::fill(res_desc.data.dims, res_desc.data.dims + DNNL_MAX_NDIMS, 0);
+    std::fill(res_desc.data.padded_dims, res_desc.data.padded_dims + DNNL_MAX_NDIMS, 0);
+
+    res_desc.data.ndims = rank;
+    res_desc.data.format_desc.blocking.inner_nblks = innermost_tokens.size();
+
+    auto res_dims = res_desc.data.dims;
+    auto res_strides = res_desc.data.format_desc.blocking.strides;
+    auto res_inner_blks = res_desc.data.format_desc.blocking.inner_blks;
+    auto res_inner_idxs = res_desc.data.format_desc.blocking.inner_idxs;
+
+    std::fill(res_dims, res_dims + rank, 1);
+
+    int orig_dim_idx = 0;
+    for (const auto& p : outermost_tokens) {
+      auto tag = p.second;
+      auto dim_size = origin_dims[orig_dim_idx];
+
+      auto result_dim_position = dim_position_by_tag[tag];
+      res_dims[result_dim_position] *= dim_size;
+      res_strides[result_dim_position] = t_desc_.data.format_desc.blocking.strides[orig_dim_idx];
+      orig_dim_idx++;
+    }
+    for (const auto& p : innermost_tokens) {
+      auto tag = p.second;
+      auto dim_size = origin_dims[orig_dim_idx];
+      auto result_dim_position = dim_position_by_tag[tag];
+      ICHECK_EQ(p.first, dim_size)
+          << "Blocking layout is not applicable to tensor with shape: " << origin_dims
+          << ". Requested layout is " << layout;
+
+      res_dims[result_dim_position] *= dim_size;
+      *res_inner_blks++ = dim_size;
+      *res_inner_idxs++ = result_dim_position;
+      orig_dim_idx++;
+    }
+
+    // Assume tensor is dense. There is no additional padding.
+    std::copy(res_desc.data.dims, res_desc.data.dims + rank, res_desc.data.padded_dims);
+
+    if (t_desc_ == res_desc) return *this;
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    return {res_desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*!
+   * \brief Produce TR with unspecified layout.
+   *
+   * Cannot be registered in TensorRegistry. Only for querying DNNL for preferred layouts.
+   */
+  TensorRequisite LayoutAny() const {
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    // Recreate tensor desc with layout 'any'
+    dnnl::memory::desc any_desc{t_desc_.dims(), t_desc_.data_type(), dnnl::memory::format_tag::any};
+    return {any_desc, orig, false, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Check is TR is constant. */
+  bool IsConstant() const {
+    if (orig_) return orig_->IsConstant();
+    return mem_.operator bool();
+  }
+
+  /*! \brief Check is tensor is scalar. */
+  bool IsScalar() const { return t_desc_.dims().size() == 1 && t_desc_.dims()[0] == 1; }
+
+  /*! \brief Return const data memory if available. */
+  dnnl::memory GetConstData() const {
+    if (mem_) return mem_;
+    if (!orig_) return {};
+
+    if (auto orig_const_data = orig_->GetConstData()) {
+      if (reinterpret_) {
+        return {t_desc_, orig_const_data.get_engine(), orig_const_data.get_data_handle()};
+      } else {
+        auto eng = orig_const_data.get_engine();
+        auto res = dnnl::memory{t_desc_, eng};
+        dnnl::reorder(orig_const_data, res).execute(dnnl::stream(eng), orig_const_data, res);
+        return res;
+      }
+    }
+    return {};
+  }
+
+  /*!
+   * \brief Return const data memory in form of vector.
+   *
+   * Same as GetConstData but use std::vector instead of dnnl::memory. Works only for 1D tensor
+   * and scalar TRs. Useful for specification of 1D DNNL attributes like zero_point or
+   * per_channel_scale
+   */
+  template <typename T>
+  std::vector<T> GetConstDataLikeVec() const {
+    auto const_data = GetConstData();
+    auto desc = const_data.get_desc();
+    ICHECK(desc.data_type() == utils::DnnlDType<T>());
+    ICHECK(desc.dims().size() == 1);
+
+    auto size = desc.get_size() / sizeof(T);
+    auto ptr = static_cast<T*>(const_data.get_data_handle());
+
+    return std::vector<T>(ptr, ptr + size);
+  }
+
+  /*! \brief Get value of constant scalar tensor if possible. */
+  template <typename T>
+  T GetConstScalarData() const {
+    ICHECK(IsConstant());
+    ICHECK(IsScalar());
+    auto const_data = GetConstData();
+    auto desc = const_data.get_desc();
+    ICHECK(desc.data_type() == utils::DnnlDType<T>());
+
+    auto ptr = static_cast<T*>(const_data.get_data_handle());
+    return *ptr;
+  }
+
+  /*! \brief Check if tensor is not empty. */
+  bool defined() const { return !t_desc_.is_zero(); }
+
+  /*! \brief Same as defined */
+  operator bool() const { return defined(); }
+
+  /*!
+   * \brief Check if tensor represent a reversed data flow.
+   * Useful for describing output processing
+   */
+  bool IsReversed() const { return reverse_data_flow_; }
+
+ private:
+  TensorRequisite(const dnnl::memory::desc& t_desc, const std::shared_ptr<TensorRequisite>& orig,
+                  bool reinterpret, const dnnl::memory& const_mem, uint32_t eid,
+                  bool reverse_data_flow)
+      : t_desc_(t_desc),
+        orig_(orig),
+        reinterpret_(reinterpret),
+        mem_(const_mem),
+        eid_(eid),
+        reverse_data_flow_(reverse_data_flow) {
+    if (mem_) ICHECK(!orig_ && !reverse_data_flow_ && eid_ == kUndefinedTid);
+    if (eid_ != kUndefinedTid) ICHECK(!orig_);
+  }
+
+  /* Descriptor of particular tensor  */
+  dnnl::memory::desc t_desc_ = {};
+  /* Parent TR object which is referred from this TR */
+  std::shared_ptr<TensorRequisite> orig_ = {};
+  /* Flag to specify which action should be done with orig TR, reordering or reinterpretation */
+  bool reinterpret_ = false;
+  /* Const memory object if available */
+  dnnl::memory mem_ = {};
+  /* Entry ID of tensor if available */
+  uint32_t eid_ = kUndefinedTid;
+
+  /*
+   * Flag to describe reverse data flow case
+   * All operation on queue will be executed in reverse order. Actual for dst tensor description
+   */
+  bool reverse_data_flow_ = false;
+
+  friend class TensorRegistry;
+};
+
+/*!
+ * \brief The registry of tensors. Implement matching of provided TRs and real memory buffers.
+ *
+ * Registration of TR performed by calling method Register(), which will return ArgId object.
+ * ArgId can be mapped to real memory via memory solver created by method MakeSolver().
+ */
+class TensorRegistry {
+ private:
+  enum ArgReqFlag {
+    CONST,        /// < Constant tensor. ExecutionCTX independent
+    TMP_STORAGE,  /// < Intermediate tensors. Stored inside TensorRegistry. Inaccessible outside
+    EXT_EID,      /// < External data. Input or Output.
+  };
+
+ public:
+  struct ArgId {
+    TensorRegistry::ArgReqFlag flag_;
+    uint32_t idx_;
+  };
+
+  using Action = std::tuple<dnnl::primitive, std::unordered_map<int, ArgId>>;
+  using ActionQue = std::vector<Action>;
+  using DLTensorProvider = std::function<const DLTensor*(uint32_t)>;
+  using MemSolver = std::function<const dnnl::memory(ArgId)>;
+
+  TensorRegistry() = default;
+  TensorRegistry(const dnnl::engine& eng, const std::set<uint32_t>& ext_io_eid)
+      : tmp_mem_collection_(1), ext_io_eid_(ext_io_eid), eng_(eng), stream_(eng) {}
+
+  /*!
+   * \brief Register TR to registry
+   *
+   * Resolution of TR may lead to introduction of intermediate memory buffers and additional
+   * transformation actions which should be performed before or after usage of corresponding memory
+   * buffer. Additional actions will be append to provided actions queue. Corresponding to
+   * tr.IsReversed() value actions should be executed before or after usage of resulting ArgId.
+   *
+   * \param tr tensor requisite sequence to register
+   * \param action resulting action queue. If TR resolution is required execution of some
+   *               transformation actions they will be put here
+   * \return associated ArgId. Should be used as argument for MemSolver.
+   */
+  ArgId Register(const TensorRequisite& tr, ActionQue* action) {
+    // 1) Constant tensor. Direct reference
+    if (auto const_data = tr.GetConstData()) {
+      auto idx = const_mem_collection_.size();
+      const_mem_collection_.push_back(const_data);
+      return MakeArgReq(ArgReqFlag::CONST, static_cast<uint32_t>(idx));
+    }
+
+    // 2) EID mapped tensor. Direct reference
+    if (tr.eid_ != TensorRequisite::kUndefinedTid) {
+      if (ext_io_eid_.count(tr.eid_) == 0) {  // Not IO tensor, means it's intermediate
+        if (eid2idx_tmp_.count(tr.eid_)) {
+          auto idx = eid2idx_tmp_.at(tr.eid_);
+          return MakeArgReq(ArgReqFlag::TMP_STORAGE, idx);
+        } else {
+          // register himself
+          auto idx = tmp_mem_collection_.size();
+          tmp_mem_collection_.push_back(tr.t_desc_);
+          eid2idx_tmp_[tr.eid_] = idx;
+          return MakeArgReq(ArgReqFlag::TMP_STORAGE, static_cast<uint32_t>(idx));
+        }
+      } else {
+        auto idx = ext_mem_collection_.size();
+        ext_mem_collection_.push_back({tr.eid_, tr.t_desc_});
+        return MakeArgReq(ArgReqFlag::EXT_EID, static_cast<uint32_t>(idx));
+      }
+    }
+
+    // 3) Tensors with transform actions
+    if (tr.orig_) {
+      // recursive register of orig TR
+      auto orig_arg_req = Register(*tr.orig_, action);
+      if (tr.reinterpret_) {
+        return RegisterReinterpret(orig_arg_req, tr.t_desc_);
+      } else {
+        return RegisterReorder(orig_arg_req, tr.t_desc_, tr.reverse_data_flow_, action);
+      }
+    }
+
+    // 4) Scratchpad
+    ICHECK(!tr.orig_ && !tr.mem_ && tr.eid_ == TensorRequisite::kUndefinedTid);
+    auto idx = tmp_mem_collection_.size();
+    tmp_mem_collection_.push_back(tr.t_desc_);
+    tmp_mem_mapping_[idx] = 0;  // zero position tmp mem object is reserved for scratchpads
+
+    auto scratchpad_size = tr.t_desc_.get_size();
+    auto glob_scratchpad_size = tmp_mem_collection_[0].get_size();
+    if (scratchpad_size > glob_scratchpad_size) {
+      tmp_mem_collection_[0] =
+          dnnl::memory::desc({static_cast<dnnl::memory::dim>(scratchpad_size)},
+                             dnnl::memory::data_type::u8, dnnl::memory::format_tag::a);
+    }
+    return MakeArgReq(TMP_STORAGE, static_cast<uint32_t>(idx));
+  }
+
+  /*!
+   * \brief Construct memory solver for all registered TRs.
+   * \param ext_provider callback to resolve external IO buffers
+   * \return memory solver object to match ArgId to dnnl::memory objects
+   */
+  MemSolver MakeSolver(const DLTensorProvider& ext_provider) const {
+    return MemSolverImpl(eng_, ext_provider, const_mem_collection_, ext_mem_collection_,
+                         tmp_mem_collection_, tmp_mem_mapping_);
+  }
+
+ private:
+  ArgId RegisterReinterpret(ArgId src_ar, const dnnl::memory::desc& desc) {
+    switch (src_ar.flag_) {
+      case TMP_STORAGE: {
+        auto idx = tmp_mem_collection_.size();
+        tmp_mem_collection_.push_back(desc);
+        tmp_mem_mapping_[idx] = src_ar.idx_;
+        return MakeArgReq(TMP_STORAGE, idx);
+      }
+      case EXT_EID: {
+        auto ext_req = ext_mem_collection_[src_ar.idx_];
+        auto idx = ext_mem_collection_.size();
+        ext_mem_collection_.push_back({ext_req.first, desc});
+        return MakeArgReq(EXT_EID, idx);
+      }
+      default:
+        LOG(FATAL) << "Unknown case";
+    }
+    return {};
+  }
+
+  ArgId RegisterReorder(ArgId src_ar, const dnnl::memory::desc& desc, bool reverse_data_flow,
+                        ActionQue* action) {
+    ICHECK(src_ar.flag_ == TMP_STORAGE || src_ar.flag_ == EXT_EID);
+
+    auto src_desc = src_ar.flag_ == TMP_STORAGE ? tmp_mem_collection_[src_ar.idx_]
+                                                : ext_mem_collection_[src_ar.idx_].second;
+    auto idx = tmp_mem_collection_.size();
+    tmp_mem_collection_.push_back(desc);
+    auto dst_ar = MakeArgReq(TMP_STORAGE, idx);
+
+    // reorder action submit
+    if (reverse_data_flow) {
+      auto reorder_pd = dnnl::reorder::primitive_desc(eng_, desc, eng_, src_desc);
+      action->insert(action->begin(),
+                     {dnnl::reorder(reorder_pd), {{DNNL_ARG_FROM, dst_ar}, {DNNL_ARG_TO, src_ar}}});
+    } else {
+      auto reorder_pd = dnnl::reorder::primitive_desc(eng_, src_desc, eng_, desc);
+      action->push_back(
+          {dnnl::reorder(reorder_pd), {{DNNL_ARG_FROM, src_ar}, {DNNL_ARG_TO, dst_ar}}});
+    }
+    return dst_ar;
+  }
+  /*! \brief Implementation of memory solver */
+  class MemSolverImpl {
+   public:
+    MemSolverImpl(const dnnl::engine& eng, const DLTensorProvider& ext_data_provider,
+                  const std::vector<dnnl::memory>& const_mems,
+                  const std::vector<std::pair<uint32_t, dnnl::memory::desc>>& ext_mems,
+                  const std::vector<dnnl::memory::desc>& tmp_mem_descs,
+                  const std::map<size_t, size_t>& tmp_mem_mapping)
+        : eng_(eng),
+          ext_data_provider_(ext_data_provider),
+          const_mems_(const_mems),
+          ext_mems_(ext_mems) {
+      // Construct temp memory objects on the fly. While we have no scratchpads
+      // support on VM/GraphExecutor level.
+      tmp_mems_.resize(tmp_mem_descs.size());
+      for (size_t i = 0; i < tmp_mem_descs.size(); i++) {
+        auto found = tmp_mem_mapping.find(i);
+
+        if (found != tmp_mem_mapping.end()) {
+          auto reuse_hdl = tmp_mems_[found->second].get_data_handle();
+          tmp_mems_[i] = dnnl::memory(tmp_mem_descs[i], eng_, reuse_hdl);
+        } else {
+          tmp_mems_[i] = dnnl::memory(tmp_mem_descs[i], eng_);
+        }
+      }
+    }
+
+    /*! \brief Find memory object associated with provided ArgId */
+    dnnl::memory operator()(const ArgId& ar) const {
+      switch (ar.flag_) {
+        case CONST:
+          return const_mems_.at(ar.idx_);
+        case TMP_STORAGE:
+          return tmp_mems_.at(ar.idx_);
+        case EXT_EID: {
+          auto eid_and_desc = ext_mems_.at(ar.idx_);
+          auto eid = eid_and_desc.first;
+          auto desc = eid_and_desc.second;
+
+          auto ext_dl_tensor = ext_data_provider_(eid);
+          ICHECK(ext_dl_tensor->data);
+          return dnnl::memory{desc, eng_, ext_dl_tensor->data};
+        }
+      }
+      return {};
+    }
+
+   private:
+    const dnnl::engine& eng_;
+    const DLTensorProvider& ext_data_provider_;
+    const std::vector<dnnl::memory>& const_mems_;
+    const std::vector<std::pair<uint32_t, dnnl::memory::desc>>& ext_mems_;
+    std::vector<dnnl::memory> tmp_mems_;
+  };
+
+  ArgId MakeArgReq(ArgReqFlag flag, uint32_t idx) { return {flag, idx}; }
+
+  /* Collection of const memory objects. */
+  std::vector<dnnl::memory> const_mem_collection_;
+
+  /* Collection of intermediate memory descriptors. Zero position is reserved for scratchpads. */
+  std::vector<dnnl::memory::desc> tmp_mem_collection_;
+
+  /* Mapping of some temp buffer on previously registered. */
+  std::map<size_t, size_t> tmp_mem_mapping_;
+
+  /* Collection of external_intermediate memory objects.
+   *  first  - eid of external buffer to ask
+   *  second - t_desc describes how to treat external buffer */
+  std::vector<std::pair<uint32_t, dnnl::memory::desc>> ext_mem_collection_;
+
+  /* Map of eid to index of temp buffer in tmp_mem_collection_ */
+  std::unordered_map<uint32_t, size_t> eid2idx_tmp_;
+
+  /* List of external eid */
+  std::set<uint32_t> ext_io_eid_;
+
+  /* Engine of all tensors existing in this registry */
+  dnnl::engine eng_;
+
+  /* Execution stream use to reorder const data */
+  dnnl::stream stream_;
+};
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_DNNL_DNNL_TENSOR_REQUISITE_H_
diff --git a/src/runtime/contrib/dnnl/dnnl_utils.cc b/src/runtime/contrib/dnnl/dnnl_utils.cc
index 7e79f1c939cfe..23992209f2ad5 100644
--- a/src/runtime/contrib/dnnl/dnnl_utils.cc
+++ b/src/runtime/contrib/dnnl/dnnl_utils.cc
@@ -23,11 +23,14 @@
 
 #include "dnnl_utils.h"
 
+#include "tvm/runtime/logging.h"
+
 namespace tvm {
 namespace runtime {
 namespace contrib {
-using dt = dnnl::memory::data_type;
-dt dtype_dl2dnnl(DLDataType dltype) {
+
+dnnl::memory::data_type dtype_dl2dnnl(DLDataType dltype) {
+  using dt = dnnl::memory::data_type;
   dt dnnl_type = dt::undef;
   if (dltype.code == DataType::TypeCode::kFloat) {
     if (dltype.bits == 16) {
@@ -51,6 +54,23 @@ dt dtype_dl2dnnl(DLDataType dltype) {
   }
   return dnnl_type;
 }
+
+dnnl::memory::dims shape_dl2dnnl(const std::vector<int64_t>& shape) {
+  if (shape.empty()) return {1};  // DNNL scalar representation is 1D tensor
+  return shape;
+}
+
+dnnl::memory::desc MakePlainDesc(const std::vector<int64_t>& shape, DLDataType dltype) {
+  auto dnnl_shape = shape_dl2dnnl(shape);
+  auto dnnl_dtype = dtype_dl2dnnl(dltype);
+
+  auto dnnl_plain_strides = dnnl::memory::dims(dnnl_shape.size(), 1);
+  for (int i = dnnl_shape.size() - 2; i >= 0; i--)
+    dnnl_plain_strides[i] = dnnl_plain_strides[i + 1] * dnnl_shape[i + 1];
+
+  return {dnnl_shape, dnnl_dtype, dnnl_plain_strides};
+}
+
 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/dnnl/dnnl_utils.h b/src/runtime/contrib/dnnl/dnnl_utils.h
index 4fb236f96f8b1..a598b6704450f 100644
--- a/src/runtime/contrib/dnnl/dnnl_utils.h
+++ b/src/runtime/contrib/dnnl/dnnl_utils.h
@@ -18,16 +18,23 @@
  */
 
 /*!
- * \file src/runtime/contrib/dnnl/dnnl_utils.h
- * \brief utils for DNNL.
+ * \file src/runtime/contrib/dnnl/dnnl_utils.cc
+ * \brief Some DNNL specific utility functions
  */
 
 #ifndef TVM_RUNTIME_CONTRIB_DNNL_DNNL_UTILS_H_
 #define TVM_RUNTIME_CONTRIB_DNNL_DNNL_UTILS_H_
 
-#include <tvm/runtime/data_type.h>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <vector>
 
-#include "dnnl.hpp"
+// TODO(@apeskov): Have to mute warning from dnnl headers.
+//  -Wzero-as-null-pointer-constant and -Wdocumentation-unknown-command
+#include <dnnl.hpp>
+
+#include "tvm/runtime/data_type.h"
 
 namespace tvm {
 namespace runtime {
@@ -40,7 +47,90 @@ namespace contrib {
  */
 dnnl::memory::data_type dtype_dl2dnnl(DLDataType dltype);
 
+/*!
+ * \brief Converter TVM shape to DNNL dims
+ * \param shape tvm shape
+ * \return dims in terms of dnnl
+ */
+dnnl::memory::dims shape_dl2dnnl(const std::vector<int64_t>& shape);
+
+/*!
+ * \brief Construct plain tensor descriptor
+ * \param shape provided shape
+ * \param dltype provided data type
+ * \return resulting plain tensor desc
+ */
+dnnl::memory::desc MakePlainDesc(const std::vector<int64_t>& shape, DLDataType dltype);
+
+namespace utils {
+
+/*! \brief Pretty printer util for shape */
+inline std::ostream& operator<<(std::ostream& o, const dnnl::memory::dims& dims) {
+  o << "[";
+  auto d = dims.begin();
+  if (d != dims.end()) o << *d++;
+  while (d != dims.end()) o << "," << *d++;
+  o << "]";
+  return o;
+}
+
+/*! \brief Pretty printer util for data type */
+inline std::ostream& operator<<(std::ostream& o, const dnnl::memory::data_type& type) {
+  std::string name = "undef";
+  switch (type) {
+    case dnnl::memory::data_type::undef:
+      name = "undef";
+      break;
+    case dnnl::memory::data_type::f32:
+      name = "fp32";
+      break;
+    case dnnl::memory::data_type::f16:
+      name = "fp16";
+      break;
+    case dnnl::memory::data_type::bf16:
+      name = "bf16";
+      break;
+    case dnnl::memory::data_type::s32:
+      name = "i32";
+      break;
+    case dnnl::memory::data_type::s8:
+      name = "i8";
+      break;
+    case dnnl::memory::data_type::u8:
+      name = "u8";
+      break;
+  }
+  o << name;
+  return o;
+}
+
+/*! \brief Converter data type template arg to runtime object */
+template <typename T>
+inline dnnl::memory::data_type DnnlDType();
+
+template <>
+inline dnnl::memory::data_type DnnlDType<int>() {
+  return dnnl::memory::data_type::s32;
+}
+
+template <>
+inline dnnl::memory::data_type DnnlDType<float>() {
+  return dnnl::memory::data_type::f32;
+}
+
+template <>
+inline dnnl::memory::data_type DnnlDType<uint8_t>() {
+  return dnnl::memory::data_type::u8;
+}
+
+template <>
+inline dnnl::memory::data_type DnnlDType<int8_t>() {
+  return dnnl::memory::data_type::s8;
+}
+
+}  // namespace utils
 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
+
 #endif  // TVM_RUNTIME_CONTRIB_DNNL_DNNL_UTILS_H_

From 4f5ab57d348e97b707d0707f9272cebe03a79777 Mon Sep 17 00:00:00 2001
From: ChunPing Chung <cpchung@pllab.cs.nthu.edu.tw>
Date: Fri, 3 Jun 2022 00:28:38 +0800
Subject: [PATCH 020/181] [Frontend][ONNX] Fix softmax converter when input
 shape is dynamic (#11507)

* [Frontend][ONNX] Fix softmax converter when input shape is dynamic

* [Frontend][ONNX] mark dynamic softmax tests as xfailed with cuda
---
 python/tvm/relay/frontend/onnx.py          |  2 ++
 tests/python/frontend/onnx/test_forward.py | 37 ++++++++++++++++++----
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 30e8188a8312c..997aa6240e9e8 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2420,6 +2420,8 @@ def _impl_v1(cls, inputs, attr, params):
             axis += ndim
         if axis == 0:
             reshape_shape = [-1]
+        elif axis == ndim - 1:
+            return _op.nn.softmax(inputs[0], axis=axis)
         else:
             axis_val = [in_shape[i] for i in range(axis)]
             reshape_shape = [np.prod(axis_val)] + [-1]
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index dbc5147e20300..c4cd93aa7d9b0 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1589,26 +1589,45 @@ def test_upsample3d_trilinear(target, dev):
     tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+# TODO: Fix softmax with dynamic input on cuda and enable this test
+@tvm.testing.known_failing_targets("cuda")
 @tvm.testing.parametrize_targets
 def test_softmax(target, dev):
-    def verify_softmax(inshape, axis):
+    def verify_softmax(inshape, axis, opset=None, dynamic=False):
         opname = "Softmax"
-        indata = np.random.uniform(size=inshape).astype(np.float32)
         outshape = inshape
-        y = helper.make_node(opname, ["in"], ["out"])
+        node_list = []
+        input_node_list = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(inshape))]
+        output_node_list = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outshape))]
+        input_list = [np.random.uniform(size=inshape).astype(np.float32)]
+        softmax_inputs = ["in"]
+
+        if dynamic:
+            input_node_list.append(
+                helper.make_tensor_value_info("shape", TensorProto.INT64, [len(inshape)])
+            )
+            input_list.append(np.asarray(inshape))
+            reshape_node = helper.make_node("Reshape", ["in", "shape"], ["dynamic_in"])
+            softmax_inputs[0] = "dynamic_in"
+            node_list += [reshape_node]
+
+        y = helper.make_node(opname, softmax_inputs, ["out"])
         if axis is not None:
             axis_attr = helper.make_attribute("axis", axis)
             y.attribute.append(axis_attr)
+        node_list.append(y)
 
         graph = helper.make_graph(
-            [y],
+            node_list,
             opname + "_test",
-            inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))],
-            outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outshape))],
+            inputs=input_node_list,
+            outputs=output_node_list,
         )
 
         model = helper.make_model(graph, producer_name=opname + "_test")
-        verify_with_ort_with_inputs(model, [indata], target=target, dev=dev)
+        verify_with_ort_with_inputs(
+            model, input_list, use_vm=True, opset=opset, target=target, dev=dev
+        )
 
     verify_softmax((1, 10), None)
     verify_softmax((1, 10), 1)
@@ -1616,6 +1635,10 @@ def verify_softmax(inshape, axis):
     verify_softmax((1, 2, 3, 10), 2)
     verify_softmax((1, 2, 3, 4, 10), 3)
     verify_softmax((1, 2, 3, 4, 10), 4)
+    verify_softmax((1, 10), -1, dynamic=True)
+    verify_softmax((1, 2, 3, 10), -1, dynamic=True)
+    verify_softmax((1, 10), -1, opset=8, dynamic=True)
+    verify_softmax((1, 2, 3, 10), -1, opset=8, dynamic=True)
 
 
 @tvm.testing.parametrize_targets

From 480fa744eb66a2c6013d43ee46778d02b905ca19 Mon Sep 17 00:00:00 2001
From: Jocelyn S <jshiue@octoml.ai>
Date: Thu, 2 Jun 2022 13:15:04 -0400
Subject: [PATCH 021/181] [Onnx] Round operator (#11446)

* banker round op added based off tutorial

* black'd onnx.py file

* retriggering CI with empty commit due to autoscheduler test failure

* removed youtube link in comments

* retriggering CI due to test failure that passed locally
---
 python/tvm/relay/frontend/onnx.py          | 21 +++++++++++++++++++--
 tests/python/frontend/onnx/test_forward.py |  1 -
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 997aa6240e9e8..abfa5629d5534 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -5061,10 +5061,27 @@ def _impl_v1(cls, inputs, attr, params):
         return _expr.TupleWrapper(_expr.Tuple(result), len(result))
 
 
+class Round(OnnxOpConverter):
+    """Operator converter for round op."""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        # Onnx round uses Banker's rounding which rounds .5 to the nearest even integer
+
+        x = inputs[0]
+        half = _expr.const(0.5, dtype="float32")
+        one = _expr.const(1, dtype="float32")
+        two = _expr.const(2, dtype="float32")
+
+        rounded = _op.ceil(x - half)
+        bankers_mask = one - (_op.ceil(x + half) - _op.floor(x + half))
+        non_even = _op.abs(_op.mod(rounded, two))
+        return rounded + (bankers_mask * non_even)
+
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
-
 # _convert_map defines maps of name to converter functor(callable)
 # for 1 to 1 mapping, use Renamer if nothing but name is different
 # use AttrCvt if attributes need to be converted
@@ -5109,7 +5126,7 @@ def _get_convert_map(opset):
         "Reciprocal": Reciprocal.get_converter(opset),
         "Floor": Renamer("floor"),
         "Ceil": Renamer("ceil"),
-        "Round": Renamer("round"),
+        "Round": Round.get_converter(opset),
         "IsInf": IsInf.get_converter(opset),
         "IsNaN": Renamer("isnan"),
         "Sqrt": Renamer("sqrt"),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index c4cd93aa7d9b0..ebaad9b4cb136 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5183,7 +5183,6 @@ def verify_eyelike(indata):
     "test_reduce_sum_negative_axes_keepdims_example",
     "test_reduce_sum_negative_axes_keepdims_random",
     "test_rnn_seq_length",
-    "test_round",
     "test_sequence_insert_at_back",
     "test_sequence_insert_at_front",
     "test_simple_rnn_batchwise",

From 84eb78cbc4663d6f25ee5a7ead6a930eba02776b Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 2 Jun 2022 10:47:29 -0700
Subject: [PATCH 022/181] [MetaSchedule] No explicit for spatial PrimFunc
 (#11534)

---
 .../parallel_vectorize_unroll.cc              |   7 +-
 src/tir/schedule/analysis.h                   |   7 +
 src/tir/schedule/analysis/analysis.cc         |  19 ++
 ...schedule_rule_parallel_vectorize_unroll.py | 179 ++++++++++++++++++
 4 files changed, 211 insertions(+), 1 deletion(-)

diff --git a/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc b/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
index c0e57a6d037a5..19758996e6080 100644
--- a/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
+++ b/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
@@ -26,6 +26,11 @@ bool IsRootBlock(const Schedule& sch, const BlockRV& block_rv) {
   return block_sref->parent == nullptr;
 }
 
+bool CheckSpatialPrimFunc(const Schedule& sch, const BlockRV& root_block_rv) {
+  return IsSpatialPrimFunc(
+      GetRef<PrimFunc>(GetRootPrimFunc(sch->mod(), sch->Get(root_block_rv).get(), nullptr)));
+}
+
 }  // namespace tir
 }  // namespace tvm
 
@@ -60,7 +65,7 @@ class ParallelizeVectorizeUnrollNode : public ScheduleRuleNode {
       sch->Annotate(root_rv, tir::attr::meta_schedule_vectorize, Integer(max_vectorize_extent));
     }
     // Unroll
-    if (!unroll_max_steps.empty()) {
+    if (!unroll_max_steps.empty() && !tir::CheckSpatialPrimFunc(sch, root_rv)) {
       int n = unroll_max_steps.size();
       double prob = 1.0 / n;
       Array<FloatImm> probs(n, FloatImm(DataType::Float(64), prob));
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index 0574cfefadb6f..5adc4f8f1b30a 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -625,6 +625,13 @@ bool IsTrivialBinding(const ScheduleState& self, const StmtSRef& block_sref);
  */
 bool NeedsMultiLevelTiling(const ScheduleState& self, const StmtSRef& block_sref);
 
+/*!
+ * \brief Checks if all the blocks in the PrimFunc is spatial
+ * \param func The PrimFunc to be checked
+ * \return A boolean indicating whether all the blocks in the PrimFunc is spatial
+ */
+bool IsSpatialPrimFunc(const PrimFunc& func);
+
 /*!
  * \brief Checks if the rfactor or cross thread reduction is beneficial to the given block.
  * \param self The schedule state.
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 83ef6adae3b23..0f84dfef1135f 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -1957,6 +1957,25 @@ bool NeedsMultiLevelTiling(const ScheduleState& self, const StmtSRef& block_sref
   return total_unused_block_vars >= 1;
 }
 
+bool IsSpatialPrimFunc(const PrimFunc& func) {
+  bool result = true;
+  PreOrderVisit(func->body, [&result](const ObjectRef& obj) {
+    if (result == false) {
+      return false;
+    }
+    if (const auto* block = obj.as<BlockNode>()) {
+      for (const IterVar& iter_var : block->iter_vars) {
+        if (iter_var->iter_type != IterVarType::kDataPar) {
+          result = false;
+          return false;
+        }
+      }
+    }
+    return true;
+  });
+  return result;
+}
+
 std::pair<int64_t, int64_t> GetCumulativeSpaceAndReductionLength(const tir::ScheduleState& self,
                                                                  const tir::StmtSRef& block_sref) {
   Array<tir::StmtSRef> loops = tir::GetLoops(block_sref);
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
index e57799f604b8a..85aa80eb3c82b 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
+from tvm import meta_schedule as ms
 from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing.schedule_rule import parallel_vectorize_unroll
 from tvm.meta_schedule.testing.space_generation import check_trace
@@ -61,6 +62,164 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:
                         C[vi, vj] = 0.0
                     C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
+
+# from tvm.script import tir as T
+@tvm.script.ir_module
+class PureSpatial:
+    @T.prim_func
+    def main(placeholder: T.Buffer[(1, 13, 13, 3, 85), "float32"], placeholder_1: T.Buffer[(1, 26, 26, 3, 85), "float32"], placeholder_2: T.Buffer[(1, 52, 52, 3, 85), "float32"], T_expand_dims: T.Buffer[(1, 80, 10647), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        T_strided_slice_with_axes = T.alloc_buffer([1, 52, 52, 3, 1], dtype="float32")
+        T_sigmoid = T.alloc_buffer([1, 52, 52, 3, 1], dtype="float32")
+        T_strided_slice_with_axes_1 = T.alloc_buffer([1, 52, 52, 3, 80], dtype="float32")
+        T_sigmoid_1 = T.alloc_buffer([1, 52, 52, 3, 80], dtype="float32")
+        T_multiply = T.alloc_buffer([1, 52, 52, 3, 80], dtype="float32")
+        T_reshape = T.alloc_buffer([8112, 80], dtype="float32")
+        T_strided_slice_with_axes_2 = T.alloc_buffer([1, 26, 26, 3, 1], dtype="float32")
+        T_sigmoid_2 = T.alloc_buffer([1, 26, 26, 3, 1], dtype="float32")
+        T_strided_slice_with_axes_3 = T.alloc_buffer([1, 26, 26, 3, 80], dtype="float32")
+        T_sigmoid_3 = T.alloc_buffer([1, 26, 26, 3, 80], dtype="float32")
+        T_multiply_1 = T.alloc_buffer([1, 26, 26, 3, 80], dtype="float32")
+        T_reshape_1 = T.alloc_buffer([2028, 80], dtype="float32")
+        T_strided_slice_with_axes_4 = T.alloc_buffer([1, 13, 13, 3, 1], dtype="float32")
+        T_sigmoid_4 = T.alloc_buffer([1, 13, 13, 3, 1], dtype="float32")
+        T_strided_slice_with_axes_5 = T.alloc_buffer([1, 13, 13, 3, 80], dtype="float32")
+        T_sigmoid_5 = T.alloc_buffer([1, 13, 13, 3, 80], dtype="float32")
+        T_multiply_2 = T.alloc_buffer([1, 13, 13, 3, 80], dtype="float32")
+        T_reshape_2 = T.alloc_buffer([507, 80], dtype="float32")
+        T_concat = T.alloc_buffer([10647, 80], dtype="float32")
+        T_transpose = T.alloc_buffer([80, 10647], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 52, 52, 3, 1):
+            with T.block("T_strided_slice_with_axes"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder_2[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)])
+                T.writes(T_strided_slice_with_axes[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes[ax0, ax1, ax2, ax3, ax4] = placeholder_2[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 52, 52, 3, 1):
+            with T.block("T_sigmoid"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 52, 52, 3, 80):
+            with T.block("T_strided_slice_with_axes_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder_2[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)])
+                T.writes(T_strided_slice_with_axes_1[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes_1[ax0, ax1, ax2, ax3, ax4] = placeholder_2[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 52, 52, 3, 80):
+            with T.block("T_sigmoid_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes_1[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid_1[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid_1[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes_1[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 52, 52, 3, 80):
+            with T.block("T_multiply"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_sigmoid[ax0, ax1, ax2, ax3, 0], T_sigmoid_1[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_multiply[ax0, ax1, ax2, ax3, ax4])
+                T_multiply[ax0, ax1, ax2, ax3, ax4] = T_sigmoid[ax0, ax1, ax2, ax3, 0] * T_sigmoid_1[ax0, ax1, ax2, ax3, ax4]
+        for i0, i1 in T.grid(8112, 80):
+            with T.block("T_reshape"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_multiply[0, (ax1 // 80 + ax0) % 8112 // 156, (ax1 // 80 + ax0) % 156 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80])
+                T.writes(T_reshape[ax0, ax1])
+                T_reshape[ax0, ax1] = T_multiply[0, (ax1 // 80 + ax0) % 8112 // 156, (ax1 // 80 + ax0) % 156 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80]
+        for i0, i1, i2, i3, i4 in T.grid(1, 26, 26, 3, 1):
+            with T.block("T_strided_slice_with_axes_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder_1[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)])
+                T.writes(T_strided_slice_with_axes_2[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes_2[ax0, ax1, ax2, ax3, ax4] = placeholder_1[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 26, 26, 3, 1):
+            with T.block("T_sigmoid_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes_2[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid_2[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid_2[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes_2[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 26, 26, 3, 80):
+            with T.block("T_strided_slice_with_axes_3"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder_1[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)])
+                T.writes(T_strided_slice_with_axes_3[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes_3[ax0, ax1, ax2, ax3, ax4] = placeholder_1[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 26, 26, 3, 80):
+            with T.block("T_sigmoid_3"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes_3[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid_3[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid_3[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes_3[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 26, 26, 3, 80):
+            with T.block("T_multiply_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_sigmoid_2[ax0, ax1, ax2, ax3, 0], T_sigmoid_3[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_multiply_1[ax0, ax1, ax2, ax3, ax4])
+                T_multiply_1[ax0, ax1, ax2, ax3, ax4] = T_sigmoid_2[ax0, ax1, ax2, ax3, 0] * T_sigmoid_3[ax0, ax1, ax2, ax3, ax4]
+        for i0, i1 in T.grid(2028, 80):
+            with T.block("T_reshape_1"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_multiply_1[0, (ax1 // 80 + ax0) % 2028 // 78, (ax1 // 80 + ax0) % 78 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80])
+                T.writes(T_reshape_1[ax0, ax1])
+                T_reshape_1[ax0, ax1] = T_multiply_1[0, (ax1 // 80 + ax0) % 2028 // 78, (ax1 // 80 + ax0) % 78 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80]
+        for i0, i1, i2, i3, i4 in T.grid(1, 13, 13, 3, 1):
+            with T.block("T_strided_slice_with_axes_4"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)])
+                T.writes(T_strided_slice_with_axes_4[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes_4[ax0, ax1, ax2, ax3, ax4] = placeholder[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 13, 13, 3, 1):
+            with T.block("T_sigmoid_4"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes_4[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid_4[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid_4[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes_4[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 13, 13, 3, 80):
+            with T.block("T_strided_slice_with_axes_5"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)])
+                T.writes(T_strided_slice_with_axes_5[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes_5[ax0, ax1, ax2, ax3, ax4] = placeholder[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 13, 13, 3, 80):
+            with T.block("T_sigmoid_5"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes_5[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid_5[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid_5[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes_5[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 13, 13, 3, 80):
+            with T.block("T_multiply_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_sigmoid_4[ax0, ax1, ax2, ax3, 0], T_sigmoid_5[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_multiply_2[ax0, ax1, ax2, ax3, ax4])
+                T_multiply_2[ax0, ax1, ax2, ax3, ax4] = T_sigmoid_4[ax0, ax1, ax2, ax3, 0] * T_sigmoid_5[ax0, ax1, ax2, ax3, ax4]
+        for i0, i1 in T.grid(507, 80):
+            with T.block("T_reshape_2"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_multiply_2[0, (ax1 // 80 + ax0) % 507 // 39, (ax1 // 80 + ax0) % 39 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80])
+                T.writes(T_reshape_2[ax0, ax1])
+                T_reshape_2[ax0, ax1] = T_multiply_2[0, (ax1 // 80 + ax0) % 507 // 39, (ax1 // 80 + ax0) % 39 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80]
+        for i0, i1 in T.grid(10647, 80):
+            with T.block("T_concat"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_reshape[ax0 - 2535, ax1], T_reshape_1[ax0 - 507, ax1], T_reshape_2[ax0, ax1])
+                T.writes(T_concat[ax0, ax1])
+                T_concat[ax0, ax1] = T.if_then_else(2535 <= ax0, T_reshape[ax0 - 2535, ax1], T.if_then_else(507 <= ax0, T_reshape_1[ax0 - 507, ax1], T_reshape_2[ax0, ax1], dtype="float32"), dtype="float32")
+        for i0, i1 in T.grid(80, 10647):
+            with T.block("T_transpose"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_concat[ax1, ax0])
+                T.writes(T_transpose[ax0, ax1])
+                T_transpose[ax0, ax1] = T_concat[ax1, ax0]
+        for i0, i1, i2 in T.grid(1, 80, 10647):
+            with T.block("T_expand_dims"):
+                ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2])
+                T.reads(T_transpose[ax1, ax2])
+                T.writes(T_expand_dims[ax0, ax1, ax2])
+                T_expand_dims[ax0, ax1, ax2] = T_transpose[ax1, ax2]
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks
 # fmt: on
 
@@ -101,5 +260,25 @@ def test_parallel_vectorize_unroll():
     check_trace(spaces, expected)
 
 
+def test_parallel_vectorize_unroll_spatial():
+    mod = PureSpatial
+    target = Target("llvm --num-cores=32")
+    ctx = _create_context(
+        mod=mod,
+        target=target,
+        rule=ms.schedule_rule.ParallelizeVectorizeUnroll(
+            max_jobs_per_core=-1,
+            max_vectorize_extent=-1,
+            unroll_max_steps=[1, 2, 4, 8, 16, 32, 64],
+            unroll_explicit=True,
+        ),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=mod)
+    assert len(spaces) == 1
+    trace = spaces[0].trace.simplified(remove_postproc=True)
+    assert not trace.insts
+
+
 if __name__ == "__main__":
     test_parallel_vectorize_unroll()
+    test_parallel_vectorize_unroll_spatial()

From 3bee5cacd7da5295e42e99e92d1864a97c9ffe80 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 2 Jun 2022 11:22:02 -0700
Subject: [PATCH 023/181] [ci][wip] Upload docs with folder structure to S3
 (#11528)

Keeping the files as-is lets us serve them from S3 + CloudFront

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 7 +++++--
 jenkins/Test.groovy.j2 | 5 ++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index b9175f06afdc5..334448a7ae24b 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-31T16:54:56.997402
+// Generated at 2022-06-01T16:34:53.941462
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -2875,7 +2875,10 @@ stage('Test') {
             label: 'Upload artifacts to S3',
           )
 
-          archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
+          sh(
+            script: "aws s3 cp --no-progress _docs s3://${s3_prefix}/docs --recursive",
+            label: 'Upload docs to S3',
+          )
         }
       }
     }
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
index d86575c247c75..d219b47bc7929 100644
--- a/jenkins/Test.groovy.j2
+++ b/jenkins/Test.groovy.j2
@@ -266,7 +266,10 @@ stage('Test') {
             )
           }
           {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }}
-          archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
+          sh(
+            script: "aws s3 cp --no-progress _docs s3://${s3_prefix}/docs --recursive",
+            label: 'Upload docs to S3',
+          )
         }
       }
     }

From a2f89c53cc761a9ef8fa918105486b81a539a02b Mon Sep 17 00:00:00 2001
From: apeskov <peskovnn@gmail.com>
Date: Thu, 2 Jun 2022 22:24:24 +0300
Subject: [PATCH 024/181] Restore integration test on Mac and Windows (#11538)

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>
---
 tests/python/contrib/test_dnnl.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index fecd776d7065e..76e3f1c3a4055 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -17,6 +17,8 @@
 import pytest
 import itertools
 import numpy as np
+import sys
+import subprocess
 
 import tvm
 from tvm import relay
@@ -37,7 +39,21 @@
     ids=["compile", "run"],
 )
 
-bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
+_bf16_supported = None
+
+
+def bf16_supported():
+    global _bf16_supported
+    if _bf16_supported is None:
+        _bf16_supported = False
+        if sys.platform.startswith("darwin"):
+            cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode()
+            for line in cpu_info.split("\n"):
+                if line.startswith("hw.optional.avx512f"):
+                    _bf16_supported = bool(line.split(":", 1)[1])
+        elif sys.platform.startswith("linux"):
+            _bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
+    return _bf16_supported
 
 
 def partition_for_dnnl(mod, params=None, alter_layout=True):
@@ -150,7 +166,7 @@ def check_dnnl_used(mod, subgraph_num=None):
             (True, False, False),
             (True, True, False),
         ]
-        if test_bf16 and bf16_supported:
+        if test_bf16 and bf16_supported():
             configs += [(True, False, True), (True, True, True)]
         for use_dnnl, alter_layout, use_bf16 in configs:
             result_key = (

From 03eefe0b41587fecb910f3543b0ddc1adeb4fcff Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 2 Jun 2022 12:43:06 -0700
Subject: [PATCH 025/181] [ci] Add @tvm-bot rerun (#11480)

This adds a command to restart CI runs that have stopped (either from a
failure, success, or abort) via GitHub comments addressed to tvm-bot:

```
@tvm-bot rerun
```

tvm-bot will then comment on the thread and send a request to Jenkins to
restart CI. This does not restart GitHub Actions jobs though we may be
able to add that in the future.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/workflows/{merge.yml => tvmbot.yml}   |  11 +-
 tests/python/ci/sample_prs/pr10786-badci.json |   3 +-
 .../sample_prs/pr10786-changes-requested.json |   3 +-
 .../ci/sample_prs/pr10786-co-authors.json     |   2 +-
 .../ci/sample_prs/pr10786-invalid-author.json |   3 +-
 .../python/ci/sample_prs/pr10786-merges.json  |   2 +-
 .../ci/sample_prs/pr10786-missing-job.json    |   2 +-
 .../ci/sample_prs/pr10786-nottriggered.json   |   2 +-
 .../ci/sample_prs/pr10786-oldreview.json      |   2 +-
 .../pr11244-unauthorized-comment.json         |   2 +-
 .../ci/sample_prs/pr11267-no-review.json      |   4 +-
 .../ci/sample_prs/pr11276-no-review.json      | 157 -------------
 ...o-recomment.json => pr11442-rerun-ci.json} |  12 +-
 tests/python/ci/test_mergebot.py              |  66 ++++--
 tests/scripts/git_utils.py                    |  22 ++
 .../{github_mergebot.py => github_tvmbot.py}  | 219 +++++++++++-------
 16 files changed, 239 insertions(+), 273 deletions(-)
 rename .github/workflows/{merge.yml => tvmbot.yml} (62%)
 delete mode 100644 tests/python/ci/sample_prs/pr11276-no-review.json
 rename tests/python/ci/sample_prs/{pr11442-no-recomment.json => pr11442-rerun-ci.json} (95%)
 rename tests/scripts/{github_mergebot.py => github_tvmbot.py} (80%)

diff --git a/.github/workflows/merge.yml b/.github/workflows/tvmbot.yml
similarity index 62%
rename from .github/workflows/merge.yml
rename to .github/workflows/tvmbot.yml
index efbada4b00a46..c9d2cf71e6a70 100644
--- a/.github/workflows/merge.yml
+++ b/.github/workflows/tvmbot.yml
@@ -1,5 +1,5 @@
 
-name: Merge
+name: tvm-bot
 on:
   status:
   pull_request_review:
@@ -12,16 +12,19 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  maybe-merge:
+  run-tvm-bot:
     if: github.repository == 'apache/tvm'
     runs-on: ubuntu-20.04
+    if: ${{ github.event.issue.pull_request }}
     steps:
       - uses: actions/checkout@v2
-      - name: Merge if requested and possible
+      - name: Run tvm-bot
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          TVM_BOT_JENKINS_TOKEN: ${{ secrets.TVM_BOT_JENKINS_TOKEN }}
           PR_NUMBER: ${{ github.event.issue.number }}
+          ISSUE_COMMENT: ${{ toJson(github.event.comment) }}
           RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
         run: |
           set -eux
-          python tests/scripts/github_mergebot.py --pr "$PR_NUMBER" --run-url "$RUN_URL"
+          python tests/scripts/github_tvmbot.py --pr "$PR_NUMBER" --run-url "$RUN_URL" --trigger-comment-json "$ISSUE_COMMENT"
diff --git a/tests/python/ci/sample_prs/pr10786-badci.json b/tests/python/ci/sample_prs/pr10786-badci.json
index b49899b86bcae..7e9d10d0b6481 100644
--- a/tests/python/ci/sample_prs/pr10786-badci.json
+++ b/tests/python/ci/sample_prs/pr10786-badci.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
@@ -119,6 +119,7 @@
         "commit": {
           "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
         },
+        "id": 123,
         "author": {
           "login": "kparzysz-quic"
         },
diff --git a/tests/python/ci/sample_prs/pr10786-changes-requested.json b/tests/python/ci/sample_prs/pr10786-changes-requested.json
index 46b13a7f6c6c0..24e261099a4ff 100644
--- a/tests/python/ci/sample_prs/pr10786-changes-requested.json
+++ b/tests/python/ci/sample_prs/pr10786-changes-requested.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
@@ -120,6 +120,7 @@
         "commit": {
           "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
         },
+        "id": 123,
         "author": {
           "login": "kparzysz-quic"
         },
diff --git a/tests/python/ci/sample_prs/pr10786-co-authors.json b/tests/python/ci/sample_prs/pr10786-co-authors.json
index a660c9d9b214a..75f2728250597 100644
--- a/tests/python/ci/sample_prs/pr10786-co-authors.json
+++ b/tests/python/ci/sample_prs/pr10786-co-authors.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr10786-invalid-author.json b/tests/python/ci/sample_prs/pr10786-invalid-author.json
index d19d6dad8a442..81b028e3196ae 100644
--- a/tests/python/ci/sample_prs/pr10786-invalid-author.json
+++ b/tests/python/ci/sample_prs/pr10786-invalid-author.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
@@ -114,6 +114,7 @@
     "nodes": [
       {
         "body": "@tvm-bot merge",
+        "id": 123,
         "updatedAt": "2022-03-25T22:13:50Z",
         "authorCanPushToRepository": false,
         "commit": {
diff --git a/tests/python/ci/sample_prs/pr10786-merges.json b/tests/python/ci/sample_prs/pr10786-merges.json
index c7b6940f0d5b3..0226c8ab52454 100644
--- a/tests/python/ci/sample_prs/pr10786-merges.json
+++ b/tests/python/ci/sample_prs/pr10786-merges.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free.\n\n\nThanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n\nPreviously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\n\n\ncc @someone\n\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>\n\n\nThanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr10786-missing-job.json b/tests/python/ci/sample_prs/pr10786-missing-job.json
index 81be0ebe47950..13739b793fb53 100644
--- a/tests/python/ci/sample_prs/pr10786-missing-job.json
+++ b/tests/python/ci/sample_prs/pr10786-missing-job.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr10786-nottriggered.json b/tests/python/ci/sample_prs/pr10786-nottriggered.json
index 11c5976bd6e40..0da541c4342df 100644
--- a/tests/python/ci/sample_prs/pr10786-nottriggered.json
+++ b/tests/python/ci/sample_prs/pr10786-nottriggered.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr10786-oldreview.json b/tests/python/ci/sample_prs/pr10786-oldreview.json
index 27ba0e8729181..1a2556cb6f5f1 100644
--- a/tests/python/ci/sample_prs/pr10786-oldreview.json
+++ b/tests/python/ci/sample_prs/pr10786-oldreview.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json b/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
index 206adc9a9eacf..beafc05958b64 100644
--- a/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
+++ b/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
@@ -3,7 +3,7 @@
   "body": "See [this thread ](https://discuss.tvm.apache.org/t/crt-add-platform-specific-pre-and-post-function-calls-in-crt-runtime/12723)for an explanation.",
   "state": "OPEN",
   "author": {
-    "login": "fPecc"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr11267-no-review.json b/tests/python/ci/sample_prs/pr11267-no-review.json
index 31577671f0b6b..d2ad164673e5a 100644
--- a/tests/python/ci/sample_prs/pr11267-no-review.json
+++ b/tests/python/ci/sample_prs/pr11267-no-review.json
@@ -3,7 +3,7 @@
   "body": "This adds `/opt/sccache` to the PATH of each of the CI docker images so when cmake looks for a C compiler it will pick up the sccache wrapper by default. This fixes some issues where compiler invocations weren't being run though sccache. With this approach the invoker doesn't need to do anything specific to set up sccache.\n\nThis will require a follow up PR to update the Docker images and remove some of the sccache logic in `task_build.py`\n\n\n\ncc @Mousius @areusch",
   "state": "OPEN",
   "author": {
-    "login": "driazati"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
@@ -15,6 +15,7 @@
         "author": {
           "login": "areusch"
         },
+        "id": 124,
         "updatedAt": "2022-05-11T16:54:32Z",
         "body": "just confirming--we can disable this when doing a local build, correct? what's the mechanism by which we do that?"
       },
@@ -23,6 +24,7 @@
         "author": {
           "login": "driazati"
         },
+        "id": 123,
         "updatedAt": "2022-05-11T18:46:54Z",
         "body": "@tvm-bot merge"
       }
diff --git a/tests/python/ci/sample_prs/pr11276-no-review.json b/tests/python/ci/sample_prs/pr11276-no-review.json
deleted file mode 100644
index 3f8459eb00f7b..0000000000000
--- a/tests/python/ci/sample_prs/pr11276-no-review.json
+++ /dev/null
@@ -1,157 +0,0 @@
-{
-  "title": "[COMMUNITY] mikepapadim -> Reviewer",
-  "body": "Please join us to welcome Michalis Papadimitriou (@mikepapadim) as a new reviewer to TVM. Michalis has contributed a lot to BYOC and TensorRT backend.\r\n\r\n- [Commits History](https://github.com/apache/tvm/commits?author=mikepapadim)\r\n- [Code Review](https://github.com/apache/tvm/pulls?utf8=%E2%9C%93&q=reviewed-by:mikepapadim)\r\n- [Community Forum Summary](https://github.com/apache/tvm/commits?author=mikepapadim)",
-  "state": "OPEN",
-  "author": {
-    "login": "ZihengJiang"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "ZihengJiang",
-                "email": "ziheng@apache.org"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "96075744cc687caafc131361d006c5967edddbc6",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391733373"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391732791"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391754960"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391732788"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391754947"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391733127"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/branch",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/ziheng%252Fcommunity/1/display/redirect"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11276/1/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "APPROVED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "body": "",
-        "updatedAt": "2022-05-11T16:50:16Z",
-        "url": "https://github.com/apache/tvm/pull/11276#pullrequestreview-969701502",
-        "authorCanPushToRepository": true,
-        "commit": {
-          "oid": "96075744cc687caafc131361d006c5967edddbc6"
-        },
-        "author": {
-          "login": "tqchen"
-        },
-        "state": "APPROVED"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr11442-no-recomment.json b/tests/python/ci/sample_prs/pr11442-rerun-ci.json
similarity index 95%
rename from tests/python/ci/sample_prs/pr11442-no-recomment.json
rename to tests/python/ci/sample_prs/pr11442-rerun-ci.json
index 77af805f2180e..0199b2921f648 100644
--- a/tests/python/ci/sample_prs/pr11442-no-recomment.json
+++ b/tests/python/ci/sample_prs/pr11442-rerun-ci.json
@@ -3,7 +3,7 @@
   "body": "(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for\r\ncontext, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).\r\n\r\nThis adds a new 'DSO exportable' runtime module representing the contents of a .o file. It\r\nallows external codegen toolchains to yield a result which:\r\n - Like CSource modules, can be conveyed directly to the final export_library compilation\r\n   step for linking into the final .so and saved to a know location without risk the\r\n   underlying code artifact will be lost.\r\n - Like DSOLibrary modules, are self contained so that no additional compile-time arguments\r\n   need be conveyed from the CSource module to the final export_library command line\r\n\r\nSince this is the third flavor of 'DSO exportable' module, add a Module::IsDSOExportable.\r\n\r\nSince adding the above, can't resist also adding a Module::ImplementsFunction virtual and\r\ncalling it from TEComplier to check if an external codegen function actually provided the\r\nimplementation it promised.\r\n\r\nNote:\r\n - I've left the existing implementation of runtime.load_module alone which\r\n   relinks .o files to .so files.\r\n - Though also contained in the .o metadata, I require static libraries to always\r\n   carry their list of exported function names.\r\n\r\nThis is all pretty stop gap pending a good rework of TVM to supoprt the notion of artifacts\r\nand, perhaps, build rules.\r\n",
   "state": "OPEN",
   "author": {
-    "login": "mbs-octoml"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
@@ -64,15 +64,7 @@
           "login": "mbs-octoml"
         },
         "updatedAt": "2022-05-25T22:12:37Z",
-        "body": "Hmff."
-      },
-      {
-        "authorAssociation": "NONE",
-        "author": {
-          "login": "github-actions"
-        },
-        "updatedAt": "2022-05-25T22:12:55Z",
-        "body": "Cannot merge, did not find any approving reviews from users with write access on 96d4e62da5a7b78da18d0ee28cc6261d8fbf31c4"
+        "body": "@tvm-bot rerun"
       }
     ]
   },
diff --git a/tests/python/ci/test_mergebot.py b/tests/python/ci/test_mergebot.py
index b9f944e897d3f..a565cc76a5c14 100644
--- a/tests/python/ci/test_mergebot.py
+++ b/tests/python/ci/test_mergebot.py
@@ -29,8 +29,8 @@ class TempGit:
     def __init__(self, cwd):
         self.cwd = cwd
 
-    def run(self, *args):
-        proc = subprocess.run(["git"] + list(args), cwd=self.cwd)
+    def run(self, *args, **kwargs):
+        proc = subprocess.run(["git"] + list(args), cwd=self.cwd, **kwargs)
         if proc.returncode != 0:
             raise RuntimeError(f"git command failed: '{args}'")
 
@@ -50,87 +50,118 @@ def run(self, *args):
         "number": 10786,
         "filename": "pr10786-merges.json",
         "expected": SUCCESS_EXPECTED_OUTPUT,
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "Everything is fine so this PR will merge",
     },
     "no-request": {
         "number": 10786,
         "filename": "pr10786-nottriggered.json",
-        "expected": "No merge requested, exiting",
+        "expected": "Command 'do something else' did not match anything",
+        "comment": "@tvm-bot do something else",
+        "user": "abc",
         "detail": "A PR for which the mergebot runs but no merge is requested",
     },
     "bad-ci": {
         "number": 10786,
         "filename": "pr10786-badci.json",
         "expected": "Cannot merge, these CI jobs are not successful on",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "A PR which failed CI and cannot merge",
     },
     "old-review": {
         "number": 10786,
         "filename": "pr10786-oldreview.json",
         "expected": "Cannot merge, did not find any approving reviews",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "A PR with passing CI and approving reviews on an old commit so it cannot merge",
     },
     "missing-job": {
         "number": 10786,
         "filename": "pr10786-missing-job.json",
         "expected": "Cannot merge, missing expected jobs",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "PR missing an expected CI job and cannot merge",
     },
     "invalid-author": {
         "number": 10786,
         "filename": "pr10786-invalid-author.json",
-        "expected": "No merge requested, exiting",
+        "expected": "Comment is not from from PR author or collaborator, quitting",
+        "comment": "@tvm-bot merge",
+        "user": "not-abc",
         "detail": "Merge requester is not a committer and cannot merge",
     },
     "unauthorized-comment": {
         "number": 11244,
         "filename": "pr11244-unauthorized-comment.json",
-        "expected": "No merge requested, exiting",
+        "expected": "Comment is not from from PR author or collaborator, quitting",
+        "comment": "@tvm-bot merge",
+        "user": "not-abc2",
         "detail": "Check that a merge comment not from a CONTRIBUTOR is rejected",
     },
     "no-review": {
         "number": 11267,
         "filename": "pr11267-no-review.json",
         "expected": "Cannot merge, did not find any approving reviews from users with write access",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "Check that a merge request without any reviews is rejected",
     },
     "changes-requested": {
         "number": 10786,
         "filename": "pr10786-changes-requested.json",
         "expected": "Cannot merge, found [this review]",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "Check that a merge request with a 'Changes Requested' review on HEAD is rejected",
     },
     "co-authors": {
         "number": 10786,
         "filename": "pr10786-co-authors.json",
         "expected": "Co-authored-by: Some One <someone@email.com>",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "Check that a merge request with co-authors generates the correct commit message",
     },
-    "no-recomment": {
+    "rerun-ci": {
         "number": 11442,
-        "filename": "pr11442-no-recomment.json",
-        "expected": "No merge requested, exiting",
-        "detail": "Check that comments after a failed merge don't trigger another merge",
+        "filename": "pr11442-rerun-ci.json",
+        "expected": "Rerunning ci with",
+        "comment": "@tvm-bot rerun",
+        "user": "abc",
+        "detail": "Start a new CI job",
     },
 }
 
 
 @pytest.mark.parametrize(
-    ["number", "filename", "expected", "detail"],
+    ["number", "filename", "expected", "comment", "user", "detail"],
     [tuple(d.values()) for d in test_data.values()],
     ids=test_data.keys(),
 )
-def test_mergebot(tmpdir_factory, number, filename, expected, detail):
-    mergebot_script = REPO_ROOT / "tests" / "scripts" / "github_mergebot.py"
+def test_mergebot(tmpdir_factory, number, filename, expected, comment, user, detail):
+    mergebot_script = REPO_ROOT / "tests" / "scripts" / "github_tvmbot.py"
     test_json_dir = Path(__file__).resolve().parent / "sample_prs"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-    git.run("init")
-    git.run("checkout", "-b", "main")
+    git.run("init", stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+    git.run("checkout", "-b", "main", stderr=subprocess.PIPE, stdout=subprocess.PIPE)
     git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
     with open(test_json_dir / filename) as f:
         test_data = json.load(f)
 
+    comment = {
+        "body": comment,
+        "id": 123,
+        "user": {
+            "login": user,
+        },
+    }
+    collaborators = []
+
     proc = subprocess.run(
         [
             str(mergebot_script),
@@ -141,10 +172,17 @@ def test_mergebot(tmpdir_factory, number, filename, expected, detail):
             "https://example.com",
             "--testing-pr-json",
             json.dumps(test_data),
+            "--testing-collaborators-json",
+            json.dumps(collaborators),
+            "--trigger-comment-json",
+            json.dumps(comment),
         ],
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         encoding="utf-8",
+        env={
+            "TVM_BOT_JENKINS_TOKEN": "123",
+        },
         cwd=git.cwd,
     )
     if proc.returncode != 0:
diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index 9f2468638cade..7cd1b6b2fe596 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -19,6 +19,7 @@
 import json
 import subprocess
 import re
+import base64
 from urllib import request
 from typing import Dict, Tuple, Any, Optional, List
 
@@ -29,6 +30,27 @@ def compress_query(query: str) -> str:
     return query
 
 
+def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] = None):
+    print(f"Requesting POST to", url, "with", body)
+    headers = {}
+    if auth is not None:
+        auth_str = base64.b64encode(f"{auth[0]}:{auth[1]}")
+        request.add_header("Authorization", f"Basic {auth_str}")
+
+    if body is None:
+        body = ""
+
+    req.add_header("Content-Type", "application/json; charset=utf-8")
+    req = request.Request(url, headers=headers, method="POST")
+    data = json.dumps(body)
+    data = data.encode("utf-8")
+    req.add_header("Content-Length", len(data))
+
+    with request.urlopen(req, data) as response:
+        response = json.loads(response.read())
+    return response
+
+
 class GitHubRepo:
     def __init__(self, user, repo, token):
         self.token = token
diff --git a/tests/scripts/github_mergebot.py b/tests/scripts/github_tvmbot.py
similarity index 80%
rename from tests/scripts/github_mergebot.py
rename to tests/scripts/github_tvmbot.py
index 76e0803efc23a..bfdbeb4039e52 100755
--- a/tests/scripts/github_mergebot.py
+++ b/tests/scripts/github_tvmbot.py
@@ -23,17 +23,21 @@
 import logging
 import traceback
 import re
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any, List, Optional, Callable
 from pathlib import Path
 
-from git_utils import git, GitHubRepo, parse_remote
+from git_utils import git, GitHubRepo, parse_remote, post
 from cmd_utils import init_log
 
 
 Review = Dict[str, Any]
 CIJob = Dict[str, Any]
+Comment = Dict[str, Any]
+CommentChecker = Callable[[Comment], bool]
 
 EXPECTED_JOBS = ["tvm-ci/pr-head"]
+TVM_BOT_JENKINS_TOKEN = os.environ["TVM_BOT_JENKINS_TOKEN"]
+JENKINS_URL = "https://ci.tlcpack.ai/"
 THANKS_MESSAGE = r"(\s*)Thanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from \[Reviewers\]\(https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers\) by  them in the pull request thread.(\s*)"
 
 
@@ -41,6 +45,19 @@ def to_json_str(obj: Any) -> str:
     return json.dumps(obj, indent=2)
 
 
+COLLABORATORS_QUERY = """
+query ($owner: String!, $name: String!, $user: String!) {
+  repository(owner: $owner, name: $name) {
+    collaborators(query: $user, first: 1) {
+      nodes {
+        login
+      }
+    }
+  }
+}
+"""
+
+
 PR_QUERY = """
     query ($owner: String!, $name: String!, $number: Int!) {
       repository(owner: $owner, name: $name) {
@@ -60,6 +77,7 @@ def to_json_str(obj: Any) -> str:
               author {
                 login
               }
+              id
               updatedAt
               body
             }
@@ -119,6 +137,7 @@ def to_json_str(obj: Any) -> str:
               body
               updatedAt
               url
+              id
               authorCanPushToRepository
               commit {
                 oid
@@ -202,6 +221,17 @@ def checker(obj, parent_key):
     def __repr__(self):
         return json.dumps(self.raw, indent=2)
 
+    def plus_one(self, comment: Dict[str, Any]):
+        """
+        React with a thumbs up to a comment
+        """
+        url = f"issues/comments/{comment['id']}/reactions"
+        data = {"content": "+1"}
+        if self.dry_run:
+            logging.info(f"Dry run, would have +1'ed to {url} with {data}")
+        else:
+            self.github.post(url, data=data)
+
     def head_commit(self):
         return self.raw["commits"]["nodes"][0]["commit"]
 
@@ -292,6 +322,19 @@ def fetch_data(self):
             },
         )["data"]["repository"]["pullRequest"]
 
+    def search_collaborator(self, user: str) -> List[Dict[str, Any]]:
+        """
+        Query GitHub for collaborators matching 'user'
+        """
+        return self.github.graphql(
+            query=COLLABORATORS_QUERY,
+            variables={
+                "owner": self.owner,
+                "name": self.repo_name,
+                "user": user,
+            },
+        )["data"]["repository"]["collaborators"]["nodes"]
+
     def comment(self, text: str) -> None:
         """
         Leave the comment 'text' on this PR
@@ -370,70 +413,8 @@ def merge(self) -> None:
 
         self.github.put(url, data=data)
 
-    def comment_can_merge(self, comment: Dict[str, Any]) -> bool:
-        """
-        Check if a comment was left by the PR author or by a committer
-        """
-        if comment["author"]["login"] == self.raw["author"]["login"]:
-            logging.info(f"Comment {comment} was from author and is mergeable")
-            return True
-
-        if comment.get("authorAssociation", "") == "CONTRIBUTOR":
-            logging.info(f"Comment {comment} was from committer comment and is mergeable")
-            return True
-
-        if comment.get("authorCanPushToRepository", False):
-            logging.info(f"Comment {comment} was from a committer review comment and is mergeable")
-            return True
-
-        logging.info(f"Comment {comment} was not from author or committers and is not mergeable")
-        return False
-
-    def merge_requested(self) -> bool:
-        """
-        Check if this PR has had a merge requested
-        """
-        merge_commands = [
-            "merge",
-            "merge this",
-            "merge this pr",
-        ]
-        cancel_commands = [
-            "cancel",
-            "cancel merge",
-            "cancel the merge",
-            "stop",
-            "stop merge",
-            "stop the merge",
-        ]
-
-        def parse_action(comment: Dict[str, Any]) -> Optional[str]:
-            if comment["author"]["login"] == "github-actions":
-                return "commented"
-
-            if not self.comment_can_merge(comment):
-                return None
-
-            body = comment["body"]
-            if any(f"@tvm-bot {c}" in body for c in merge_commands):
-                return "merge"
-
-            if any(f"@tvm-bot {c}" in body for c in cancel_commands):
-                return "cancel"
-
-            return None
-
-        # Check regular comments and top-level review comments
-        all_comments = self.raw["comments"]["nodes"] + self.reviews()
-        all_comments = sorted(all_comments, key=lambda comment: comment["updatedAt"])
-        actions = [parse_action(comment) for comment in all_comments]
-        logging.info(f"Found these tvm-bot actions: {actions}")
-        actions = [a for a in actions if a is not None]
-
-        if len(actions) == 0:
-            return False
-
-        return actions[-1] == "merge"
+    def author(self) -> str:
+        return self.raw["author"]["login"]
 
     def find_failed_ci_jobs(self) -> List[CIJob]:
         # NEUTRAL is GitHub Action's way of saying cancelled
@@ -502,6 +483,49 @@ def merge_if_passed_checks(self) -> None:
             self.comment(f"Cannot merge, CI did not pass on on {self.head_oid()}")
             return
 
+    def rerun_jenkins_ci(self) -> None:
+        url = JENKINS_URL + f"job/tvm/job/PR-{self.number}/buildWithParameters"
+        logging.info(f"Rerunning ci with URL={url}")
+        if self.dry_run:
+            logging.info("Dry run, not sending POST")
+        else:
+            post(url, auth=("tvm-bot", TVM_BOT_JENKINS_TOKEN))
+
+
+class Merge:
+    triggers = [
+        "merge",
+        "merge this",
+        "merge this pr",
+    ]
+
+    @staticmethod
+    def run(pr: PR):
+        try:
+            pr.merge_if_passed_checks()
+        except Exception as e:
+            if not args.dry_run:
+                msg = traceback.format_exc()
+                pr.comment(
+                    f"Failed to process merge request in {args.run_url}\n\n<details>\n\n```\n{msg}\n```\n\n</details>"
+                )
+            raise e
+
+
+class Rerun:
+    triggers = [
+        "rerun",
+        "rerun ci",
+        "re-run",
+        "re-run ci",
+        "run",
+        "run ci",
+    ]
+
+    @staticmethod
+    def run(pr: PR):
+        pr.rerun_jenkins_ci()
+
 
 if __name__ == "__main__":
     help = "Check if a PR has comments trying to merge it, and do so based on reviews/CI status"
@@ -509,7 +533,13 @@ def merge_if_passed_checks(self) -> None:
     parser.add_argument("--remote", default="origin", help="ssh remote to parse")
     parser.add_argument("--pr", required=True, help="pr number to check")
     parser.add_argument("--run-url", required=True, help="workflow run URL")
+    parser.add_argument(
+        "--trigger-comment-json", required=True, help="json of the comment that triggered this run"
+    )
     parser.add_argument("--testing-pr-json", help="(testing only) manual data for testing")
+    parser.add_argument(
+        "--testing-collaborators-json", help="(testing only) manual data for testing"
+    )
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -518,7 +548,27 @@ def merge_if_passed_checks(self) -> None:
     )
     args = parser.parse_args()
     init_log()
+    comment = json.loads(args.trigger_comment_json)
+    body = comment["body"].strip()
+
+    # Check that the comment was addressed to tvm-bot
+    if not body.startswith("@tvm-bot "):
+        logging.info(f"Not a bot comment, '{body}' does not start with '@tvm-bot'")
+        exit(0)
 
+    # Find the code to run for the command from the user
+    user_command = body.lstrip("@tvm-bot").strip()
+    command_to_run = None
+    for command in [Merge, Rerun]:
+        if user_command in command.triggers:
+            command_to_run = command
+            break
+
+    if command_to_run is None:
+        logging.info(f"Command '{user_command}' did not match anything")
+        exit(0)
+
+    # Find the remote for querying more data about the PR
     remote = git(["config", "--get", f"remote.{args.remote}.url"])
     logging.info(f"Using remote remote={remote}")
     owner, repo = parse_remote(remote)
@@ -539,21 +589,34 @@ def merge_if_passed_checks(self) -> None:
     else:
         pr = PR(number=int(args.pr), owner=owner, repo=repo, dry_run=args.dry_run)
 
+    # Acknowledge the comment with a react
+    pr.plus_one(comment)
+
+    # Check the comment author
+    comment_author = comment["user"]["login"]
+    if pr.author() == comment_author:
+        logging.info("Comment user is PR author, continuing")
+    else:
+        logging.info("Comment is not from PR author, checking collaborators")
+        # Get the list of collaborators for the repo filtered by the comment
+        # author
+        if args.testing_collaborators_json:
+            collaborators = json.loads(args.testing_collaborators_json)
+        else:
+            collaborators = pr.search_collaborator(comment_author)
+        logging.info(f"Found collaborators: {collaborators}")
+
+        if len(collaborators) > 0:
+            logging.info("Comment is from collaborator")
+        else:
+            logging.info("Comment is not from from PR author or collaborator, quitting")
+            exit(0)
+
     state = pr.state()
 
     if state != "OPEN":
         logging.info(f"Ignoring event on PR, state was not OPEN, instead was state={state}")
         exit(0)
 
-    if pr.merge_requested():
-        try:
-            pr.merge_if_passed_checks()
-        except Exception as e:
-            if not args.dry_run:
-                msg = traceback.format_exc()
-                pr.comment(
-                    f"Failed to process merge request in {args.run_url}\n\n<details>\n\n```\n{msg}\n```\n\n</details>"
-                )
-            raise e
-    else:
-        logging.info("No merge requested, exiting")
+    # Run the command
+    command_to_run.run(pr)

From c78539cc59b60b77794276699f9430cd5e838106 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 2 Jun 2022 15:08:13 -0500
Subject: [PATCH 026/181] [TIR][Arith] Additional Simplifications Inside
 Conditionals (#11524)

* [TIR][Arith] Use equality constraints in analyzer

Previously, constraints with inequalities were recognized and used for
simplifications by `ConstIntBoundAnalyzer` and `ModularSetAnalyzer`,
but constraints with equalities were not.  This adds equality-based
constraints.  (e.g. Inside the then-case of `if i==5`, the value of
`i` is known to be 5.)

* [TIR][Arith] RewriteSimplifier, apply literal constraints

Previously, constraints were only checked within a `tir.likely`
annotation.  After this change, constraints are used for
simplification of all boolean expressions.  (e.g. Within a conditional
`if i==n`, the expression `(i==n) and (j==m)` can be simplified to
`j==m`.)

* [TIR][Arith] Do not apply literal constraints to BufferLoad

If a literal constraint relies on the contents of a buffer, the
constraint may not be assumed to hold.  This prevents the incorrect
rewriting of `A[i]==n` to true within a `if A[i]==n` conditional, as
the value of `A[i]` may have changed.

* [TIR][Arith] Use each independent constraints in RewriteSimplifier

Inside a constraint `if i==n and j==m`, both `i==n` and `j==m` may be
replaced with true, even in separate expressions.

This commit uses a new internal utility function
`tvm::arith::ExtractConstraints`, which breaks up a boolean expression
into a list of true statements.  This may be used to reduce
duplication elsewhere, such as `const_int_bound.cc` and
`iter_affine_map.cc`.

* [TIR][Arith] Check for negation of literal constraints

When inside a conditional of `i!=n`, in addition to the previous
replacement of `i!=n` with true, we can also replace `i==n` with
false.

* [TIR][Arith] Added unittests for new simplifications

* Fix lint error

* Fixed handling of negation of non-boolean types

* Removed extra asterisk
---
 src/arith/const_int_bound.cc                  |   3 +
 src/arith/constraint_extract.cc               |  55 +++++
 src/arith/constraint_extract.h                |  58 +++++
 src/arith/modular_set.cc                      |   4 +
 src/arith/rewrite_simplify.cc                 |  50 +++-
 src/arith/rewrite_simplify.h                  |   9 +
 .../unittest/test_tir_transform_simplify.py   | 233 +++++++++++++++++-
 7 files changed, 398 insertions(+), 14 deletions(-)
 create mode 100644 src/arith/constraint_extract.cc
 create mode 100644 src/arith/constraint_extract.h

diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index cb125551c4683..4fd27a0fde10d 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -598,6 +598,9 @@ class ConstIntBoundAnalyzer::Impl
     if ((x < c).Match(cond)) {
       return {BoundInfo(x.Eval(), MakeBound(kNegInf, c.Eval()->value - 1))};
     }
+    if ((x == c).Match(cond) || (c == x).Match(cond)) {
+      return {BoundInfo(x.Eval(), MakeBound(c.Eval()->value, c.Eval()->value))};
+    }
     if ((x && y).Match(cond)) {
       auto ret1 = DetectBoundInfo(x.Eval());
       auto ret2 = DetectBoundInfo(y.Eval());
diff --git a/src/arith/constraint_extract.cc b/src/arith/constraint_extract.cc
new file mode 100644
index 0000000000000..d0bf57497e63e
--- /dev/null
+++ b/src/arith/constraint_extract.cc
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/arith/constraint_extract.cc
+ */
+
+#include "constraint_extract.h"
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/tir/expr.h>
+
+#include "pattern_match.h"
+
+namespace tvm {
+namespace arith {
+
+void CollectConstraints(const PrimExpr& expr, Analyzer* analyzer, std::vector<PrimExpr>* collect) {
+  collect->push_back(expr);
+
+  PVar<PrimExpr> x, y;
+  if ((x && y).Match(expr)) {
+    CollectConstraints(x.Eval(), analyzer, collect);
+    CollectConstraints(y.Eval(), analyzer, collect);
+  } else if ((!(x || y)).Match(expr)) {
+    CollectConstraints(analyzer->rewrite_simplify(tir::Not(x.Eval())), analyzer, collect);
+    CollectConstraints(analyzer->rewrite_simplify(tir::Not(y.Eval())), analyzer, collect);
+  }
+}
+
+std::vector<PrimExpr> ExtractConstraints(const PrimExpr& expr) {
+  std::vector<PrimExpr> out;
+  Analyzer analyzer;
+  CollectConstraints(expr, &analyzer, &out);
+  return out;
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arith/constraint_extract.h b/src/arith/constraint_extract.h
new file mode 100644
index 0000000000000..ea6e0a74419ce
--- /dev/null
+++ b/src/arith/constraint_extract.h
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file contraint_extract.h
+ *
+ * \brief Centralized location for extraction of constraints from a boolean expression.
+ */
+
+#ifndef TVM_ARITH_CONSTRAINT_EXTRACT_H_
+#define TVM_ARITH_CONSTRAINT_EXTRACT_H_
+
+#include <tvm/tir/expr.h>
+
+#include <vector>
+
+namespace tvm {
+namespace arith {
+
+/* \brief Returns constraints that are true if the expression is true.
+ *
+ * Utility to break up a boolean expression into independent
+ * constraints.
+ *
+ * Example: `i==5 && j==3` => `[i==5 && j==3, i==5, j==3]`
+ * Example: `i==5 || j==3` => `[i==5 || j==3]`
+ * Example: `!(i>5 || j==3)` => `[!(i==5 || j==3), i<=5, j!=3]`
+ *
+ * Intended for use in bounds analysis or simplification within a
+ * conditional, or identifying independent conditionals that may be
+ * hoisted.
+ *
+ * \param expr The expression to be analyzers
+ *
+ * \returns A vector of independent constraints
+ */
+std::vector<PrimExpr> ExtractConstraints(const PrimExpr& expr);
+
+}  // namespace arith
+}  // namespace tvm
+
+#endif  // TVM_ARITH_CONSTRAINT_EXTRACT_H_
diff --git a/src/arith/modular_set.cc b/src/arith/modular_set.cc
index afc28a5ed2859..4cad570ab3359 100644
--- a/src/arith/modular_set.cc
+++ b/src/arith/modular_set.cc
@@ -112,6 +112,10 @@ class ModularSetAnalyzer::Impl : public ExprFunctor<ModularSetAnalyzer::Entry(co
       Entry entry(coeff.Eval()->value, base.Eval()->value);
       return UpdateByIntersect(var.Eval(), entry);
     }
+    if ((var == base).Match(constraint) || (base == var).Match(constraint)) {
+      Entry entry(1, base.Eval()->value);
+      return UpdateByIntersect(var.Eval(), entry);
+    }
     return nullptr;
   }
 
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index f9e38dee48e50..a168e1f0836ca 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -32,6 +32,7 @@
 
 #include "../target/datatype/registry.h"
 #include "const_fold.h"
+#include "constraint_extract.h"
 #include "pattern_match.h"
 
 namespace tvm {
@@ -228,7 +229,24 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
   size_t old_literal_size = literal_constraints_.size();
   // we will compare the already simplified result with the constraint,
   // so simplify the constarint as well
-  literal_constraints_.push_back(operator()(constraint));
+  PrimExpr new_constraint = operator()(constraint);
+  for (const PrimExpr& subconstraint : ExtractConstraints(new_constraint)) {
+    if (SideEffect(subconstraint) <= CallEffectKind::kPure) {
+      literal_constraints_.push_back(subconstraint);
+      // We could apply this during TryMatchLiteralConstraint, but
+      // that would require performing a rewrite of each expression
+      // being checked.  This way, we only apply a rewrite for each
+      // constraint being applied.
+      PrimExpr negation;
+      if (subconstraint.dtype().is_bool()) {
+        negation = Not(subconstraint);
+      } else {
+        negation = subconstraint == make_zero(subconstraint.dtype());
+      }
+      negation = operator()(negation);
+      literal_constraints_.push_back(Not(negation));
+    }
+  }
   size_t new_literal_size = literal_constraints_.size();
   auto frecover = [old_literal_size, new_literal_size, this]() {
     ICHECK_EQ(literal_constraints_.size(), new_literal_size);
@@ -1291,11 +1309,27 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MaxNode* op) {
   return ret;
 }
 
+Optional<PrimExpr> RewriteSimplifier::Impl::TryMatchLiteralConstraint(const PrimExpr& expr) const {
+  PrimExpr negation = Not(expr);
+
+  ExprDeepEqual expr_equal;
+  for (const auto& constraint : literal_constraints_) {
+    if (expr_equal(constraint, expr)) {
+      return make_const(expr->dtype, true);
+    }
+    if (expr_equal(constraint, negation)) {
+      return make_const(expr->dtype, false);
+    }
+  }
+  return NullOpt;
+}
+
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const EQNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<EQNode>();
   PrimExpr const_res = TryConstFold<EQ>(op->a, op->b);
   if (const_res.defined()) return const_res;
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
@@ -1344,6 +1378,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LTNode* op) {
   op = ret.as<LTNode>();
   PrimExpr const_res = TryConstFold<LT>(op->a, op->b);
   if (const_res.defined()) return const_res;
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, s1, s2;
@@ -1475,6 +1510,8 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NotNode* op) {
   op = ret.as<NotNode>();
   PrimExpr const_res = TryConstFold<Not>(op->a);
   if (const_res.defined()) return const_res;
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
+
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
   PVar<int> lanes;
@@ -1499,6 +1536,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
   op = ret.as<AndNode>();
   PrimExpr const_res = TryConstFold<And>(op->a, op->b);
   if (const_res.defined()) return const_res;
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
@@ -1538,6 +1576,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) {
   op = ret.as<OrNode>();
   PrimExpr const_res = TryConstFold<Or>(op->a, op->b);
   if (const_res.defined()) return const_res;
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
@@ -1602,13 +1641,10 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
       return op->args[0] << op->args[1];
     }
   }
-  ExprDeepEqual expr_equal;
   if (op->op.same_as(tir::builtin::likely())) {
-    for (const auto& constraint : literal_constraints_) {
-      // Cases such as for (i, 0, bound) {if (likely(iter_var < bound)) { .. } }
-      if (expr_equal(constraint, op->args[0])) {
-        return make_const(op->dtype, true);
-      }
+    // Cases such as for (i, 0, bound) {if (likely(iter_var < bound)) { .. } }
+    if (auto match = TryMatchLiteralConstraint(op->args[0])) {
+      return match.value();
     }
   }
   return ret;
diff --git a/src/arith/rewrite_simplify.h b/src/arith/rewrite_simplify.h
index 202b9209da6df..6007b6416742c 100644
--- a/src/arith/rewrite_simplify.h
+++ b/src/arith/rewrite_simplify.h
@@ -105,6 +105,15 @@ class RewriteSimplifier::Impl : public IRMutatorWithAnalyzer {
    */
   bool CanInlineLet(const LetNode* op);
 
+  /*! \brief Internal function to apply constraints
+   *
+   * Tests whether the expression is known to be true or false based
+   * on existing constraints.  If the expression or its negation
+   * matches a constraint, return the boolean it should be replaced
+   * with.  Otherwise, return false.
+   */
+  Optional<PrimExpr> TryMatchLiteralConstraint(const PrimExpr& expr) const;
+
  private:
   // Whether x >= val
   bool CanProveGreaterEqual(const PrimExpr& x, int64_t val) {
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 01cc41c7cec75..4f727cd89b123 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -136,7 +136,24 @@ def sls(n, d):
     assert "if" not in str(stmt)
 
 
-def test_load_store_noop():
+class BaseBeforeAfter:
+    def test_simplify(self):
+        before = self.before
+        before_mod = tvm.IRModule.from_expr(before)
+        after_mod = tvm.tir.transform.Simplify()(before_mod)
+        after = after_mod["main"]
+        expected = self.expected
+
+        try:
+            tvm.ir.assert_structural_equal(after, expected)
+        except ValueError as err:
+            script = tvm.IRModule({"expected": expected, "after": after, "before": before}).script()
+            raise ValueError(
+                f"Function after simplification did not match expected:\n{script}"
+            ) from err
+
+
+class TestLoadStoreNoop(BaseBeforeAfter):
     """Store of a value that was just read from the same location is a no-op."""
 
     @T.prim_func
@@ -147,11 +164,8 @@ def before(A: T.Buffer[(1,), "float32"]):
     def expected(A: T.Buffer[(1,), "float32"]):
         T.evaluate(0)
 
-    after = tvm.tir.transform.Simplify()(tvm.IRModule.from_expr(before))["main"]
-    tvm.ir.assert_structural_equal(after, expected)
 
-
-def test_load_store_noop_after_simplify():
+class TestLoadStoreNoopAfterSimplify(BaseBeforeAfter):
     """As test_load_store_noop, but requiring simplification to identify.
 
     Previously, a bug caused the self-assignment of a buffer to
@@ -168,8 +182,213 @@ def before(A: T.Buffer[(1,), "float32"]):
     def expected(A: T.Buffer[(1,), "float32"]):
         T.evaluate(0)
 
-    after = tvm.tir.transform.Simplify()(tvm.IRModule.from_expr(before))["main"]
-    tvm.ir.assert_structural_equal(after, expected)
+
+class TestNestedCondition(BaseBeforeAfter):
+    """Nested IfThenElse with the same condition can be simplified.
+
+    Requires const_int_bound to narrow scope of i within the
+    conditional, or for rewrite_simplify to recognize the literal
+    constraint.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "float32"]):
+        for i in T.serial(16):
+            if i == 5:
+                if i == 5:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "float32"]):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 0.0
+
+
+class TestNestedProvableCondition(BaseBeforeAfter):
+    """Simplify inner conditional using constraint from outer.
+
+    Requires const_int_bound to narrow scope of i within the
+    conditional.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "float32"]):
+        for i in T.serial(16):
+            if i == 5:
+                if i < 7:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "float32"]):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 0.0
+
+
+class TestNestedVarCondition(BaseBeforeAfter):
+    """Simplify inner conditional using constraint from outer.
+
+    Requires for rewrite_simplify to recognize the repeated
+    constraint.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "float32"], n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                if i == n:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "float32"], n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                A[i] = 0.0
+
+
+class TestAlteredBufferContents(BaseBeforeAfter):
+    """No simplification of data-dependent conditionals.
+
+    A literal constraint must not be propagated if the values
+    referenced may change.  TIR requires single assignment of
+    variables, so Var objects may be assumed constant, but BufferLoad
+    may not.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(1,), "int32"], n: T.int32):
+        if A[0] == n:
+            A[0] = A[0] + 1
+            if A[0] == n:
+                A[0] = 0
+
+    expected = before
+
+
+class TestNegationOfCondition(BaseBeforeAfter):
+    """Use negation of outer condition to simplify innner.
+
+    Within the body of an if statement, the negation of the
+    condition is known to be false.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "int32"]):
+        for i in T.serial(16):
+            if i == 5:
+                if i != 5:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "int32"]):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 1
+
+
+class TestNegationOfNotEqual(BaseBeforeAfter):
+    """As TestNegationOfVarCondition, but with a != outer condition.
+
+    Because ConstIntBoundAnalyzer only tracks the min and max allowed
+    values, the outer i!=5 condition does provide a constraint on the
+    bounds.  This test relies on RewriteSimplifier to recognize
+    ``i==5`` as the negation of a literal constraint.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "int32"]):
+        for i in T.serial(16):
+            if i != 5:
+                if i == 5:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "int32"]):
+        for i in T.serial(16):
+            if i != 5:
+                A[i] = 1
+
+
+class TestNegationOfVarCondition(BaseBeforeAfter):
+    """As TestNegationOfVarCondition, but with a dynamic condition.
+
+    This simplification cannot be done with ConstIntBoundAnalyzer, and
+    must rely on RewriteSimplifier recognizing the repeated literal.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "int32"], n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                if i != n:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "int32"], n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                A[i] = 1
+
+
+class TestLiteralConstraintSplitBooleanAnd(BaseBeforeAfter):
+    """Split a boolean AND into independent constraints
+
+    A single if condition may impose multiple literal constraints.
+    Each constraint that is ANDed together to form the condition
+    should be treated as an independent constraint.  The use of n in
+    the condition is to ensure we exercise RewriteSimplifier.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16, 16), "int32"], n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n and j == n:
+                if i == n:
+                    A[i, j] = 0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16, 16), "int32"], n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n and j == n:
+                A[i, j] = 0
+
+
+class TestLiteralConstraintSplitBooleanOr(BaseBeforeAfter):
+    """Split a boolean OR into independent constraints
+
+    Similar to TestLiteralConstraintSplitBooleanAnd, but splitting a
+    boolean OR into independent conditions.  This uses the
+    simplification that ``!(x || y) == !x && !y``.
+
+    The use of ``n`` in the condition is to ensure we exercise
+    RewriteSimplifier.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16, 16), "int32"], n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n or j == n:
+                A[i, j] = 0
+            else:
+                if i == n:
+                    A[i, j] = 1
+                else:
+                    A[i, j] = 2
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16, 16), "int32"], n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n or j == n:
+                A[i, j] = 0
+            else:
+                A[i, j] = 2
 
 
 if __name__ == "__main__":

From 12a0f3edcf8295288f4aa9ec3dbb6771c3a1a301 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 2 Jun 2022 14:34:23 -0700
Subject: [PATCH 027/181] [TIR] Add schedule primitive ReIndex (#11515)

---
 include/tvm/tir/schedule/schedule.h           |  13 +
 python/tvm/tir/schedule/schedule.py           |  73 +++
 src/tir/schedule/concrete_schedule.cc         |  10 +
 src/tir/schedule/concrete_schedule.h          |   2 +
 src/tir/schedule/primitive.h                  |  15 +
 .../schedule/primitive/cache_read_write.cc    | 468 ++++++++++++++++++
 src/tir/schedule/schedule.cc                  |   5 +
 src/tir/schedule/traced_schedule.cc           |  12 +
 src/tir/schedule/traced_schedule.h            |   2 +
 src/tir/schedule/transform.cc                 |  26 +
 src/tir/schedule/transform.h                  |  21 +
 .../unittest/test_tir_schedule_reindex.py     | 203 ++++++++
 12 files changed, 850 insertions(+)
 create mode 100644 tests/python/unittest/test_tir_schedule_reindex.py

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 48014280a5589..68900e107d7c9 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -364,6 +364,19 @@ class ScheduleNode : public runtime::Object {
    */
   virtual BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                              const String& storage_scope) = 0;
+  /*!
+   * \brief Create a block that read/write a buffer region into a read/write cache with reindexing.
+   * The layout of the cache will be the same as by the iterators of the block that reads/writes the
+   * buffer. It requires:
+   * 1) There is only one block who reads/writes the target buffer
+   * 2) There is only one buffer load/store of this buffer in the block
+   * \param block_rv The block operates on the target buffer.
+   * \param buffer_index The index of the buffer in block's read or write region.
+   * \param buffer_index_type The type of the buffer index, kRead or kWrite.
+   * \return The reindex stage block.
+   */
+  virtual BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
+                          BufferIndexType buffer_index_type) = 0;
   /******** Schedule: Compute location ********/
   /*!
    * \brief Move a producer block under the specific loop, and regenerate the
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index f86228848b9d2..4179088aa534d 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -1056,6 +1056,79 @@ def after_cache_write(a: T.handle, b: T.handle) -> None:
             self, block, write_buffer_index, storage_scope
         )
 
+    @type_checked
+    def reindex(self, block: BlockRV, buffer_index: int, buffer_index_type: str) -> BlockRV:
+        """Create a block that read/write a buffer region into a read/write cache with reindexing.
+        The layout of the cache will be the same as by the iterators of the block that reads/writes
+        the buffer. It requires:
+        1) There is only one block who reads/writes the target buffer
+        2) There is only one buffer load/store of this buffer in the block
+
+        Parameters
+        ----------
+        block: BlockRV
+            The block that accesses the target buffer
+        buffer_index: int
+            The index of the buffer in block's read or write region
+        buffer_index_type : str
+            Type of the buffer index, "read" or "write"
+
+        Returns
+        -------
+        reindex_block : BlockRV
+            The block of the reindex stage
+
+        Examples
+        --------
+
+        Before transform_layout, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def before_reindex(
+                A: T.Buffer[(128, 128), "float32"],
+                B: T.Buffer[(128, 128), "float32"]
+            ) -> None:
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vj, vi] * 2.0
+
+        Create the schedule and do transform_layout:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_reindex)
+            block = sch.get_block("B")
+            sch.reindex(block, 0, "read)
+
+        After applying reindex, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def after_reindex(
+                A: T.Buffer[(128, 128), "float32"],
+                B: T.Buffer[(128, 128), "float32"]
+            ) -> None:
+                A_reindex = T.alloc_buffer((128, 128), "float32")
+                for i, j in T.grid(128, 128):
+                    with T.block("A_reindex"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        A_reindex[vi, vj] = A[vj, vi]
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A_reindex[vi, vj] * 2.0
+
+        """
+        assert buffer_index_type in ["read", "write"], "Invalid buffer_index_type"
+        buffer_index_type_enum = 0 if buffer_index_type == "read" else 1
+        return _ffi_api.ScheduleReIndex(  # type: ignore # pylint: disable=no-member
+            self, block, buffer_index, buffer_index_type_enum
+        )
+
     ########## Schedule: Compute location ##########
 
     @type_checked
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 2289899c329bb..590a0f0025954 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -511,6 +511,16 @@ BlockRV ConcreteScheduleNode::CacheWrite(const BlockRV& block_rv, int write_buff
   return CreateRV<BlockRV>(result);
 }
 
+BlockRV ConcreteScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index,
+                                      BufferIndexType buffer_index_type) {
+  StmtSRef result{nullptr};
+  TVM_TIR_SCHEDULE_BEGIN();
+  result = tir::ReIndex(state_, this->GetSRef(block_rv), buffer_index, buffer_index_type);
+  TVM_TIR_SCHEDULE_END("reindex", this->error_render_level_);
+  this->state_->DebugVerify();
+  return CreateRV<BlockRV>(result);
+}
+
 /******** Schedule: Compute location ********/
 
 void ConcreteScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 8e83aac2ce823..70c0265611c31 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -109,6 +109,8 @@ class ConcreteScheduleNode : public ScheduleNode {
                     const String& storage_scope) override;
   BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                      const String& storage_scope) override;
+  BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
+                  BufferIndexType buffer_index_type) override;
   /******** Schedule: Compute location ********/
   void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops) override;
   void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 50dedf71ff528..f4dba69c6b156 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -253,6 +253,21 @@ TVM_DLL StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int r
  */
 TVM_DLL StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_buffer_index,
                             const String& storage_scope);
+/*!
+ *!
+ * \brief Create a block that read/write a buffer region into a read/write cache with reindexing.
+ * The layout of the cache will be the same as by the iterators of the block that reads/writes the
+ * buffer. It requires:
+ * 1) There is only one block who reads/writes the target buffer
+ * 2) There is only one buffer load/store of this buffer in the block
+ * \param self The state of the schedule
+ * \param block_rv The block operates on the target buffer.
+ * \param buffer_index The index of the buffer in block's read or write region.
+ * \param buffer_index_type The type of the buffer index, kRead or kWrite.
+ * \return The reindex stage block.
+ */
+TVM_DLL StmtSRef ReIndex(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
+                         BufferIndexType buffer_index_type);
 /******** Schedule: Compute location ********/
 /*!
  * \brief Move a producer block under the specific loop, and regenerate the
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 1bba2ae4fc611..c96f88e1f6333 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -160,6 +160,121 @@ Block MakeCacheStage(const BufferRegion& cache_region, CacheStageInfo* info,
   return block;
 }
 
+/*!
+ * \brief Create the reindex block and generate the corresponding outer loops.
+ * \details The reindex block is a data copy block between the reindex buffer (the intermediate
+ * buffer), and the target buffer.
+    If buffer_index_type == kWrite, copy from the reindex buffer to the target buffer.
+    If buffer_index_type == kRead, copy from the target buffer to the reindex buffer.
+    The reindex block has the same block iters and the surrounding loops as the input block.
+ However, if a block iter is not used in the indices of the target buffer being reindexed, the
+ domain of the block iter, and the corresponding outer loop, will become constant value one, making
+ it a trivial iter.
+ * \param block The block to be reindexed
+ * \param info The cache info
+ * \param covered The set of block iter vars covered in the buffer access indices
+ * \param original_indices The original buffer access indices
+ * \param buffer_index The index of the target buffer
+ * \param buffer_index_type The type of buffer index
+ * \return The reindex block.
+ */
+Block MakeReIndexStage(const Block& block, CacheStageInfo* info,
+                       const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& covered,
+                       const Array<PrimExpr>& original_indices, int buffer_index,
+                       BufferIndexType buffer_index_type) {
+  // iters of the reindex block
+  Array<IterVar> new_block_iters;
+  // the substition map from the original block iter to the iters of the reindex block
+  std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectEqual> block_var_replace_map;
+  // block access region of reindexed buffer and target buffer
+  Region reindex_region, target_region;
+  // indices to access the reindex buffer and the target buffer
+  Array<PrimExpr> reindex_indices, target_indices;
+
+  // Step 1: Create block iters, access regions of the reindex block, and accessing indices to the
+  // reindex buffer.
+  for (const IterVar& iter : block->iter_vars) {
+    Var var("v" + std::to_string(new_block_iters.size()));
+    bool used = covered.count(iter->var);
+    new_block_iters.push_back(IterVar(/*dom=*/used ? iter->dom : Range::FromMinExtent(0, 1),
+                                      /*var=*/var,
+                                      /*IterVarType=*/kDataPar));
+    if (used) {
+      reindex_indices.push_back(var);
+      reindex_region.push_back(Range::FromMinExtent(var, 1));
+    }
+    block_var_replace_map[iter->var] = var;
+  }
+
+  // Step 2: Replace the original block iters with the new block iters
+  BufferRegion buffer_region = buffer_index_type == BufferIndexType::kWrite
+                                   ? block->writes[buffer_index]
+                                   : block->reads[buffer_index];
+  target_region = Substitute(buffer_region->region, block_var_replace_map);
+  for (const PrimExpr& index : original_indices) {
+    target_indices.push_back(Substitute(index, block_var_replace_map));
+  }
+
+  // Step 3: Create the reindex block
+
+  // The src and the dst region and indices of the data copy
+  Region src_region{nullptr};
+  Region dst_region{nullptr};
+  Array<PrimExpr> src_indices{nullptr};
+  Array<PrimExpr> dst_indices{nullptr};
+
+  if (buffer_index_type == BufferIndexType::kWrite) {
+    src_region = reindex_region;
+    dst_region = target_region;
+    src_indices = reindex_indices;
+    dst_indices = target_indices;
+  } else {
+    src_region = target_region;
+    dst_region = reindex_region;
+    src_indices = target_indices;
+    dst_indices = reindex_indices;
+  }
+
+  // Create the body block
+  Block new_block(
+      /*iter_vars=*/new_block_iters,
+      /*reads=*/
+      {BufferRegion(info->read_buffer, src_region)},
+      /*writes=*/
+      {BufferRegion(info->write_buffer, dst_region)},
+      /*name_hint=*/buffer_region->buffer->name + "_reindex",
+      /*body=*/
+      BufferStore(info->write_buffer, BufferLoad(info->read_buffer, src_indices), dst_indices));
+
+  // Step 4: Create surrounding loops
+
+  // Create loop vars and bindings for block iters
+  std::vector<Var> loop_vars;         // loop variables
+  std::vector<PrimExpr> iter_values;  // bindings in block realize
+  for (int i = 0; i < static_cast<int>(block->iter_vars.size()); ++i) {
+    Var loop_var("ax" + std::to_string(loop_vars.size()));
+    loop_vars.push_back(loop_var);
+    iter_values.push_back(loop_var);
+  }
+
+  // Create the block realize node
+  Stmt body = BlockRealize(/*values=*/iter_values,
+                           /*predicate=*/const_true(),
+                           /*block=*/new_block);
+
+  // Create the chain of loops
+  for (int i = static_cast<int>(new_block_iters.size()) - 1; i >= 0; --i) {
+    body = For(/*loop_var=*/loop_vars[i],
+               /*min=*/new_block_iters[i]->dom->min,
+               /*extent=*/new_block_iters[i]->dom->extent,
+               /*kind=*/ForKind::kSerial,
+               /*body=*/std::move(body));
+  }
+  // Update cache info, which will be used in the later rewriting.
+  info->cache_stage = std::move(body);
+  return new_block;
+}
+
 /*!
  * \brief Recalculate the `affine_binding` flag of a specifc block
  * \param block_sref The sref to the specific block
@@ -599,6 +714,252 @@ class CacheWriteRewriter : public StmtExprMutator {
   bool under_writer_block_{false};
 };
 
+/*!
+ * \brief Create a new buffer by change the shape with block iters to be used as the reindex buffer
+ * \param buffer The given buffer.
+ * \param block_iters The block iters.
+ * \param covered Set of block iter vars covered by the buffer access indices
+ * \return The new buffer with target shape.
+ */
+Buffer CreateReindexBuffer(const Buffer& buffer, const Array<IterVar>& block_iters,
+                           const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& covered) {
+  ObjectPtr<BufferNode> new_buffer = make_object<BufferNode>(*buffer.get());
+  ObjectPtr<VarNode> new_var = make_object<VarNode>(*buffer->data.get());
+  std::vector<PrimExpr> new_shape;
+  std::vector<PrimExpr> new_strides;
+  for (const auto& iter : block_iters) {
+    if (covered.count(iter->var)) {
+      new_shape.push_back(iter->dom->min + iter->dom->extent);
+    }
+  }
+  new_strides.clear();
+  new_buffer->shape = new_shape;
+  new_buffer->strides = new_strides;
+  new_buffer->data = buffer->data.copy_with_suffix("_reindex");
+  new_buffer->name = buffer->name + "_reindex";
+  return Buffer(new_buffer);
+}
+
+/*! \brief The schedule error that the target is not a leaf block. */
+class NotLeafBlockError : public ScheduleError {
+ public:
+  NotLeafBlockError(IRModule mod, Block block) : mod_(std::move(mod)), block_(std::move(block)) {}
+  String FastErrorString() const final {
+    return "ScheduleError: The target block is not a leaf block.";
+  }
+
+  String DetailRenderTemplate() const final { return "The target block {0} is not a leaf block."; }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+  IRModule mod_;
+  Block block_;
+};
+
+/*! \brief The schedule error that the buffer access is invalid for reindex. */
+class InvalidBufferAccessError : public ScheduleError {
+ public:
+  enum class ErrorKind {
+    kNoAccess,         // buffer access not found
+    kNonUniqueAccess,  // multiple buffer accesses with different indices
+    kOpaqueAccess,     // opaque access to the buffer
+  };
+
+  InvalidBufferAccessError(IRModule mod, Buffer buffer, Block block, ErrorKind kind)
+      : mod_(std::move(mod)), buffer_(std::move(buffer)), block_(std::move(block)), kind_(kind) {}
+  String FastErrorString() const final {
+    return "ScheduleError: The target buffer should be accessed via BufferLoad or BufferStore. The "
+           "indices should be the same if there are multiple accesses to the target buffer.";
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "The target buffer " << buffer_->name
+       << " should be accessed in the leaf block {0} via BufferLoad or BufferStore. The indices "
+          "should be the same if there are multiple accesses to the target buffer. ";
+    if (kind_ == ErrorKind::kNoAccess) {
+      os << "No buffer accesses found.";
+    } else if (kind_ == ErrorKind::kNonUniqueAccess) {
+      os << "Multiple buffer accesses have non-unique indices.";
+    } else if (kind_ == ErrorKind::kOpaqueAccess) {
+      os << "Opaque buffer accesses found.";
+    }
+    return os.str();
+  }
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+
+ private:
+  IRModule mod_;
+  Buffer buffer_;
+  Block block_;
+  ErrorKind kind_;
+};
+
+/*! \brief Collect the related Load/Store to reindex */
+class ReIndexCollector : public StmtExprVisitor {
+ public:
+  static Array<PrimExpr> Collect(const IRModule& mod, const Buffer& buffer, const Block& block) {
+    ReIndexCollector collector(mod, buffer, block);
+    collector(block->body);
+    if (!collector.buffer_access_indices_.defined()) {
+      throw InvalidBufferAccessError(mod, buffer, block,
+                                     InvalidBufferAccessError::ErrorKind::kNoAccess);
+    }
+    return collector.buffer_access_indices_.value();
+  }
+
+ private:
+  explicit ReIndexCollector(const IRModule& mod, const Buffer& buffer, const Block& block)
+      : mod_(mod), buffer_(buffer), block_(block) {}
+
+  void VisitExpr_(const BufferLoadNode* load) final {
+    StmtExprVisitor::VisitExpr_(load);
+    if (load->buffer.same_as(buffer_)) {
+      CheckAndUpdateBufferAccessIndices(load->indices);
+    }
+  }
+
+  void VisitStmt_(const BlockNode* block) final {
+    // no sub-blocks under this block
+    throw NotLeafBlockError(mod_, block_);
+  }
+
+  void VisitStmt_(const BufferStoreNode* store) final {
+    StmtExprVisitor::VisitStmt_(store);
+    if (store->buffer.same_as(buffer_)) {
+      CheckAndUpdateBufferAccessIndices(store->indices);
+    }
+  }
+
+  void CheckAndUpdateBufferAccessIndices(const Array<PrimExpr> indices) {
+    if (!buffer_access_indices_.defined()) {
+      buffer_access_indices_ = indices;
+      return;
+    } else if (!std::equal(buffer_access_indices_.value().begin(),
+                           buffer_access_indices_.value().end(), indices.begin(), indices.end(),
+                           ExprDeepEqual())) {
+      throw InvalidBufferAccessError(mod_, buffer_, block_,
+                                     InvalidBufferAccessError::ErrorKind::kNonUniqueAccess);
+    }
+  }
+
+  void VisitExpr_(const VarNode* var) final {
+    if (var == buffer_->data.get()) {
+      throw InvalidBufferAccessError(mod_, buffer_, block_,
+                                     InvalidBufferAccessError::ErrorKind::kOpaqueAccess);
+    }
+  }
+  /*! \brief The IR module */
+  IRModule mod_;
+  /*! \brief The buffer to rewrite */
+  Buffer buffer_;
+  /*! \brief The block to visit */
+  Block block_;
+  /*! \brief The indices of buffer acess to rewrite */
+  Optional<Array<PrimExpr>> buffer_access_indices_;
+};
+
+/*! \brief Mutator of ReIndex */
+class ReIndexRewriter : public StmtExprMutator {
+ public:
+  static Stmt Rewrite(const StmtSRef& scope_sref, const StmtSRef& block_sref, CacheStageInfo* info,
+                      const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& covered) {
+    ReIndexRewriter rewriter(block_sref, info, covered);
+    return rewriter(GetRef<Stmt>(scope_sref->stmt));
+  }
+
+ private:
+  explicit ReIndexRewriter(const StmtSRef& block_sref, CacheStageInfo* info,
+                           const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& covered)
+      : block_sref_(block_sref), info_(info), covered_(covered) {
+    new_buffer_ = info->alloc;
+    old_buffer_ = info->read_buffer.same_as(new_buffer_) ? info->write_buffer : info->read_buffer;
+  }
+
+  Stmt VisitStmt_(const BlockNode* block) final {
+    Block old_stmt = GetRef<Block>(block);
+    if (is_scope_) {
+      is_scope_ = false;
+      Block stmt = Downcast<Block>(StmtExprMutator::VisitStmt_(block));
+      // Insert cache stage into the loop
+      ObjectPtr<BlockNode> n = make_object<BlockNode>(*stmt.as<BlockNode>());
+      n->body = InsertCacheStage(n->body, info_->loc_pos, info_->cache_stage);
+      n->alloc_buffers.push_back(info_->alloc);
+      stmt = Block(n);
+      info_->block_reuse.Set(old_stmt, stmt);
+      return stmt;
+    }
+
+    // Visiting the blokc being reindexed
+    if (block == block_sref_->stmt) {
+      // Collect the updated indices and regions
+      for (const IterVar& iter : block->iter_vars) {
+        if (covered_.count(iter->var)) {
+          indices_.push_back(iter->var);
+          region_.push_back(Range::FromMinExtent(iter->var, 1));
+        }
+      }
+      Block stmt = Downcast<Block>(StmtExprMutator::VisitStmt_(block));
+      // Update block reads/writes to use the intermediate reindex buffer
+      auto writes =
+          ReplaceBufferRegion(block->writes, old_buffer_, BufferRegion{new_buffer_, region_});
+      auto reads =
+          ReplaceBufferRegion(block->reads, old_buffer_, BufferRegion{new_buffer_, region_});
+      auto match_buffers = ReplaceBufferRegion(block->match_buffers, old_buffer_,
+                                               BufferRegion{new_buffer_, region_});
+      if (!writes.same_as(block->writes) || !reads.same_as(block->reads) ||
+          !match_buffers.same_as(block->match_buffers)) {
+        ObjectPtr<BlockNode> n = make_object<BlockNode>(*stmt.as<BlockNode>());
+        n->writes = std::move(writes);
+        n->reads = std::move(reads);
+        n->match_buffers = std::move(match_buffers);
+        stmt = Block(n);
+      }
+      info_->block_reuse.Set(old_stmt, stmt);
+      return stmt;
+    }
+    return old_stmt;
+  }
+
+  template <typename Node>
+  Node VisitBufferAccess(Node node) {
+    if (node->buffer.same_as(old_buffer_)) {
+      auto* n = node.CopyOnWrite();
+      n->buffer = new_buffer_;
+      n->indices = indices_;
+    }
+    return node;
+  }
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    BufferStore buffer_store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    return VisitBufferAccess(std::move(buffer_store));
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    BufferLoad buffer_load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    return VisitBufferAccess(std::move(buffer_load));
+  }
+
+ private:
+  /*! \brief The parent scope of the insertion. */
+  const StmtSRef& block_sref_;
+  /*! \brief The info for inserting reindex stage. */
+  CacheStageInfo* info_;
+  /*! \brief Whether old block var is covered in the indices */
+  const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& covered_;
+  /*! \brief Whether the current block is scope block */
+  bool is_scope_{true};
+  /*! \brief The  buffer to be replaced */
+  Buffer old_buffer_;
+  /*! \brief The reindex buffer */
+  Buffer new_buffer_;
+  /*! \brief The new indices */
+  Array<PrimExpr> indices_;
+  /*! \brief The new region */
+  Region region_;
+};
+
 /******** Implementation ********/
 
 StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buffer_index,
@@ -729,6 +1090,80 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu
   return result_block_sref;
 }
 
+StmtSRef ReIndex(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
+                 BufferIndexType buffer_index_type) {
+  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
+  Block block = GetRef<Block>(block_ptr);
+  Buffer buffer =
+      GetNthAccessBuffer(self, block, buffer_index, buffer_index_type == BufferIndexType::kWrite);
+  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
+  arith::Analyzer analyzer;
+
+  // Step 1. Collect the original indices and check there's only single pattern of related
+  // Load/Store and the buffer is not accessed opaquely
+  Array<PrimExpr> original_indices = ReIndexCollector::Collect(self->mod, buffer, block);
+  // Simplify the indices if possible
+  for (const IterVar& iter : block->iter_vars) {
+    analyzer.Bind(iter->var, iter->dom);
+  }
+  original_indices.MutateByApply(
+      [&analyzer](const PrimExpr& expr) { return analyzer.Simplify(expr); });
+
+  // Collect block iters appearing in the original_indices
+  std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> covered;
+  for (const PrimExpr& index : original_indices) {
+    PreOrderVisit(index, [&](const ObjectRef& obj) -> bool {
+      if (const VarNode* var = obj.as<VarNode>()) {
+        covered.insert(GetRef<Var>(var));
+      }
+      return true;
+    });
+  }
+
+  // Step 2. Creating CacheStageInfo
+  CacheStageInfo info;
+  // Create the corresponding buffer to be read(write), i.e. the result of reindex read(write)
+  if (buffer_index_type == BufferIndexType::kWrite) {
+    info.read_buffer = CreateReindexBuffer(buffer, block->iter_vars, covered);
+    info.write_buffer = buffer;
+    info.alloc = info.read_buffer;
+  } else {
+    info.read_buffer = buffer;
+    info.write_buffer = CreateReindexBuffer(buffer, block->iter_vars, covered);
+    info.alloc = info.write_buffer;
+  }
+
+  // Step 3. Check the block belongs to a chain loop nesting under the scope,
+  //         and get the insert location
+  const StmtSRefNode* loop;
+  for (loop = block_sref->parent; loop->parent != scope_sref.get();) {
+    const ForNode* outer = loop->parent->StmtAs<ForNode>();
+    const ForNode* inner = loop->StmtAs<ForNode>();
+    ICHECK(outer != nullptr && inner != nullptr);
+    ICHECK(outer->body.get() == inner);
+    loop = loop->parent;
+  }
+
+  info.loc_pos = loop->seq_index == -1 ? 0 : loop->seq_index;
+  if (buffer_index_type == BufferIndexType::kWrite) {
+    info.loc_pos++;
+  }
+
+  // Step 4. Making new reindex stage block and rewrite
+  Block reindex_stage =
+      MakeReIndexStage(block, &info, covered, original_indices, buffer_index, buffer_index_type);
+  Stmt new_scope = ReIndexRewriter::Rewrite(scope_sref, block_sref, &info, covered);
+
+  // Step 5. Replacing and updating flags
+  self->Replace(scope_sref, new_scope, info.block_reuse);
+  StmtSRef result_block_sref = self->stmt2ref.at(reindex_stage.get());
+  BlockInfo& block_info = self->block_info[result_block_sref];
+  block_info.affine_binding = CalculateAffineFlag(self, result_block_sref);
+  block_info.region_cover = true;
+  block_info.scope->stage_pipeline = true;
+  return result_block_sref;
+}
+
 /******** Instruction Registration ********/
 
 struct CacheReadTraits : public UnpackedInstTraits<CacheReadTraits> {
@@ -787,7 +1222,40 @@ struct CacheWriteTraits : public UnpackedInstTraits<CacheWriteTraits> {
   friend struct ::tvm::tir::UnpackedInstTraits;
 };
 
+struct ReIndexTraits : public UnpackedInstTraits<ReIndexTraits> {
+  static constexpr const char* kName = "ReIndex";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumAttrs = 2;
+  static constexpr size_t kNumDecisions = 0;
+
+  static BlockRV UnpackedApplyToSchedule(Schedule sch, BlockRV block, Integer buffer_index,
+                                         Integer buffer_index_type) {
+    return sch->ReIndex(block, buffer_index,
+                        static_cast<BufferIndexType>(buffer_index_type->value));
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String block, Integer buffer_index,
+                                 Integer buffer_index_type) {
+    PythonAPICall py("reindex");
+    py.Input("block", block);
+    py.Input("buffer_index", buffer_index);
+    py.Input("buffer_index_type", '"' +
+                                      std::string(BufferIndexType2Str(
+                                          static_cast<BufferIndexType>(buffer_index_type->value))) +
+                                      '"');
+    py.SingleOutput(outputs);
+    return py.Str();
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
 TVM_REGISTER_INST_KIND_TRAITS(CacheReadTraits);
 TVM_REGISTER_INST_KIND_TRAITS(CacheWriteTraits);
+TVM_REGISTER_INST_KIND_TRAITS(ReIndexTraits);
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index fb884ce77f7b7..3880d0b19eeb8 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -165,6 +165,11 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheRead")
     .set_body_method<Schedule>(&ScheduleNode::CacheRead);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheWrite")
     .set_body_method<Schedule>(&ScheduleNode::CacheWrite);
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleReIndex")
+    .set_body_typed([](Schedule self, const BlockRV& block_rv, int buffer_index,
+                       int buffer_index_type) {
+      return self->ReIndex(block_rv, buffer_index, static_cast<BufferIndexType>(buffer_index_type));
+    });
 /******** (FFI) Compute location ********/
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleComputeAt")
     .set_body_method<Schedule>(&ScheduleNode::ComputeAt);
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 8156480a4516b..d2f627edfd11d 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -265,6 +265,18 @@ BlockRV TracedScheduleNode::CacheWrite(const BlockRV& block_rv, int write_buffer
   return result;
 }
 
+BlockRV TracedScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index,
+                                    BufferIndexType buffer_index_type) {
+  BlockRV result = ConcreteScheduleNode::ReIndex(block_rv, buffer_index, buffer_index_type);
+
+  static const InstructionKind& kind = InstructionKind::Get("ReIndex");
+  trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
+                                      /*inputs=*/{block_rv},
+                                      /*attrs=*/{Integer(buffer_index), Integer(buffer_index_type)},
+                                      /*outputs=*/{result}));
+  return result;
+}
+
 /******** Schedule: Compute location ********/
 
 void TracedScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index d1860be9512d7..ba4a4b99cbb2d 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -73,6 +73,8 @@ class TracedScheduleNode : public ConcreteScheduleNode {
                     const String& storage_scope) final;
   BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                      const String& storage_scope) final;
+  BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
+                  BufferIndexType buffer_index_type) final;
   /******** Schedule: Compute location ********/
   void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops) final;
   void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index 79802ecd65dbb..67d0f55f20b9f 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -70,6 +70,32 @@ Array<MatchBufferRegion> ReplaceBuffer(Array<MatchBufferRegion> match_buffers, c
   return match_buffers;
 }
 
+Array<BufferRegion> ReplaceBufferRegion(Array<BufferRegion> regions, const Buffer& source_buffer,
+                                        const BufferRegion& target) {
+  regions.MutateByApply([&source_buffer, &target](const BufferRegion& region) -> BufferRegion {
+    if (region->buffer.same_as(source_buffer)) {
+      return target;
+    }
+    return region;
+  });
+  return regions;
+}
+
+Array<MatchBufferRegion> ReplaceBufferRegion(Array<MatchBufferRegion> match_buffers,
+                                             const Buffer& source_buffer,
+                                             const BufferRegion& target) {
+  match_buffers.MutateByApply([&source_buffer, &target](
+                                  const MatchBufferRegion& match_buffer) -> MatchBufferRegion {
+    if (match_buffer->source->buffer.same_as(source_buffer)) {
+      ObjectPtr<MatchBufferRegionNode> n = make_object<MatchBufferRegionNode>(*match_buffer.get());
+      n->source = target;
+      return MatchBufferRegion(n);
+    }
+    return match_buffer;
+  });
+  return match_buffers;
+}
+
 /******** ReplaceBufferMutator ********/
 ReplaceBufferMutator::ReplaceBufferMutator(const Buffer& old_buffer, Buffer new_buffer,
                                            Map<Block, Block>* block_sref_reuse)
diff --git a/src/tir/schedule/transform.h b/src/tir/schedule/transform.h
index 192d44d9e9adc..908a823c2d860 100644
--- a/src/tir/schedule/transform.h
+++ b/src/tir/schedule/transform.h
@@ -73,6 +73,27 @@ Array<BufferRegion> ReplaceBuffer(Array<BufferRegion> regions, const Buffer& sou
 Array<MatchBufferRegion> ReplaceBuffer(Array<MatchBufferRegion> match_buffers, const Buffer& source,
                                        const Buffer& target);
 
+/*!
+ * \brief Replaces the buffer region within the specific sequence of regions
+ * \param regions The regions to be replaced
+ * \param source_buffer The buffer to whose region is to be replaced
+ * \param target The buffer region to be replaced to
+ * \return The new sequence of regions after replacement
+ */
+Array<BufferRegion> ReplaceBufferRegion(Array<BufferRegion> regions, const Buffer& source_buffer,
+                                        const BufferRegion& target);
+
+/*!
+ * \brief Replaces the buffer region within the specific sequence of match_buffers
+ * \param regions The match_buffers to be replaced
+ * \param source_buffer The buffer to whose region is to be replaced
+ * \param target The buffer region to be replaced to
+ * \return The new sequence of match_buffers after replacement
+ */
+Array<MatchBufferRegion> ReplaceBufferRegion(Array<MatchBufferRegion> match_buffers,
+                                             const Buffer& source_buffer,
+                                             const BufferRegion& target);
+
 /*!
  * \brief A helper mutator which recursively replaces the old buffer with the new buffer and
  * collects the block sref reuse information for the following replacement.
diff --git a/tests/python/unittest/test_tir_schedule_reindex.py b/tests/python/unittest/test_tir_schedule_reindex.py
new file mode 100644
index 0000000000000..9b2e37a19813a
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_reindex.py
@@ -0,0 +1,203 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-function-docstring,missing-module-docstring
+import pytest
+import tvm
+import tvm.testing
+from tvm import tir
+from tvm.script import tir as T
+from tvm.tir.schedule.schedule import ScheduleError
+from tvm.tir.schedule.testing import verify_trace_roundtrip
+
+
+@T.prim_func
+def transpose_elementwise(
+    A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]
+) -> None:
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vj, vi] * 2.0
+
+
+@T.prim_func
+def transpose_elementwise_reindex_read(
+    A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]
+) -> None:
+    A_reindex = T.alloc_buffer((128, 128), "float32")
+    for i, j in T.grid(128, 128):
+        with T.block("A_reindex"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            A_reindex[vi, vj] = A[vj, vi]
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A_reindex[vi, vj] * 2.0
+
+
+@T.prim_func
+def conv2d_nhwc(
+    Input: T.Buffer[(1, 224, 224, 3), "float32"],
+    Weight: T.Buffer[(7, 7, 3, 64), "float32"],
+    Conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"],
+) -> None:
+    PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+    for i0, i1, i2, i3 in T.grid(1, 230, 230, 3):
+        with T.block("PadInput"):
+            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
+                ((((i1_1 >= 3) and (i1_1 < 227)) and (i2_1 >= 3)) and (i2_1 < 227)),
+                Input[i0_1, (i1_1 - 3), (i2_1 - 3), i3_1],
+                T.float32(0),
+                dtype="float32",
+            )
+    for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 112, 112, 64, 7, 7, 3):
+        with T.block("conv2d_nhwc"):
+            n, h, w, co, rh, rw, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+            with T.init():
+                Conv2d_nhwc[n, h, w, co] = T.float32(0)
+            Conv2d_nhwc[n, h, w, co] = Conv2d_nhwc[n, h, w, co] + (
+                PadInput[n, ((h * 2) + rh), ((w * 2) + rw), ((T.floordiv(co, 64) * 3) + rc)]
+                * Weight[rh, rw, rc, co]
+            )
+
+
+@T.prim_func
+def conv2d_nhwc_reindex_weight(
+    var_inputs: T.handle, var_weight: T.handle, var_conv2d_nhwc: T.handle
+) -> None:
+    inputs = T.match_buffer(var_inputs, [1, 224, 224, 3], dtype="float32")
+    weight = T.match_buffer(var_weight, [7, 7, 3, 64], dtype="float32")
+    conv2d_nhwc = T.match_buffer(var_conv2d_nhwc, [1, 112, 112, 64], dtype="float32")
+    PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+    weight_reindex = T.alloc_buffer([64, 7, 7, 3], dtype="float32")
+    for i0, i1, i2, i3 in T.grid(1, 230, 230, 3):
+        with T.block("PadInput"):
+            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            T.reads(inputs[i0_1, i1_1 - 3, i2_1 - 3, i3_1])
+            T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
+            PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
+                i1_1 >= 3 and i1_1 < 227 and i2_1 >= 3 and i2_1 < 227,
+                inputs[i0_1, i1_1 - 3, i2_1 - 3, i3_1],
+                T.float32(0),
+                dtype="float32",
+            )
+    for ax0, ax1, ax2, ax3, ax4, ax5, ax6 in T.grid(1, 1, 1, 64, 7, 7, 3):
+        with T.block("weight_reindex"):
+            v0, v1, v2, v3, v4, v5, v6 = T.axis.remap(
+                "SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6]
+            )
+            T.reads(weight[v4, v5, v6, v3])
+            T.writes(weight_reindex[v3, v4, v5, v6])
+            weight_reindex[v3, v4, v5, v6] = weight[v4, v5, v6, v3]
+    for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 112, 112, 64, 7, 7, 3):
+        with T.block("conv2d_nhwc"):
+            n, h, w, co, rh, rw, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+            T.reads(
+                PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc],
+                weight_reindex[co, rh, rw, rc],
+            )
+            T.writes(conv2d_nhwc[n, h, w, co])
+            with T.init():
+                conv2d_nhwc[n, h, w, co] = T.float32(0)
+            conv2d_nhwc[n, h, w, co] = (
+                conv2d_nhwc[n, h, w, co]
+                + PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc]
+                * weight_reindex[co, rh, rw, rc]
+            )
+
+
+@T.prim_func
+def matmul(
+    A: T.Buffer[(512, 512), "float32"],
+    B: T.Buffer[(512, 512), "float32"],
+    C: T.Buffer[(512, 512), "float32"],
+) -> None:
+    for i0, i1, i2 in T.grid(512, 512, 512):
+        with T.block("matmul"):
+            i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+            T.reads(C[i, j], A[i, k], B[k, j])
+            T.writes(C[i, j])
+            with T.init():
+                C[i, j] = T.float32(0)
+            C[i, j] = C[i, j] + A[i, k] * B[k, j]
+
+
+@T.prim_func
+def matmul_reindex_write(
+    A: T.Buffer[(512, 512), "float32"],
+    B: T.Buffer[(512, 512), "float32"],
+    C: T.Buffer[(512, 512), "float32"],
+) -> None:
+    C_reindex = T.alloc_buffer([512, 512], dtype="float32")
+    for i0, i1, i2 in T.grid(512, 512, 512):
+        with T.block("matmul"):
+            i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+            T.reads(C_reindex[i, j], A[i, k], B[k, j])
+            T.writes(C_reindex[i, j])
+            with T.init():
+                C_reindex[i, j] = T.float32(0)
+            C_reindex[i, j] = C_reindex[i, j] + A[i, k] * B[k, j]
+    for i0, i1, i2 in T.grid(512, 512, 1):
+        with T.block("C_reindex"):
+            v0, v1, v2 = T.axis.remap("SSS", [i0, i1, i2])
+            T.reads(C_reindex[v0, v1])
+            T.writes(C[v0, v1])
+            C[v0, v1] = C_reindex[v0, v1]
+
+
+@T.prim_func
+def multiple_read(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]) -> None:
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vj, vi] + A[vi, vj]
+
+
+def test_reindex_read_basic():
+    sch = tir.Schedule(transpose_elementwise)
+    block = sch.get_block("B")
+    sch.reindex(block, 0, "read")
+    tvm.ir.assert_structural_equal(transpose_elementwise_reindex_read, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=transpose_elementwise)
+
+
+def test_conv2d_reindex_read():
+    sch = tir.Schedule(conv2d_nhwc)
+    block = sch.get_block("conv2d_nhwc")
+    sch.reindex(block, 1, "read")
+    tvm.ir.assert_structural_equal(conv2d_nhwc_reindex_weight, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=conv2d_nhwc)
+
+
+def test_matmul_reindex_write():
+    sch = tir.Schedule(matmul)
+    block = sch.get_block("matmul")
+    sch.reindex(block, 0, "write")
+    tvm.ir.assert_structural_equal(matmul_reindex_write, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=matmul)
+
+
+def test_reindex_fail_multiple_read():
+    sch = tir.Schedule(multiple_read)
+    block = sch.get_block("B")
+    with pytest.raises(ScheduleError):
+        sch.reindex(block, 0, "read")
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From aff1312e365142bcb77d6ae847753702a4e3a0c6 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 2 Jun 2022 14:37:11 -0700
Subject: [PATCH 028/181] [PROFILER] Fix percent compute bound calculation
 (#11542)

* [PROFILER] Fix percent compute bound calculation

Somehow the runtime was dropped from the percent compute bound
calculation. Tolerances on the test we bumped a little bit higher to try
and catch mistakes like this in the future.

* forgot print
---
 python/tvm/utils/roofline.py                    | 2 +-
 tests/python/unittest/test_runtime_profiling.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py
index 6d1ac753e27e5..8a17b9f003123 100644
--- a/python/tvm/utils/roofline.py
+++ b/python/tvm/utils/roofline.py
@@ -392,7 +392,7 @@ def roofline_from_existing(
             compute_bound = arith_inten > ridge_point
             call["Bound"] = "compute" if compute_bound else "memory"
             per_mem_bound = (loaded_bytes / runtime) / peak_bandwidth * 100
-            per_compute_bound = flops / peak_flops * 100.0
+            per_compute_bound = (flops / runtime) / peak_flops * 100.0
             # We use ratio here because the percentages should be averaged instead of summed.
             call["Percent of Theoretical Optimal"] = profiling.Ratio(
                 per_compute_bound if compute_bound else per_mem_bound
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 919057f08d27c..29a8414337756 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -328,7 +328,7 @@ def test_roofline_analysis(target, dev):
             # Ideally we'd like a little tighter bound here, but it is hard to
             # know how well this dense will perform without tuning. And we
             # don't have an operator that uses a specific number of flops.
-            assert call["Percent of Theoretical Optimal"].ratio >= 0
+            assert call["Percent of Theoretical Optimal"].ratio >= 5.0
 
 
 @tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
@@ -354,7 +354,7 @@ def test_roofline_analysis_rpc():
             # Ideally we'd like a little tighter bound here, but it is hard to
             # know how well this dense will perform without tuning. And we
             # don't have an operator that uses a specific number of flops.
-            assert call["Percent of Theoretical Optimal"].ratio >= 0
+            assert call["Percent of Theoretical Optimal"].ratio >= 5.0
 
 
 if __name__ == "__main__":

From 017d410bd18fd3e272ea49ea9e11955c3128bb72 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Thu, 2 Jun 2022 14:37:40 -0700
Subject: [PATCH 029/181] Fix docker/lint.sh after #10933. (#11541)

---
 docker/lint.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/lint.sh b/docker/lint.sh
index a968bc1e6421b..4f7bca445a9fd 100755
--- a/docker/lint.sh
+++ b/docker/lint.sh
@@ -20,7 +20,7 @@
 source "$(dirname $0)/dev_common.sh"
 
 SCRIPT_NAME="$0"
-DEFAULT_STEPS=( file_type asf cpplint clang_format pylint python_format jnilint cppdocs mypy )
+DEFAULT_STEPS=( file_type asf clang_format cpplint python_format pylint jnilint cppdocs mypy )
 
 inplace_fix=0
 
@@ -43,12 +43,12 @@ function run_lint_step() {
             ;;
         clang_format)
             if [ $inplace_fix -eq 0 ]; then
-                cmd=( tests/lint/clang_format.sh )
+                cmd=( tests/lint/git-clang-format.sh )
             else
                 # NOTE: need to run git status to update some docker-side cache. Otherwise,
                 # git-clang-format will fail with "The following files would be modified but have
                 # unstaged changes:"
-                cmd=( bash -c 'git status &>/dev/null && tests/lint/git-clang-format.sh -i origin/main' )
+                cmd=( bash -c 'git status &>/dev/null && tests/lint/git-clang-format.sh -i --rev origin/main' )
             fi
             ;;
         cpplint)
@@ -62,9 +62,9 @@ function run_lint_step() {
             ;;
         python_format)
             if [ $inplace_fix -eq 0 ]; then
-                cmd=( tests/lint/python_format.sh )
+                cmd=( tests/lint/git-black.sh )
             else
-                cmd=( tests/lint/git-black.sh -i origin/main )
+                cmd=( tests/lint/git-black.sh -i --rev origin/main )
             fi
             ;;
         jnilint)

From f31477f9c3c5ad618750ad6d43b6d6020f6b44d6 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 2 Jun 2022 16:47:20 -0700
Subject: [PATCH 030/181] [FIX] Pad feature vectors to the same size in xgboost
 cost model (#11479)

* [FIX] Pad feature vectors to the same size in xgboost cost model

* add test

* more test

* explaination

* formatting
---
 .../tvm/autotvm/tuner/xgboost_cost_model.py   | 24 +++++++++++++------
 python/tvm/testing/autotvm.py                 | 11 ++++++---
 .../unittest/test_autotvm_xgboost_model.py    |  4 ++++
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 637891854aee0..d4942ce6a4ca0 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -243,18 +243,27 @@ def fit_log(self, records, plan_size, min_seed_records=500):
         else:
             raise RuntimeError("Invalid feature type: " + self.fea_type)
         result = pool.map_with_error_catching(feature_extract_func, data)
+        result = list(result)  # store results so we can iterate through them twice
 
-        # filter out feature with different shapes
-        fea_len = len(self._get_feature([0])[0])
+        # get maximum feature length
+        fea_len = -1
+        for res in result:
+            if res.status != StatusKind.COMPLETE:
+                continue
+            x, _ = res.value
+            fea_len = max(fea_len, x.shape[0])
 
         xs, ys = [], []
         for res in result:
             if res.status != StatusKind.COMPLETE:
                 continue
             x, y = res.value
-            if len(x) == fea_len:
+            # Features may not be the same size, pad them until they are
+            if fea_len > len(x):
+                xs.append(np.pad(x, (0, fea_len - len(x))))
+            else:
                 xs.append(x)
-                ys.append(y)
+            ys.append(y)
 
         if len(xs) < min_seed_records:  # no enough samples
             return False
@@ -329,15 +338,16 @@ def _get_feature(self, indexes):
             for i, fea in zip(need_extract, feas):
                 fea_cache[i] = fea.value if fea.status == StatusKind.COMPLETE else None
 
-        feature_len = None
+        feature_len = -1
         for idx in indexes:
             if fea_cache[idx] is not None:
-                feature_len = fea_cache[idx].shape[-1]
-                break
+                feature_len = max(fea_cache[idx].shape[-1], feature_len)
 
         ret = np.empty((len(indexes), feature_len), dtype=np.float32)
         for i, ii in enumerate(indexes):
             t = fea_cache[ii]
+            if t.shape[0] < feature_len:
+                t = np.pad(t, (0, feature_len - t.shape[0]))
             ret[i, :] = t if t is not None else 0
         return ret
 
diff --git a/python/tvm/testing/autotvm.py b/python/tvm/testing/autotvm.py
index 6f7bb13fe6dca..b1132cd1faa7f 100644
--- a/python/tvm/testing/autotvm.py
+++ b/python/tvm/testing/autotvm.py
@@ -62,9 +62,14 @@ def matmul(N, L, M, dtype):
 
     # schedule according to config
     yo, yi = cfg["tile_y"].apply(s, C, y)
-    xo, xi = cfg["tile_x"].apply(s, C, x)
-
-    s[C].reorder(yo, xo, k, yi, xi)
+    # Make sure configurations have a varied number of itervars. Splitting adds
+    # new itervars, so conditionally splitting with cause the number of
+    # itervars to depend on the tile size.
+    if cfg["tile_x"].size[-1] > 1:
+        xo, xi = cfg["tile_x"].apply(s, C, x)
+        s[C].reorder(yo, xo, k, yi, xi)
+    else:
+        s[C].reorder(yo, k, yi, x)
 
     return s, [A, B, C]
 
diff --git a/tests/python/unittest/test_autotvm_xgboost_model.py b/tests/python/unittest/test_autotvm_xgboost_model.py
index baecdaceab6d3..7fa3daede07e1 100644
--- a/tests/python/unittest/test_autotvm_xgboost_model.py
+++ b/tests/python/unittest/test_autotvm_xgboost_model.py
@@ -43,6 +43,10 @@ def test_fit():
 
     upper_model.fit(xs, ys, plan_size=32)
 
+    # feature lengths are not guaranteed to always be the same
+    upper_model.predict(np.ones(12))
+    upper_model.predict(np.ones(8))
+
 
 def fit_spawn():
     assert multiprocessing.get_start_method(False) == "spawn"

From 274d8fa964489e03ad97e684902063d935bf192b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Thu, 2 Jun 2022 16:56:50 -0700
Subject: [PATCH 031/181] Unbreak CI image build (tensorflow 2.6.5, ci_gpu
 bugfix) (#11546)

* Pin protobuf to 3.20.1 due to #11545.

* Unpin and instead update to 2.6.5

* attempt to fix gpu build

* Revert to 2.6.3, pin protobuf for ci-arm.

* escape bash char
---
 docker/Dockerfile.ci_gpu                            | 2 +-
 docker/install/ubuntu_install_tensorflow.sh         | 2 +-
 docker/install/ubuntu_install_tensorflow_aarch64.sh | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 73d13007f1d06..e0d1997de729b 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -24,7 +24,7 @@ FROM nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
 
 # Base scripts
-RUN rm /etc/apt/sources.list.d/nvidia-ml.list && apt-get clean
+RUN rm -f /etc/apt/sources.list.d/nvidia-ml.list && apt-get clean
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
index eaf89ffcf8fef..17d2b31d9bc24 100755
--- a/docker/install/ubuntu_install_tensorflow.sh
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -23,4 +23,4 @@ set -o pipefail
 pip3 install \
     "h5py==3.1.0" \
     keras==2.6 \
-    tensorflow==2.6.2
+    tensorflow==2.6.5
diff --git a/docker/install/ubuntu_install_tensorflow_aarch64.sh b/docker/install/ubuntu_install_tensorflow_aarch64.sh
index 6acf8b7270d81..8d5b6765deb05 100755
--- a/docker/install/ubuntu_install_tensorflow_aarch64.sh
+++ b/docker/install/ubuntu_install_tensorflow_aarch64.sh
@@ -26,5 +26,6 @@ apt-get install -y --no-install-recommends libhdf5-dev
 pip3 install \
     "h5py==3.1.0" \
     keras==2.6 \
-    tensorflow-aarch64==2.6.2 \
+    tensorflow-aarch64==2.6.3 \
+    "protobuf<4" \
     -f https://snapshots.linaro.org/ldcg/python-cache/tensorflow-aarch64/

From 2ae20882d3e34cc6e5acef992c23c17a585c25aa Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 3 Jun 2022 11:58:30 -0400
Subject: [PATCH 032/181] [hexagon][testing] add TIRScript elemwise-add
 (#11490)

Replace TE-based elementwise-add benchmark with
a TVMScript-based one.

Update Hexagon target architecture from v68 to v69.
As a result, the benchmark now requires a version of
Hexagon SDK newer than 4.4.0.1.  Version 4.5.0.3 is
known to work.
---
 .../test_hexagon/benchmark_elemwise_add.py    | 434 ++++++++++++++++++
 .../contrib/test_hexagon/benchmark_hexagon.py | 245 ----------
 .../contrib/test_hexagon/benchmark_util.py    |  34 ++
 3 files changed, 468 insertions(+), 245 deletions(-)
 create mode 100644 tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
 delete mode 100644 tests/python/contrib/test_hexagon/benchmark_hexagon.py

diff --git a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py b/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
new file mode 100644
index 0000000000000..70266d7939bc5
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
@@ -0,0 +1,434 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import os.path
+import sys
+import pytest
+import numpy as np
+import logging
+import tempfile
+
+import tvm.testing
+import tvm.script
+from tvm.script import tir as T
+from tvm import te
+from tvm.contrib.hexagon.build import HexagonLauncherRPC
+from . import benchmark_util
+
+# This is a fixed detail of the v68 architecture.
+HVX_VECTOR_BYTES = 128
+
+_HEXAGON_TARGET = tvm.target.hexagon("v69", link_params=True)
+
+_SUPER_TARGET = tvm.target.Target(_HEXAGON_TARGET, host=_HEXAGON_TARGET)
+
+# NOTE on server ports:
+# These tests use different port numbers for the RPC server (7070 + ...).
+# The reason is that an RPC session cannot be gracefully closed without
+# triggering TIME_WAIT state on the server socket. This prevents another
+# server to bind to the same port until the wait time elapses.
+
+_BT = benchmark_util.BenchmarksTable()
+
+_CSV_COLUMN_ORDER = [
+    # Identifies which TE-compute / TIRScript is used as the basis for the
+    # benchmarked primfunc. Only needs to be meaningful to humans.
+    "basic_kernel",
+    # The tensors' element type
+    "dtype",
+    # When applicable, indicates the particular variation of schedules
+    # apply by the Python code. Decoding this may require looking at this
+    # script's source code.
+    "sched_type",
+    # The memory location of the tensors used during the execution of
+    # the primfunc.  We currently assume just one location.
+    # This will likely need to be generalized as we add more sophisticated
+    # primfuncs.
+    "mem_scope",
+    # For primfuncs that treat tensor buffers as collections of 1D vectors,
+    # this is the number of vectors in each tensor.
+    # This will likely need to be generalized as we add more sophisticated
+    # primfuncs.
+    "num_vectors_per_tensor",
+    # Reserved columns defined by the BenchmarksTable class.
+    "row_status",
+    "timings_min_usecs",
+    "timings_max_usecs",
+    "timings_median_usecs",
+    "timings_mean_usecs",
+    "timings_stddev_usecs",
+    # For benchmarks that produce files on the host file system, this indicates
+    # their location. Useful for post-mortem investigation of benchmark results.
+    "host_files_dir_path",
+    # Miscellaneous comments about the benchmark.
+    "comments",
+]
+
+_HOST_OUTPUT_DIR = tempfile.mkdtemp()
+
+_PRIMFUNC_NAME = "elemwise_add"
+
+print("-" * 80)
+print("OUTPUT DIRECTORY: {}".format(_HOST_OUTPUT_DIR))
+print("-" * 80)
+print()
+
+
+class UnsupportedException(Exception):
+    """
+    Indicates that the specified benchmarking configuration is known to
+    currently be unsupported.  The Exception message may provide more detail.
+    """
+
+
+class NumericalAccuracyException(Exception):
+    """
+    Indicates that the benchmarking configuration appeared to run successfully,
+    but the output data didn't have the expected accuracy.
+    """
+
+
+from typing import Tuple
+
+
+def _get_irmod_elemwise_add(
+    _PRIMFUNC_NAME: str, shape: list, dtype: str, mem_scope: str
+) -> tvm.ir.module.IRModule:
+    """
+    Return an IRModule containing a single primfunc, expressed as NS-TIR.
+
+    The primfunc implements elementwise-add. Its signature is (A,B,C), where
+    A and B are the input tensors, and C is the output tensor.
+    All three tensors have the specfied shape, dtype, and mem_scope.
+
+    If the specified primfunc is known to be unsupported, raise an UnsupportedExcetion.
+    """
+    assert len(shape) == 2
+
+    # TVMScript can reference simple Python variables, but it doesn't
+    # curently support more complex Python expressions...
+    (
+        dim0_size,
+        dim1_size,
+    ) = shape
+    dtype_str = str(dtype)
+
+    if mem_scope == "global.vtcm":
+        raise UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")
+
+        # This check is currently elided by the one above, but it should become relevant as soon
+        # as we add VTCM support to this kernel generator.
+        #
+        # Also: The VTCM budget is a very rough estimate, based only on experience.
+        # Assuming that it's even reasonable to use a hard-coded estimate AT ALL, this number
+        # may need tweaking.
+        estimated_vtcm_budget_bytes = HVX_VECTOR_BYTES * 1024
+
+        dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+        assert dtype_bits % 8 == 0
+        dtype_bytes = dtype_bits // 8
+
+        num_vtcm_tensors = 3
+        estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors
+
+        if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes:
+            raise UnsupportedException("Expect to exceed VTCM budget.")
+
+    @tvm.script.ir_module
+    class BenchmarkModule:
+        @T.prim_func
+        def main(a: T.handle, b: T.handle, c: T.handle):
+            # We exchange data between function by handles, which are similar to pointer.
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+
+            A = T.match_buffer(a, shape, dtype=dtype)
+            B = T.match_buffer(b, shape, dtype=dtype)
+            C = T.match_buffer(c, shape, dtype=dtype)
+
+            for i in range(dim0_size):
+                for j in range(dim1_size):
+                    C[i, j] = A[i, j] + B[i, j]
+
+    return BenchmarkModule
+
+
+def _benchmark_hexagon_elementwise_add_kernel(
+    hexagon_launcher: HexagonLauncherRPC, shape: list, dtype: str, mem_scope: str
+):
+    """
+    Generate and benchmark a single elementwise-add kernel for Hexagon.
+
+    Produce these outputs:
+      - Printed status updates / results to stdout and/or stderr.
+
+      - Create a new subdirectory under _HOST_OUTPUT_DIR, and populate it with
+        various logs and intermediate files.
+
+      - Add to _BT a row describing this benchmark run.
+    """
+    # Represent the benchmark details in a form required by the benchmark table
+    # and for other logging...
+    keys_dict = {
+        "basic_kernel": "ewise-add",
+        "dtype": dtype,
+        "shape": shape,
+        "mem_scope": mem_scope,
+    }
+
+    desc = benchmark_util.get_benchmark_decription(keys_dict)
+
+    # Create the host-side directory for this benchmark run's files / logs...
+    host_files_dir_name = benchmark_util.get_benchmark_id(keys_dict)
+    host_files_dir_path = os.path.join(_HOST_OUTPUT_DIR, host_files_dir_name)
+    os.mkdir(host_files_dir_path)
+
+    keys_dict["host_files_dir_path"] = host_files_dir_path
+
+    log_file_path = os.path.join(host_files_dir_path, "out.txt")
+    with open(log_file_path, "w") as log_file:
+        print(f"CONFIGURATION: {desc}")
+        log_file.write(f"CONFIGURATION: {desc}\n")
+
+        try:
+            ns_tir_module = _get_irmod_elemwise_add(_PRIMFUNC_NAME, shape, dtype, mem_scope)
+
+            # Dump the primfunc NS-TIR (as text) to the log file...
+            lowered_mod = tvm.lower(ns_tir_module, _PRIMFUNC_NAME)
+            log_file.write("LOWERED IR MODULE:\n")
+            log_file.write(str(lowered_mod))
+            log_file.write("\n")
+
+            # Lower the primfunc's IRModule to Hexagon object code...
+            A = tvm.te.placeholder(shape, dtype=dtype)
+            B = tvm.te.placeholder(shape, dtype=dtype)
+            C = tvm.te.placeholder(shape, dtype=dtype)
+
+            built_module: tvm.driver.build_module.OperatorModule = tvm.build(
+                ns_tir_module,
+                [
+                    A,
+                    B,
+                    C,
+                ],
+                _SUPER_TARGET,
+                name=_PRIMFUNC_NAME,
+            )
+
+            # Create an actual Hexagon-native shared object file, initially stored on the
+            # host's file system...
+            host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so")
+            built_module.save(host_dso_binary_path)
+            print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}")
+
+            # Upload the .so to the Android device's file system (or wherever is appropriate
+            # when using the Hexagon simulator)...
+            target_dso_binary_filename = "test_binary.so"
+            hexagon_launcher.upload(host_dso_binary_path, target_dso_binary_filename)
+
+            # Generate our testing / validation data...
+            (
+                host_numpy_A_data,
+                host_numpy_B_data,
+                host_numpy_C_data_expected,
+            ) = _get_elemwise_add_reference_value_tensors(shape, dtype)
+
+            with hexagon_launcher.start_session() as sess:
+                # On the target device / simulator, make our Hexagon-native shared object
+                # available for use...
+                loaded_hexagon_module: tvm.runtime.module.Module = hexagon_launcher.load_module(
+                    target_dso_binary_filename, sess
+                )
+
+                # Create the target-side tensors to hold the primfunc's inputs and outputs...
+                A_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+                B_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+                C_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+
+                # Populate the primfunc's input tensors...
+                A_data.copyfrom(host_numpy_A_data)
+                B_data.copyfrom(host_numpy_B_data)
+
+                # Actually benchmark the primfunc...
+                timer = loaded_hexagon_module.time_evaluator(
+                    "main", sess.device, number=10, repeat=1
+                )
+                timing_result = timer(A_data, B_data, C_data)
+
+                print(f"TIMING RESULT: {timing_result}")
+                log_file.write(f"TIMING RESULT: {timing_result}\n")
+
+                # Verify that the computation actually happened, and produced the correct result.
+                result = C_data.numpy()
+
+                if dtype == "float16":
+                    # These are the closest tolerance we currently expect / require for these
+                    # kernels.  They may be changed in the future.
+                    rel_tolerance = 0.005
+                    abs_tolerance = 2.0
+                elif dtype == "int8":
+                    rel_tolerance = 0
+                    abs_tolerance = 0
+                else:
+                    raise Exception(f"Unexpected dtype: {dtype}")
+
+                # TODO: We're assuming that *any* assertion thrown by 'assert_allclose' is because
+                # the numerical differences were too large.  But ideally this code would
+                # differentiate between (a) numerical difference errors, which should simply be
+                # recorded as a failed benchmark run, vs. (b) more serious errors that should
+                # kill the overall script.
+                try:
+                    tvm.testing.assert_allclose(
+                        result, host_numpy_C_data_expected, rel_tolerance, abs_tolerance
+                    )
+                except AssertionError as e:
+                    raise NumericalAccuracyException(str(e))
+
+                _BT.record_success(timing_result, **keys_dict)
+
+        except NumericalAccuracyException as e:
+            print()
+            print(f"FAIL: Numerical accuracy error. See log file.")
+
+            log_file.write("\n")
+            log_file.write(f"FAIL: {e}\n")
+
+            _BT.record_fail(**keys_dict, comments=f"Numerical accuracy error. See log file.")
+
+        except UnsupportedException as e:
+            print()
+            print(f"SKIP: {e}")
+
+            log_file.write("\n")
+            log_file.write(f"SKIP: {e}\n")
+
+            _BT.record_skip(**keys_dict, comments=f"Unsupported configuration: {e}")
+
+
+def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
+    """
+    Return [A:np.array, B:np.array, C:np.array]
+
+    `A`, `B`, and `C` are reference data used to exercise and validate
+    an elementwise-add kernel: C = A+B.
+
+    NOTE: These data are primarily meant for performance testing.
+    The values may be helpful in detecting correctness issues, but that's
+    a secondary consideration here.
+    """
+    assert len(shape) == 2
+
+    A = np.ndarray(shape, dtype=dtype)
+    B = np.ndarray(shape, dtype=dtype)
+
+    np_dtype = A.dtype
+
+    if np_dtype.kind in ["i", "u"]:
+        # We allow overflow for integer types because it tends to be well-behaved
+        # and well-understood...
+        min_value = np.iinfo(np_dtype).min
+        max_value = np.iinfo(np_dtype).max
+
+        next_value = min_value
+
+        for i in range(shape[0]):
+            for j in range(shape[1]):
+                A[i, j] = next_value
+                B[i, j] = next_value * 2
+                next_value += 1
+
+    elif np_dtype.kind == "f":
+        # NOTE: For simplicity, we avoid test data that that require
+        # well-defined behavior on floating-point overflow.
+        # But it may be reasonable to test that in the future.
+        min_value = np.finfo(np_dtype).min
+        max_value = np.finfo(np_dtype).max
+
+        min_input_value = min_value / 2.0 + 1
+        max_input_value = max_value / 2.0 - 2
+        delta = (max_input_value - min_input_value) / (shape[0] * shape[1])
+
+        next_value = min_input_value
+
+        for i in range(shape[0]):
+            for j in range(shape[1]):
+                A[i, j] = next_value
+                B[i, j] = next_value + 1
+                next_value += delta
+
+    else:
+        assert False, f"Unexpected data type: {np_dtype}"
+
+    C = A + B
+    return [
+        A,
+        B,
+        C,
+    ]
+
+
+@tvm.testing.requires_hexagon
+def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
+    for dtype in [
+        "int8",
+        "float16",
+    ]:
+
+        for mem_scope in [
+            "global",
+            "global.vtcm",
+        ]:
+
+            # These numbers are fairly arbitrary, but they're meant to stress memory/caches to
+            # various extents.
+            for num_vectors_per_tensor in [
+                1,
+                16,
+                64,
+                512,
+                2048,
+            ]:
+
+                dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+                assert dtype_bits % 8 == 0
+                dtype_bytes = dtype_bits // 8
+
+                elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
+
+                shape = [
+                    num_vectors_per_tensor,
+                    elem_per_hvx_vector,
+                ]
+
+                print()
+                _benchmark_hexagon_elementwise_add_kernel(hexagon_launcher, shape, dtype, mem_scope)
+
+    print("-" * 80)
+    print(f"OUTPUT DIRECTORY: {_HOST_OUTPUT_DIR}")
+    print("-" * 80)
+    print()
+
+    tabular_output_filename = os.path.join(_HOST_OUTPUT_DIR, "benchmark-results.csv")
+    with open(tabular_output_filename, "w") as csv_file:
+        _BT.print_csv(csv_file, _CSV_COLUMN_ORDER)
+
+    print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
+
+    _BT.print_csv(sys.stdout, _CSV_COLUMN_ORDER)
+
+    if _BT.has_fail() > 0:
+        pytest.fail("At least one benchmark configuration failed", pytrace=False)
diff --git a/tests/python/contrib/test_hexagon/benchmark_hexagon.py b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
deleted file mode 100644
index 2a1d6796e7315..0000000000000
--- a/tests/python/contrib/test_hexagon/benchmark_hexagon.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import os.path
-import sys
-import pytest
-import numpy as np
-import logging
-import tempfile
-
-import tvm.testing
-from tvm import te
-from tvm.contrib.hexagon.build import HexagonLauncherRPC
-from .benchmark_util import BenchmarksTable
-
-RPC_SERVER_PORT = 7070
-
-# This is a fixed detail of the v68 architecture.
-HVX_VECTOR_BYTES = 128
-
-# NOTE on server ports:
-# These tests use different port numbers for the RPC server (7070 + ...).
-# The reason is that an RPC session cannot be gracefully closed without
-# triggering TIME_WAIT state on the server socket. This prevents another
-# server to bind to the same port until the wait time elapses.
-
-
-@tvm.testing.requires_hexagon
-def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
-    """
-    Starting with an elementwise-add computation, try various schedules / optimizations to
-    see the impact they have on performance.
-
-    The main motivation for this test is to explore the relationship between these
-    schedules / optimizations vs. how effectively the primfunc uses the Hexagon's
-    HVX units.
-    """
-    host_output_dir = tempfile.mkdtemp()
-
-    print("-" * 80)
-    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
-    print("-" * 80)
-    print()
-
-    bt = BenchmarksTable()
-
-    # Create and benchmark a single primfunc.
-    # If an unexpected problem occurs, raise an exception.  Otherwise add a row of output to 'bt'.
-    def test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor):
-        version_name = f"dtype:{dtype}-schedtype:{sched_type}-memscope:{mem_scope}-numvecs:{num_vectors_per_tensor}"
-        print()
-        print(f"CONFIGURATION: {version_name}")
-
-        if num_vectors_per_tensor == 2048 and mem_scope == "global.vtcm":
-            bt.record_skip(
-                dtype=dtype,
-                sched_type=sched_type,
-                mem_scope=mem_scope,
-                num_vectors_per_tensor=num_vectors_per_tensor,
-                comments="Expect to exceed VTCM budget.",
-            )
-            return
-
-        dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
-        assert dtype_bits % 8 == 0
-        dtype_bytes = dtype_bits // 8
-
-        elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
-
-        # Note!  We're providing the complete input tensor shapes now,
-        # whereas the original code only reveals the exact shape when
-        # about to call the kernel.
-
-        shape = [
-            num_vectors_per_tensor,
-            elem_per_hvx_vector,
-        ]
-
-        A = tvm.te.placeholder(shape, dtype=dtype)
-        B = tvm.te.placeholder(shape, dtype=dtype)
-        C = tvm.te.compute(A.shape, lambda i, j: A[i, j] + B[i, j], name="C")
-
-        sched = tvm.te.create_schedule(C.op)
-
-        if sched_type == 1:
-            pass
-        elif sched_type == 2:
-            sched[C].vectorize(C.op.axis[1])
-        else:
-            raise Exception("Unknown schedule type")
-
-        # If we're using VTCM, we *must* add a transform_layout step to the schedule.
-        # Otherwise the generated code will crash.
-        # As of 2022-04-12 the crash does not provide a useful error message to the
-        # host Python code.
-        if mem_scope == "global.vtcm":
-            for tensor in [A, B, C]:
-                sched[tensor].transform_layout(lambda i, j: [i, te.AXIS_SEPARATOR, j])
-
-        # This module is only created so humans can inspect its IR.
-        module_for_ir_dump = tvm.lower(sched, [A, B, C], "foo")
-
-        report_path = os.path.join(host_output_dir, f"{version_name}.txt")
-
-        with open(report_path, "w") as f:
-            f.write("LOWERED IR MODULE:\n")
-            f.write(str(module_for_ir_dump))
-            f.write("\n")
-
-            target_hexagon = tvm.target.hexagon("v68", link_params=True)
-            func = tvm.build(
-                sched,
-                [A, B, C],
-                tvm.target.Target(target_hexagon, host=target_hexagon),
-                name="elemwise_add",
-            )
-
-            host_dso_binary_path = os.path.join(host_output_dir, f"test_binary-{version_name}.so")
-            target_dso_binary_filename = "test_binary.so"
-
-            func.save(str(host_dso_binary_path))
-            print("SAVED BINARY TO HOST PATH: {}".format(str(host_dso_binary_path)))
-
-            hexagon_launcher.upload(host_dso_binary_path, target_dso_binary_filename)
-
-            try:
-                with hexagon_launcher.start_session() as sess:
-                    mod = hexagon_launcher.load_module(target_dso_binary_filename, sess)
-
-                    host_numpy_A_data = np.ndarray(shape, dtype=dtype)
-                    host_numpy_B_data = np.ndarray(shape, dtype=dtype)
-
-                    for i in range(shape[0]):
-                        for j in range(shape[1]):
-                            host_numpy_A_data[i, j] = i + j
-                            host_numpy_B_data[i, j] = (i + 1) * (j + 1)
-
-                    host_numpy_C_data_expected = host_numpy_A_data + host_numpy_B_data
-
-                    A_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
-                    A_data.copyfrom(host_numpy_A_data)
-
-                    B_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
-                    B_data.copyfrom(host_numpy_B_data)
-
-                    C_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
-
-                    # NOTE: We may want to soften these numbers, depending on future findings.
-                    timer = mod.time_evaluator("elemwise_add", sess.device, number=10, repeat=1)
-                    timing_result = timer(A_data, B_data, C_data)
-
-                    # Verify that the computation actually happened, and produced the correct result.
-                    result = C_data.numpy()
-                    tvm.testing.assert_allclose(host_numpy_C_data_expected, result)
-
-                    bt.record_success(
-                        timing_result,
-                        dtype=dtype,
-                        sched_type=sched_type,
-                        mem_scope=mem_scope,
-                        num_vectors_per_tensor=num_vectors_per_tensor,
-                    )
-
-            except Exception as err:
-                f.write("ERROR:\n")
-                f.write("{}\n".format(err))
-                bt.record_fail(
-                    dtype=dtype,
-                    sched_type=sched_type,
-                    mem_scope=mem_scope,
-                    num_vectors_per_tensor=num_vectors_per_tensor,
-                    comments=f"See {report_path}",
-                )
-
-    # -----------------------------------------------------------------------------------------------
-
-    csv_column_order = [
-        "dtype",
-        "sched_type",
-        "mem_scope",
-        "num_vectors_per_tensor",
-        "row_status",
-        "timings_min_usecs",
-        "timings_max_usecs",
-        "timings_median_usecs",
-        "timings_mean_usecs",
-        "timings_stddev_usecs",
-        "comments",
-    ]
-
-    # Hexagon v69 allows more dtypes, but we're sticking with v68 for now.
-    for dtype in [
-        "int8",
-    ]:
-
-        # These numbers are only meaningful in the context of this script.
-        for sched_type in [
-            1,
-            2,
-        ]:
-
-            for mem_scope in ["global", "global.vtcm"]:
-
-                # These numbers are fairly arbitrary, but they're meant to stress memory/caches to
-                # various extents.
-                for num_vectors_per_tensor in [
-                    1,
-                    16,
-                    64,
-                    512,
-                    2048,
-                ]:
-
-                    test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor)
-
-                    # Report our progress.
-                    bt.print_csv(sys.stdout, csv_column_order)
-
-    print("-" * 80)
-    print(f"OUTPUT DIRECTORY: {host_output_dir}")
-    print("-" * 80)
-    print()
-
-    tabular_output_filename = os.path.join(host_output_dir, "benchmark-results.csv")
-    with open(tabular_output_filename, "w") as csv_file:
-        bt.print_csv(csv_file, csv_column_order)
-    print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
-
-    if bt.has_fail() > 0:
-        pytest.fail("At least one benchmark configuration failed", pytrace=False)
diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py
index 5a75e9a6e80fb..113c7780c130f 100644
--- a/tests/python/contrib/test_hexagon/benchmark_util.py
+++ b/tests/python/contrib/test_hexagon/benchmark_util.py
@@ -139,3 +139,37 @@ def print_csv(self, f, column_name_order, timing_decimal_places=3):
                     csv_line_dict[col_name] = str_value
 
             writer.writerow(csv_line_dict)
+
+
+def get_benchmark_id(keys_dict):
+    """
+    Given a dictionary with the distinguishing characteristics of a particular benchmark
+    line item, compute a string that uniquely identifies the benchmark.
+
+    The returned string:
+    - is a valid directory name on the host's file systems, and
+    - should be easy for humans to parse
+
+    Note that the insertion order for `keys_dict` affects the computed name.
+    """
+    # Creat a copy, because we might be modifying it.
+    d = dict(keys_dict)
+
+    # Sniff for shape-like lists, because we want them in a form that's both
+    # readable and filesystem-friendly...
+    for k, v in d.items():
+        if isinstance(v, list) or isinstance(v, tuple):
+            v2 = "_".join([str(x) for x in v])
+            d[k] = v2
+
+    return "-".join([f"{k}:{v}" for k, v in d.items()])
+
+
+def get_benchmark_decription(keys_dict):
+    """
+    Similar to `get_benchmark_id`, but the focus is on human-readability.
+
+    The returned string contains no line-breaks, but may contain spaces and
+    other characters that make it unsuitable for use as a filename.
+    """
+    return " ".join([f"{k}={v}" for k, v in keys_dict.items()])

From b086005f8f9d439ff8397dcc6b048fd8dda5a995 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 3 Jun 2022 10:58:28 -0700
Subject: [PATCH 033/181] [ci] Fix action expressions for tvm-bot workflow
 (#11556)

These weren't caught by `actionlint` for some reason but GitHub doesn't merge multiple `if`s, so this combines them into one.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/workflows/tvmbot.yml | 3 +--
 tests/scripts/git_utils.py   | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/tvmbot.yml b/.github/workflows/tvmbot.yml
index c9d2cf71e6a70..784f6899a3be3 100644
--- a/.github/workflows/tvmbot.yml
+++ b/.github/workflows/tvmbot.yml
@@ -13,9 +13,8 @@ concurrency:
 
 jobs:
   run-tvm-bot:
-    if: github.repository == 'apache/tvm'
+    if: ${{ github.event.issue.pull_request && github.repository == 'apache/tvm' }}
     runs-on: ubuntu-20.04
-    if: ${{ github.event.issue.pull_request }}
     steps:
       - uses: actions/checkout@v2
       - name: Run tvm-bot
diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index 7cd1b6b2fe596..0e2e85e552431 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -33,15 +33,15 @@ def compress_query(query: str) -> str:
 def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] = None):
     print(f"Requesting POST to", url, "with", body)
     headers = {}
+    req = request.Request(url, headers=headers, method="POST")
     if auth is not None:
-        auth_str = base64.b64encode(f"{auth[0]}:{auth[1]}")
-        request.add_header("Authorization", f"Basic {auth_str}")
+        auth_str = base64.b64encode(f"{auth[0]}:{auth[1]}".encode())
+        req.add_header("Authorization", f"Basic {auth_str}")
 
     if body is None:
         body = ""
 
     req.add_header("Content-Type", "application/json; charset=utf-8")
-    req = request.Request(url, headers=headers, method="POST")
     data = json.dumps(body)
     data = data.encode("utf-8")
     req.add_header("Content-Length", len(data))

From 9dceb4e191c5588046c1478243d031f0b6052311 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 3 Jun 2022 13:19:53 -0700
Subject: [PATCH 034/181] [BYOC] Two helper passes for external codegen using
 RelayToTIR custom pass machinery (#11474)

* [BYOC] Two helper passes for external codegen using RelayToTIR custom pass machinery

(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for
context, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).

For reasons explained in the above thread I'm moving CUTLASS to be IRModule-at-a-time external codegen
using a custom RelayToTIR pass instead of the traditional function-at-a-time external codegen using
a relay.ext.cutlass registered function. This means some of the rewriing done on-the-fly by LowerTEPass now
needs to be done by the custom pass directly. This PR supplies two passes which ease that burden:
 - Before starting the CUTLASS-specific processing, make sure all "Compiler" attributed functions have
   unique global definitions (ie are outlined). Though functions start in this form after BYOC partitioning,
   under Graph and AOT compilation flows those functions are then inlined to pass through the 'codegen' keyhole
   which assumes the whole model is just one self-contained main function. This pass will undo that. (I gave up
   trying to just remove the inlining in the first place.)
 - After the CUTLASS-specific processing the now compiled "Compiler" attributed functions need to marked as
   'extern'. The te_compiler.cc uses the "ExternalSymbol" attribute for that, but since a) the symbol name
   is never needed, on the presense of the attribute is significant downstream and b) "ExternalSymbol" is
   easy to confuse with "global_symbol", I just replaced "ExternalSymbol" with "Extern" with an Integer(1)
   (cf "Primitive").

 The outlining pass is a little more general than necessary because it (will also) be used by Collage to
 rewrite the IRModule into optimally partitioned form while making maximal reuse of partition functions.
 Hence the abstract GlobalSymbolCache.

* - Andrew's comments
---
 include/tvm/ir/expr.h                         |   3 +-
 include/tvm/relay/attrs/call.h                |   2 +-
 include/tvm/relay/function.h                  |  32 ++-
 python/tvm/relay/transform/transform.py       |  70 ++++--
 src/ir/expr.cc                                |   3 +-
 src/parser/tokenizer.h                        |   4 +-
 src/relay/backend/te_compiler.cc              |   8 +-
 src/relay/backend/vm/compiler.cc              |   4 +-
 src/relay/ir/function.cc                      |   2 +-
 src/relay/op/nn/nn.cc                         |   1 +
 .../transforms/compiler_function_utils.cc     | 212 ++++++++++++++++++
 .../transforms/compiler_function_utils.h      | 135 +++++++++++
 src/relay/transforms/dead_code.cc             |   6 +-
 src/relay/transforms/inline.cc                |   5 +-
 .../transform/test_compiler_function_utils.py | 162 +++++++++++++
 15 files changed, 608 insertions(+), 41 deletions(-)
 create mode 100644 src/relay/transforms/compiler_function_utils.cc
 create mode 100644 src/relay/transforms/compiler_function_utils.h
 create mode 100644 tests/python/relay/transform/test_compiler_function_utils.py

diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index 4a00de802c61e..b54a067e1c941 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -260,9 +260,10 @@ class GlobalVarNode : public RelayExprNode {
  */
 class GlobalVar : public RelayExpr {
  public:
-  TVM_DLL explicit GlobalVar(String name_hint, Type type = {});
+  TVM_DLL explicit GlobalVar(String name_hint, Type type = {}, Span span = {});
 
   TVM_DEFINE_OBJECT_REF_METHODS(GlobalVar, RelayExpr, GlobalVarNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(GlobalVarNode);
 };
 
 // PrimExprs that are useful as runtime containers.
diff --git a/include/tvm/relay/attrs/call.h b/include/tvm/relay/attrs/call.h
index 167a593ff377b..e0b347de17837 100644
--- a/include/tvm/relay/attrs/call.h
+++ b/include/tvm/relay/attrs/call.h
@@ -35,7 +35,7 @@ namespace relay {
  * \brief Metadata for calls to TIR functions, useful for program analysis crossing Relay and TIR.
  */
 struct CallLoweredAttrs : public tvm::AttrsNode<CallLoweredAttrs> {
-  /*! \brief The metadata attached to the call node. */
+  /*! \brief Additional metadata attached to the call node. Should be replaced by explict fields. */
   Map<String, ObjectRef> metadata;
 
   TVM_DECLARE_ATTRS(CallLoweredAttrs, "relay.attrs.CallLoweredAttrs") {
diff --git a/include/tvm/relay/function.h b/include/tvm/relay/function.h
index 5869f878aa856..052d04fe24119 100644
--- a/include/tvm/relay/function.h
+++ b/include/tvm/relay/function.h
@@ -170,19 +170,40 @@ const FunctionNode* AsOptimizableFunctionNode(const BaseFunc& base_func);
  * \brief namespace of the attributes that can be attached to a relay::Function.
  */
 namespace attr {
-/*! \brief Mark the function as a primitive function. */
+
+/*!
+ * \brief Mark the function as representing a sub-graph which is to be lowered or compiled as
+ * a unit. For example, the function may represent a kernel which TVM will lower to a PrimFunc.
+ * If present should be bound to \p Integer(1). May be accompanied by "Compiler", see below.
+ * The function body should be considered opaque by Relay, and many passes simply ignore these
+ * functions.
+ *
+ * Type: Integer
+ */
 constexpr const char* kPrimitive = "Primitive";
+
+/*!
+ * \brief Mark the function as externally implemented, ie bound in a runtime::Module within the
+ * IRModule's "external_mods" attribute. If present should be bound to \p Integer(1). Generally
+ * the only attribute when present.
+ *
+ * Type: Integer
+ */
+constexpr const char* kExtern = "Extern";
+
 /*!
- * \brief Indicate the compiler that should be used for building this function.
- * When this is unset or set to "default", the default compilation pipeline will be used.
+ * \brief Indicates the name of the external codegen 'compiler' that should be used to lower
+ * or compile the function other than TVM's default lowering pipeline. The name may correspond
+ * to a TargetKind name. There may be a global function registered under 'relay.ext.{name}'.
+ *
+ * Type: String
  */
 constexpr const char* kCompiler = "Compiler";
+
 /*! \brief Indicate if the function is a closure. */
 constexpr const char* kClosure = "Closure";
 /*! \brief Store a Var to parameter/Constant mapping on a Function. */
 constexpr const char* kParams = "__params__";
-/*! \brief Store the unique external symbol for external compilers. */
-constexpr const char* kExternalSymbol = "ExternalSymbol";
 /*! \brief Mark if the function should be avoided being optimized. */
 constexpr const char* kSkipOptimization = "SkipOptimization";
 /*! \brief Treat the function as a composite operator. */
@@ -193,6 +214,7 @@ constexpr const char* kInline = "Inline";
 constexpr const char* kPartitionedFromPattern = "PartitionedFromPattern";
 /*! \brief Mark the function as only composed of reshape operations. */
 constexpr const char* kReshapeOnly = "relay.reshape_only";
+
 }  // namespace attr
 
 }  // namespace relay
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 9f253f8e88ba7..694dbb45218ca 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -802,24 +802,6 @@ def Inline():
     return _ffi_api.Inline()
 
 
-def InlineComposites(target):
-    """Perform inlining on the given Relay IR module. The functions originate
-    from the MergeComposite pass based on an input pattern table will fold back
-    to main. Currently, this is used for the TRT BYOC which expects a single
-    primitive function to operate on.
-
-    Parameters
-    ----------
-    target: str
-        The byoc target for which ops need to fold back to primitive function.
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass that performs inlining for a Relay IR module.
-    """
-    return _ffi_api.InlineComposites(target)
-
-
 def gradient(expr, mod=None, mode="higher_order"):
     """
     Transform the input function,
@@ -1386,3 +1368,55 @@ def SplitArgs(max_function_args):
         The registered pass for constant folding.
     """
     return _ffi_api.SplitArgs(max_function_args)
+
+
+def OutlineCompilerFunctionsWithExistingGlobalSymbols(compiler_filter=""):
+    """Outlines all literal functions in direct call positions which have a "Compiler"
+    attribute.
+
+    The outlined functions are bound to unique global vars according to their existing
+    "global_symbol" attribute. At most one function with the same global symbol is outlined.
+
+    If compiler_filter is non-empty only functions with that as their attribute value are
+    outlined.
+
+    This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism
+    to prepare the IRModule before custom lowering.
+
+    Parameters
+    ----------
+    compiler_filter : String
+        If non-empty, the 'compiler' attribute to filter on.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The pass.
+    """
+    return _ffi_api.OutlineCompilerFunctionsWithExistingGlobalSymbols(compiler_filter)
+
+
+def MarkCompilerFunctionsAsExtern(compiler_filter=""):
+    """Marks all global functions which have a "Compiler" attribute matching
+    compiler_filter as 'extern'.
+
+    The function's attributes are replaced with a single "Extern" attribute, and
+    all calls to the function are switched to use the 'call_lowered' calling convention.
+
+    If compiler_filter is non-empty only functions with that as their attribute value are
+    outlined.
+
+    This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism to
+    cleanup the IRModule after custom lowering.
+
+    Parameters
+    ----------
+    compiler_filter : String
+        If non-empty, the 'compiler' attribute to filter on.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The pass.
+    """
+    return _ffi_api.MarkCompilerFunctionsAsExtern(compiler_filter)
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index 399873492f041..a3318bf94fc66 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -141,10 +141,11 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "range(min=" << op->min << ", ext=" << op->extent << ')';
     });
 
-GlobalVar::GlobalVar(String name_hint, Type type) {
+GlobalVar::GlobalVar(String name_hint, Type type, Span span) {
   ObjectPtr<GlobalVarNode> n = make_object<GlobalVarNode>();
   n->name_hint = std::move(name_hint);
   n->checked_type_ = std::move(type);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
diff --git a/src/parser/tokenizer.h b/src/parser/tokenizer.h
index 4ac1ceef26dce..505784e4bf70e 100644
--- a/src/parser/tokenizer.h
+++ b/src/parser/tokenizer.h
@@ -295,8 +295,6 @@ struct Tokenizer {
     int line = this->line;
     int column = this->col;
 
-    ICHECK_EQ(Peek(), '[');
-    Next();
     std::stringstream type_key;
     while (More() && Peek() != ']') {
       type_key << Next();
@@ -498,7 +496,7 @@ struct Tokenizer {
       auto token = NewToken(TokenType::kQuestion);
       Next();
       return token;
-    } else if (MatchString("meta")) {
+    } else if (MatchString("meta[")) {
       return TokenizeMetaRef();
     } else if (next == '#') {
       return TokenizeAttr();
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 73b44f7361a57..c78f3abd6eccf 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -168,7 +168,7 @@ class TECompilerImpl : public TECompilerNode {
           if (const auto* function_node = kv2.second.as<FunctionNode>()) {
             // Abandon the existing function annotations.
 
-            // Unfortuantely, Optional<DictAttrs>() is indistinguishable from
+            // Unfortunately, Optional<DictAttrs>() is indistinguishable from
             // NullValue<DictAttrs>(), and DictAttrs() is nullptr, so to erase the attributes, we
             // need pass in DictAttrs<Map<String, ObjectRef>()), which is a DictAttrs containing no
             // attributes.
@@ -176,8 +176,8 @@ class TECompilerImpl : public TECompilerNode {
                 WithFields(GetRef<Function>(function_node), function_node->params,
                            function_node->body, function_node->ret_type, function_node->type_params,
                            /* erase attributes */ DictAttrs(Map<String, ObjectRef>()));
-            // Mark function as 'extern' using the "ExternalSymbol" attribute.
-            function = WithAttr(std::move(function), attr::kExternalSymbol, kv2.first->name_hint);
+            // Mark function as 'extern'.
+            function = WithAttr(std::move(function), attr::kExtern, Integer(1));
             module->Add(kv2.first, function);
           }
         }
@@ -688,7 +688,7 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
 
   Expr DeviceAwareVisitExpr_(const FunctionNode* function_node) override {
     if (function_node->HasNonzeroAttr(attr::kPrimitive) ||
-        function_node->GetAttr<String>(attr::kExternalSymbol)) {
+        function_node->HasNonzeroAttr(attr::kExtern)) {
       // Nothing to lower inside primitive/external functions.
       return GetRef<Function>(function_node);
     } else {
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index e0b742a840906..d9730b1b5a4ca 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -922,7 +922,7 @@ void VMCompiler::LowerImpl(IRModule mod) {
   for (const auto& pair : context_.module->functions) {
     auto gvar = pair.first;
     if (auto* n = pair.second.as<FunctionNode>()) {
-      if (n->GetAttr<String>(attr::kExternalSymbol).defined()) {
+      if (n->HasNonzeroAttr(attr::kExtern)) {
         // Already compiled during lowering.
         continue;
       }
@@ -1131,7 +1131,7 @@ size_t VMCompiler::PopulateGlobalMap() {
   // Excludes PrimFuncs and externs, which are managed by the primitive_map_.
   for (const auto& kv : context_.module->functions) {
     if (const auto* function_node = kv.second.as<FunctionNode>()) {
-      if (!function_node->GetAttr<String>(attr::kExternalSymbol)) {
+      if (!function_node->HasNonzeroAttr(attr::kExtern)) {
         context_.global_map.emplace(kv.first, context_.global_map.size());
       }
     }
diff --git a/src/relay/ir/function.cc b/src/relay/ir/function.cc
index bf0dd577a4d29..63e74144e0616 100644
--- a/src/relay/ir/function.cc
+++ b/src/relay/ir/function.cc
@@ -112,7 +112,7 @@ FuncType FunctionNode::func_type_annotation() const {
 const FunctionNode* AsOptimizableFunctionNode(const BaseFunc& base_func) {
   if (const auto* function_node = base_func.as<FunctionNode>()) {
     if (!function_node->GetAttr<String>(attr::kCompiler).defined() &&
-        !function_node->GetAttr<String>(attr::kExternalSymbol).defined() &&
+        !function_node->HasNonzeroAttr(attr::kExtern) &&
         !function_node->HasNonzeroAttr(attr::kSkipOptimization)) {
       return function_node;
     }
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 234cafdca1502..41b47401de1c2 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -1012,6 +1012,7 @@ Both `tensor_a` and `tensor_b` can be transposed. For legacy reason, we use NT f
 - **out**: `(b, m, n)`.
 
 )code" TVM_ADD_FILELINE)
+    .set_attrs_type<BatchMatmulAttrs>()
     .set_num_inputs(2)
     .add_argument("tensor_a", "3D Tensor", "The first input.")
     .add_argument("tensor_b", "3D Tensor", "The second input.")
diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc
new file mode 100644
index 0000000000000..b98d089b346a3
--- /dev/null
+++ b/src/relay/transforms/compiler_function_utils.cc
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/transforms/compiler_function_utils.cc
+ * \brief Helper passes for working with functions with the "Compiler" attribute.
+ */
+
+#include "./compiler_function_utils.h"
+
+#include "../op/call/call.h"
+#include "tvm/relay/analysis.h"
+#include "tvm/relay/expr_functor.h"
+
+namespace tvm {
+namespace relay {
+namespace transforms {
+namespace {
+
+/*!
+ * \brief Rewrite calls to inlined "Compiler" functions to global functions. The given
+ * module will be extended with the newly outlined functions.
+ */
+class Outliner : public MixedModeMutator {
+ public:
+  Outliner(GlobalSymbolCache* cache, std::string compiler_filter, IRModule mod)
+      : cache_(cache), compiler_filter_(std::move(compiler_filter)), mod_(std::move(mod)) {}
+
+  Expr Rewrite_(const CallNode* pre, const Expr& post) final {
+    Call new_call = Downcast<Call>(post);
+    if (const auto* function_node = new_call->op.as<FunctionNode>()) {
+      Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+      if (opt_compiler.defined() &&
+          (compiler_filter_.empty() || opt_compiler.value() == compiler_filter_)) {
+        auto function = GetRef<Function>(function_node);
+        DCHECK(FreeVars(function).empty()) << "Function marked with '" << attr::kCompiler
+                                           << "' attribute should not have free variables";
+        // Ask the cache to supply a unique  global var for this function.
+        GlobalVar global_symbol = cache_->GetGlobalSymbol(function);
+        // Depending on the cache's implementation, two structurally equal (but not object equal)
+        // functions may be assigned the same global symbol. If so we'll lift it just once, but
+        // rewrite all the calls.
+        if (!mod_->ContainGlobalVar(global_symbol->name_hint)) {
+          function =
+              WithAttr(std::move(function), tvm::attr::kGlobalSymbol, global_symbol->name_hint);
+          mod_->Add(global_symbol, function);
+        }
+        // Update the call.
+        return WithFields(new_call, global_symbol);
+      }
+    }
+    return post;
+  }
+
+ private:
+  /*!
+   * \brief A cached mapping from functions to global variables. Depending on the implementation
+   * the cache may generate fresh symbols or require the function to already have a "global_symbol"
+   * attribute, and may share symbols between structurally equal functions.
+   */
+  GlobalSymbolCache* cache_;
+  /*! \brief If non-empty, the "Compiler" attribute value to require on functions to outline. */
+  std::string compiler_filter_;
+  /*! \brief Module being rewritten. */
+  IRModule mod_;
+};
+
+/*!
+ * \brief Rewrite calls to global "Compiler" functions to use the 'call_lowered' convention.
+ */
+class CallRewriter : public MixedModeMutator {
+ public:
+  CallRewriter(std::string compiler_filter, IRModule mod)
+      : compiler_filter_(std::move(compiler_filter)), mod_(std::move(mod)) {}
+
+  Expr Rewrite_(const CallNode* pre, const Expr& post) final {
+    Call new_call = Downcast<Call>(post);
+    if (const auto* global_var_node = new_call->op.as<GlobalVarNode>()) {
+      if (const auto* function_node =
+              mod_->Lookup(GetRef<GlobalVar>(global_var_node)).as<FunctionNode>()) {
+        Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+        if (opt_compiler.defined() &&
+            (compiler_filter_.empty() || opt_compiler.value() == compiler_filter_)) {
+          Optional<String> opt_global_symbol =
+              function_node->GetAttr<String>(tvm::attr::kGlobalSymbol);
+          ICHECK(opt_global_symbol.defined());
+          GlobalVar global_symbol = mod_->GetGlobalVar(opt_global_symbol.value());
+          CallLoweredAttrs attrs;
+          attrs.metadata.Set("relay_attrs", new_call->attrs);
+          return CallLowered(global_symbol, new_call->args, attrs, new_call->span);
+        }
+      }
+    }
+    return post;
+  }
+
+ private:
+  /*! \brief If non-empty, the "Compiler" attribute value to require on functions to outline. */
+  std::string compiler_filter_;
+  /*! \brief Module being rewritten. */
+  IRModule mod_;
+};
+
+}  // namespace
+
+GlobalVar ExistingGlobalSymbolCache::GetGlobalSymbol(const Function& function) {
+  Optional<String> opt_global_symbol = function->GetAttr<String>(tvm::attr::kGlobalSymbol);
+  ICHECK(opt_global_symbol.defined())
+      << "ExistingGlobalSymbolCache requires all functions to already have a '"
+      << tvm::attr::kGlobalSymbol << "' attribute";
+  std::string global_symbol = opt_global_symbol.value();
+  auto itr = global_vars_.find(global_symbol);
+  if (itr != global_vars_.end()) {
+    return itr->second;
+  }
+  // Ok if function does not have a checked_type, but if it does capture it in the global var.
+  GlobalVar global_var(global_symbol, function->checked_type_, function->span);
+  global_vars_.emplace(global_symbol, global_var);
+  return global_var;
+}
+
+transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cache,
+                                         std::string compiler_filter) {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [cache = std::move(cache), compiler_filter = std::move(compiler_filter)](
+          IRModule mod, transform::PassContext ctx) {
+        IRModule output_mod = GetRef<IRModule>(mod.CopyOnWrite());
+        for (const auto& kv : mod->functions) {
+          const FunctionNode* function_node = AsOptimizableFunctionNode(kv.second);
+          if (function_node) {
+            Expr new_body =
+                Outliner(cache.get(), compiler_filter, output_mod).VisitExpr(function_node->body);
+            Function new_function =
+                WithFields(GetRef<Function>(function_node), /*opt_params=*/{}, new_body);
+            output_mod->Add(kv.first, new_function);
+          }
+        }
+        return output_mod;
+      };
+
+  return tvm::transform::CreateModulePass(pass_func, 0, "OutlineCompilerFunctions", {});
+}
+
+// Any Java programmers in the house?
+transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string compiler_filter) {
+  return OutlineCompilerFunctions(std::make_shared<ExistingGlobalSymbolCache>(),
+                                  std::move(compiler_filter));
+}
+
+transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [compiler_filter = std::move(compiler_filter)](IRModule mod, transform::PassContext ctx) {
+        IRModule output_mod = mod->ShallowCopy();
+
+        // First pass, rewrite the calls.
+        // We have to do this before marking functions as 'extern' to know which calls to rewrite!
+        for (const auto& kv : mod->functions) {
+          if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) {
+            Expr new_body =
+                CallRewriter(compiler_filter, output_mod).VisitExpr(function_node->body);
+            Function new_function =
+                WithFields(GetRef<Function>(function_node), /*opt_params=*/{}, new_body);
+            output_mod->Update(kv.first, new_function);
+          }
+        }
+
+        // Second pass, mark functions as 'extern'.
+        for (const auto& kv : mod->functions) {
+          if (const auto* function_node = kv.second.as<FunctionNode>()) {
+            Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+            if (opt_compiler.defined() &&
+                (compiler_filter.empty() || opt_compiler.value() == compiler_filter)) {
+              auto new_function = WithFields(
+                  GetRef<Function>(function_node), function_node->params, function_node->body,
+                  function_node->ret_type, function_node->type_params,
+                  /* erase attributes */ DictAttrs(Map<String, ObjectRef>()));
+              new_function = WithAttr(std::move(new_function), attr::kExtern, Integer(1));
+              output_mod->Update(kv.first, new_function);
+            }
+          }
+        }
+
+        return output_mod;
+      };
+
+  return tvm::transform::CreateModulePass(pass_func, 0, "MarkCompilerFunctionsAsExtern", {});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.OutlineCompilerFunctionsWithExistingGlobalSymbols")
+    .set_body_typed(OutlineCompilerFunctionsWithExistingGlobalSymbols);
+TVM_REGISTER_GLOBAL("relay._transform.MarkCompilerFunctionsAsExtern")
+    .set_body_typed(MarkCompilerFunctionsAsExtern);
+
+}  // namespace transforms
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/compiler_function_utils.h b/src/relay/transforms/compiler_function_utils.h
new file mode 100644
index 0000000000000..7b5143444bf8a
--- /dev/null
+++ b/src/relay/transforms/compiler_function_utils.h
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/transforms/compiler_function_utils.h
+ * \brief Helper passes for working with functions with the "Compiler" attribute.
+ *
+ * Those wishing to use the "RelayToTIR" custom pass machinery to do IRModule-at-a-time external
+ * codegen may find the following two helper passes useful:
+ *
+ *  - \p OutlineCompilerFunctionsWithExistingGlobalSymbols will lift inline functions with a
+ *    matching "Compiler" attribute to be global functions, using the "global_symbol" attribute
+ *    already assigned. Can be used before custom lowering.
+ *
+ *    Note that ideally "Compiler" attributed functions would be made global functions as early as
+ *    possible and would stay that way. However, the GraphExecutorCodegen and AOTExecutorCodegen
+ *    assume the entire model can be represented by a single 'main' function, and the Inline pass
+ *    is run to respect that assumption. So this pass is mostly just to undo that Pass after modules
+ *    have passed through the 'codegen' keyhole.
+ *
+ *    See also OutlineCompilerFunctionsMutator in src/relay/backend/contrib/ethosu/codegen.cc.
+ *
+ *  - (\p OutlineCompilerFunctions is a more general version of the above which can use a custom
+ *    cache to both allocate "global_symbol" names and ensure two strucurally equal functions are
+ *    assigned the same name, and thus lowered only once. This is used by Collage when preparing
+ *    the optimally partitioned IRModule).
+ *
+ *  - \p MarkCompilerFunctionsAsExtern will replace global functions with a matching "Compiler"
+ *    attribute with the same function with just  an "Extern" attribute, signalling the function
+ *    has been dealt with. Calls to such functions will be rewritten to use the 'call_lowered'
+ *    calling convention. Can be used after lowering to cleanup the IRModule.
+ *
+ * Note that the above behaviour is hard coded within the TECompiler, but is only available to
+ * external codegen using the Function-at-a-time "relay.ext.toolchain" extension point.
+ */
+
+#ifndef TVM_RELAY_TRANSFORMS_COMPILER_FUNCTION_UTILS_H_
+#define TVM_RELAY_TRANSFORMS_COMPILER_FUNCTION_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "tvm/ir/transform.h"
+#include "tvm/relay/function.h"
+
+namespace tvm {
+namespace relay {
+namespace transforms {
+
+/*!
+ * \brief Abstract class representing a cache of unique global vars keyed by functions. This can
+ * be used to ensure structurally equal functions are assigned the same global var object, and
+ * thus lowered at most once.
+ */
+class GlobalSymbolCache {
+ public:
+  virtual GlobalVar GetGlobalSymbol(const Function& function) = 0;
+};
+
+/*!
+ * \brief A \p GlobalSymbolCache that requires every "Compiler" attributed function to already
+ * have a "global_symbol" attribute.
+ */
+class ExistingGlobalSymbolCache : public GlobalSymbolCache {
+ public:
+  ExistingGlobalSymbolCache() = default;
+
+  GlobalVar GetGlobalSymbol(const Function& function) final;
+
+ private:
+  /*! \brief Maps already seen global symbol names to their corresponding GlobalVar objects. */
+  std::unordered_map<std::string, GlobalVar> global_vars_;
+};
+
+/*!
+ * \brief A pass to outline all literal functions in direct call positions which have a "Compiler"
+ * attribute. The given \p GlobalSymbolCache is used to determine a unique global symbol for each
+ * function, which is also assigned to the "global_symbol" attribute of the new global function.
+ *
+ * At most one function with the same global symbol is outlined.
+ *
+ * If \p compiler_filter is non-empty only functions with that as their attribute value are
+ * outlined.
+ */
+transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cache,
+                                         std::string compiler_filter = "");
+
+/*!
+ * \brief A pass to outline all literal functions in direct call positions which have a "Compiler"
+ * attribute. The functions are bound to unique global vars according to their existing
+ * "global_symbol" attribute. At most one function with the same global symbol is outlined.
+ *
+ * If \p compiler_filter is non-empty only functions with that as their attribute value are
+ * outlined.
+ *
+ * This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism
+ * to prepare the IRModule before custom lowering.
+ */
+transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string compiler_filter = "");
+
+/*!
+ * \brief A pass to mark all global functions which have a "Compiler" attribute matching
+ * compiler_filter as 'extern' by replacing all attributes with a single "Extern" attribute, and
+ * rewrite all calls to such functions to use the 'call_lowered' calling convention.
+ *
+ * If \p compiler_filter is non-empty only functions with that as their attribute value are
+ * outlined.
+ *
+ * This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism to
+ * cleanup the IRModule after custom lowering.
+ */
+transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter = "");
+
+}  // namespace transforms
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_TRANSFORMS_COMPILER_FUNCTION_UTILS_H_
diff --git a/src/relay/transforms/dead_code.cc b/src/relay/transforms/dead_code.cc
index 45cb8271b0746..18d2de1bdede3 100644
--- a/src/relay/transforms/dead_code.cc
+++ b/src/relay/transforms/dead_code.cc
@@ -84,7 +84,7 @@ class PurityVisitor : ExprFunctor<Purity(const Expr&)> {
     for (const auto& kv : mod_->functions) {
       if (const auto* function_node = kv.second.as<FunctionNode>()) {
         if (function_node->HasNonzeroAttr(attr::kPrimitive) ||
-            function_node->GetAttr<String>(attr::kExternalSymbol)) {
+            function_node->HasNonzeroAttr(attr::kExtern)) {
           // Ignore primitive and external functions.
           continue;
         }
@@ -133,9 +133,11 @@ class PurityVisitor : ExprFunctor<Purity(const Expr&)> {
 
   Purity VisitExpr_(const GlobalVarNode* global_var_node) final {
     auto global_var = GetRef<GlobalVar>(global_var_node);
+    ICHECK(mod_->ContainGlobalVar(global_var_node->name_hint))
+        << "No definition for '" << global_var_node->name_hint << "'";
     auto func = mod_->Lookup(global_var);
     if (const auto* function_node = func.as<FunctionNode>()) {
-      if (!function_node->GetAttr<String>(attr::kExternalSymbol)) {
+      if (!function_node->HasNonzeroAttr(attr::kExtern)) {
         return VisitGlobalFunction(global_var, GetRef<Function>(function_node));
       }
     }
diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc
index c55b6778093e5..012b3579494f1 100644
--- a/src/relay/transforms/inline.cc
+++ b/src/relay/transforms/inline.cc
@@ -110,7 +110,7 @@ class Inliner : ExprMutator {
     if (!function_node->body.defined()) return false;
 
     // The function must be annotated with the inline attribute.
-    // (Note that external functions do not have this attribute!)
+    // (Note that partitioned functions and external functions do not have this attribute!)
     if (!function_node->HasNonzeroAttr(attr::kInline)) return false;
 
     // The function is not able to be inlined if any callee under the CallGraph
@@ -136,8 +136,7 @@ class Inliner : ExprMutator {
     auto func = Function(fn->params, fn->body, fn->ret_type, fn->type_params, fn->attrs);
     // Inline the function body to the caller if this function uses default
     // compiler, i.e. no external codegen is needed.
-    if (!func->GetAttr<String>(attr::kCompiler).defined() &&
-        !func->GetAttr<String>(attr::kExternalSymbol).defined()) {
+    if (!func->GetAttr<String>(attr::kCompiler).defined() && !func->HasNonzeroAttr(attr::kExtern)) {
       ICHECK_EQ(func->params.size(), args.size())
           << "Mismatch found in the number of parameters and call args";
       // Bind the parameters with call args.
diff --git a/tests/python/relay/transform/test_compiler_function_utils.py b/tests/python/relay/transform/test_compiler_function_utils.py
new file mode 100644
index 0000000000000..13e0f98e79f19
--- /dev/null
+++ b/tests/python/relay/transform/test_compiler_function_utils.py
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License
+"""Unit tests for the OutlineCompilerFunctionsWithExistingGlobalSymbols and
+   MarkCompilerFunctionsAsExtern external codegen helper passes."""
+
+import tvm
+import tvm.testing
+import numpy as np
+
+
+def make_const(dtype, shape):
+    return tvm.relay.const(np.random.rand(*shape).astype(dtype))
+
+
+def make_consts(dtype, shapes):
+    return [make_const(dtype, shape) for shape in shapes]
+
+
+metatable = {
+    "relay.Constant": make_consts(
+        "float16",
+        [
+            (2304, 768),  # 0
+            (2304,),  # 1
+            (600, 32, 64),  # 2
+        ],
+    ),
+    "attributes": [{"relay_attrs": None}],
+}
+
+
+def inlined_mod():
+    return tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
+          %0 = fn(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
+                  Inline=1, Compiler="cutlass", global_symbol="tvmgen_default_cutlass_main_0", Primitive=1) -> Tensor[(1600, 2304), float16] {
+            %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
+                     PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] {
+              %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304);
+              add(%5, %FunctionVar_0_2)
+            };
+            %4(%y_0_i0, %y_0_i1, %y_0_i2)
+          };
+          %1 = %0(%x0, meta[relay.Constant][0], meta[relay.Constant][1]);
+          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
+                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
+            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
+                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
+              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
+            };
+            %6(%y_3_i0, %y_3_i1)
+          };
+          %3 = %2(%x3, meta[relay.Constant][2]);
+          (%1, %3)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+
+
+def expected_outlined_mod():
+    return tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
+          %1 = @tvmgen_default_cutlass_main_0(%x0, meta[relay.Constant][0], meta[relay.Constant][1]);
+          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
+                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
+            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
+                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
+              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
+            };
+            %6(%y_3_i0, %y_3_i1)
+          };
+          %3 = %2(%x3, meta[relay.Constant][2]);
+          (%1, %3)
+        }
+        
+        def @tvmgen_default_cutlass_main_0(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
+                  Inline=1, Compiler="cutlass", global_symbol="tvmgen_default_cutlass_main_0", Primitive=1) -> Tensor[(1600, 2304), float16] {
+          %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
+                   PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] {
+            %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304);
+            add(%5, %FunctionVar_0_2)
+          };
+          %4(%y_0_i0, %y_0_i1, %y_0_i2)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+
+
+def expected_extern_mod():
+    return tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
+          %1 = call_lowered(@tvmgen_default_cutlass_main_0, (%x0, meta[relay.Constant][0], meta[relay.Constant][1]), metadata=meta[attributes][0]);
+          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
+                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
+            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
+                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
+              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
+            };
+            %6(%y_3_i0, %y_3_i1)
+          };
+          %3 = %2(%x3, meta[relay.Constant][2]);
+          (%1, %3)
+        }
+        
+        def @tvmgen_default_cutlass_main_0(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
+                  Extern=1) -> Tensor[(1600, 2304), float16] {
+          %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
+                   PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] {
+            %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304);
+            add(%5, %FunctionVar_0_2)
+          };
+          %4(%y_0_i0, %y_0_i1, %y_0_i2)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+
+
+def test_outline_compiler_functions_with_existing_global_symbols():
+    actual_outlined_mod = tvm.relay.transform.OutlineCompilerFunctionsWithExistingGlobalSymbols(
+        "cutlass"
+    )(inlined_mod())
+    tvm.ir.assert_structural_equal(actual_outlined_mod, expected_outlined_mod(), map_free_vars=True)
+
+
+def test_mark_compiler_functions_as_extern():
+    actual_extern_mod = tvm.relay.transform.MarkCompilerFunctionsAsExtern("cutlass")(
+        expected_outlined_mod()
+    )
+    tvm.ir.assert_structural_equal(actual_extern_mod, expected_extern_mod(), map_free_vars=True)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 4811d702f3cadf5b06d7c1947846b10b90b19e79 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 3 Jun 2022 15:23:32 -0500
Subject: [PATCH 035/181] [Hexagon] Register strategy for concatenate (#11562)

* [Hexagon] Register strategy for concatenate

* Restart CI
---
 python/tvm/relay/op/strategy/hexagon.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py
index da15a5412517d..be01ee50fba82 100644
--- a/python/tvm/relay/op/strategy/hexagon.py
+++ b/python/tvm/relay/op/strategy/hexagon.py
@@ -26,7 +26,7 @@
 
 
 @batch_matmul_strategy.register("hexagon")
-def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
+def batch_matmul_strategy_hexagon(attrs, inputs, out_type, target):
     """batch_matmul strategy for Hexagon"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
@@ -37,6 +37,18 @@ def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
     return strategy
 
 
+@concatenate_strategy.register("hexagon")
+def concatenate_strategy_hexagon(attrs, inputs, out_type, target):
+    """concatenate strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_concat(topi.concatenate),
+        wrap_topi_schedule(topi.hexagon.schedule_injective),
+        name="concatenate.hexagon",
+    )
+    return strategy
+
+
 @conv2d_strategy.register("hexagon")
 def conv2d_strategy_hexagon(attrs, inputs, out_type, target):
     """Conv2d strategy for Hexagon"""

From cee74c9f8f5563b1bed1956acccd6027d530d45e Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Sat, 4 Jun 2022 02:02:09 +0530
Subject: [PATCH 036/181] [CI] Update to LLVM 14.0.0 for ci_hexagon (#11539)

---
 docker/install/ubuntu_install_hexagon.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_hexagon.sh b/docker/install/ubuntu_install_hexagon.sh
index 46d2a44cfaa52..e616c8a4977cc 100755
--- a/docker/install/ubuntu_install_hexagon.sh
+++ b/docker/install/ubuntu_install_hexagon.sh
@@ -21,9 +21,9 @@ set -o pipefail
 
 # Install LLVM/clang
 CLANG_LLVM_HOME=/opt/clang-llvm
-CLANG_LLVM_VERSION=13.0.0
+CLANG_LLVM_VERSION=14.0.0
 CLANG_LLVM_FILENAME=clang_llvm.tar.xz
-wget -q https://github.com/llvm/llvm-project/releases/download/llvmorg-${CLANG_LLVM_VERSION}/clang+llvm-${CLANG_LLVM_VERSION}-x86_64-linux-gnu-ubuntu-16.04.tar.xz -O ${CLANG_LLVM_FILENAME}
+wget -q https://github.com/llvm/llvm-project/releases/download/llvmorg-${CLANG_LLVM_VERSION}/clang+llvm-${CLANG_LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04.tar.xz -O ${CLANG_LLVM_FILENAME}
 mkdir ${CLANG_LLVM_HOME}
 tar -xvf ${CLANG_LLVM_FILENAME} -C ${CLANG_LLVM_HOME} --strip-components=1
 rm ${CLANG_LLVM_FILENAME}

From b885362c36eff6d08363d53e5816f696a99ac822 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 3 Jun 2022 16:03:08 -0500
Subject: [PATCH 037/181] [CI] Refactor of tvm.testing.requires_* annotations
 (#11313)

* [CI] Improved skip messages when using @tvm.testing.requires_*

Previously, the same message was given regardless of why a test
couldn't be run.  This has been split up into separate checks for TVM
cmake options in `config.cmake`, enabled targets in `TVM_TEST_TARGETS`
environment variable, and checks for available hardware.

* Refactor to specify repeated feature marks, compile-only markers

* Fixed lint errors

* Import from contrib, not from a different import

* Removed use of requires_llvm() as a list of marks

* Corrected mark from requires_gpu to requires_cuda

* Adding missing "not"

* Added USE_CMSISNN as a requirement for corstone300.
---
 python/tvm/testing/plugin.py              |  25 +-
 python/tvm/testing/utils.py               | 799 ++++++++++++----------
 tests/python/contrib/test_dnnl.py         |   4 +-
 tests/python/contrib/test_tensorrt.py     |   4 +-
 tests/python/driver/tvmc/test_compiler.py |  12 +-
 tests/python/integration/test_reduce.py   |   2 +-
 6 files changed, 463 insertions(+), 383 deletions(-)

diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index e90bd5e6dbf52..1f4f983b72102 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -56,8 +56,8 @@
 def pytest_configure(config):
     """Runs at pytest configure time, defines marks to be used later."""
 
-    for markername, desc in MARKERS.items():
-        config.addinivalue_line("markers", "{}: {}".format(markername, desc))
+    for feature in utils.Feature._all_features.values():
+        feature._register_marker(config)
 
     print("enabled targets:", "; ".join(map(lambda x: x[0], utils.enabled_targets())))
     print("pytest marker:", config.option.markexpr)
@@ -269,25 +269,26 @@ def _target_to_requirement(target):
 
     # mapping from target to decorator
     if target.kind.name == "cuda" and "cudnn" in target.attrs.get("libs", []):
-        return utils.requires_cudnn()
+        return utils.requires_cudnn.marks()
     if target.kind.name == "cuda" and "cublas" in target.attrs.get("libs", []):
-        return utils.requires_cublas()
+        return utils.requires_cublas.marks()
     if target.kind.name == "cuda":
-        return utils.requires_cuda()
+        return utils.requires_cuda.marks()
     if target.kind.name == "rocm":
-        return utils.requires_rocm()
+        return utils.requires_rocm.marks()
     if target.kind.name == "vulkan":
-        return utils.requires_vulkan()
+        return utils.requires_vulkan.marks()
     if target.kind.name == "nvptx":
-        return utils.requires_nvptx()
+        return utils.requires_nvptx.marks()
     if target.kind.name == "metal":
-        return utils.requires_metal()
+        return utils.requires_metal.marks()
     if target.kind.name == "opencl":
-        return utils.requires_opencl()
+        return utils.requires_opencl.marks()
     if target.kind.name == "llvm":
-        return utils.requires_llvm()
+        return utils.requires_llvm.marks()
     if target.kind.name == "hexagon":
-        return utils.requires_hexagon()
+        return utils.requires_hexagon.marks()
+
     return []
 
 
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 0e2d7be4a14e7..939786c9294fc 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -67,15 +67,20 @@ def test_something():
 import copyreg
 import ctypes
 import functools
+import itertools
 import logging
 import os
+import pickle
 import platform
 import shutil
 import sys
 import time
-import pickle
+
+from typing import Optional, Callable, Union, List
+
 import pytest
 import numpy as np
+
 import tvm
 import tvm.arith
 import tvm.tir
@@ -84,9 +89,6 @@ def test_something():
 
 from tvm.contrib import nvcc, cudnn
 from tvm.error import TVMError
-from tvm.relay.op.contrib.ethosn import ethosn_available
-from tvm.relay.op.contrib import cmsisnn
-from tvm.relay.op.contrib import vitis_ai
 
 
 SKIP_SLOW_TESTS = os.getenv("SKIP_SLOW_TESTS", "").lower() in {"true", "1", "yes"}
@@ -388,12 +390,9 @@ def _check_forward(constraints1, constraints2, varmap, backvarmap):
     )
 
 
-def _get_targets(target_str=None):
-    if target_str is None:
-        target_str = os.environ.get("TVM_TEST_TARGETS", "")
-        # Use dict instead of set for de-duplication so that the
-        # targets stay in the order specified.
-        target_names = list({t.strip(): None for t in target_str.split(";") if t.strip()})
+def _get_targets(target_names=None):
+    if target_names is None:
+        target_names = _tvm_test_targets()
 
     if not target_names:
         target_names = DEFAULT_TEST_TARGETS
@@ -429,7 +428,7 @@ def _get_targets(target_str=None):
                 " Try setting TVM_TEST_TARGETS to a supported target. Defaulting to llvm.",
                 target_str,
             )
-            return _get_targets("llvm")
+            return _get_targets(["llvm"])
 
         raise TVMError(
             "None of the following targets are supported by this build of TVM: %s."
@@ -515,458 +514,544 @@ def enabled_targets():
     return [(t["target"], tvm.device(t["target"])) for t in _get_targets() if t["is_runnable"]]
 
 
-def _compose(args, decs):
-    """Helper to apply multiple markers"""
-    if len(args) > 0:
-        f = args[0]
-        for d in reversed(decs):
-            f = d(f)
-        return f
-    return decs
+class Feature:
 
+    """A feature that may be required to run a test.
 
-def slow(fn):
-    @functools.wraps(fn)
-    def wrapper(*args, **kwargs):
-        if SKIP_SLOW_TESTS:
-            pytest.skip("Skipping slow test since RUN_SLOW_TESTS environment variables is 'true'")
-        else:
-            fn(*args, **kwargs)
+    Parameters
+    ----------
+    name: str
 
-    return wrapper
+        The short name of the feature.  Should match the name in the
+        requires_* decorator.  This is applied as a mark to all tests
+        using this feature, and can be used in pytests ``-m``
+        argument.
 
+    long_name: Optional[str]
 
-def uses_gpu(*args):
-    """Mark to differentiate tests that use the GPU in some capacity.
+        The long name of the feature, to be used in error messages.
 
-    These tests will be run on CPU-only test nodes and on test nodes with GPUs.
-    To mark a test that must have a GPU present to run, use
-    :py:func:`tvm.testing.requires_gpu`.
+        If None, defaults to the short name.
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _uses_gpu = [pytest.mark.gpu]
-    return _compose(args, _uses_gpu)
+    cmake_flag: Optional[str]
 
+        The flag that must be enabled in the config.cmake in order to
+        use this feature.
 
-def requires_x86(*args):
-    """Mark a test as requiring the x86 Architecture to run.
+        If None, no flag is required to use this feature.
 
-    Tests with this mark will not be run unless on an x86 platform.
+    target_kind_enabled: Optional[str]
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_x86 = [
-        pytest.mark.skipif(platform.machine() != "x86_64", reason="x86 Architecture Required"),
-    ]
-    return _compose(args, _requires_x86)
+        The target kind that must be enabled to run tests using this
+        feature.  If present, the target_kind must appear in the
+        TVM_TEST_TARGETS environment variable, or in
+        tvm.testing.DEFAULT_TEST_TARGETS if TVM_TEST_TARGETS is
+        undefined.
 
+        If None, this feature does not require a specific target to be
+        enabled.
 
-def requires_gpu(*args):
-    """Mark a test as requiring a GPU to run.
+    compile_time_check: Optional[Callable[[], Union[bool,str]]]
 
-    Tests with this mark will not be run unless a gpu is present.
+        A check that returns True if the feature can be used at
+        compile-time.  (e.g. Validating the version number of the nvcc
+        compiler.)  If the feature does not have support to perform
+        compile-time tests, the check should returns False to display
+        a generic error message, or a string to display a more
+        specific error message.
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_gpu = [
-        pytest.mark.skipif(
-            not tvm.cuda().exist
-            and not tvm.rocm().exist
-            and not tvm.opencl().exist
-            and not tvm.metal().exist
-            and not tvm.vulkan().exist,
-            reason="No GPU present",
-        ),
-        *uses_gpu(),
-    ]
-    return _compose(args, _requires_gpu)
+        If None, no additional check is performed.
 
+    target_kind_hardware: Optional[str]
 
-def requires_cuda(*args):
-    """Mark a test as requiring the CUDA runtime.
+        The target kind that must have available hardware in order to
+        run tests using this feature.  This is checked using
+        tvm.device(target_kind_hardware).exist.  If a feature requires
+        a different check, this should be implemented using
+        run_time_check.
 
-    This also marks the test as requiring a cuda gpu.
+        If None, this feature does not require a specific
+        tvm.device to exist.
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_cuda = [
-        pytest.mark.cuda,
-        pytest.mark.skipif(not device_enabled("cuda"), reason="CUDA support not enabled"),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_cuda)
+    run_time_check: Optional[Callable[[], Union[bool,str]]]
 
+        A check that returns True if the feature can be used at
+        run-time.  (e.g. Validating the compute version supported by a
+        GPU.)  If the feature does not have support to perform
+        run-time tests, the check should returns False to display a
+        generic error message, or a string to display a more specific
+        error message.
 
-def requires_cudnn(*args):
-    """Mark a test as requiring the cuDNN library.
+        If None, no additional check is performed.
 
-    This also marks the test as requiring a cuda gpu.
+    parent_features: Optional[Union[str,List[str]]]
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
+        The short name of a feature or features that are required in
+        order to use this feature.  (e.g. Using cuDNN requires using
+        CUDA) This feature should inherit all checks of the parent
+        feature, with the exception of the `target_kind_enabled`
+        checks.
 
-    requirements = [
-        pytest.mark.skipif(
-            not cudnn.exists(), reason="cuDNN library not enabled, or not installed"
-        ),
-        *requires_cuda(),
-    ]
-    return _compose(args, requirements)
+        If None, this feature does not require any other parent
+        features.
 
+    """
 
-def requires_cublas(*args):
-    """Mark a test as requiring the cuBLAS library.
+    _all_features = {}
+
+    def __init__(
+        self,
+        name: str,
+        long_name: Optional[str] = None,
+        cmake_flag: Optional[str] = None,
+        target_kind_enabled: Optional[str] = None,
+        compile_time_check: Optional[Callable[[], Union[bool, str]]] = None,
+        target_kind_hardware: Optional[str] = None,
+        run_time_check: Optional[Callable[[], Union[bool, str]]] = None,
+        parent_features: Optional[Union[str, List[str]]] = None,
+    ):
+        self.name = name
+        self.long_name = long_name or name
+        self.cmake_flag = cmake_flag
+        self.target_kind_enabled = target_kind_enabled
+        self.compile_time_check = compile_time_check
+        self.target_kind_hardware = target_kind_hardware
+        self.run_time_check = run_time_check
+
+        if parent_features is None:
+            self.parent_features = []
+        elif isinstance(parent_features, str):
+            self.parent_features = [parent_features]
+        else:
+            self.parent_features = parent_features
 
-    This also marks the test as requiring a cuda gpu.
+        self._all_features[self.name] = self
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
+    def _register_marker(self, config):
+        config.addinivalue_line("markers", f"{self.name}: Mark a test as using {self.long_name}")
 
-    requirements = [
-        pytest.mark.skipif(
-            tvm.get_global_func("tvm.contrib.cublas.matmul", True),
-            reason="cuDNN library not enabled",
-        ),
-        *requires_cuda(),
-    ]
-    return _compose(args, requirements)
+    def _uses_marks(self):
+        for parent in self.parent_features:
+            yield from self._all_features[parent]._uses_marks()
 
+        yield getattr(pytest.mark, self.name)
 
-def requires_nvptx(*args):
-    """Mark a test as requiring the NVPTX compilation on the CUDA runtime
+    def _compile_only_marks(self):
+        for parent in self.parent_features:
+            yield from self._all_features[parent]._compile_only_marks()
 
-    This also marks the test as requiring a cuda gpu, and requiring
-    LLVM support.
+        if self.compile_time_check is not None:
+            res = self.compile_time_check()
+            if isinstance(res, str):
+                yield pytest.mark.skipif(True, reason=res)
+            else:
+                yield pytest.mark.skipif(
+                    not res, reason=f"Compile-time support for {self.long_name} not present"
+                )
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
+        if self.target_kind_enabled is not None:
+            target_kind = self.target_kind_enabled.split()[0]
+            yield pytest.mark.skipif(
+                all(enabled.split()[0] != target_kind for enabled in _tvm_test_targets()),
+                reason=(
+                    f"{self.target_kind_enabled} tests disabled "
+                    f"by TVM_TEST_TARGETS environment variable"
+                ),
+            )
 
-    """
-    _requires_nvptx = [
-        pytest.mark.skipif(not device_enabled("nvptx"), reason="NVPTX support not enabled"),
-        *requires_llvm(),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_nvptx)
+        if self.cmake_flag is not None:
+            yield pytest.mark.skipif(
+                not _cmake_flag_enabled(self.cmake_flag),
+                reason=(
+                    f"{self.long_name} support not enabled.  "
+                    f"Set {self.cmake_flag} in config.cmake to enable."
+                ),
+            )
 
+    def _run_only_marks(self):
+        for parent in self.parent_features:
+            yield from self._all_features[parent]._run_only_marks()
+
+        if self.run_time_check is not None:
+            res = self.run_time_check()
+            if isinstance(res, str):
+                yield pytest.mark.skipif(True, reason=res)
+            else:
+                yield pytest.mark.skipif(
+                    not res, reason=f"Run-time support for {self.long_name} not present"
+                )
 
-def requires_nvcc_version(major_version, minor_version=0, release_version=0):
-    """Mark a test as requiring at least a specific version of nvcc.
+        if self.target_kind_hardware is not None:
+            yield pytest.mark.skipif(
+                not tvm.device(self.target_kind_hardware).exist,
+                reason=f"No device exists for target {self.target_kind_hardware}",
+            )
 
-    Unit test marked with this decorator will run only if the
-    installed version of NVCC is at least `(major_version,
-    minor_version, release_version)`.
+    def marks(self, support_required="compile-and-run"):
+        """Return a list of marks to be used
 
-    This also marks the test as requiring a cuda support.
+        Parameters
+        ----------
 
-    Parameters
-    ----------
-    major_version: int
+        support_required: str
 
-        The major version of the (major,minor,release) version tuple.
+            Allowed values: "compile-and-run" (default),
+            "compile-only", or "optional".
 
-    minor_version: int
+            See Feature.__call__ for details.
+        """
+        if support_required not in ["compile-and-run", "compile-only", "optional"]:
+            raise ValueError(f"Unknown feature support type: {support_required}")
 
-        The minor version of the (major,minor,release) version tuple.
+        if support_required == "compile-and-run":
+            marks = itertools.chain(
+                self._run_only_marks(), self._compile_only_marks(), self._uses_marks()
+            )
+        elif support_required == "compile-only":
+            marks = itertools.chain(self._compile_only_marks(), self._uses_marks())
+        elif support_required == "optional":
+            marks = self._uses_marks()
+        else:
+            raise ValueError(f"Unknown feature support type: {support_required}")
 
-    release_version: int
+        return list(marks)
 
-        The release version of the (major,minor,release) version tuple.
+    def __call__(self, func=None, *, support_required="compile-and-run"):
+        """Mark a pytest function as requiring this feature
 
-    """
+        Can be used either as a bare decorator, or as a decorator with
+        arguments.
 
-    try:
-        nvcc_version = nvcc.get_cuda_version()
-    except RuntimeError:
-        nvcc_version = (0, 0, 0)
+        Parameters
+        ----------
 
-    min_version = (major_version, minor_version, release_version)
-    version_str = ".".join(str(v) for v in min_version)
-    requires = [
-        pytest.mark.skipif(nvcc_version < min_version, reason=f"Requires NVCC >= {version_str}"),
-        *requires_cuda(),
-    ]
+        func: Callable
 
-    def inner(func):
-        return _compose([func], requires)
+            The pytest test function to be marked
 
-    return inner
+        support_required: str
 
+            Allowed values: "compile-and-run" (default),
+            "compile-only", or "optional".
 
-def skip_if_32bit(reason):
-    def decorator(*args):
-        if "32bit" in platform.architecture()[0]:
-            return _compose(args, [pytest.mark.skip(reason=reason)])
+            If "compile-and-run", the test case is marked as using the
+            feature, and is skipped if the environment lacks either
+            compile-time or run-time support for the feature.
 
-        return _compose(args, [])
+            If "compile-only", the test case is marked as using the
+            feature, and is skipped if the environment lacks
+            compile-time support.
 
-    return decorator
+            If "optional", the test case is marked as using the
+            feature, but isn't skipped.  This is kept for backwards
+            compatibility for tests that use `enabled_targets()`, and
+            should be avoided in new test code.  Instead, prefer
+            parametrizing over the target using the `target` fixture.
 
+        Examples
+        --------
 
-def requires_cudagraph(*args):
-    """Mark a test as requiring the CUDA Graph Feature
+        .. code-block:: python
 
-    This also marks the test as requiring cuda
+          @feature
+          def test_compile_and_run():
+              ...
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_cudagraph = [
-        pytest.mark.skipif(
-            not nvcc.have_cudagraph(), reason="CUDA Graph is not supported in this environment"
-        ),
-        *requires_cuda(),
-    ]
-    return _compose(args, _requires_cudagraph)
+          @feature(compile_only=True)
+          def test_compile_only():
+              ...
 
+        """
 
-def requires_opencl(*args):
-    """Mark a test as requiring the OpenCL runtime.
+        if support_required not in ["compile-and-run", "compile-only", "optional"]:
+            raise ValueError(f"Unknown feature support type: {support_required}")
 
-    This also marks the test as requiring a gpu.
+        def wrapper(func):
+            for mark in self.marks(support_required=support_required):
+                func = mark(func)
+            return func
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_opencl = [
-        pytest.mark.opencl,
-        pytest.mark.skipif(not device_enabled("opencl"), reason="OpenCL support not enabled"),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_opencl)
+        if func is None:
+            return wrapper
 
+        return wrapper(func)
 
-def requires_corstone300(*args):
-    """Mark a test as requiring the corstone300 FVP
+    @classmethod
+    def require(cls, name, support_required="compile-and-run"):
+        """Returns a decorator that marks a test as requiring a feature
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_corstone300 = [
-        pytest.mark.corstone300,
-        pytest.mark.skipif(
-            shutil.which("arm-none-eabi-gcc") is None, reason="ARM embedded toolchain unavailable"
-        ),
-    ]
-    return _compose(args, _requires_corstone300)
+        Parameters
+        ----------
 
+        name: str
 
-def requires_rocm(*args):
-    """Mark a test as requiring the rocm runtime.
+            The name of the feature that is used by the test
 
-    This also marks the test as requiring a gpu.
+        support_required: str
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_rocm = [
-        pytest.mark.rocm,
-        pytest.mark.skipif(not device_enabled("rocm"), reason="rocm support not enabled"),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_rocm)
+            Allowed values: "compile-and-run" (default),
+            "compile-only", or "optional".
 
+            See Feature.__call__ for details.
 
-def requires_metal(*args):
-    """Mark a test as requiring the metal runtime.
+        Examples
+        --------
 
-    This also marks the test as requiring a gpu.
+        .. code-block:: python
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_metal = [
-        pytest.mark.metal,
-        pytest.mark.skipif(not device_enabled("metal"), reason="metal support not enabled"),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_metal)
+          @Feature.require("cuda")
+          def test_compile_and_run():
+              ...
 
+          @Feature.require("cuda", compile_only=True)
+          def test_compile_only():
+              ...
+        """
+        return cls._all_features[name](support_required=support_required)
 
-def requires_vulkan(*args):
-    """Mark a test as requiring the vulkan runtime.
 
-    This also marks the test as requiring a gpu.
+def _any_gpu_exists():
+    return (
+        tvm.cuda().exist
+        or tvm.rocm().exist
+        or tvm.opencl().exist
+        or tvm.metal().exist
+        or tvm.vulkan().exist
+    )
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_vulkan = [
-        pytest.mark.vulkan,
-        pytest.mark.skipif(not device_enabled("vulkan"), reason="vulkan support not enabled"),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_vulkan)
 
+# Mark a test as requiring llvm to run
+requires_llvm = Feature(
+    "llvm", "LLVM", cmake_flag="USE_LLVM", target_kind_enabled="llvm", target_kind_hardware="llvm"
+)
 
-def requires_tensorcore(*args):
-    """Mark a test as requiring a tensorcore to run.
+# Mark a test as requiring a GPU to run.
+requires_gpu = Feature("gpu", run_time_check=_any_gpu_exists)
 
-    Tests with this mark will not be run unless a tensorcore is present.
+# Mark to differentiate tests that use the GPU in some capacity.
+#
+# These tests will be run on CPU-only test nodes and on test nodes with GPUs.
+# To mark a test that must have a GPU present to run, use
+# :py:func:`tvm.testing.requires_gpu`.
+uses_gpu = requires_gpu(support_required="optional")
+
+# Mark a test as requiring the x86 Architecture to run.
+requires_x86 = Feature(
+    "x86", "x86 Architecture", run_time_check=lambda: platform.machine() == "x86_64"
+)
+
+# Mark a test as requiring the CUDA runtime.
+requires_cuda = Feature(
+    "cuda",
+    "CUDA",
+    cmake_flag="USE_CUDA",
+    target_kind_enabled="cuda",
+    target_kind_hardware="cuda",
+    parent_features="gpu",
+)
+
+# Mark a test as requiring a tensorcore to run
+requires_tensorcore = Feature(
+    "tensorcore",
+    "NVIDIA Tensor Core",
+    run_time_check=lambda: tvm.cuda().exist and nvcc.have_tensorcore(tvm.cuda().compute_version),
+    parent_features="cuda",
+)
+
+# Mark a test as requiring the cuDNN library.
+requires_cudnn = Feature("cudnn", "cuDNN", cmake_flag="USE_CUDNN", parent_features="cuda")
+
+# Mark a test as requiring the cuBLAS library.
+requires_cublas = Feature("cublas", "cuBLAS", cmake_flag="USE_CUBLAS", parent_features="cuda")
+
+# Mark a test as requiring the NVPTX compilation on the CUDA runtime
+requires_nvptx = Feature(
+    "nvptx",
+    "NVPTX",
+    target_kind_enabled="nvptx",
+    target_kind_hardware="nvptx",
+    parent_features=["llvm", "cuda"],
+)
+
+# Mark a test as requiring the CUDA Graph Feature
+requires_cudagraph = Feature(
+    "cudagraph",
+    "CUDA Graph",
+    target_kind_enabled="cuda",
+    compile_time_check=nvcc.have_cudagraph,
+    parent_features="cuda",
+)
+
+# Mark a test as requiring the OpenCL runtime
+requires_opencl = Feature(
+    "opencl",
+    "OpenCL",
+    cmake_flag="USE_OPENCL",
+    target_kind_enabled="opencl",
+    target_kind_hardware="opencl",
+    parent_features="gpu",
+)
+
+# Mark a test as requiring the rocm runtime
+requires_rocm = Feature(
+    "rocm",
+    "ROCm",
+    cmake_flag="USE_ROCM",
+    target_kind_enabled="rocm",
+    target_kind_hardware="rocm",
+    parent_features="gpu",
+)
+
+# Mark a test as requiring the metal runtime
+requires_metal = Feature(
+    "metal",
+    "Metal",
+    cmake_flag="USE_METAL",
+    target_kind_enabled="metal",
+    target_kind_hardware="metal",
+    parent_features="gpu",
+)
+
+# Mark a test as requiring the vulkan runtime
+requires_vulkan = Feature(
+    "vulkan",
+    "Vulkan",
+    cmake_flag="USE_VULKAN",
+    target_kind_enabled="vulkan",
+    target_kind_hardware="vulkan",
+    parent_features="gpu",
+)
+
+# Mark a test as requiring microTVM to run
+requires_micro = Feature("micro", "MicroTVM", cmake_flag="USE_MICRO")
+
+# Mark a test as requiring rpc to run
+requires_rpc = Feature("rpc", "RPC", cmake_flag="USE_RPC")
+
+# Mark a test as requiring Arm(R) Ethos(TM)-N to run
+requires_ethosn = Feature("ethosn", "Arm(R) Ethos(TM)-N", cmake_flag="USE_ETHOSN")
+
+# Mark a test as requiring Hexagon to run
+requires_hexagon = Feature(
+    "hexagon",
+    "Hexagon",
+    cmake_flag="USE_HEXAGON",
+    target_kind_enabled="hexagon",
+    compile_time_check=lambda: (
+        (_cmake_flag_enabled("USE_LLVM") and tvm.target.codegen.llvm_version_major() >= 7)
+        or "Hexagon requires LLVM 7 or later"
+    ),
+    target_kind_hardware="hexagon",
+    parent_features="llvm",
+)
+
+# Mark a test as requiring the CMSIS NN library
+requires_cmsisnn = Feature("cmsisnn", "CMSIS NN", cmake_flag="USE_CMSISNN")
+
+# Mark a test as requiring the corstone300 FVP
+requires_corstone300 = Feature(
+    "corstone300",
+    "Corstone-300",
+    compile_time_check=lambda: (
+        (shutil.which("arm-none-eabi-gcc") is None) or "ARM embedded toolchain unavailable"
+    ),
+    parent_features="cmsisnn",
+)
+
+# Mark a test as requiring Vitis AI to run
+requires_vitis_ai = Feature("vitis_ai", "Vitis AI", cmake_flag="USE_VITIS_AI")
+
+
+def _cmake_flag_enabled(flag):
+    flag = tvm.support.libinfo()[flag]
+
+    # Because many of the flags can be library flags, we check if the
+    # flag is not disabled, rather than checking if it is enabled.
+    return flag.lower() not in ["off", "false", "0"]
+
+
+def _tvm_test_targets():
+    target_str = os.environ.get("TVM_TEST_TARGETS", "").strip()
+    if target_str:
+        # Use dict instead of set for de-duplication so that the
+        # targets stay in the order specified.
+        return list({t.strip(): None for t in target_str.split(";") if t.strip()})
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_tensorcore = [
-        pytest.mark.tensorcore,
-        pytest.mark.skipif(
-            not tvm.cuda().exist or not nvcc.have_tensorcore(tvm.cuda(0).compute_version),
-            reason="No tensorcore present",
-        ),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_tensorcore)
+    return DEFAULT_TEST_TARGETS
 
 
-def requires_llvm(*args):
-    """Mark a test as requiring llvm to run.
+def _compose(args, decs):
+    """Helper to apply multiple markers"""
+    if len(args) > 0:
+        f = args[0]
+        for d in reversed(decs):
+            f = d(f)
+        return f
+    return decs
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_llvm = [
-        pytest.mark.llvm,
-        pytest.mark.skipif(not device_enabled("llvm"), reason="LLVM support not enabled"),
-    ]
-    return _compose(args, _requires_llvm)
 
+def slow(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if SKIP_SLOW_TESTS:
+            pytest.skip("Skipping slow test since RUN_SLOW_TESTS environment variables is 'true'")
+        else:
+            fn(*args, **kwargs)
 
-def requires_micro(*args):
-    """Mark a test as requiring microTVM to run.
+    return wrapper
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_micro = [
-        pytest.mark.skipif(
-            tvm.support.libinfo().get("USE_MICRO", "OFF") != "ON",
-            reason="MicroTVM support not enabled. Set USE_MICRO=ON in config.cmake to enable.",
-        )
-    ]
-    return _compose(args, _requires_micro)
 
+def requires_nvcc_version(major_version, minor_version=0, release_version=0):
+    """Mark a test as requiring at least a specific version of nvcc.
 
-def requires_rpc(*args):
-    """Mark a test as requiring rpc to run.
+    Unit test marked with this decorator will run only if the
+    installed version of NVCC is at least `(major_version,
+    minor_version, release_version)`.
+
+    This also marks the test as requiring a cuda support.
 
     Parameters
     ----------
-    f : function
-        Function to mark
-    """
-    _requires_rpc = [
-        pytest.mark.skipif(
-            tvm.support.libinfo().get("USE_RPC", "OFF") != "ON",
-            reason="RPC support not enabled. Set USE_RPC=ON in config.cmake to enable.",
-        )
-    ]
-    return _compose(args, _requires_rpc)
+    major_version: int
 
+        The major version of the (major,minor,release) version tuple.
 
-def requires_ethosn(*args):
-    """Mark a test as requiring Arm(R) Ethos(TM)-N to run.
+    minor_version: int
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    marks = [
-        pytest.mark.ethosn,
-        pytest.mark.skipif(
-            not ethosn_available(),
-            reason=(
-                "Arm(R) Ethos(TM)-N support not enabled.  "
-                "Set USE_ETHOSN=ON in config.cmake to enable, "
-                "and ensure that hardware support is present."
-            ),
-        ),
-    ]
-    return _compose(args, marks)
+        The minor version of the (major,minor,release) version tuple.
 
+    release_version: int
 
-def requires_hexagon(*args):
-    """Mark a test as requiring Hexagon to run.
+        The release version of the (major,minor,release) version tuple.
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
     """
-    _requires_hexagon = [
-        pytest.mark.hexagon,
-        pytest.mark.skipif(not device_enabled("hexagon"), reason="Hexagon support not enabled"),
-        *requires_llvm(),
-        pytest.mark.skipif(
-            tvm.target.codegen.llvm_version_major() < 7, reason="Hexagon requires LLVM 7 or later"
-        ),
-    ]
-    return _compose(args, _requires_hexagon)
 
+    try:
+        nvcc_version = nvcc.get_cuda_version()
+    except RuntimeError:
+        nvcc_version = (0, 0, 0)
 
-def requires_cmsisnn(*args):
-    """Mark a test as requiring the CMSIS NN library.
+    min_version = (major_version, minor_version, release_version)
+    version_str = ".".join(str(v) for v in min_version)
+    requires = [
+        pytest.mark.skipif(nvcc_version < min_version, reason=f"Requires NVCC >= {version_str}"),
+        *requires_cuda.marks(),
+    ]
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
+    def inner(func):
+        return _compose([func], requires)
 
-    requirements = [pytest.mark.skipif(not cmsisnn.enabled(), reason="CMSIS NN not enabled")]
-    return _compose(args, requirements)
+    return inner
 
 
-def requires_vitis_ai(*args):
-    """Mark a test as requiring Vitis AI to run.
+def skip_if_32bit(reason):
+    def decorator(*args):
+        if "32bit" in platform.architecture()[0]:
+            return _compose(args, [pytest.mark.skip(reason=reason)])
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
+        return _compose(args, [])
 
-    requirements = [pytest.mark.skipif(not vitis_ai.enabled(), reason="Vitis AI not enabled")]
-    return _compose(args, requirements)
+    return decorator
 
 
 def requires_package(*packages):
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 76e3f1c3a4055..19ac183d66dfe 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -34,8 +34,8 @@
 )
 
 run_module = tvm.testing.parameter(
-    pytest.param(False, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm()]),
-    pytest.param(True, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm()]),
+    pytest.param(False, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
+    pytest.param(True, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
     ids=["compile", "run"],
 )
 
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 982ec976d54ed..cecb64785a49a 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -44,9 +44,9 @@
 )
 
 run_module = tvm.testing.parameter(
-    pytest.param(False, marks=[has_tensorrt_codegen, *tvm.testing.requires_cuda()]),
+    pytest.param(False, marks=[has_tensorrt_codegen, *tvm.testing.requires_cuda.marks()]),
     pytest.param(
-        True, marks=[has_tensorrt_runtime, has_tensorrt_codegen, *tvm.testing.requires_cuda()]
+        True, marks=[has_tensorrt_runtime, has_tensorrt_codegen, *tvm.testing.requires_cuda.marks()]
     ),
     ids=["compile", "run"],
 )
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index d6ae27957de2d..e8e93a6c7514d 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -25,7 +25,7 @@
 
 import tvm
 import tvm.testing
-from tvm.testing.utils import ethosn_available
+from tvm.relay.op.contrib.ethosn import ethosn_available
 from tvm.relay.backend import Runtime, Executor
 
 from tvm.contrib.target.vitis_ai import vitis_ai_available
@@ -412,10 +412,7 @@ def test_compile_tflite_module_with_external_codegen_cmsisnn(
         assert len(c_source_files) == 4
 
 
-@pytest.mark.skipif(
-    not ethosn_available(),
-    reason="--target=Ethos(TM)-N78 is not available. TVM built with 'USE_ETHOSN OFF'",
-)
+@tvm.testing.requires_ethosn
 def test_compile_tflite_module_with_external_codegen_ethos_n78(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("tflite")
     tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
@@ -430,10 +427,7 @@ def test_compile_tflite_module_with_external_codegen_ethos_n78(tflite_mobilenet_
     assert os.path.exists(dumps_path)
 
 
-@pytest.mark.skipif(
-    not vitis_ai_available(),
-    reason="--target=vitis-ai is not available. TVM built with 'USE_VITIS_AI OFF'",
-)
+@tvm.testing.requires_vitis_ai
 def test_compile_tflite_module_with_external_codegen_vitis_ai(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("tflite")
 
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index a40164ded941e..f3886374ccb65 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -528,7 +528,7 @@ def check_target(device):
     check_target("rocm")
 
 
-@tvm.testing.requires_gpu
+@tvm.testing.requires_cuda
 def test_reduce_storage_reuse():
     target = tvm.target.Target("cuda")
 

From 8823757f3037cdf2afe0ce6bb4f38fff8ef97536 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 3 Jun 2022 16:09:24 -0500
Subject: [PATCH 038/181] [TIR] Expose tir.call_cpacked in python (#11563)

---
 python/tvm/tir/__init__.py |  2 +-
 python/tvm/tir/op.py       | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 2d201bb0dab65..6db93b6ad0915 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -44,7 +44,7 @@
 
 from .function import PrimFunc, TensorIntrin, IndexMap
 
-from .op import call_packed, call_intrin, call_pure_extern, call_extern
+from .op import call_packed, call_cpacked, call_intrin, call_pure_extern, call_extern
 from .op import call_llvm_intrin, call_llvm_pure_intrin, ret, all, any, min_value, max_value, trace
 from .op import exp, exp2, exp10, log, log2, log10, log1p, ldexp, clz
 from .op import sin, sinh, asin, asinh
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index de3ca5fa8d5b2..5d15bf15da581 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -73,6 +73,33 @@ def call_packed(*args, span=None):
     return Call("int32", Op.get("tir.tvm_call_packed"), call_args, span)
 
 
+def call_cpacked(*args, span=None):
+    """Build expression by call an external packed function.
+
+    Same as call_packed, except that the first argument is the function name
+    (as in call_extern), and the last argument is the resource handle.
+
+    Parameters
+    ----------
+    args : list of Expr or Buffer.
+        Positional arguments.
+
+    span : Optional[Span]
+        The location of this operator in the source code.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+
+    See Also
+    --------
+    te.extern : Create tensor with extern function call.
+    """
+    call_args = [_pack_buffer(x) if isinstance(x, Buffer) else x for x in args]
+    return Call("int32", Op.get("tir.tvm_call_cpacked"), call_args, span)
+
+
 def call_intrin(dtype, func_name, *args, span=None):
     """Build expression by calling an intrinsic function.
 

From 6dbdf2e20116ecc6f5379f5cb430ed023ff0d62b Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 3 Jun 2022 14:22:05 -0700
Subject: [PATCH 039/181] Fix Hexagon build using ci.py (#11304)

* Add output directory

add post build for hexagon

fix -net=host for docker

* remove --net by default
---
 tests/scripts/ci.py                     |  6 +++++-
 tests/scripts/task_build_hexagon_api.sh | 16 ++++++++++------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index b3f9cb6500e53..599bbaddceec9 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -342,6 +342,7 @@ def generate_command(
     options: Dict[str, Option],
     help: str,
     precheck: Optional[Callable[[], None]] = None,
+    post_build: Optional[List[str]] = None,
 ):
     """
     Helper to generate CLIs that:
@@ -378,6 +379,9 @@ def fn(
                 f"./tests/scripts/task_build.py --build-dir {get_build_dir(name)}",
             ]
 
+        if post_build is not None:
+            scripts += post_build
+
         # Check that a test suite was not used alongside specific test names
         if any(v for v in kwargs.values()) and tests is not None:
             option_flags = ", ".join([f"--{k}" for k in options.keys()])
@@ -624,12 +628,12 @@ def add_subparser(
     generate_command(
         name="hexagon",
         help="Run Hexagon build and test(s)",
+        post_build=["./tests/scripts/task_build_hexagon_api.sh --output build-hexagon"],
         options={
             "cpp": CPP_UNITTEST,
             "test": (
                 "run Hexagon API/Python tests",
                 [
-                    "./tests/scripts/task_build_hexagon_api.sh",
                     "./tests/scripts/task_python_hexagon.sh",
                 ],
             ),
diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index 4c7b4f396ced4..5f811e4e27492 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -19,6 +19,15 @@
 set -e
 set -u
 
+output_directory_parent=$(realpath ${PWD}/build)
+if [ $# -ge 1 ] && [[ "$1" == "--output" ]]; then
+    shift 1
+    output_directory_parent=$(realpath $1)
+    shift 1
+fi
+output_directory="${output_directory_parent}/hexagon_api_output"
+rm -rf ${output_directory}
+
 use_cache=false
 if [ $# -ge 1 ] && [[ "$1" == "--use-cache" ]]; then
     use_cache=true
@@ -26,24 +35,19 @@ if [ $# -ge 1 ] && [[ "$1" == "--use-cache" ]]; then
 fi
 
 cd apps/hexagon_api
-
 if [ "$use_cache" = false ]; then
     rm -rf build
 fi
-
 mkdir -p build
 cd build
 
-output_binary_directory=$(realpath ${PWD}/../../../build/hexagon_api_output)
-rm -rf ${output_binary_directory}
-
 cmake -DANDROID_ABI=arm64-v8a \
     -DANDROID_PLATFORM=android-28 \
     -DUSE_ANDROID_TOOLCHAIN="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \
     -DUSE_HEXAGON_ARCH=v68 \
     -DUSE_HEXAGON_SDK="${HEXAGON_SDK_ROOT}" \
     -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \
-    -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \
+    -DUSE_OUTPUT_BINARY_DIR="${output_directory}" \
     -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_ROOT}/utils/googletest/gtest" ..
 
 make -j$(nproc)

From f05ebde8e84e4bce620b0fdf839b89eb60c1008c Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Fri, 3 Jun 2022 16:23:46 -0600
Subject: [PATCH 040/181] [docs] microTVM model training tutorial with Colab
 support (#10921)

* First draft of micro train tutorial

* unit test code

* Fix obvious formatting issues

* Linting

* Proof of concept showing that "Open in Colab" is possible

* Make test Python script more readable

* Fix formatting

* Ready for review

* Import pyserial only when needed

Changes from code review

Use official sphinx-gallery repo

Correctly specify version

Import pyserial only when necessary

* Add warning to ignored list

Try to avoid throwing warning

Fix linting, try verbosity filter

Try adding to ignore file

Remove fix attempts

* Grammar fixes

* Address code review comments

Include full git hashes

* Rerun tests

* Rerun again
---
 .../template_project/microtvm_api_server.py   |   4 +-
 apps/microtvm/pyproject.toml                  |   2 +-
 docker/install/ubuntu_install_sphinx.sh       |   2 +-
 docs/conf.py                                  |   3 +-
 .../how_to/work_with_microtvm/micro_train.py  | 649 ++++++++++++++++++
 tests/scripts/ci.py                           |   3 +-
 tests/scripts/task_python_docs.sh             |   2 +
 7 files changed, 660 insertions(+), 5 deletions(-)
 create mode 100644 gallery/how_to/work_with_microtvm/micro_train.py

diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index 95f941fe34737..131f92a208298 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -34,7 +34,6 @@
 import re
 
 from packaging import version
-import serial.tools.list_ports
 
 from tvm.micro.project_api import server
 
@@ -485,6 +484,9 @@ def flash(self, options):
         subprocess.run(upload_cmd, check=True)
 
     def open_transport(self, options):
+        import serial
+        import serial.tools.list_ports
+
         # Zephyr example doesn't throw an error in this case
         if self._serial is not None:
             return
diff --git a/apps/microtvm/pyproject.toml b/apps/microtvm/pyproject.toml
index 98c769be48f51..5976328592290 100644
--- a/apps/microtvm/pyproject.toml
+++ b/apps/microtvm/pyproject.toml
@@ -129,7 +129,7 @@ importer-tflite = ["tflite", "tensorflow", "tensorflow-estimator"]
 autodocsumm = "^0.1"
 black = "^19.10b0"
 sphinx = "^3.0"
-sphinx-gallery = "^0.8"
+sphinx-gallery = { git = "https://github.com/sphinx-gallery/sphinx-gallery.git", rev = "6142f179" }
 sphinx-rtd-theme = "^0.4"
 matplotlib = "^3.2"
 Image = "^1.5"
diff --git a/docker/install/ubuntu_install_sphinx.sh b/docker/install/ubuntu_install_sphinx.sh
index 12ca25b22b85a..96023fa6e633a 100755
--- a/docker/install/ubuntu_install_sphinx.sh
+++ b/docker/install/ubuntu_install_sphinx.sh
@@ -29,5 +29,5 @@ pip3 install \
     matplotlib \
     sphinx==4.2.0 \
     sphinx_autodoc_annotation \
-    sphinx-gallery==0.4.0 \
+    "git+https://github.com/sphinx-gallery/sphinx-gallery.git@6142f1791151849b5bec4bf3959f75697ba226cd" \
     sphinx_rtd_theme
diff --git a/docs/conf.py b/docs/conf.py
index 49c5c4fa755d2..9d55e20c03e5c 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -312,6 +312,7 @@ def git_describe_version(original_version):
         "bring_your_own_datatypes.py",
     ],
     "micro": [
+        "micro_train.py",
         "micro_autotune.py",
         "micro_reference_vm.py",
         "micro_tflite.py",
@@ -360,11 +361,11 @@ def force_gc(gallery_conf, fname):
     "gallery_dirs": gallery_dirs,
     "subsection_order": subsection_order,
     "filename_pattern": os.environ.get("TVM_TUTORIAL_EXEC_PATTERN", ".py"),
-    "find_mayavi_figures": False,
     "download_all_examples": False,
     "min_reported_time": 60,
     "expected_failing_examples": [],
     "reset_modules": ("matplotlib", "seaborn", force_gc),
+    "promote_jupyter_magic": True,
 }
 
 autodoc_default_options = {
diff --git a/gallery/how_to/work_with_microtvm/micro_train.py b/gallery/how_to/work_with_microtvm/micro_train.py
new file mode 100644
index 0000000000000..378fe56d9da01
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_train.py
@@ -0,0 +1,649 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _microtvm-train-arduino:
+
+Training Vision Models for microTVM on Arduino
+==============================================
+**Author**: `Gavin Uberti <https://github.com/guberti>`_
+
+This tutorial shows how MobileNetV1 models can be trained
+to fit on embedded devices, and how those models can be
+deployed to Arduino using TVM.
+"""
+
+######################################################################
+# .. note::
+#
+#   This tutorial is best viewed as a Jupyter Notebook. You can download and run it locally
+#   using the link at the bottom of this page, or open it online for free using Google Colab.
+#   Click the icon below to open in Google Colab.
+#
+# .. image:: https://raw.githubusercontent.com/guberti/web-data/micro-train-tutorial-data/images/utilities/colab_button.png
+#      :align: center
+#      :target: https://colab.research.google.com/github/guberti/tvm-site/blob/asf-site/docs/_downloads/a7c7ea4b5017ae70db1f51dd8e6dcd82/micro_train.ipynb
+#      :width: 300px
+#
+# Motivation
+# ----------
+# When building IOT devices, we often want them to **see and understand** the world around them.
+# This can take many forms, but often times a device will want to know if a certain **kind of
+# object** is in its field of vision.
+#
+# For example, a security camera might look for **people**, so it can decide whether to save a video
+# to memory. A traffic light might look for **cars**, so it can judge which lights should change
+# first. Or a forest camera might look for a **kind of animal**, so they can estimate how large
+# the animal population is.
+#
+# To make these devices affordable, we would like them to need only a low-cost processor like the
+# `nRF52840 <https://www.nordicsemi.com/Products/nRF52840>`_ (costing five dollars each on Mouser) or the `RP2040 <https://www.raspberrypi.com/products/rp2040/>`_ (just $1.45 each!).
+#
+# These devices have very little memory (~250 KB RAM), meaning that no conventional edge AI
+# vision model (like MobileNet or EfficientNet) will be able to run. In this tutorial, we will
+# show how these models can be modified to work around this requirement. Then, we will use TVM
+# to compile and deploy it for an Arduino that uses one of these processors.
+#
+# Installing the Prerequisites
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# This tutorial will use TensorFlow to train the model - a widely used machine learning library
+# created by Google. TensorFlow is a very low-level library, however, so we will the Keras
+# interface to talk to TensorFlow. We will also use TensorFlow Lite to perform quantization on
+# our model, as TensorFlow by itself does not support this.
+#
+# Once we have our generated model, we will use TVM to compile and test it. To avoid having to
+# build from source, we'll install ``tlcpack`` - a community build of TVM. Lastly, we'll also
+# install ``imagemagick`` and ``curl`` to preprocess data:
+#
+#     .. code-block:: bash
+#
+#       %%bash
+#       pip install -q tensorflow tflite
+#       pip install -q tlcpack-nightly -f https://tlcpack.ai/wheels
+#       apt-get -qq install imagemagick curl
+#
+#       # Install Arduino CLI and library for Nano 33 BLE
+#       curl -fsSL https://raw.githubusercontent.com/arduino/arduino-cli/master/install.sh | sh
+#       /content/bin/arduino-cli core update-index
+#       /content/bin/arduino-cli core install arduino:mbed_nano
+#
+# Using the GPU
+# ^^^^^^^^^^^^^
+#
+# This tutorial demonstrates training a neural network, which is requires a lot of computing power
+# and will go much faster if you have a GPU. If you are viewing this tutorial on Google Colab, you
+# can enable a GPU by going to **Runtime->Change runtime type** and selecting "GPU" as our hardware
+# accelerator. If you are running locally, you can `follow TensorFlow's guide <https://www.tensorflow.org/guide/gpu>`_ instead.
+#
+# We can test our GPU installation with the following code:
+
+import tensorflow as tf
+
+if not tf.test.gpu_device_name():
+    print("No GPU was detected!")
+    print("Model training will take much longer (~30 minutes instead of ~5)")
+else:
+    print("GPU detected - you're good to go.")
+
+######################################################################
+# Choosing Our Work Dir
+# ^^^^^^^^^^^^^^^^^^^^^
+# We need to pick a directory where our image datasets, trained model, and eventual Arduino sketch
+# will all live. If running on Google Colab, we'll save everything in ``/root`` (aka ``~``) but you'll
+# probably want to store it elsewhere if running locally. Note that this variable only affects Python
+# scripts - you'll have to adjust the Bash commands too.
+
+import os
+
+FOLDER = "/root"
+# sphinx_gallery_start_ignore
+import tempfile
+
+FOLDER = tempfile.mkdtemp()
+# sphinx_gallery_end_ignore
+
+######################################################################
+# Downloading the Data
+# --------------------
+# Convolutional neural networks usually learn by looking at many images, along with labels telling
+# the network what those images are. To get these images, we'll need a publicly available dataset
+# with thousands of images of all sorts of objects and labels of what's in each image. We'll also
+# need a bunch of images that **aren't** of cars, as we're trying to distinguish these two classes.
+#
+# In this tutorial, we'll create a model to detect if an image contains a **car**, but you can use
+# whatever category you like! Just change the source URL below to one containing images of another
+# type of object.
+#
+# To get our car images, we'll be downloading the `Stanford Cars dataset <http://ai.stanford.edu/~jkrause/cars/car_dataset.html>`_,
+# which contains 16,185 full color images of cars. We'll also need images of random things that
+# aren't cars, so we'll use the `COCO 2017 <https://cocodataset.org/#home>`_ validation set (it's
+# smaller, and thus faster to download than the full training set. Training on the full data set
+# would yield better results). Note that there are some cars in the COCO 2017 data set, but it's
+# a small enough fraction not to matter - just keep in mind that this will drive down our percieved
+# accuracy slightly.
+#
+# We could use the TensorFlow dataloader utilities, but we'll instead do it manually to make sure
+# it's easy to change the datasets being used. We'll end up with the following file hierarchy:
+#
+#     .. code-block::
+#
+#         /root
+#         ├── images
+#         │   ├── object
+#         │   │   ├── 000001.jpg
+#         │   │   │ ...
+#         │   │   └── 016185.jpg
+#         │   ├── object.tgz
+#         │   ├── random
+#         │   │   ├── 000000000139.jpg
+#         │   │   │ ...
+#         │   │   └── 000000581781.jpg
+#         │   └── random.zip
+#
+# We should also note that Stanford cars has 8k images, while the COCO 2017 validation set is 5k
+# images - it is not a 50/50 split! If we wanted to, we could weight these classes differently
+# during training to correct for this, but training will still work if we ignore it. It should
+# take about **2 minutes** to download the Stanford Cars, while COCO 2017 validation will take
+# **1 minute**.
+
+import os
+import shutil
+import urllib.request
+
+# Download datasets
+os.makedirs(f"{FOLDER}/images")
+urllib.request.urlretrieve(
+    "http://ai.stanford.edu/~jkrause/car196/cars_train.tgz", f"{FOLDER}/images/target.tgz"
+)
+urllib.request.urlretrieve(
+    "http://images.cocodataset.org/zips/val2017.zip", f"{FOLDER}/images/random.zip"
+)
+
+# Extract them and rename their folders
+shutil.unpack_archive(f"{FOLDER}/images/target.tgz", f"{FOLDER}/images")
+shutil.unpack_archive(f"{FOLDER}/images/random.zip", f"{FOLDER}/images")
+shutil.move(f"{FOLDER}/images/cars_train", f"{FOLDER}/images/target")
+shutil.move(f"{FOLDER}/images/val2017", f"{FOLDER}/images/random")
+
+######################################################################
+# Loading the Data
+# ----------------
+# Currently, our data is stored on-disk as JPG files of various sizes. To train with it, we'll have
+# to load the images into memory, resize them to be 64x64, and convert them to raw, uncompressed
+# data. Keras's ``image_dataset_from_directory`` will take care of most of this, though it loads
+# images such that each pixel value is a float from 0 to 255.
+#
+# We'll also need to load labels, though Keras will help with this. From our subdirectory structure,
+# it knows the images in ``/objects`` are one class, and those in ``/random`` another. Setting
+# ``label_mode='categorical'`` tells Keras to convert these into **categorical labels** - a 2x1 vector
+# that's either ``[1, 0]`` for an object of our target class, or ``[0, 1]`` vector for anything else.
+# We'll also set ``shuffle=True`` to randomize the order of our examples.
+#
+# We will also **batch** the data - grouping samples into clumps to make our training go faster.
+# Setting ``batch_size = 32`` is a decent number.
+#
+# Lastly, in machine learning we generally want our inputs to be small numbers. We'll thus use a
+# ``Rescaling`` layer to change our images such that each pixel is a float between ``0.0`` and ``1.0``,
+# instead of ``0`` to ``255``. We need to be careful not to rescale our categorical labels though, so
+# we'll use a ``lambda`` function.
+
+IMAGE_SIZE = (64, 64, 3)
+unscaled_dataset = tf.keras.utils.image_dataset_from_directory(
+    f"{FOLDER}/images",
+    batch_size=32,
+    shuffle=True,
+    label_mode="categorical",
+    image_size=IMAGE_SIZE[0:2],
+)
+rescale = tf.keras.layers.Rescaling(scale=1.0 / 255)
+full_dataset = unscaled_dataset.map(lambda im, lbl: (rescale(im), lbl))
+
+######################################################################
+# What's Inside Our Dataset?
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Before giving this data set to our neural network, we ought to give it a quick visual inspection.
+# Does the data look properly transformed? Do the labels seem appropriate? And what's our ratio of
+# objects to other stuff? We can display some examples from our datasets using ``matplotlib``:
+
+import matplotlib.pyplot as plt
+
+num_target_class = len(os.listdir(f"{FOLDER}/images/target/"))
+num_random_class = len(os.listdir(f"{FOLDER}/images/random/"))
+print(f"{FOLDER}/images/target contains {num_target_class} images")
+print(f"{FOLDER}/images/random contains {num_random_class} images")
+
+# Show some samples and their labels
+SAMPLES_TO_SHOW = 10
+plt.figure(figsize=(20, 10))
+for i, (image, label) in enumerate(unscaled_dataset.unbatch()):
+    if i >= SAMPLES_TO_SHOW:
+        break
+    ax = plt.subplot(1, SAMPLES_TO_SHOW, i + 1)
+    plt.imshow(image.numpy().astype("uint8"))
+    plt.title(list(label.numpy()))
+    plt.axis("off")
+
+######################################################################
+# Validating our Accuracy
+# ^^^^^^^^^^^^^^^^^^^^^^^
+# While developing our model, we'll often want to check how accurate it is (e.g. to see if it
+# improves during training). How do we do this? We could just train it on *all* of the data, and
+# then ask it to classify that same data. However, our model could cheat by just memorizing all of
+# the samples, which would make it *appear* to have very high accuracy, but perform very badly in
+# reality. In practice, this "memorizing" is called **overfitting**.
+#
+# To prevent this, we will set aside some of the data (we'll use 20%) as a **validation set**. Our
+# model will never be trained on validation data - we'll only use it to check our model's accuracy.
+
+num_batches = len(full_dataset)
+train_dataset = full_dataset.take(int(num_batches * 0.8))
+validation_dataset = full_dataset.skip(len(train_dataset))
+
+######################################################################
+# Loading the Data
+# ----------------
+# In the past decade, `convolutional neural networks <https://en.wikipedia.org/wiki/Convolutional_neural_network>`_ have been widely
+# adopted for image classification tasks. State-of-the-art models like `EfficientNet V2 <https://arxiv.org/abs/2104.00298>`_ are able
+# to perform image classification better than even humans! Unfortunately, these models have tens of
+# millions of parameters, and thus won't fit on cheap security camera computers.
+#
+# Our applications generally don't need perfect accuracy - 90% is good enough. We can thus use the
+# older and smaller MobileNet V1 architecture. But this *still* won't be small enough - by default,
+# MobileNet V1 with 224x224 inputs and alpha 1.0 takes ~50 MB to just **store**. To reduce the size
+# of the model, there are three knobs we can turn. First, we can reduce the size of the input images
+# from 224x224 to 96x96 or 64x64, and Keras makes it easy to do this. We can also reduce the **alpha**
+# of the model, from 1.0 to 0.25, which downscales the width of the network (and the number of
+# filters) by a factor of four. And if we were really strapped for space, we could reduce the
+# number of **channels** by making our model take grayscale images instead of RGB ones.
+#
+# In this tutorial, we will use an RGB 64x64 input image and alpha 0.25. This is not quite
+# ideal, but it allows the finished model to fit in 192 KB of RAM, while still letting us perform
+# transfer learning using the official TensorFlow source models (if we used alpha <0.25 or a
+# grayscale input, we wouldn't be able to do this).
+#
+# What is Transfer Learning?
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Deep learning has `dominated image classification <https://paperswithcode.com/sota/image-classification-on-imagenet>`_ for a long time,
+# but training neural networks takes a lot of time. When a neural network is trained "from scratch",
+# its parameters start out randomly initialized, forcing it to learn very slowly how to tell images
+# apart.
+#
+# With transfer learning, we instead start with a neural network that's **already** good at a
+# specific task. In this example, that task is classifying images from `the ImageNet database <https://www.image-net.org/>`_. This
+# means the network already has some object detection capabilities, and is likely closer to what you
+# want then a random model would be.
+#
+# This works especially well with image processing neural networks like MobileNet. In practice, it
+# turns out the convolutional layers of the model (i.e. the first 90% of the layers) are used for
+# identifying low-level features like lines and shapes - only the last few fully connected layers
+# are used to determine how those shapes make up the objects the network is trying to detect.
+#
+# We can take advantage of this by starting training with a MobileNet model that was trained on
+# ImageNet, and already knows how to identify those lines and shapes. We can then just remove the
+# last few layers from this pretrained model, and add our own final layers. We'll then train this
+# conglomerate model for a few epochs on our cars vs non-cars dataset, to adjust the first layers
+# and train from scratch the last layers. This process of training an already-partially-trained
+# model is called *fine-tuning*.
+#
+# Source MobileNets for transfer learning have been `pretrained by the TensorFlow folks <https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md>`_, so we
+# can just download the one closest to what we want (the 128x128 input model with 0.25 depth scale).
+
+os.makedirs(f"{FOLDER}/models")
+WEIGHTS_PATH = f"{FOLDER}/models/mobilenet_2_5_128_tf.h5"
+urllib.request.urlretrieve(
+    "https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/mobilenet_2_5_128_tf.h5",
+    WEIGHTS_PATH,
+)
+
+pretrained = tf.keras.applications.MobileNet(
+    input_shape=IMAGE_SIZE, weights=WEIGHTS_PATH, alpha=0.25
+)
+
+######################################################################
+# Modifying Our Network
+# ^^^^^^^^^^^^^^^^^^^^^
+# As mentioned above, our pretrained model is designed to classify the 1,000 ImageNet categories,
+# but we want to convert it to classify cars. Since only the bottom few layers are task-specific,
+# we'll **cut off the last five layers** of our original model. In their place we'll build our own
+# "tail" to the model by performing respape, dropout, flatten, and softmax operations.
+
+model = tf.keras.models.Sequential()
+
+model.add(tf.keras.layers.InputLayer(input_shape=IMAGE_SIZE))
+model.add(tf.keras.Model(inputs=pretrained.inputs, outputs=pretrained.layers[-5].output))
+
+model.add(tf.keras.layers.Reshape((-1,)))
+model.add(tf.keras.layers.Dropout(0.1))
+model.add(tf.keras.layers.Flatten())
+model.add(tf.keras.layers.Dense(2, activation="softmax"))
+
+######################################################################
+# Fine Tuning Our Network
+# ^^^^^^^^^^^^^^^^^^^^^^^
+# When training neural networks, we must set a parameter called the **learning rate** that controls
+# how fast our network learns. It must be set carefully - too slow, and our network will take
+# forever to train; too fast, and our network won't be able to learn some fine details. Generally
+# for Adam (the optimizer we're using), ``0.001`` is a pretty good learning rate (and is what's
+# recommended in the `original paper <https://arxiv.org/abs/1412.6980>`_). However, in this case
+# ``0.0005`` seems to work a little better.
+#
+# We'll also pass the validation set from earlier to ``model.fit``. This will evaluate how good our
+# model is each time we train it, and let us track how our model is improving. Once training is
+# finished, the model should have a validation accuracy around ``0.98`` (meaning it was right 98% of
+# the time on our validation set).
+
+model.compile(
+    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
+    loss="categorical_crossentropy",
+    metrics=["accuracy"],
+)
+model.fit(train_dataset, validation_data=validation_dataset, epochs=3, verbose=2)
+
+######################################################################
+# Quantization
+# ------------
+# We've done a decent job of reducing our model's size so far - changing the input dimension,
+# along with removing the bottom layers reduced the model to just 219k parameters. However, each of
+# these parameters is a ``float32`` that takes four bytes, so our model will take up almost one MB!
+#
+# Additionally, it might be the case that our hardware doesn't have built-in support for floating
+# point numbers. While most high-memory Arduinos (like the Nano 33 BLE) do have hardware support,
+# some others (like the Arduino Due) do not. On any boards *without* dedicated hardware support,
+# floating point multiplication will be extremely slow.
+#
+# To address both issues we will **quantize** the model - representing the weights as eight bit
+# integers. It's more complex than just rounding, though - to get the best performance, TensorFlow
+# tracks how each neuron in our model activates, so we can figure out how most accurately simulate
+# the neuron's original activations with integer operations.
+#
+# We will help TensorFlow do this by creating a representative dataset - a subset of the original
+# that is used for tracking how those neurons activate. We'll then pass this into a ``TFLiteConverter``
+# (Keras itself does not have quantization support) with an ``Optimize`` flag to tell TFLite to perform
+# the conversion. By default, TFLite keeps the inputs and outputs of our model as floats, so we must
+# explicitly tell it to avoid this behavior.
+
+
+def representative_dataset():
+    for image_batch, label_batch in full_dataset.take(10):
+        yield [image_batch]
+
+
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.representative_dataset = representative_dataset
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.uint8
+
+quantized_model = converter.convert()
+
+######################################################################
+# Download the Model if Desired
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# We've now got a finished model that you can use locally or in other tutorials (try autotuning
+# this model or viewing it on `https://netron.app/ <https://netron.app/>`_). But before we do
+# those things, we'll have to write it to a file (``quantized.tflite``). If you're running this
+# tutorial on Google Colab, you'll have to uncomment the last two lines to download the file
+# after writing it.
+
+QUANTIZED_MODEL_PATH = f"{FOLDER}/models/quantized.tflite"
+with open(QUANTIZED_MODEL_PATH, "wb") as f:
+    f.write(quantized_model)
+# from google.colab import files
+# files.download(QUANTIZED_MODEL_PATH)
+
+######################################################################
+# Compiling With TVM For Arduino
+# ------------------------------
+# TensorFlow has a built-in framework for deploying to microcontrollers - `TFLite Micro <https://www.tensorflow.org/lite/microcontrollers>`_. However,
+# it's poorly supported by development boards and does not support autotuning. We will use Apache
+# TVM instead.
+#
+# TVM can be used either with its command line interface (``tvmc``) or with its Python interface. The
+# Python interface is fully-featured and more stable, so we'll use it here.
+#
+# TVM is an optimizing compiler, and optimizations to our model are performed in stages via
+# **intermediate representations**. The first of these is `Relay <https://arxiv.org/abs/1810.00952>`_ a high-level intermediate
+# representation emphasizing portability. The conversion from ``.tflite`` to Relay is done without any
+# knowledge of our "end goal" - the fact we intend to run this model on an Arduino.
+#
+# Choosing an Arduino Board
+# ^^^^^^^^^^^^^^^^^^^^^^^^^
+# Next, we'll have to decide exactly which Arduino board to use. The Arduino sketch that we
+# ultimately generate should be compatible with any board, but knowing which board we are using in
+# advance allows TVM to adjust its compilation strategy to get better performance.
+#
+# There is one catch - we need enough **memory** (flash and RAM) to be able to run our model. We
+# won't ever be able to run a complex vision model like a MobileNet on an Arduino Uno - that board
+# only has 2 kB of RAM and 32 kB of flash! Our model has ~200,000 parameters, so there is just no
+# way it could fit.
+#
+# For this tutorial, we will use the Nano 33 BLE, which has 1 MB of flash memory and 256 KB of RAM.
+# However, any other Arduino with those specs or better should also work.
+#
+# Generating our project
+# ^^^^^^^^^^^^^^^^^^^^^^
+# Next, we'll compile the model to TVM's MLF (model library format) intermediate representation,
+# which consists of C/C++ code and is designed for autotuning. To improve performance, we'll tell
+# TVM that we're compiling for the ``nrf52840`` microprocessor (the one the Nano 33 BLE uses). We'll
+# also tell it to use the C runtime (abbreviated ``crt``) and to use ahead-of-time memory allocation
+# (abbreviated ``aot``, which helps reduce the model's memory footprint). Lastly, we will disable
+# vectorization with ``"tir.disable_vectorize": True``, as C has no native vectorized types.
+#
+# Once we have set these configuration parameters, we will call ``tvm.relay.build`` to compile our
+# Relay model into the MLF intermediate representation. From here, we just need to call
+# ``tvm.micro.generate_project`` and pass in the Arduino template project to finish compilation.
+
+import shutil
+import tflite
+import tvm
+
+# Method to load model is different in TFLite 1 vs 2
+try:  # TFLite 2.1 and above
+    tflite_model = tflite.Model.GetRootAsModel(quantized_model, 0)
+except AttributeError:  # Fall back to TFLite 1.14 method
+    tflite_model = tflite.Model.Model.GetRootAsModel(quantized_model, 0)
+
+# Convert to the Relay intermediate representation
+mod, params = tvm.relay.frontend.from_tflite(tflite_model)
+
+# Set configuration flags to improve performance
+target = tvm.target.target.micro("nrf52840")
+runtime = tvm.relay.backend.Runtime("crt")
+executor = tvm.relay.backend.Executor("aot", {"unpacked-api": True})
+
+# Convert to the MLF intermediate representation
+with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+    mod = tvm.relay.build(mod, target, runtime=runtime, executor=executor, params=params)
+
+# Generate an Arduino project from the MLF intermediate representation
+shutil.rmtree(f"{FOLDER}/models/project", ignore_errors=True)
+arduino_project = tvm.micro.generate_project(
+    tvm.micro.get_microtvm_template_projects("arduino"),
+    mod,
+    f"{FOLDER}/models/project",
+    {
+        "arduino_board": "nano33ble",
+        "arduino_cli_cmd": "/content/bin/arduino-cli",
+        "project_type": "example_project",
+    },
+)
+
+######################################################################
+# Testing our Arduino Project
+# ---------------------------
+# Consider the following two 224x224 images from the author's camera roll - one of a car, one not.
+# We will test our Arduino project by loading both of these images and executing the compiled model
+# on them.
+#
+# .. image:: https://raw.githubusercontent.com/guberti/web-data/micro-train-tutorial-data/testdata/microTVM/data/model_train_images_combined.png
+#      :align: center
+#      :height: 200px
+#      :width: 600px
+#
+# Currently, these are 224x224 PNG images we can download from Imgur. Before we can feed in these
+# images, we'll need to resize and convert them to raw data, which can be done with ``imagemagick``.
+#
+# It's also challenging to load raw data onto an Arduino, as only C/CPP files (and similar) are
+# compiled. We can work around this by embedding our raw data in a hard-coded C array with the
+# built-in utility ``bin2c`` that will output a file like below:
+#
+#     .. code-block:: c
+#
+#       static const unsigned char CAR_IMAGE[] = {
+#         0x22,0x23,0x14,0x22,
+#         ...
+#         0x07,0x0e,0x08,0x08
+#       };
+#
+# We can do both of these things with a few lines of Bash code:
+#
+#     .. code-block:: bash
+#
+#       %%bash
+#       mkdir -p ~/tests
+#       curl "https://i.imgur.com/JBbEhxN.png" -o ~/tests/car_224.png
+#       convert ~/tests/car_224.png -resize 64 ~/tests/car_64.png
+#       stream ~/tests/car_64.png ~/tests/car.raw
+#       bin2c -c -st ~/tests/car.raw --name CAR_IMAGE > ~/models/project/car.c
+#
+#       curl "https://i.imgur.com/wkh7Dx2.png" -o ~/tests/catan_224.png
+#       convert ~/tests/catan_224.png -resize 64 ~/tests/catan_64.png
+#       stream ~/tests/catan_64.png ~/tests/catan.raw
+#       bin2c -c -st ~/tests/catan.raw --name CATAN_IMAGE > ~/models/project/catan.c
+
+######################################################################
+# Writing our Arduino Script
+# --------------------------
+# We now need a little bit of Arduino code to read the two binary arrays we just generated, run the
+# model on them, and log the output to the serial monitor. This file will replace ``arduino_sketch.ino``
+# as the main file of our sketch. You'll have to copy this code in manually..
+#
+#     .. code-block:: c
+#
+#         %%writefile /root/models/project.ino
+#         #include "src/model.h"
+#         #include "car.c"
+#         #include "catan.c"
+#
+#         void setup() {
+#           Serial.begin(9600);
+#           TVMInitialize();
+#         }
+#
+#         void loop() {
+#           uint8_t result_data[2];
+#           Serial.println("Car results:");
+#           TVMExecute(const_cast<uint8_t*>(CAR_IMAGE), result_data);
+#           Serial.print(result_data[0]); Serial.print(", ");
+#           Serial.print(result_data[1]); Serial.println();
+#
+#           Serial.println("Other object results:");
+#           TVMExecute(const_cast<uint8_t*>(CATAN_IMAGE), result_data);
+#           Serial.print(result_data[0]); Serial.print(", ");
+#           Serial.print(result_data[1]); Serial.println();
+#
+#           delay(1000);
+#         }
+#
+# Compiling Our Code
+# ^^^^^^^^^^^^^^^^^^
+# Now that our project has been generated, TVM's job is mostly done! We can still call
+# ``arduino_project.build()`` and ``arduino_project.upload()``, but these just use ``arduino-cli``'s
+# compile and flash commands underneath. We could also begin autotuning our model, but that's a
+# subject for a different tutorial. To finish up, we'll verify no compiler errors are thrown
+# by our project:
+
+shutil.rmtree(f"{FOLDER}/models/project/build", ignore_errors=True)
+# sphinx_gallery_start_ignore
+from unittest.mock import MagicMock
+
+arduino_project = MagicMock()
+# sphinx_gallery_end_ignore
+arduino_project.build()
+print("Compilation succeeded!")
+
+######################################################################
+# Uploading to Our Device
+# -----------------------
+# The very last step is uploading our sketch to an Arduino to make sure our code works properly.
+# Unfortunately, we can't do that from Google Colab, so we'll have to download our sketch. This is
+# simple enough to do - we'll just turn our project into a `.zip` archive, and call `files.download`.
+# If you're running on Google Colab, you'll have to uncomment the last two lines to download the file
+# after writing it.
+
+ZIP_FOLDER = f"{FOLDER}/models/project"
+shutil.make_archive(ZIP_FOLDER, "zip", ZIP_FOLDER)
+# from google.colab import files
+# files.download(f"{FOLDER}/models/project.zip")
+# sphinx_gallery_start_ignore
+# Run a few unit tests to make sure the Python code worked
+
+# Ensure transfer learn model was correctly assembled
+assert len(model.layers) == 5
+assert model.count_params() == 219058  # Only 219,058 of these are trainable
+
+assert len(quantized_model) >= 250000  # Quantized model will be 250 KB - 350 KB
+assert len(quantized_model) <= 350000  # Exact value depends on quantization
+
+# Assert .tflite and .zip files were written to disk
+assert os.path.isfile(f"{FOLDER}/models/quantized.tflite")
+assert os.path.isfile(f"{FOLDER}/models/project.zip")
+
+# Assert MLF file was correctly generated
+assert str(mod.executor) == "aot"
+
+# Remove the temporary folder we generated at the beginning
+shutil.rmtree(FOLDER)
+# sphinx_gallery_end_ignore
+
+
+######################################################################
+# From here, we'll need to open it in the Arduino IDE. You'll have to download the IDE as well as
+# the SDK for whichever board you are using. For certain boards like the Sony SPRESENSE, you may
+# have to change settings to control how much memory you want the board to use.
+#
+# Expected Results
+# ^^^^^^^^^^^^^^^^
+# If all works as expected, you should see the following output on a Serial monitor:
+#
+#     .. code-block::
+#
+#       Car results:
+#       255, 0
+#       Other object results:
+#       0, 255
+#
+# The first number represents the model's confidence that the object **is** a car and ranges from
+# 0-255. The second number represents the model's confidence that the object **is not** a car and
+# is also 0-255. These results mean the model is very sure that the first image is a car, and the
+# second image is not (which is correct). Hence, our model is working!
+#
+# Summary
+# -------
+# In this tutorial, we used transfer learning to quickly train an image recognition model to
+# identify cars. We modified its input dimensions and last few layers to make it better at this,
+# and to make it faster and smaller. We then quantified the model and compiled it using TVM to
+# create an Arduino sketch. Lastly, we tested the model using two static images to prove it works
+# as intended.
+#
+# Next Steps
+# ^^^^^^^^^^
+# From here, we could modify the model to read live images from the camera - we have another
+# Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection>`_. Alternatively, we could also
+# `use TVM's autotuning capabilities <https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html>`_ to dramatically improve the model's performance.
+#
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 599bbaddceec9..1ffd2d20e7ae9 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -260,7 +260,8 @@ def docs(
             "tlcpack-sphinx-addon==0.2.1",
             "synr==0.5.0",
             "image==1.5.33",
-            "sphinx-gallery==0.4.0",
+            # Temporary git link until a release is published
+            "git+https://github.com/sphinx-gallery/sphinx-gallery.git@6142f1791151849b5bec4bf3959f75697ba226cd",
             "sphinx-rtd-theme==1.0.0",
             "matplotlib==3.3.4",
             "commonmark==0.9.1",
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index b4b52ed36ccf1..da1a2c9c5636a 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -84,6 +84,8 @@ IGNORED_WARNINGS=(
     'autotvm:Cannot find config for target=llvm -keys=cpu -link-params=0'
     'autotvm:One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.'
     'autotvm:Cannot find config for target=cuda -keys=cuda,gpu'
+    # Warning is thrown during TFLite quantization for micro_train tutorial
+    'absl:For model inputs containing unsupported operations which cannot be quantized, the `inference_input_type` attribute will default to the original type.'
 )
 
 JOINED_WARNINGS=$(join_by '|' "${IGNORED_WARNINGS[@]}")

From fe24fa9840500b9217f5773e65a764a16e998a66 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 4 Jun 2022 01:37:23 -0700
Subject: [PATCH 041/181] [Bugfix][MetaSchedule] Auto-bind when there are no
 spatial loops (#11570)

---
 src/meta_schedule/schedule_rule/auto_bind.cc  | 38 +++++++++++-----
 ...t_meta_schedule_schedule_rule_auto_bind.py | 45 ++++++++++++++++++-
 2 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc
index 9c16856557e00..61f8e4f6fc54f 100644
--- a/src/meta_schedule/schedule_rule/auto_bind.cc
+++ b/src/meta_schedule/schedule_rule/auto_bind.cc
@@ -72,7 +72,7 @@ void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
   if (i_multi_child == -1) {
     i_multi_child = n;
   }
-  if ((i_block_idx != -1 && i_thread_idx != -1) || i_spatial_loop == -1) {
+  if (i_block_idx != -1 && i_thread_idx != -1) {
     return;
   }
   if (i_block_idx != -1 && i_thread_idx == -1) {
@@ -80,16 +80,34 @@ void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
     throw;
   }
   LoopRV loop_rv{nullptr};
-  if (i_block_idx == -1 && i_thread_idx != -1) {
-    int num_fuse = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1);
+  {
     Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
-    loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
-    sch->Bind(loop_rv, "blockIdx.x");
-    return;
-  } else {  // i_block_idx == -1 && i_thread_idx == -1
-    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
-    int num_fuse = std::min(i_multi_child, i_spatial_loop + 1);
-    loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+    if (i_spatial_loop == -1) {
+      Array<LoopRV> split = sch->Split(loop_rvs[0], {Integer(1), NullOpt});
+      ICHECK_EQ(split.size(), 2);
+      loop_rvs.Set(0, split[1]);
+      loop_rvs.insert(loop_rvs.begin(), split[0]);
+      i_spatial_loop = 0;
+      if (i_block_idx != -1) {
+        i_block_idx += 1;
+      }
+      if (i_thread_idx != -1) {
+        i_thread_idx += 1;
+      }
+      if (i_multi_child != -1) {
+        i_multi_child += 1;
+      }
+    }
+    if (i_block_idx == -1 && i_thread_idx != -1) {
+      int num_fuse = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1);
+      Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
+      loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+      sch->Bind(loop_rv, "blockIdx.x");
+      return;
+    } else {  // i_block_idx == -1 && i_thread_idx == -1
+      int num_fuse = std::min(i_multi_child, i_spatial_loop + 1);
+      loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+    }
   }
   int64_t extent = -1;
   if (const int64_t* e = GetLoopIntExtent(sch->Get(loop_rv).get())) {
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index bd0a24e8b642e..80a72a4e93ab2 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -20,8 +20,8 @@
 from tvm.meta_schedule.testing.schedule_rule import auto_bind
 from tvm.meta_schedule.testing.space_generation import check_trace
 from tvm.meta_schedule.tune_context import TuneContext
-from tvm.target import Target
 from tvm.script import tir as T
+from tvm.target import Target
 
 
 @T.prim_func
@@ -34,6 +34,25 @@ def element_wise(var_A: T.handle, var_B: T.handle) -> None:
             B[vi, vj] = A[vi, vj] + 1.0
 
 
+@T.prim_func
+def reduction_loop_only(
+    A: T.Buffer[2, "float32"],
+    B: T.Buffer[2, "float32"],
+    C: T.Buffer[(), "float32"],
+) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    # body
+    for i0 in T.serial(2):
+        with T.block("C"):
+            k0 = T.axis.reduce(2, i0)
+            T.reads(A[k0], B[k0])
+            T.writes(C[()])
+            with T.init():
+                C[()] = T.float32(1.0)
+            C[()] = T.min(C[()], A[k0] / B[k0])
+
+
 def _create_context(mod, target, rule) -> TuneContext:
     ctx = TuneContext(
         mod=mod,
@@ -71,5 +90,29 @@ def test_cuda_element_wise():
     check_trace(spaces, expected)
 
 
+def test_cuda_reduction_loop_only():
+    expected = [
+        [
+            'b0 = sch.get_block(name="C", func_name="main")',
+            "l1, = sch.get_loops(block=b0)",
+            "l2, l3 = sch.split(loop=l1, factors=[1, None])",
+            "l4 = sch.fuse(l2)",
+            "l5, l6 = sch.split(loop=l4, factors=[None, 1])",
+            'sch.bind(loop=l5, thread_axis="blockIdx.x")',
+            'sch.bind(loop=l6, thread_axis="threadIdx.x")',
+        ]
+    ]
+    target = Target("nvidia/geforce-rtx-3080", host="llvm")
+    ctx = _create_context(
+        reduction_loop_only,
+        target=target,
+        rule=auto_bind(target=target),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+    check_trace(spaces, expected)
+
+
 if __name__ == "__main__":
     test_cuda_element_wise()
+    test_cuda_reduction_loop_only()

From 9d2c9a7f6457fb98156a722625c95bf3383dec42 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 4 Jun 2022 17:48:19 -0700
Subject: [PATCH 042/181] [TIR] Schedule Primitive: Add-Unit-Loop (#11575)

In TE, a unit loop could be introduced by fusing an empty list of loops on a stage. This PR adds its counterpart in TIR, while being a bit more explicit with a new schedule primitive which adds a unit loop without impacting any existing functionalities.
---
 include/tvm/tir/schedule/schedule.h           | 12 ++++
 python/tvm/tir/schedule/schedule.py           | 64 +++++++++++++++--
 src/tir/schedule/concrete_schedule.cc         | 18 +++++
 src/tir/schedule/concrete_schedule.h          |  2 +
 src/tir/schedule/primitive.h                  | 10 +++
 .../schedule/primitive/loop_transformation.cc | 69 +++++++++++++++++++
 src/tir/schedule/schedule.cc                  | 12 ++++
 src/tir/schedule/traced_schedule.cc           | 22 ++++++
 src/tir/schedule/traced_schedule.h            |  2 +
 .../unittest/test_tir_schedule_split_fuse.py  | 58 ++++++++++++++++
 10 files changed, 265 insertions(+), 4 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 68900e107d7c9..d3ecd8a1135b8 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -303,6 +303,18 @@ class ScheduleNode : public runtime::Object {
    * \param ordered_loop_rvs The loops in the new order
    */
   virtual void Reorder(const Array<LoopRV>& ordered_loop_rvs) = 0;
+  /*!
+   * \brief Create a new unit loop on top of the specific block.
+   * \param block_rv The block above which the new loop is created
+   * \return The new loop created
+   */
+  virtual LoopRV AddUnitLoop(const BlockRV& block_rv) = 0;
+  /*!
+   * \brief Create a new unit loop on top of the specific loop.
+   * \param loop_rv The loop above which the new loop is created
+   * \return The new loop created
+   */
+  virtual LoopRV AddUnitLoop(const LoopRV& loop_rv) = 0;
   /******** Schedule: Manipulate ForKind ********/
   /*!
    * \brief Parallelize the input loop. It requires:
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 4179088aa534d..d225280b655f7 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -15,19 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 """The TensorIR schedule class"""
-from typing import Callable, Dict, List, Optional, Union, Tuple
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 from tvm._ffi import register_object as _register_object
 from tvm.error import TVMError, register_error
 from tvm.ir import IRModule, PrimExpr
 from tvm.runtime import Object, String
-from tvm.tir import Block, FloatImm, For, IntImm, PrimFunc, Buffer
-from ..function import IndexMap
+from tvm.tir import Block, Buffer, FloatImm, For, IntImm, PrimFunc
 
+from ..function import IndexMap
 from . import _ffi_api
+from ._type_checker import type_checked
 from .state import ScheduleState, StmtSRef, _parse_debug_mask, _parse_mod
 from .trace import Trace
-from ._type_checker import type_checked
 
 
 @register_error
@@ -685,6 +685,62 @@ def after_reorder(a: T.handle, b: T.handle) -> None:
         """
         _ffi_api.ScheduleReorder(self, ordered_loops)  # type: ignore # pylint: disable=no-member
 
+    @type_checked
+    def add_unit_loop(self, block_or_loop: Union[LoopRV, BlockRV]) -> LoopRV:
+        """Create a new unit loop on top of the specific block or loop.
+
+        Parameters
+        ----------
+        block_or_loop : Union[LoopRV, BlockRV]
+            The block above which the new loop is created
+
+        Returns
+        -------
+        new_loop : LoopRV
+            The new unit loop
+
+        Examples
+        --------
+
+        Before add_unit_loop, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def before_add_unit_loop(
+                A: T.Buffer[(), "int32"],
+                B: T.Buffer[(), "int32"],
+                C: T.Buffer[(), "int32"],
+            ) -> None:
+                with T.block("C"):
+                    vi = T.axis.spatial(1, 0)
+                    C[()] = A[()] + B[()]
+
+        Create the schedule and do add-unit-loop:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_add_unit_loop)
+            sch.add_unit_loop(sch.get_block("C"))
+            print(sch.mod["main"].script())
+
+        After applying add-unit-loop, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def after_add_unit_loop(
+                A: T.Buffer[(), "int32"],
+                B: T.Buffer[(), "int32"],
+                C: T.Buffer[(), "int32"],
+            ) -> None:
+                for u in T.serial(1):
+                    with T.block("C"):
+                        vi = T.axis.spatial(1, 0)
+                        C[()] = A[()] + B[()]
+        """
+        return _ffi_api.ScheduleAddUnitLoop(self, block_or_loop)  # type: ignore # pylint: disable=no-member
+
     ########## Schedule: Manipulate ForKind ##########
 
     @type_checked
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 590a0f0025954..051bd42506252 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -453,6 +453,24 @@ void ConcreteScheduleNode::Reorder(const Array<LoopRV>& ordered_loop_rvs) {
   this->state_->DebugVerify();
 }
 
+LoopRV ConcreteScheduleNode::AddUnitLoop(const BlockRV& block_rv) {
+  LoopRV result{nullptr};
+  TVM_TIR_SCHEDULE_BEGIN();
+  result = CreateRV<LoopRV>(tir::AddUnitLoop(state_, GetSRef(block_rv)));
+  TVM_TIR_SCHEDULE_END("add-unit-loop", this->error_render_level_);
+  this->state_->DebugVerify();
+  return result;
+}
+
+LoopRV ConcreteScheduleNode::AddUnitLoop(const LoopRV& loop_rv) {
+  LoopRV result{nullptr};
+  TVM_TIR_SCHEDULE_BEGIN();
+  result = CreateRV<LoopRV>(tir::AddUnitLoop(state_, GetSRef(loop_rv)));
+  TVM_TIR_SCHEDULE_END("add-unit-loop", this->error_render_level_);
+  this->state_->DebugVerify();
+  return result;
+}
+
 /******** Schedule: Manipulate ForKind ********/
 
 void ConcreteScheduleNode::Parallel(const LoopRV& loop_rv) {
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 70c0265611c31..11d68694a1fec 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -99,6 +99,8 @@ class ConcreteScheduleNode : public ScheduleNode {
   LoopRV Fuse(const Array<LoopRV>& loop_rvs) override;
   Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors) override;
   void Reorder(const Array<LoopRV>& ordered_loop_rvs) override;
+  LoopRV AddUnitLoop(const BlockRV& block_rv) override;
+  LoopRV AddUnitLoop(const LoopRV& loop_rv) override;
   /******** Schedule: Manipulate ForKind ********/
   void Parallel(const LoopRV& loop_rv) override;
   void Vectorize(const LoopRV& loop_rv) override;
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index f4dba69c6b156..af0f417e4cf50 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -186,6 +186,16 @@ TVM_DLL StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs);
  */
 TVM_DLL void Reorder(ScheduleState self, const Array<StmtSRef>& ordered_loop_srefs);
 
+/*!
+ * \brief Create a new unit loop on top of the specific block or loop.
+ * \param sref The block/loop above which the new thread_binding loop is created
+ * \param extent The extent of the new thread_binding loop
+ * \param thread_axis The thread axis of the new thread_binding loop
+ * \param attrs Extra loop attributes
+ * \return The new thread_binding loop
+ */
+TVM_DLL StmtSRef AddUnitLoop(ScheduleState self, StmtSRef sref);
+
 /******** Schedule: Manipulate ForKind ********/
 /*!
  * \brief Parallelize the input loop. It requires:
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index 5315b139f0f6f..66e29518ca5e1 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -698,6 +698,43 @@ void Reorder(ScheduleState self, const Array<StmtSRef>& ordered_loop_srefs) {
   self->Replace(GetRef<StmtSRef>(top), new_loop, {});
 }
 
+StmtSRef AddUnitLoop(ScheduleState self, StmtSRef sref) {
+  if (sref->stmt->IsInstance<ForNode>()) {
+    For new_loop(Var("u", DataType::Int(32)), 0, 1, ForKind::kSerial, GetRef<Stmt>(sref->stmt));
+    self->Replace(sref, new_loop, {});
+    return self->stmt2ref.at(new_loop.get());
+  }
+  class NewLoopCreator : public StmtMutator {
+   public:
+    explicit NewLoopCreator(const StmtNode* src_block) : src_block_(src_block) {}
+
+    Stmt VisitStmt_(const BlockRealizeNode* realize) final {
+      if (realize->block.get() == src_block_) {
+        new_loop_ =
+            For(Var("u", DataType::Int(32)), 0, 1, ForKind::kSerial, GetRef<BlockRealize>(realize));
+        return new_loop_;
+      }
+      return StmtMutator::VisitStmt_(realize);
+    }
+
+    const StmtNode* src_block_;
+    For new_loop_{nullptr};
+  };
+
+  CHECK(sref->parent != nullptr) << "ValueError: Cannot add loops on top of the root block";
+  StmtSRef parent_sref = GetRef<StmtSRef>(sref->parent);
+  NewLoopCreator creator(sref->stmt);
+  Stmt new_stmt = creator(GetRef<Stmt>(parent_sref->stmt));
+  if (new_stmt->IsInstance<ForNode>()) {
+    self->Replace(parent_sref, std::move(new_stmt), {});
+  } else {
+    Block old_parent_block = GetRef<Block>(parent_sref->StmtAs<BlockNode>());
+    Block new_parent_block = Downcast<Block>(new_stmt);
+    self->Replace(parent_sref, new_stmt, {{old_parent_block, new_parent_block}});
+  }
+  return self->stmt2ref.at(creator.new_loop_.get());
+}
+
 /******** InstructionKind Registration ********/
 
 struct SplitTraits : public UnpackedInstTraits<SplitTraits> {
@@ -800,9 +837,41 @@ struct ReorderTraits : public UnpackedInstTraits<ReorderTraits> {
   friend struct ::tvm::tir::UnpackedInstTraits;
 };
 
+struct AddUnitLoopTraits : public UnpackedInstTraits<AddUnitLoopTraits> {
+  static constexpr const char* kName = "AddUnitLoop";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumAttrs = 0;
+  static constexpr size_t kNumDecisions = 0;
+
+  static LoopRV UnpackedApplyToSchedule(Schedule sch, ObjectRef rv) {
+    if (const auto* block = rv.as<BlockRVNode>()) {
+      return sch->AddUnitLoop(GetRef<BlockRV>(block));
+    } else if (const auto* loop = rv.as<LoopRVNode>()) {
+      return sch->AddUnitLoop(GetRef<LoopRV>(loop));
+    } else {
+      LOG(FATAL) << "TypeError: AddUnitLoop expects a loop or block";
+      throw;
+    }
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String rv) {
+    PythonAPICall py("add_unit_loop");
+    py.Input("block_or_loop", rv);
+    py.SingleOutput(outputs);
+    return py.Str();
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
 TVM_REGISTER_INST_KIND_TRAITS(SplitTraits);
 TVM_REGISTER_INST_KIND_TRAITS(FuseTraits);
 TVM_REGISTER_INST_KIND_TRAITS(ReorderTraits);
+TVM_REGISTER_INST_KIND_TRAITS(AddUnitLoopTraits);
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index 3880d0b19eeb8..372d94a15025b 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -153,6 +153,18 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleFuse").set_body_method<Schedule>(&Sche
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleSplit").set_body_method<Schedule>(&ScheduleNode::Split);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleReorder")
     .set_body_method<Schedule>(&ScheduleNode::Reorder);
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleAddUnitLoop")
+    .set_body_typed([](Schedule self, ObjectRef rv) -> LoopRV {
+      if (const auto* loop_rv = rv.as<LoopRVNode>()) {
+        return self->AddUnitLoop(GetRef<LoopRV>(loop_rv));
+      } else if (const auto* block_rv = rv.as<BlockRVNode>()) {
+        return self->AddUnitLoop(GetRef<BlockRV>(block_rv));
+      } else {
+        LOG(FATAL) << "TypeError: Cannot evaluate the random variable of type: " << rv->GetTypeKey()
+                   << ". Its value is: " << rv;
+        throw;
+      }
+    });
 /******** (FFI) Manipulate ForKind ********/
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleParallel")
     .set_body_method<Schedule>(&ScheduleNode::Parallel);
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index d2f627edfd11d..95a10e26ac2f8 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -198,6 +198,28 @@ void TracedScheduleNode::Reorder(const Array<LoopRV>& ordered_loop_rvs) {
                                       /*outputs=*/{}));
 }
 
+LoopRV TracedScheduleNode::AddUnitLoop(const BlockRV& block_rv) {
+  LoopRV result = ConcreteScheduleNode::AddUnitLoop(block_rv);
+
+  static const InstructionKind& kind = InstructionKind::Get("AddUnitLoop");
+  trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
+                                      /*inputs=*/{block_rv},
+                                      /*attrs=*/{},
+                                      /*outputs=*/{result}));
+  return result;
+}
+
+LoopRV TracedScheduleNode::AddUnitLoop(const LoopRV& loop_rv) {
+  LoopRV result = ConcreteScheduleNode::AddUnitLoop(loop_rv);
+
+  static const InstructionKind& kind = InstructionKind::Get("AddUnitLoop");
+  trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
+                                      /*inputs=*/{loop_rv},
+                                      /*attrs=*/{},
+                                      /*outputs=*/{result}));
+  return result;
+}
+
 /******** Schedule: Manipulate ForKind ********/
 
 void TracedScheduleNode::Parallel(const LoopRV& loop_rv) {
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index ba4a4b99cbb2d..25bf3d4871ae7 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -63,6 +63,8 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   LoopRV Fuse(const Array<LoopRV>& loop_rvs) final;
   Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factor_rvs) final;
   void Reorder(const Array<LoopRV>& ordered_loop_rvs) final;
+  LoopRV AddUnitLoop(const BlockRV& block_rv) final;
+  LoopRV AddUnitLoop(const LoopRV& loop_rv) final;
   /******** Schedule: Manipulate ForKind ********/
   void Parallel(const LoopRV& loop_rv) final;
   void Vectorize(const LoopRV& loop_rv) final;
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index 16eef57c4748d..d70748bc8a03d 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -524,5 +524,63 @@ def test_fuse_not_affine():
     verify_trace_roundtrip(sch=sch, mod=elementwise_not_affine)
 
 
+def test_add_unit_loop_above_block():
+    @T.prim_func
+    def zero_dim(
+        A: T.Buffer[(), "int32"],
+        B: T.Buffer[(), "int32"],
+        C: T.Buffer[(), "int32"],
+    ) -> None:
+        with T.block("C"):
+            vi = T.axis.spatial(1, 0)
+            C[()] = A[()] + B[()]
+
+    @T.prim_func
+    def zero_dim_added(
+        A: T.Buffer[(), "int32"],
+        B: T.Buffer[(), "int32"],
+        C: T.Buffer[(), "int32"],
+    ) -> None:
+        for u in range(1):
+            with T.block("C"):
+                vi = T.axis.spatial(1, 0)
+                C[()] = A[()] + B[()]
+
+    sch = tir.Schedule(zero_dim, debug_mask="all")
+    block = sch.get_block("C")
+    sch.add_unit_loop(block)
+    tvm.ir.assert_structural_equal(zero_dim_added, sch.mod["main"])
+
+
+def test_add_unit_loop_above_loop():
+    @T.prim_func
+    def zero_dim(
+        A: T.Buffer[(), "int32"],
+        B: T.Buffer[(), "int32"],
+        C: T.Buffer[(), "int32"],
+    ) -> None:
+        for u in range(1):
+            with T.block("C"):
+                vi = T.axis.spatial(1, 0)
+                C[()] = A[()] + B[()]
+
+    @T.prim_func
+    def zero_dim_added(
+        A: T.Buffer[(), "int32"],
+        B: T.Buffer[(), "int32"],
+        C: T.Buffer[(), "int32"],
+    ) -> None:
+        for u1, u2 in T.grid(1, 1):
+            with T.block("C"):
+                vi = T.axis.spatial(1, 0)
+                C[()] = A[()] + B[()]
+
+    sch = tir.Schedule(zero_dim, debug_mask="all")
+    block = sch.get_block("C")
+    (loop,) = sch.get_loops(block)
+    sch.add_unit_loop(loop)
+    tvm.ir.assert_structural_equal(zero_dim_added, sch.mod["main"])
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From ba60788118e7c65c26cb6cf1097a012dd7b647f2 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 4 Jun 2022 21:42:43 -0700
Subject: [PATCH 043/181] [MetaSchedule] Use Add-Unit-Loop in Auto-Bind
 (#11581)

Following #11575, this PR allows CUDA thread binding for TIR programs
like

```python
@T.prim_func
def zero_dim_add(
    A: T.Buffer[(), "float32"],
    B: T.Buffer[(), "float32"],
    C: T.Buffer[(), "float32"],
) -> None:
    with T.block("C"):
        vi = T.axis.spatial(1, 0)
        C[()] = A[()] + B[()]
```

where there is no loop available to be bound to threadIdx/blockIdx.
---
 src/meta_schedule/schedule_rule/auto_bind.cc  | 18 ++++---
 ...t_meta_schedule_schedule_rule_auto_bind.py | 47 +++++++++++++++----
 2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc
index 61f8e4f6fc54f..2bc90f3c2e5cf 100644
--- a/src/meta_schedule/schedule_rule/auto_bind.cc
+++ b/src/meta_schedule/schedule_rule/auto_bind.cc
@@ -30,11 +30,12 @@ void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
                         int64_t max_threadblocks, int64_t max_threads_per_block,
                         std::function<tir::ExprRV(int64_t)> get_factor) {
   using namespace tvm::tir;
-  Array<StmtSRef> loops = tir::GetLoops(sch->GetSRef(block_rv));
-  int n = loops.size();
-  if (n == 0) {
+  StmtSRef block_sref = sch->GetSRef(block_rv);
+  if (block_sref->parent == nullptr) {
     return;
   }
+  Array<StmtSRef> loops = tir::GetLoops(block_sref);
+  int n = loops.size();
   int i_block_idx = -1;
   int i_thread_idx = -1;
   int i_multi_child = -1;
@@ -83,10 +84,13 @@ void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
   {
     Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
     if (i_spatial_loop == -1) {
-      Array<LoopRV> split = sch->Split(loop_rvs[0], {Integer(1), NullOpt});
-      ICHECK_EQ(split.size(), 2);
-      loop_rvs.Set(0, split[1]);
-      loop_rvs.insert(loop_rvs.begin(), split[0]);
+      LoopRV spatial_loop_rv{nullptr};
+      if (loop_rvs.empty()) {
+        spatial_loop_rv = sch->AddUnitLoop(block_rv);
+      } else {
+        spatial_loop_rv = sch->AddUnitLoop(loop_rvs[0]);
+      }
+      loop_rvs.insert(loop_rvs.begin(), spatial_loop_rv);
       i_spatial_loop = 0;
       if (i_block_idx != -1) {
         i_block_idx += 1;
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index 80a72a4e93ab2..8b36ec2f462da 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -40,9 +40,6 @@ def reduction_loop_only(
     B: T.Buffer[2, "float32"],
     C: T.Buffer[(), "float32"],
 ) -> None:
-    # function attr dict
-    T.func_attr({"global_symbol": "main", "tir.noalias": True})
-    # body
     for i0 in T.serial(2):
         with T.block("C"):
             k0 = T.axis.reduce(2, i0)
@@ -53,6 +50,17 @@ def reduction_loop_only(
             C[()] = T.min(C[()], A[k0] / B[k0])
 
 
+@T.prim_func
+def zero_dim_add(
+    A: T.Buffer[(), "float32"],
+    B: T.Buffer[(), "float32"],
+    C: T.Buffer[(), "float32"],
+) -> None:
+    with T.block("C"):
+        vi = T.axis.spatial(1, 0)
+        C[()] = A[()] + B[()]
+
+
 def _create_context(mod, target, rule) -> TuneContext:
     ctx = TuneContext(
         mod=mod,
@@ -95,11 +103,11 @@ def test_cuda_reduction_loop_only():
         [
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1, = sch.get_loops(block=b0)",
-            "l2, l3 = sch.split(loop=l1, factors=[1, None])",
-            "l4 = sch.fuse(l2)",
-            "l5, l6 = sch.split(loop=l4, factors=[None, 1])",
-            'sch.bind(loop=l5, thread_axis="blockIdx.x")',
-            'sch.bind(loop=l6, thread_axis="threadIdx.x")',
+            "l2 = sch.add_unit_loop(block_or_loop=l1)",
+            "l3 = sch.fuse(l2)",
+            "l4, l5 = sch.split(loop=l3, factors=[None, 1])",
+            'sch.bind(loop=l4, thread_axis="blockIdx.x")',
+            'sch.bind(loop=l5, thread_axis="threadIdx.x")',
         ]
     ]
     target = Target("nvidia/geforce-rtx-3080", host="llvm")
@@ -113,6 +121,29 @@ def test_cuda_reduction_loop_only():
     check_trace(spaces, expected)
 
 
+def test_cuda_zero_dim_add():
+    expected = [
+        [
+            'b0 = sch.get_block(name="C", func_name="main")',
+            "l1 = sch.add_unit_loop(block_or_loop=b0)",
+            "l2 = sch.fuse(l1)",
+            "l3, l4 = sch.split(loop=l2, factors=[None, 1])",
+            'sch.bind(loop=l3, thread_axis="blockIdx.x")',
+            'sch.bind(loop=l4, thread_axis="threadIdx.x")',
+        ]
+    ]
+    target = Target("nvidia/geforce-rtx-3080", host="llvm")
+    ctx = _create_context(
+        zero_dim_add,
+        target=target,
+        rule=auto_bind(target=target),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+    check_trace(spaces, expected)
+
+
 if __name__ == "__main__":
     test_cuda_element_wise()
     test_cuda_reduction_loop_only()
+    test_cuda_zero_dim_add()

From c732828d48c872ff358191da2e2087d38278bb81 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 5 Jun 2022 11:17:32 -0700
Subject: [PATCH 044/181] [TIR] Prevent loop binding over-simplification
 (#11578)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

@vinx13 @jinhongyii and I observe a recent regression on TVM mainline: over-simplification in
`Schedule.split` leads to information loss that negatively impacts search space generation.

**Impact.** This affects common operators like `softmax` and even simpler reductions.

**Example.** Consider splitting a simple reduction loop:

```python
@T.prim_func
def main(
    A: T.Buffer[2, "float32"],
    B: T.Buffer[2, "float32"],
    C: T.Buffer[(), "float32"],
) -> None:
    for i in T.serial(2):  # <= split `i` into `i_0` and `i_1`, where `i_0` is a trivial loop
        with T.block("C"):
            k = T.axis.reduce(2, i)
            with T.init():
                C[()] = T.float32(1)
            C[()] = T.min(C[()], A[k] / B[k])
```

Splitting loop `i`  by factors `[1, 2]`, we get:

```python
@T.prim_func
def main(
    A: T.Buffer[2, "float32"],
    B: T.Buffer[2, "float32"],
    C: T.Buffer[(), "float32"],
) -> None:
    for i_0, i_1 in T.grid(1, 2):
        with T.block("C"):
            k = T.axis.reduce(2, i_1)  # <= i_0 is not part of the binding,
                                       # so the system cannot tell if i_0 is a reduction loop
            with T.init():
                C[()] = T.float32(1)
            C[()] = T.min(C[()], A[k] / B[k])
```

In this case, loop `i_0` will be considered as a spatial loop, even it’s the outcome of splitting
a reduction loop. However, if we change the factors from `[1, 2]` to `[2, 1]`, loop `i_0` becomes
a reduction loop. This means the loop iteration property depends on the loop extent.

**Why is it problematic**? MetaSchedule has an assumption: extremely seldomly, a loop extent would
impact the iteration property of the loop itself, i.e. no matter the extent is 1 or 2 or anything,
the fact that the loop is a reduction loop should rarely change.

As an example, `Auto-Bind` finds the outer `k` spatial loops, which are fused together and bound to
thread axis. In the trace, the number (`k`) of the outer loops has to be a constant.

However, if Auto-Bind thinks there are `k=3` outer loops to fuse during search space generation,
where the last loop happens to be a reduction loop with extent 1, as shown below:

```python
for spatial_loop_0 in range(...):
  for spatial_loop_1 in range(...):
    for reduction_loop in range(1):  # <= Auto-Bind mistakes this loop as spatial, because extent==1
```

During evolutionary search, the extent of reduction_loop will change and become larger than 1.
In this case, the binding strategy will consistently fail because it considers fusing `k=3` loops
- which means the entire search strategy will fail with almost no valid candidates.

Thanks @MasterJH5574 for figuring out the root cause of the issue,
and @jinhongyii for valuable pointers to the right fix!
---
 include/tvm/arith/iter_affine_map.h            |  5 +++--
 src/arith/iter_affine_map.cc                   |  6 ++++--
 .../schedule/primitive/loop_transformation.cc  |  5 +++--
 ...edule_postproc_rewrite_cooperative_fetch.py |  2 +-
 ...st_meta_schedule_schedule_rule_auto_bind.py |  1 -
 .../unittest/test_tir_schedule_reorder.py      |  2 +-
 .../unittest/test_tir_schedule_split_fuse.py   | 12 ++++++------
 .../unittest/test_tir_schedule_transform.py    | 18 ++++++------------
 8 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
index 2c0e5e92997af..6b98d84fdf17e 100644
--- a/include/tvm/arith/iter_affine_map.h
+++ b/include/tvm/arith/iter_affine_map.h
@@ -349,11 +349,12 @@ IterMapResult DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range
  * \param input_iters Map from variable to iterator's range.
  * \param input_pred The predicate constraints on the input iterators
  * \param check_level The iter mapping checking level.
- *
+ * \param simplify_trivial_iterators If true, iterators with unit extents are simplified
  * \return The indices after rewrite
  */
 Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                const PrimExpr& input_pred, IterMapLevel check_level);
+                                const PrimExpr& input_pred, IterMapLevel check_level,
+                                bool simplify_trivial_iterators = true);
 
 /*!
  * \brief Apply the inverse of the affine transformation to the outputs.
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index cce826fedca64..ace7b7f84441f 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -1720,10 +1720,12 @@ PrimExpr NormalizeIterMapToExpr(const PrimExpr& expr) {
 TVM_REGISTER_GLOBAL("arith.NormalizeIterMapToExpr").set_body_typed(NormalizeIterMapToExpr);
 
 Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                const PrimExpr& input_pred, IterMapLevel check_level) {
+                                const PrimExpr& input_pred, IterMapLevel check_level,
+                                bool simplify_trivial_iterators) {
   if (!IterRangeSanityCheck(input_iters)) return indices;
   Analyzer analyzer;
-  auto res = DetectIterMap(indices, input_iters, input_pred, check_level, &analyzer);
+  auto res = DetectIterMap(indices, input_iters, input_pred, check_level, &analyzer,
+                           /*simplify_trivial_iterators=*/simplify_trivial_iterators);
   Array<IterSumExpr> rewrite = res->indices;
 
   if (rewrite.empty()) {
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index 66e29518ca5e1..e374d1f3c5e77 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -115,7 +115,8 @@ class IterMapSimplifyBlockBinding : public StmtExprMutator {
     Array<PrimExpr> v = arith::IterMapSimplify(/*indices=*/op->iter_values,
                                                /*input_iters=*/loop_var2extent_,
                                                /*input_pred=*/op->predicate,
-                                               /*check_level=*/arith::IterMapLevel::Surjective);
+                                               /*check_level=*/arith::IterMapLevel::Surjective,
+                                               /*simplify_trivial_iterators=*/false);
     if (v.same_as(op->iter_values)) {
       return GetRef<Stmt>(op);
     } else {
@@ -397,7 +398,7 @@ Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
   for (int i = 0; i < n; i++) {
     const PrimExpr& factor = factors[i];
     Var var = loop->loop_var.copy_with_suffix("_" + std::to_string(i));
-    if (!is_one(factor)) substitute_value = substitute_value * factor + var;
+    substitute_value = substitute_value * factor + var;
     analyzer.Bind(var, Range::FromMinExtent(0, factor));
     new_loop_vars.emplace_back(std::move(var));
   }
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
index e4dff51cf9d4f..aa1d219d1c65a 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
@@ -86,7 +86,7 @@ def main(var_A: T.handle, var_B: T.handle, var_C: T.handle) -> None:
                             with T.block("C"):
                                 i = T.axis.spatial(512, i0_1_i1_1_fused * 32 + i0_3 * 16 + i0_4)
                                 j = T.axis.spatial(512, i0_0_i1_0_fused * 32 + i0_2_i1_2_fused * 4 + i1_3 * 2 + i1_4)
-                                k = T.axis.reduce(512, i2_1 * 32 + i2_2)
+                                k = T.axis.reduce(512, i2_0 * 512 + i2_1 * 32 + i2_2)
                                 T.reads([A_shared[i, k], B_shared[k, j]])
                                 T.writes([C_local[i, j]])
                                 with T.init():
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index 8b36ec2f462da..aa7cb09265e9c 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-
 from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing.schedule_rule import auto_bind
 from tvm.meta_schedule.testing.space_generation import check_trace
diff --git a/tests/python/unittest/test_tir_schedule_reorder.py b/tests/python/unittest/test_tir_schedule_reorder.py
index c5663a5f2ebd2..4351fe5b6361d 100644
--- a/tests/python/unittest/test_tir_schedule_reorder.py
+++ b/tests/python/unittest/test_tir_schedule_reorder.py
@@ -281,7 +281,7 @@ def cascade_pool_ops_tile_reordered(
                     )
             for h_i, w, kh, kw in T.grid(4, 108, 3, 3):
                 with T.block("pool_1"):
-                    ax0 = T.axis.spatial(1, 0)
+                    ax0 = T.axis.spatial(1, n)
                     ax1 = T.axis.spatial(16, c)
                     ax2 = T.axis.spatial(108, h_o * 4 + h_i)
                     ax3, rv0, rv1 = T.axis.remap("SRR", [w, kh, kw])
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index d70748bc8a03d..c9e6eec029329 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -178,7 +178,7 @@ def elementwise_split_case0(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, k1, k2 in T.grid(2, 1, 64, 4, 32, 16, 8):
         with T.block("B"):
-            vi = T.axis.S(128, i1 * 64 + i3)
+            vi = T.axis.S(128, (i1 + i2) * 64 + i3)
             vj = T.axis.S(128, j1 * 32 + j2)
             vk = T.axis.S(128, k1 * 8 + k2)
             T.reads([A[vi, vj, vk]])
@@ -192,9 +192,9 @@ def elementwise_split_case1(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, j3, k1, k2, k3 in T.grid(2, 1, 64, 2, 1, 64, 2, 1, 64):
         with T.block("B"):
-            vi = T.axis.S(128, i1 * 64 + i3)
-            vj = T.axis.S(128, j1 * 64 + j3)
-            vk = T.axis.S(128, k1 * 64 + k3)
+            vi = T.axis.S(128, (i1 + i2) * 64 + i3)
+            vj = T.axis.S(128, (j1 + j2) * 64 + j3)
+            vk = T.axis.S(128, (k1 + k2) * 64 + k3)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -206,10 +206,10 @@ def elementwise_split_with_predicate(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [128, 128, 128])
     for i0, i1, i2, j0, j1, k0, k1 in T.grid(1000, 2, 3, 1, 129, 3, 43):
         with T.block("B"):
-            T.where((i0 * 2 + i1) * 3 + i2 < 128 and j1 < 128 and k0 * 43 + k1 < 128)
             vi = T.axis.S(128, i0 * 6 + i1 * 3 + i2)
-            vj = T.axis.S(128, j1)
+            vj = T.axis.S(128, j0 * 129 + j1)
             vk = T.axis.S(128, k0 * 43 + k1)
+            T.where((i0 * 2 + i1) * 3 + i2 < 128 and j0 * 129 + j1 < 128 and k0 * 43 + k1 < 128)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
diff --git a/tests/python/unittest/test_tir_schedule_transform.py b/tests/python/unittest/test_tir_schedule_transform.py
index 6dfd4315ec904..e812587e66761 100644
--- a/tests/python/unittest/test_tir_schedule_transform.py
+++ b/tests/python/unittest/test_tir_schedule_transform.py
@@ -15,11 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN
-
-from tvm.tir import Schedule
 from tvm.script import tir as T
+from tvm.tir import Schedule
 from tvm.tir.schedule.transform import tile_with_tensor_intrin
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN
 
 
 @tvm.script.ir_module
@@ -128,11 +127,10 @@ def main(
             1, 16, 56, 56, 1, 1, 1, 4, 4, 1, 16, 4
         ):
             with T.block("conv2d_NCHWc_int8"):
-                n = T.axis.spatial(1, 0)
-                oc_chunk, oh, ow, oc_block = T.axis.remap("SSSS", [i1, i2, i3, i4_1])
-                kh = T.axis.reduce(1, 0)
-                kw = T.axis.reduce(1, 0)
-                ic_outer, ic_f_inner, ic_s_inner = T.axis.remap("RRR", [i7, i8, i9_1])
+                n, oc_chunk, oh, ow = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                oc_block = T.axis.spatial(16, i4_0 * 16 + i4_1)
+                kh, kw, ic_outer, ic_f_inner = T.axis.remap("RRRR", [i5, i6, i7, i8])
+                ic_s_inner = T.axis.reduce(4, i9_0 * 4 + i9_1)
                 T.reads(
                     placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
                     placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
@@ -165,14 +163,10 @@ def test_tile_with_tensor_intrin_dense_vnni():
 def test_tile_with_tensor_intrin_conv2d_nchwc_vnni():
     s = Schedule(Conv2dNCHWcVNNIModule)
     block = s.get_block("conv2d_NCHWc_int8")
-
     tiled_loop = tile_with_tensor_intrin(s, block, VNNI_DOT_16x4_INTRIN)
-
     tiled_loops = s.get_loops(block)
-
     assert len(tiled_loops) == 12
     assert s.get(tiled_loop) == s.get(tiled_loops[-2])
-
     tvm.ir.assert_structural_equal(s.mod, Conv2dNCHWcVNNIModuleTiled)
 
 

From 06c443e9959452c6da3a911fe0c11e08c5554477 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 5 Jun 2022 16:59:17 -0700
Subject: [PATCH 045/181] [Bugfix][TIR] compute-at/fuse/split dtype mismatch
 (#11582)

The schedule primitives, including compute-at, fuse and split usually
generate loop variables with `dtype=int32` as default. However, in some
models, there are usecases where int64 are part of tensor shapes, which
leads to unexpected behavior in scheduling. This PR brings the fix to
existing known issues.
---
 src/tir/schedule/primitive/compute_at.cc      |  5 +-
 .../schedule/primitive/loop_transformation.cc | 21 +++++--
 .../unittest/test_tir_schedule_compute_at.py  | 24 ++++++--
 .../unittest/test_tir_schedule_split_fuse.py  | 61 ++++++++++++++++++-
 4 files changed, 97 insertions(+), 14 deletions(-)

diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
index 7f1d74ac20214..7b0d749f03dcf 100644
--- a/src/tir/schedule/primitive/compute_at.cc
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -194,7 +194,7 @@ struct BlockVarDomainInfo {
       }
       return;
     }
-    // simplify intsets
+    // simplify intset
     dom = to_simplified(dom);
     bound = to_simplified(bound);
     // if can proof the dom is within bound, remove bound
@@ -242,7 +242,8 @@ class ScopeReconstructor : private StmtMutator {
     for (int i = 0; i < n_iters; ++i) {
       Range iter_dom = iter_doms[i].dom.CoverRange(block_->iter_vars[i]->dom);
       if (preserve_unit_loops || !is_one(iter_dom->extent)) {
-        Var var("ax" + std::to_string(loop_vars.size()), DataType::Int(32));
+        int bits = std::max(iter_dom->min.dtype().bits(), iter_dom->extent.dtype().bits());
+        Var var("ax" + std::to_string(loop_vars.size()), DataType::Int(bits));
         loop_vars.push_back(var);
         loop_extents.push_back(analyzer->Simplify(iter_dom->extent));
         iter_values.push_back(iter_dom->min + var);
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index e374d1f3c5e77..bb505bca33763 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -54,7 +54,7 @@ class SubstituteVarAndCollectOpaqueBlock : public StmtExprMutator {
   PrimExpr VisitExpr_(const VarNode* op) final {
     Var var = GetRef<Var>(op);
     if (Optional<PrimExpr> ret = vmap_(var)) {
-      return ret.value();
+      return tvm::cast(var.dtype(), ret.value());
     } else {
       return std::move(var);
     }
@@ -391,15 +391,24 @@ Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
   arith::Analyzer analyzer;
   CheckLoopStartsWithZero(self, loop_sref, &analyzer);
 
+  // Find the most common dtype
+  DataType dtype;
+  {
+    int bits = loop->loop_var.dtype().bits();
+    for (const PrimExpr& factor : factors) {
+      bits = std::max(bits, factor.dtype().bits());
+    }
+    dtype = DataType::Int(bits);
+  }
   int n = factors.size();
-  PrimExpr substitute_value = 0;
+  PrimExpr substitute_value = make_const(dtype, 0);
   std::vector<Var> new_loop_vars;
   new_loop_vars.reserve(n);
   for (int i = 0; i < n; i++) {
     const PrimExpr& factor = factors[i];
-    Var var = loop->loop_var.copy_with_suffix("_" + std::to_string(i));
+    Var var = loop->loop_var.copy_with_suffix("_" + std::to_string(i)).copy_with_dtype(dtype);
     substitute_value = substitute_value * factor + var;
-    analyzer.Bind(var, Range::FromMinExtent(0, factor));
+    analyzer.Bind(var, Range::FromMinExtent(make_const(dtype, 0), tvm::cast(dtype, factor)));
     new_loop_vars.emplace_back(std::move(var));
   }
   Map<Block, Block> opaque_block_reuse;
@@ -481,11 +490,13 @@ StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) {
   // Step 2. Create fused loop var and replace the original loop vars
   std::string suffix;
   int n = loops.size();
+  int bits = loops[0]->loop_var.dtype().bits();
   for (int i = 1; i < n; i++) {
     suffix += "_" + loops[i]->loop_var->name_hint;
+    bits = std::max(bits, loops[i]->loop_var.dtype().bits());
   }
   suffix += "_fused";
-  Var fused_var = loops[0]->loop_var.copy_with_suffix(suffix);
+  Var fused_var = loops[0]->loop_var.copy_with_suffix(suffix).copy_with_dtype(DataType::Int(bits));
   Array<PrimExpr> substitute_value;
   substitute_value.resize(loops.size());
   PrimExpr lower = 1;
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
index f477367adfad3..3772d9a4e0fec 100644
--- a/tests/python/unittest/test_tir_schedule_compute_at.py
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -15,13 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring
-import sys
-
 import pytest
-
 import tvm
 import tvm.testing
-from tvm import tir
+from tvm import te, tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
 
@@ -1335,5 +1332,24 @@ def test_fail_all_producers_under_loop():
         sch.reverse_compute_at(block, loop)
 
 
+def test_compute_at_int64_loop():
+    def _create_prim_func():
+        n = te.var("n", dtype="int64")
+        m = te.var("m", dtype="int64")
+        A = te.placeholder((n, m), name="A", dtype="float32")
+        B = te.placeholder((n, m), name="B", dtype="float32")
+        C = te.compute((n, m), lambda i, j: A[i, j] + B[i, j], name="C")
+        D = te.compute((n, m), lambda i, j: C[i, j] + 1.0, name="D")
+        return te.create_prim_func([A, B, D])
+
+    mod = _create_prim_func()
+    sch = tir.Schedule(mod, debug_mask="all")
+    block_c = sch.get_block("C")
+    block_d = sch.get_block("D")
+    i, _ = sch.get_loops(block_d)
+    sch.compute_at(block_c, i)
+    verify_trace_roundtrip(sch=sch, mod=mod)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index c9e6eec029329..0bfac4e425b95 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -15,12 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring
-import sys
-
 import pytest
 import tvm
 import tvm.testing
-from tvm import tir
+from tvm import te, tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
 
@@ -582,5 +580,62 @@ def zero_dim_added(
     tvm.ir.assert_structural_equal(zero_dim_added, sch.mod["main"])
 
 
+@pytest.mark.skip("Pending fix in affine analysis")
+def test_fuse_int64():
+    def _create_prim_func():
+        n = te.const(16, "int32")
+        m = te.const(32, "int64")
+        A = te.placeholder((n, m), name="A", dtype="int32")
+        B = te.compute((n, m), lambda i, j: A[i, j] + 1, name="B")
+        return te.create_prim_func([A, B])
+
+    mod = _create_prim_func()
+    sch = tir.Schedule(mod, debug_mask="all")
+    i, j = sch.get_loops(sch.get_block("B"))
+    sch.fuse(i, j)
+    verify_trace_roundtrip(sch=sch, mod=mod)
+
+
+def test_split_int64_extent_with_mixed_factors():
+    def _create_prim_func():
+        m = te.const(384, "int64")
+        A = te.placeholder((m,), name="A", dtype="float32")
+        B = te.compute((m,), lambda i: A[i] + 1, name="B")
+        return te.create_prim_func([A, B])
+
+    mod = _create_prim_func()
+    sch = tir.Schedule(mod, debug_mask="all")
+    (i,) = sch.get_loops(sch.get_block("B"))
+    sch.split(
+        i,
+        factors=[
+            te.const(1, "int64"),
+            te.const(512, "int32"),
+        ],
+    )
+
+
+def test_split_int64_extent_with_int32_factors():
+    def _create_prim_func():
+        m = te.const(12, "int64")
+        A = te.placeholder((m,), name="A", dtype="float32")
+        B = te.compute((m,), lambda i: A[i] + 1, name="B")
+        return te.create_prim_func([A, B])
+
+    mod = _create_prim_func()
+    sch = tir.Schedule(mod, debug_mask="all")
+    (i,) = sch.get_loops(sch.get_block("B"))
+    sch.split(
+        i,
+        factors=[
+            te.const(1, "int32"),
+            te.const(1, "int32"),
+            te.const(3, "int32"),
+            te.const(1, "int32"),
+            te.const(4, "int32"),
+        ],
+    )
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 8a568bc823fa7c8c3d37ff15deb4a8faef6d0bbb Mon Sep 17 00:00:00 2001
From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com>
Date: Sun, 5 Jun 2022 19:44:52 -0700
Subject: [PATCH 046/181] [MetaSchedule] exposed method:
 TuneContextNodeInitialize (#11576)

I exposed the initialize() method for TuneContextNode on the C++ side and added a corresponding method to TuneContext class on the Python side, so that we do not need to call initialize_with_tune_context for every scheduling rule.
---
 python/tvm/meta_schedule/tune_context.py | 5 +++++
 src/meta_schedule/tune_context.cc        | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index ef2e4bcd8e6d9..19ab0a40cf617 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -129,3 +129,8 @@ def __init__(
             rand_state,
             num_threads,
         )
+
+    def initialize(self):
+        """Initialize the tuning context"""
+
+        _ffi_api.TuneContextInitialize(self)  # type: ignore # pylint: disable=no-member
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 382dd961dee0e..3607e3050803e 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -89,6 +89,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuneContext")
     });
 
 TVM_REGISTER_GLOBAL("meta_schedule._SHash2Hex").set_body_typed(SHash2Hex);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextInitialize")
+    .set_body_method<TuneContext>(&TuneContextNode::Initialize);
 
 }  // namespace meta_schedule
 }  // namespace tvm

From 8038987411471bbdd03edba75271a1c00d571f23 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 5 Jun 2022 19:45:16 -0700
Subject: [PATCH 047/181] [MetaSchedule] Fix Summary Format for Invalid Runs
 (#11584)

Previously for invalid tasks, MetaSchedule prints a huge number in
latency which is aesthetically unacceptable. For example,

```
 69 |  fused_cast_add_cast_3 | 16777216 | 2 | 0.0000 | 10000000000000000019156750857346687362159551272651920111528035145993793242039887559612361451081803235328.0000 | 20000000000000000038313501714693374724319102545303840223056070291987586484079775119224722902163606470656.0000 |     64 |
```

This PR fixes this behavior and turns the huge number into "N/A".
---
 .../task_scheduler/gradient_based.cc          | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
index a95dbba6c3e14..f8cc9d5514941 100644
--- a/src/meta_schedule/task_scheduler/gradient_based.cc
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -79,10 +79,14 @@ class GradientBasedNode final : public TaskSchedulerNode {
           << /*name=*/record.task->task_name.value()      //
           << /*flops=*/static_cast<int64_t>(record.flop)  //
           << /*weight=*/static_cast<int>(record.weight);
-      if (trials == 0) {
+      double latency = 1e9;
+      if (trials > 0) {
+        latency = record.best_time_cost_history.back();
+      }
+      if (latency >= 1e9) {
         row << /*speed=*/"N/A" << /*latency=*/"N/A" << /*weighted_latency=*/"N/A";
       } else {
-        double latency = record.best_time_cost_history.back() * 1000.0;
+        latency *= 1000.0;
         double speed = record.flop / latency / 1000.0;
         double weighted_latency = latency * record.weight;
         row << /*speed=*/speed << /*latency=*/latency << /*weighted_latency=*/weighted_latency;
@@ -139,10 +143,15 @@ class GradientBasedNode final : public TaskSchedulerNode {
       int n = record.best_time_cost_history.size();
       ICHECK_GE(n, 1);
       double best = record.best_time_cost_history[n - 1];
-      double g1 = (n >= 1 + w) ? (record.best_time_cost_history[n - 1 - w] - best) / w : 0.0;
-      double g2 = best / n;
-      double g = alpha * g1 + (1 - alpha) * g2;
-      grad.push_back(g * record.weight);
+      if (best < 1e9) {
+        double g1 = (n >= 1 + w) ? (record.best_time_cost_history[n - 1 - w] - best) / w : 0.0;
+        double g2 = best / n;
+        double g = alpha * g1 + (1 - alpha) * g2;
+        grad.push_back(g * record.weight);
+      } else {
+        // If the best time cost is unavailable, it means some task is not valid. Skip it.
+        grad.push_back(-1e9);
+      }
     }
     auto max_grad = std::max_element(grad.begin(), grad.end());
     auto min_grad = std::min_element(grad.begin(), grad.end());

From 283542f68a8759eebca97626b983909f55c64699 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Sun, 5 Jun 2022 20:00:19 -0700
Subject: [PATCH 048/181] [CI][DOC] Fix incorrect commands in docs/readme.md
 (#11583)

Fix incorrect commands in docs/readme.md
---
 docs/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 520fea60ca28a..0ccb3cd3b954d 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -79,14 +79,14 @@ the path that matches the regular expression pattern.
 For example, to only build tutorials under `/vta/tutorials`, run
 
 ```bash
-python tests/scripts/ci.py docs --tutorials=/vta/tutorials
+python tests/scripts/ci.py docs --tutorial-pattern=/vta/tutorials
 ```
 
 To only build one specific file, do
 
 ```bash
 # The slash \ is used to get . in regular expression
-python tests/scripts/ci.py docs --tutorials=file_name\.py
+python tests/scripts/ci.py docs --tutorial-pattern=file_name\.py
 ```
 
 ## Helper Scripts
@@ -95,14 +95,14 @@ You can run the following script to reproduce the CI sphinx pre-check stage.
 This script skips the tutorial executions and is useful to quickly check the content.
 
 ```bash
-python tests/scripts/ci.py docs --precheck
+tests/scripts/task_python_docs.sh
 ```
 
 The following script runs the full build which includes tutorial executions.
 You will need a GPU CI environment.
 
 ```bash
-python tests/scripts/ci.py --precheck --full
+python tests/scripts/ci.py docs --full
 ```
 
 ## Define the Order of Tutorials

From bf4b8f5c766be8320df8d792a8c063b7b42c69f5 Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Sun, 5 Jun 2022 22:11:45 -0500
Subject: [PATCH 049/181] split test_forward_math_api function (#11537)

---
 .../frontend/paddlepaddle/test_forward.py     | 237 ++++++++++++++----
 1 file changed, 193 insertions(+), 44 deletions(-)

diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index 56ec3a4e5469e..8b696404e2b0c 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -1358,7 +1358,10 @@ def slice4(inputs):
 
 
 @tvm.testing.uses_gpu
-def test_forward_math_api():
+def run_math_api(func):
+    api_name = func.__name__.split("_")[-1]
+    print("func_name:", api_name)
+
     class MathAPI(nn.Layer):
         def __init__(self, api_name):
             super(MathAPI, self).__init__()
@@ -1371,52 +1374,198 @@ def __init__(self, api_name):
         def forward(self, inputs):
             return self.func(inputs)
 
-    api_list = [
-        "abs",
-        "acos",
-        "asin",
-        "atan",
-        "ceil",
-        "cos",
-        "cosh",
-        "elu",
-        "erf",
-        "exp",
-        "floor",
-        "hardshrink",
-        "hardtanh",
-        "log_sigmoid",
-        "log_softmax",
-        "log",
-        "log2",
-        "log10",
-        "log1p",
-        "reciprocal",
-        "relu",
-        "relu6",
-        "round",
-        "rsqrt",
-        "selu",
-        "sigmoid",
-        "sign",
-        "sin",
-        "sinh",
-        "softplus",
-        "softsign",
-        "sqrt",
-        "square",
-        "swish",
-        "tan",
-        "tanh",
-    ]
     input_shapes = [[128], [2, 100], [10, 2, 5], [7, 3, 4, 1]]
     for input_shape in input_shapes:
         input_data = paddle.rand(input_shape, dtype="float32")
-        for api_name in api_list:
-            if api_name in ["log", "log2", "log10", "reciprocal", "sqrt", "rsqrt"]:
-                # avoid illegal input, all elements should be positive
-                input_data = paddle.uniform(input_shape, min=0.01, max=0.99)
-            verify_model(MathAPI(api_name), input_data=input_data)
+        if api_name in ["log", "log2", "log10", "reciprocal", "sqrt", "rsqrt"]:
+            # avoid illegal input, all elements should be positive
+            input_data = paddle.uniform(input_shape, min=0.01, max=0.99)
+        verify_model(MathAPI(api_name), input_data=input_data)
+
+
+@run_math_api
+def test_forward_abs():
+    pass
+
+
+@run_math_api
+def test_forward_acos():
+    pass
+
+
+@run_math_api
+def test_forward_abs():
+    pass
+
+
+@run_math_api
+def test_forward_atan():
+    pass
+
+
+@run_math_api
+def test_forward_ceil():
+    pass
+
+
+@run_math_api
+def test_forward_cos():
+    pass
+
+
+@run_math_api
+def test_forward_cosh():
+    pass
+
+
+@run_math_api
+def test_forward_elu():
+    pass
+
+
+@run_math_api
+def test_forward_erf():
+    pass
+
+
+@run_math_api
+def test_forward_exp():
+    pass
+
+
+@run_math_api
+def test_forward_floor():
+    pass
+
+
+@run_math_api
+def test_forward_hardshrink():
+    pass
+
+
+@run_math_api
+def test_forward_hardtanh():
+    pass
+
+
+@run_math_api
+def test_forward_log_sigmoid():
+    pass
+
+
+@run_math_api
+def test_forward_log_softmax():
+    pass
+
+
+@run_math_api
+def test_forward_log():
+    pass
+
+
+@run_math_api
+def test_forward_log2():
+    pass
+
+
+@run_math_api
+def test_forward_log10():
+    pass
+
+
+@run_math_api
+def test_forward_log1p():
+    pass
+
+
+@run_math_api
+def test_forward_reciprocal():
+    pass
+
+
+@run_math_api
+def test_forward_relu():
+    pass
+
+
+@run_math_api
+def test_forward_round():
+    pass
+
+
+@run_math_api
+def test_forward_rsqrt():
+    pass
+
+
+@run_math_api
+def test_forward_selu():
+    pass
+
+
+@run_math_api
+def test_forward_sigmoid():
+    pass
+
+
+@run_math_api
+def test_forward_sign():
+    pass
+
+
+@run_math_api
+def test_forward_sin():
+    pass
+
+
+@run_math_api
+def test_forward_softplus():
+    pass
+
+
+@run_math_api
+def test_forward_sqrt():
+    pass
+
+
+@run_math_api
+def test_forward_square():
+    pass
+
+
+@run_math_api
+def test_forward_sin():
+    pass
+
+
+@run_math_api
+def test_forward_softsign():
+    pass
+
+
+@run_math_api
+def test_forward_sqrt():
+    pass
+
+
+@run_math_api
+def test_forward_square():
+    pass
+
+
+@run_math_api
+def test_forward_swish():
+    pass
+
+
+@run_math_api
+def test_forward_tan():
+    pass
+
+
+@run_math_api
+def test_forward_tanh():
+    pass
 
 
 @tvm.testing.uses_gpu

From b555bf5481d3eb261427850cea286c162aa3d2e3 Mon Sep 17 00:00:00 2001
From: M <mengceng.he@intel.com>
Date: Mon, 6 Jun 2022 16:54:09 +0800
Subject: [PATCH 050/181] fix bmm quantization realize (#11586)

---
 src/relay/quantize/realize.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 301dc1a09f396..5766c62eaa433 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -511,13 +511,14 @@ Expr BatchMatmulRealize(const Call& ref_call, const Array<Expr>& new_args, const
 
   Expr ldata = lhs->data;
   Expr rdata = rhs->data;
-  DataType dtype = cfg->dtype_input;
+  DataType dtype_input = cfg->dtype_input;
+  DataType dtype_weight = cfg->dtype_weight;
 
-  if (lhs->dtype != dtype) {
-    ldata = Cast(ldata, dtype);
+  if (lhs->dtype != dtype_input) {
+    ldata = Cast(ldata, dtype_input);
   }
-  if (rhs->dtype != dtype) {
-    rdata = Cast(rdata, dtype);
+  if (rhs->dtype != dtype_weight) {
+    rdata = Cast(rdata, dtype_weight);
   }
 
   const auto ref_attrs = ref_call->attrs.as<BatchMatmulAttrs>();

From 609d6af17605d657909549e908876f4335206bd6 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 6 Jun 2022 13:29:07 +0100
Subject: [PATCH 051/181] [microNPU] Fix output mismatch in Leaky ReLU (#11397)

* [microNPU] Fix output mismatch in Leaky ReLU

All codegen tests have been running with a representative dataset
between 0,1 which masked an output mismatch in Leaky ReLU when compared
to TFLite kernels. This issue can be replicated by replacing the
representative dataset range with something like -1,1.

To fix this mismatch, we use the same implementation for calculating
LUT values as Vela which uses arithmetic constrained to quantized
values, rather than the previously used floating point calculations.

Change-Id: I0ed52215acd27722873be609271971b6fc4aaef1

* fix lint

Change-Id: Ica7de0c000ee015e79fe10985b2ec7a9b341861f

* fix lint again

Change-Id: I005d90ad248bfff7090f99d161eefbdc962cba48
---
 .../relay/backend/contrib/ethosu/legalize.py  | 88 ++++++++++++-------
 .../contrib/test_ethosu/test_codegen.py       |  6 +-
 2 files changed, 62 insertions(+), 32 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/legalize.py b/python/tvm/relay/backend/contrib/ethosu/legalize.py
index d83cd403ca144..c940abdeab5f5 100644
--- a/python/tvm/relay/backend/contrib/ethosu/legalize.py
+++ b/python/tvm/relay/backend/contrib/ethosu/legalize.py
@@ -16,10 +16,11 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument, import-outside-toplevel, no-value-for-parameter
 """A set of passes to legalize some of operations for the NPU"""
-from typing import List, Type, Callable, Any, Dict
+from typing import List, Type, Callable
 import math
 
 import numpy as np  # type: ignore
+from ethosu.vela import scaling, fp_math
 
 import tvm  # type: ignore
 from tvm import relay
@@ -132,7 +133,6 @@ def get_lut_from_func(
     ofm_scale: float,
     ofm_zp: int,
     func: Callable[[float], float],
-    func_params: Dict[str, Any],
 ) -> List[int]:
     """Calculates the values of the lookup table based on the calculation function"""
 
@@ -142,7 +142,7 @@ def get_lut_from_func(
     qmin, qmax = np.iinfo(dtype).min, np.iinfo(dtype).max
     for x in range(qmin, qmax + 1):
         x_real = ifm_scale * (x - ifm_zp)
-        out_real = func(x_real, **func_params)
+        out_real = func(x_real)
         lut_result = int(util.round_away_zero(ofm_zp + out_real / ofm_scale))
         lut_result = min(qmax, max(qmin, lut_result))
         lut_values.append(lut_result)
@@ -165,29 +165,10 @@ def __init__(
         self.activation_type = activation_type
         self.calc_func = calc_func
 
-    def get_calc_func_params(self, expr: tvm.relay.Expr) -> Dict[str, Any]:
-        """
-        Overridable method that can be used to extract additional arguments
-        for passing to calc_func.
-
-        Parameters
-        ----------
-        expr : tvm.relay.Expr
-            The matched composite activation function.
-
-        Returns
-        -------
-        Dict[str, Any]
-            Maps argument name to argument value.
-        """
-        return {}
-
     def callback(self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map):
         params = self.params_class(post.op.body)
         params.ifm.tensor = post.args[0]
 
-        calc_func_params = self.get_calc_func_params(post.op)
-
         input_scale = float(params.ifm.q_params.scale_f32)
         input_zp = int(params.ifm.q_params.zero_point)
         output_scale = float(params.ofm.q_params.scale_f32)
@@ -199,7 +180,6 @@ def callback(self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.c
             output_scale,
             output_zp,
             self.calc_func,
-            calc_func_params,
         )
         lut = relay.const(lut_values, dtype=params.ifm.dtype)
 
@@ -257,19 +237,65 @@ def leaky_relu_calc_func(x: float, alpha: float) -> float:
     return x if x >= 0 else x * alpha
 
 
-class LeakyReLURewriter(LutActivationRewriter):
+class LeakyReLURewriter(DFPatternCallback):
     """This pass adds leaky relu as a LUT for identity op."""
 
     def __init__(self):
-        super().__init__(
-            params_class=ethosu_patterns.LeakyReLUParams,
-            activation_type="LUT",
-            calc_func=leaky_relu_calc_func,
+        super().__init__(require_type=True, rewrite_once=True)
+        self.params_class = ethosu_patterns.LeakyReLUParams
+        self.pattern = wildcard().has_attr({"Composite": self.params_class.composite_name})(
+            wildcard()
         )
 
-    def get_calc_func_params(self, expr: tvm.relay.Expr) -> Dict[str, Any]:
-        params = ethosu_patterns.LeakyReLUParams(expr.body)
-        return {"alpha": params.alpha}
+    def callback(self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map):
+        params = self.params_class(post.op.body)
+        params.ifm.tensor = post.args[0]
+
+        input_scale = np.double(float(params.ifm.q_params.scale_f32))
+        input_zp = int(params.ifm.q_params.zero_point)
+        output_scale = np.double(float(params.ofm.q_params.scale_f32))
+        output_zp = int(params.ofm.q_params.zero_point)
+
+        alpha = params.alpha
+
+        # The calculation of the LUT values is similar to that in Vela
+        # convert_lrelu_to_lut(op, arch)
+        # (https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela/+/refs/tags/3.2.0/ethosu/vela/tflite_graph_optimiser.py#864)  # pylint: disable=line-too-long
+        alpha_scalar = 1
+        alpha_scale, alpha_shift = scaling.elementwise_mul_scale(input_scale, alpha, output_scale)
+        identity_scale, identity_shift = scaling.elementwise_mul_scale(input_scale, 1, output_scale)
+
+        dtype = params.ifm.dtype
+        qmin, qmax = np.iinfo(dtype).min, np.iinfo(dtype).max
+
+        def calculate_lut_value(i):
+            zp_shift = (
+                fp_math.multiply_by_quantized_multiplier(
+                    alpha_scalar * (i - input_zp), alpha_scale, alpha_shift
+                )
+                if i < input_zp
+                else fp_math.multiply_by_quantized_multiplier(
+                    i - input_zp, identity_scale, identity_shift
+                )
+            )
+
+            return min(qmax, max(qmin, output_zp + zp_shift))
+
+        values = list(map(calculate_lut_value, range(qmin, qmax + 1)))
+        lut = relay.const(values, dtype=dtype)
+
+        # We baked the requantization into the LUT, so we don't requantize the identity operator
+        identity = ethosu_ops.ethosu_identity(
+            ifm=params.ifm.tensor,
+            lut=lut,
+            ifm_scale=input_scale,
+            ifm_zero_point=input_zp,
+            ofm_scale=input_scale,
+            ofm_zero_point=input_zp,
+            activation="LUT",
+        )
+
+        return identity
 
 
 class Conv2DRewriter(DFPatternCallback):
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index b6b78c3357605..b73ebd5361192 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -1022,7 +1022,11 @@ def leaky_relu_func(x):
         return tf.nn.leaky_relu(x, alpha=alpha)
 
     infra.compare_tvm_with_tflite(
-        leaky_relu_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+        leaky_relu_func,
+        [ifm_shape],
+        accel_type,
+        enable_cascader=is_u55_accel_type(accel_type),
+        ranges=[(-1, 1)],
     )
 
 

From 1aac4d6826192383a755369ab5ccfe4876e8902b Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 6 Jun 2022 15:10:22 +0100
Subject: [PATCH 052/181] [microNPU] Optimize separate padding operation for
 conv2d (#11468)

Optimizes a case where padding appears as a separate nn.pad operation followed by a qnn.conv2d. If possible, the nn.pad will be partitioned and offloaded together with the qnn.conv2d operation, as opposed to separately. As a fallback, both operations will be considered separately.

cc Mousius NicolaLancellotti ekalda manupa-arm
---
 python/tvm/relay/op/contrib/ethosu.py         |  66 +++++-
 tests/python/contrib/test_ethosu/infra.py     |  11 +-
 .../contrib/test_ethosu/test_codegen.py       |  68 +++++-
 .../contrib/test_ethosu/test_legalize.py      | 216 ++++++++++++++++++
 4 files changed, 349 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
index dfdc0c82fb1e9..806bf6dce2e89 100644
--- a/python/tvm/relay/op/contrib/ethosu.py
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -201,6 +201,8 @@ def __init__(self, func_body: tvm.relay.Function):
         from tvm.relay.backend.contrib.ethosu.util import RequantArgs
 
         activation = None
+        separate_padding = None
+
         if str(func_body.op) in self.activation_map.keys():
             activation = func_body
             requantize_op = activation.args[0]
@@ -208,8 +210,11 @@ def __init__(self, func_body: tvm.relay.Function):
             requantize_op = func_body
         bias_add = requantize_op.args[0]
         qnn_conv2d = bias_add.args[0]
+        if isinstance(qnn_conv2d.args[0], relay.Call) and str(qnn_conv2d.args[0].op) == "nn.pad":
+            separate_padding = qnn_conv2d.args[0]
         data_layout = qnn_conv2d.attrs.data_layout
         self.kernel_layout = qnn_conv2d.attrs.kernel_layout
+
         # We consider the weights & biases as params as it should be a Constant
         self.weights = TensorParams(
             qnn_conv2d.args[QConv2DArgs.WEIGHTS.value],
@@ -224,8 +229,11 @@ def __init__(self, func_body: tvm.relay.Function):
             requantize_op.args[RequantArgs.IFM_SCALE.value],
             requantize_op.args[RequantArgs.IFM_ZERO_POINT.value],
         )
+        ifm_tensor = (
+            separate_padding.args[0] if separate_padding else qnn_conv2d.args[QConv2DArgs.IFM.value]
+        )
         self.ifm = TensorParams(
-            qnn_conv2d.args[QConv2DArgs.IFM.value],
+            ifm_tensor,
             data_layout,
             qnn_conv2d.args[QConv2DArgs.IFM_SCALE.value],
             qnn_conv2d.args[QConv2DArgs.IFM_ZERO_POINT.value],
@@ -237,7 +245,10 @@ def __init__(self, func_body: tvm.relay.Function):
             requantize_op.args[RequantArgs.OFM_ZERO_POINT.value],
         )
         attrs = qnn_conv2d.attrs
-        self.padding = attrs.padding
+
+        pad_value = int(qnn_conv2d.args[QConv2DArgs.IFM_ZERO_POINT.value].data.asnumpy())
+        self.padding = self.extract_padding(attrs.padding, separate_padding, pad_value)
+
         self.strides = attrs.strides
         self.dilation = attrs.dilation
         self.activation = activation
@@ -250,6 +261,37 @@ def __init__(self, func_body: tvm.relay.Function):
         if self.groups == self.weights.shape[channels_axis[self.kernel_layout]]:
             self.is_depthwise = True
 
+    @staticmethod
+    def extract_padding(
+        operator_padding: Tuple[int, int, int, int],
+        separate_padding: relay.Call,
+        pad_value: int,
+    ) -> Optional[Tuple[int, int, int, int]]:
+        """
+        Convolution operations can sometimes have padding represented as a separate
+        padding operation before the convolution operation itself. Here we can check
+        whether these representations can be combined into a single padding attribute
+        as part of the NPU convolution itself. If the padding specified by the separate
+        nn.pad operation is not supported, None will be returned. This will cause the
+        nn.pad to be offloaded separately.
+        """
+        if separate_padding is None:
+            return operator_padding
+        if pad_value != int(separate_padding.args[1].data.asnumpy()):
+            return None
+        pad_width = separate_padding.attrs["pad_width"]
+        if len(pad_width) != 4:
+            return None
+        if list(pad_width[0]) != [0, 0] or list(pad_width[3]) != [0, 0]:
+            return None
+        top, left, bottom, right = operator_padding
+        return [
+            top + pad_width[1][0],
+            left + pad_width[2][0],
+            bottom + pad_width[1][1],
+            right + pad_width[2][1],
+        ]
+
     def is_valid(self) -> bool:
         """
         This function checks whether QnnConv2D has compatible attributes with the NPU
@@ -267,7 +309,7 @@ def is_valid(self) -> bool:
             return False
         if not check_dilation(self.dilation):
             return False
-        if not check_padding(self.padding, self.padding_bounds):
+        if not self.padding or not check_padding(self.padding, self.padding_bounds):
             return False
         legal_groups = [1, self.ofm.shape[3]]
         if self.groups not in legal_groups:
@@ -437,7 +479,7 @@ def is_valid(self):
             return False
         if not check_dilation(self.dilation):
             return False
-        if not check_padding(self.padding, self.padding_bounds):
+        if not self.padding or not check_padding(self.padding, self.padding_bounds):
             return False
         if self.weights.layout != "HWOI":
             return False
@@ -453,8 +495,14 @@ def qnn_conv2d_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
     """
     This function creates the pattern for qnn.conv2D with optional fused RELU activation.
     """
+    optional_pad = is_op("nn.pad")(wildcard(), is_constant())
     qnn_conv2d = is_op("qnn.conv2d")(
-        wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant()
+        optional_pad | wildcard(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
     ).has_attr({"kernel_layout": "HWIO"})
     bias_add = is_op("nn.bias_add")(qnn_conv2d, is_constant())
     req = is_op("qnn.requantize")(
@@ -468,8 +516,14 @@ def qnn_depthwise_conv2d_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
     """
     This function creates the pattern for depthwise qnn.conv2D with optional fused RELU activation.
     """
+    optional_pad = is_op("nn.pad")(wildcard(), is_constant())
     qnn_conv2d = is_op("qnn.conv2d")(
-        wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant()
+        optional_pad | wildcard(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
     ).has_attr({"kernel_layout": "HWOI"})
     bias_add = is_op("nn.bias_add")(qnn_conv2d, is_constant())
     req = is_op("qnn.requantize")(
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index a1bdcb47e62d1..1f999781e3b1b 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -473,10 +473,17 @@ def compute_ofm_shape(ifm_shape, padding, kernel_shape, strides, dilation=[1, 1]
     assert len(strides) == 2
     assert len(dilation) == 2
     assert len(kernel_shape) == 2
-    if padding.lower() == "valid":
+    if isinstance(padding, tuple):
+        h = (
+            ifm_shape[1] - (kernel_shape[0] - 1) * dilation[0] + padding[0] + padding[2]
+        ) // strides[0]
+        w = (
+            ifm_shape[2] - (kernel_shape[1] - 1) * dilation[1] + padding[1] + padding[3]
+        ) // strides[1]
+    elif padding.lower() == "valid":
         h = math.ceil((ifm_shape[1] - (kernel_shape[0] - 1) * dilation[0]) / strides[0])
         w = math.ceil((ifm_shape[2] - (kernel_shape[1] - 1) * dilation[1]) / strides[1])
-    if padding.lower() == "same":
+    elif padding.lower() == "same":
         h = math.ceil(ifm_shape[1] / strides[0])
         w = math.ceil(ifm_shape[2] / strides[1])
     ofm_shape = [ifm_shape[0], h, w, ifm_shape[3]]
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index b73ebd5361192..2d3489889e8ab 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -72,13 +72,43 @@ def conv2d(x):
             padding=padding,
             dilations=dilation,
         )
-        if activation:
+        if activation == "RELU":
             op = tf.nn.relu(op)
         return op
 
     infra.compare_tvm_with_tflite(conv2d, [ifm_shape], accel_type)
 
 
+def test_tflite_conv2d_with_separate_pad():
+    np.random.seed(0)
+
+    ifm_shape = (1, 55, 34, 3)
+    kernel_shape = (3, 2)
+    strides = (1, 1)
+    dilation = (2, 1)
+    padding = (0, 0, 1, 1)
+
+    @tf.function
+    def conv2d(x):
+        tf_strides = [1, strides[0], strides[1], 1]
+        op = tf.pad(
+            x,
+            [[0, 0], [padding[0], padding[2]], [padding[1], padding[3]], [0, 0]],
+            "CONSTANT",
+        )
+        weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 3]
+        weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+        return tf.nn.conv2d(
+            op,
+            weight,
+            strides=tf_strides,
+            padding="VALID",
+            dilations=dilation,
+        )
+
+    infra.compare_tvm_with_tflite(conv2d, [ifm_shape], "ethos-u55-256")
+
+
 @pytest.mark.parametrize("ifm_shape", [(1, 214, 227, 2), (1, 27, 42, 3)])
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
 @pytest.mark.parametrize("strides, dilation", [((1, 1), (2, 1)), ((3, 2), (1, 1))])
@@ -120,7 +150,7 @@ def conv2d_double(x):
             padding=padding,
             dilations=dilation,
         )
-        if activation:
+        if activation == "RELU":
             op2 = tf.nn.relu(op2)
         return op2
 
@@ -156,7 +186,7 @@ def conv_invalid_scale(x):
             padding=padding,
             dilations=dilation,
         )
-        if activation:
+        if activation == "RELU":
             op = tf.nn.relu(op)
         return op
 
@@ -191,13 +221,43 @@ def depthwise_conv2d(x):
         op = tf.nn.depthwise_conv2d(
             x, weight, strides=tf_strides, padding=padding, dilations=dilation
         )
-        if activation_function:
+        if activation_function == "RELU":
             op = tf.nn.relu(op)
         return op
 
     infra.compare_tvm_with_tflite(depthwise_conv2d, [ifm_shape], accel_type)
 
 
+def test_tflite_depthwise_conv2d_with_separate_pad():
+    np.random.seed(0)
+
+    ifm_shape = (1, 23, 32, 7)
+    kernel_shape = (1, 2)
+    strides = (3, 2)
+    dilation = (1, 1)
+    padding = (0, 0, 1, 1)
+
+    @tf.function
+    def depthwise_conv2d(x):
+        tf_strides = [1, strides[0], strides[1], 1]
+        op = tf.pad(
+            x,
+            [[0, 0], [padding[0], padding[2]], [padding[1], padding[3]], [0, 0]],
+            "CONSTANT",
+        )
+        weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 1]
+        weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+        return tf.nn.depthwise_conv2d(
+            op,
+            weight,
+            strides=tf_strides,
+            padding="VALID",
+            dilations=dilation,
+        )
+
+    infra.compare_tvm_with_tflite(depthwise_conv2d, [ifm_shape], "ethos-u55-256")
+
+
 @pytest.mark.parametrize(
     "accel_type",
     ACCEL_TYPES,
diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
index 2dd5eff91373b..3f8b5f7d5b583 100644
--- a/tests/python/contrib/test_ethosu/test_legalize.py
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -347,6 +347,114 @@ def verify(ext_func):
     verify(mod["tvmgen_default_ethos_u_main_0"])
 
 
+def test_tflite_conv2d_with_separate_padding_legalize():
+    dtype = "int8"
+    ifm_shape = (1, 55, 34, 3)
+    kernel_shape = (3, 2)
+    strides = (1, 1)
+    dilation = (2, 1)
+    padding = (0, 0, 1, 1)
+
+    def create_tflite_graph_single():
+        class Model(tf.Module):
+            @tf.function
+            def tf_function(self, x):
+                tf_strides = [1, strides[0], strides[1], 1]
+                op = tf.pad(
+                    x,
+                    [[0, 0], [padding[0], padding[2]], [padding[1], padding[3]], [0, 0]],
+                    "CONSTANT",
+                )
+                weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 3]
+                weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+                return tf.nn.conv2d(
+                    op,
+                    weight,
+                    strides=tf_strides,
+                    padding="VALID",
+                    dilations=dilation,
+                )
+
+        model = Model()
+        concrete_func = model.tf_function.get_concrete_function(
+            tf.TensorSpec(ifm_shape, dtype=tf.float32)
+        )
+        # Convert the model
+        def representative_dataset():
+            for _ in range(100):
+                data = np.random.rand(*tuple(ifm_shape))
+                yield [data.astype(np.float32)]
+
+        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+        converter.representative_dataset = representative_dataset
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+        converter.inference_input_type = tf.int8
+        converter.inference_output_type = tf.int8
+        tflite_model = converter.convert()
+        return tflite_model
+
+    def verify(ext_func):
+        op = ext_func.body
+        ofm_channels = op.attrs.ofm_channels
+
+        # check IFM
+        ifm = op.args[0].checked_type
+        assert list(ifm.shape) == list(ifm_shape)
+        assert str(ifm.dtype) == dtype
+        assert ifm.shape[3] == ofm_channels
+
+        # check OFM
+        ofm = op.checked_type
+        expected_ofm_shape = infra.compute_ofm_shape(
+            ifm_shape, padding, kernel_shape, strides, dilation
+        )
+        assert list(ofm.shape) == list(expected_ofm_shape)
+        assert str(ofm.dtype) == dtype
+        assert ofm.shape[3] == ofm_channels
+
+        # check weights
+        weights_ohwi = op.args[1].data.asnumpy()
+        assert str(weights_ohwi.dtype) == dtype
+        assert weights_ohwi.shape[0] == ofm_channels
+        assert weights_ohwi.shape[1] == kernel_shape[0]
+        assert weights_ohwi.shape[2] == kernel_shape[1]
+        assert weights_ohwi.shape[3] == 3
+
+        # Check that scale_bias matches weight tensor
+        assert list(op.args[2].checked_type.shape)[0] == ofm_channels
+
+        assert list(op.attrs.padding) == list(padding)
+        assert list(op.attrs.strides) == list(strides)
+        assert list(op.attrs.dilation) == list(dilation)
+
+    conv2d_pattern_table = [
+        (
+            ethosu.QnnConv2DParams.composite_name,
+            ethosu.qnn_conv2d_pattern(),
+            lambda pat: ethosu.QnnConv2DParams(pat).is_valid(),
+        )
+    ]
+
+    tflite_graph = create_tflite_graph_single()
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    mod, conv_params = relay.frontend.from_tflite(
+        tflite_model,
+        shape_dict={"input": ifm_shape},
+        dtype_dict={"input": dtype},
+    )
+
+    mod["main"] = bind_params_by_name(mod["main"], conv_params)
+    mod = partition_ethosu_by_table(mod, conv2d_pattern_table)
+
+    mod["tvmgen_default_ethos_u_main_0"] = dataflow_pattern.rewrite(
+        legalize.Conv2DRewriter(), mod["tvmgen_default_ethos_u_main_0"]
+    )
+
+    verify(mod["tvmgen_default_ethos_u_main_0"])
+
+
 @pytest.mark.parametrize("ifm_shape", [(1, 299, 299, 3), (1, 123, 17, 7)])
 @pytest.mark.parametrize("kernel_shape", [(7, 3), (22, 5)])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
@@ -458,6 +566,114 @@ def verify(ext_func):
     verify(mod["tvmgen_default_ethos_u_main_0"])
 
 
+def test_tflite_depthwise_conv2d_with_separate_padding_legalize():
+    dtype = "int8"
+    ifm_shape = (1, 23, 32, 7)
+    kernel_shape = (1, 2)
+    strides = (3, 2)
+    dilation = (1, 1)
+    padding = (0, 0, 1, 1)
+
+    def create_tflite_graph():
+        class Model(tf.Module):
+            @tf.function
+            def tf_function(self, x):
+                tf_strides = [1, strides[0], strides[1], 1]
+                op = tf.pad(
+                    x,
+                    [[0, 0], [padding[0], padding[2]], [padding[1], padding[3]], [0, 0]],
+                    "CONSTANT",
+                )
+                weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 1]
+                weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+                return tf.nn.depthwise_conv2d(
+                    op,
+                    weight,
+                    strides=tf_strides,
+                    padding="VALID",
+                    dilations=dilation,
+                )
+
+        model = Model()
+        concrete_func = model.tf_function.get_concrete_function(
+            tf.TensorSpec(ifm_shape, dtype=tf.float32)
+        )
+        # Convert the model
+        def representative_dataset():
+            for _ in range(100):
+                data = np.random.rand(*tuple(ifm_shape))
+                yield [data.astype(np.float32)]
+
+        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+        converter.representative_dataset = representative_dataset
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+        converter.inference_input_type = tf.int8
+        converter.inference_output_type = tf.int8
+        tflite_model = converter.convert()
+        return tflite_model
+
+    def verify(ext_func):
+        op = ext_func.body
+        ofm_channels = op.attrs.ofm_channels
+
+        # check IFM
+        ifm = op.args[0].checked_type
+        assert list(ifm.shape) == list(ifm_shape)
+        assert str(ifm.dtype) == dtype
+        assert ifm.shape[3] == ofm_channels
+
+        # check OFM
+        ofm = op.checked_type
+        expected_ofm_shape = infra.compute_ofm_shape(
+            ifm_shape, padding, kernel_shape, strides, dilation
+        )
+        assert list(ofm.shape) == list(expected_ofm_shape)
+        assert str(ofm.dtype) == dtype
+        assert ofm.shape[3] == ofm_channels
+
+        # check weights
+        weights_ohwi = op.args[1].data.asnumpy()
+        assert str(weights_ohwi.dtype) == dtype
+        assert weights_ohwi.shape[0] == ofm_channels
+        assert weights_ohwi.shape[1] == kernel_shape[0]
+        assert weights_ohwi.shape[2] == kernel_shape[1]
+        assert weights_ohwi.shape[3] == 1  # only depth multiplier 1 is supported
+
+        # Check that scale_bias matches weight tensor
+        assert list(op.args[2].checked_type.shape)[0] == ofm_channels
+
+        assert list(op.attrs.padding) == list(padding)
+        assert op.attrs.ofm_channels == ofm_channels
+        assert list(op.attrs.strides) == list(strides)
+        assert list(op.attrs.dilation) == list(dilation)
+
+    depthwise_pattern_table = [
+        (
+            ethosu.QnnDepthwiseConv2DParams.composite_name,
+            ethosu.qnn_depthwise_conv2d_pattern(),
+            lambda pat: ethosu.QnnDepthwiseConv2DParams(pat).is_valid(),
+        )
+    ]
+
+    tflite_graph = create_tflite_graph()
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    mod, params = relay.frontend.from_tflite(
+        tflite_model,
+        shape_dict={"input": ifm_shape},
+        dtype_dict={"input": dtype},
+    )
+
+    mod["main"] = bind_params_by_name(mod["main"], params)
+    mod = partition_ethosu_by_table(mod, depthwise_pattern_table)
+
+    mod["tvmgen_default_ethos_u_main_0"] = dataflow_pattern.rewrite(
+        legalize.DepthwiseConv2DRewriter(), mod["tvmgen_default_ethos_u_main_0"]
+    )
+    verify(mod["tvmgen_default_ethos_u_main_0"])
+
+
 @pytest.mark.parametrize("pooling_type", ["MAX", "AVG"])
 @pytest.mark.parametrize("ifm_shape", [[1, 3, 4, 3], [1, 4, 5, 2]])
 @pytest.mark.parametrize(

From 9d6599c928ec4de1aede59927fcc5f651096e358 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 6 Jun 2022 08:49:22 -0700
Subject: [PATCH 053/181] [PROFILER] Add configuration information to profiler
 (#11530)

Configuration is a place to store extra information related to the
specific profiler run. Right now it is just the executor used and the
number of threads. The roofline analysis also adds peak flops and peak
bandwidth.
---
 include/tvm/runtime/profiling.h               |  17 ++-
 python/tvm/runtime/profiling/__init__.py      |  11 +-
 python/tvm/utils/roofline.py                  |   5 +-
 src/node/structural_hash.cc                   |   1 +
 .../debug/graph_executor_debug.cc             |   2 +-
 src/runtime/profiling.cc                      | 111 ++++++++++++------
 src/runtime/vm/profiler/vm.cc                 |   6 +-
 .../python/unittest/test_runtime_profiling.py |   3 +
 8 files changed, 109 insertions(+), 47 deletions(-)

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 0163f0c2e49e1..83c26933be45b 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -25,6 +25,7 @@
 #define TVM_RUNTIME_PROFILING_H_
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/container/map.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
@@ -192,6 +193,11 @@ class ReportNode : public Object {
    * because these metrics include the overhead of the executor.
    */
   Map<String, Map<String, ObjectRef>> device_metrics;
+  /*! Configuration used for this profiling run. Includes number of threads, executor.
+   *
+   * Values must be an object type that can be used with device_metrics.
+   */
+  Map<String, ObjectRef> configuration;
   /*! \brief Output `calls` in CSV format.
    *
    * Note that this does not include `device_metrics`, it only includes per-call metrics.
@@ -255,9 +261,11 @@ class Report : public ObjectRef {
   /*! Construct a Report from a set of calls (with associated metrics) and per-device metrics.
    * \param calls Function calls and associated metrics.
    * \param device_metrics Per-device metrics for overall execution.
+   * \param configuration Configuration data specific to this profiling run.
    */
   explicit Report(Array<Map<String, ObjectRef>> calls,
-                  Map<String, Map<String, ObjectRef>> device_metrics);
+                  Map<String, Map<String, ObjectRef>> device_metrics,
+                  Map<String, ObjectRef> configuration);
 
   /*! Deserialize a Report from a JSON object. Needed for sending the report over RPC.
    * \param json Serialized json report from `ReportNode::AsJSON`.
@@ -366,8 +374,10 @@ class Profiler {
    * \param devs The list of devices the profiler will be running on. Should
    *             include all devices used by profiled operators.
    * \param metric_collectors Additional `MetricCollector`s to use with this profiler.
+   * \param configuration Additional configuration data to add to the outputted profiling report.
    */
-  explicit Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors);
+  explicit Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors,
+                    std::unordered_map<String, ObjectRef> configuration = {});
   /*! \brief Start the profiler.
    *
    * This function should only be called once per object.
@@ -400,7 +410,7 @@ class Profiler {
    *  \returns A `Report` that can either be formatted as CSV (with `.AsCSV`)
    *  or as a human readable table (with `.AsTable`).
    */
-  profiling::Report Report(bool aggregate = true, bool sort = true);
+  profiling::Report Report();
   /*! \brief Check if the profiler is currently running.
    * \returns Whether or not the profiler is running.
    */
@@ -412,6 +422,7 @@ class Profiler {
   std::vector<CallFrame> calls_;
   std::stack<CallFrame> in_flight_;
   std::vector<MetricCollector> collectors_;
+  std::unordered_map<String, ObjectRef> configuration_;
 };
 
 /* \brief A duration in time. */
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index 5737790378278..347d8b9f94f15 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -36,7 +36,10 @@ class Report(Object):
     """
 
     def __init__(
-        self, calls: Sequence[Dict[str, Object]], device_metrics: Dict[str, Dict[str, Object]]
+        self,
+        calls: Sequence[Dict[str, Object]],
+        device_metrics: Dict[str, Dict[str, Object]],
+        configuration: Dict[str, Object],
     ):
         """Construct a profiling report from a list of metrics and per-device metrics.
 
@@ -47,8 +50,12 @@ def __init__(
 
         device_metrics : Dict[str, Dict[str, Object]]
             Per device metrics.
+
+        configuration : Dict[str, Object]
+            Configuration of TVM for this profiling run. Includes number of
+            threads, executor.
         """
-        self.__init_handle_by_constructor__(_ffi_api.Report, calls, device_metrics)
+        self.__init_handle_by_constructor__(_ffi_api.Report, calls, device_metrics, configuration)
 
     def csv(self):
         """Convert this profiling report into CSV format.
diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py
index 8a17b9f003123..6cfca81c5c420 100644
--- a/python/tvm/utils/roofline.py
+++ b/python/tvm/utils/roofline.py
@@ -400,7 +400,10 @@ def roofline_from_existing(
             new_calls.append(call)
         else:
             new_calls.append(call)
-    return profiling.Report(new_calls, report.device_metrics)
+    new_configuration = dict(report.configuration.items())
+    new_configuration["Estimated Peak FMA FLOP/s"] = profiling.Ratio(peak_flops)
+    new_configuration["Estimated Peak Bandwidth (byte/second)"] = profiling.Ratio(peak_bandwidth)
+    return profiling.Report(new_calls, report.device_metrics, new_configuration)
 
 
 def roofline_analysis(
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index e97e5f41bfc28..23811e2190784 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -521,6 +521,7 @@ struct ReportNodeTrait {
   static void VisitAttrs(runtime::profiling::ReportNode* report, AttrVisitor* attrs) {
     attrs->Visit("calls", &report->calls);
     attrs->Visit("device_metrics", &report->device_metrics);
+    attrs->Visit("configuration", &report->configuration);
   }
   static constexpr std::nullptr_t SEqualReduce = nullptr;
   static constexpr std::nullptr_t SHashReduce = nullptr;
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index bd3b0db0403f3..4a950153954ff 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -294,7 +294,7 @@ class GraphExecutorDebug : public GraphExecutor {
    */
   profiling::Report Profile(Array<profiling::MetricCollector> collectors) {
     std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
-    profiling::Profiler prof(devices_, cs);
+    profiling::Profiler prof(devices_, cs, {{String("Executor"), String("Graph")}});
 
     // warm up. 1 iteration does not seem enough.
     for (int i = 0; i < 3; i++) {
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 9499a6e7a5bbb..9f95bf18f74b2 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -105,8 +105,9 @@ TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start);
 
 namespace profiling {
 
-Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors)
-    : devs_(devs), collectors_(metric_collectors) {
+Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors,
+                   std::unordered_map<String, ObjectRef> configuration)
+    : devs_(devs), collectors_(metric_collectors), configuration_(configuration) {
   is_running_ = false;
   std::vector<DeviceWrapper> wrapped_devs;
   for (auto dev : devs) {
@@ -117,6 +118,9 @@ Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric
   }
   // reset the thread pool so that PAPI eventset hooks are set in all threads.
   threading::ResetThreadPool();
+
+  configuration_[String("Number of threads")] =
+      ObjectRef(make_object<CountNode>(threading::NumThreads()));
 }
 
 void Profiler::Start() {
@@ -279,7 +283,7 @@ String ReportNode::AsCSV() const {
 }
 
 namespace {
-void print_metric(std::ostream& os, ObjectRef o) {
+void metric_as_json(std::ostream& os, ObjectRef o) {
   if (o.as<StringObj>()) {
     os << "{\"string\":"
        << "\"" << Downcast<String>(o) << "\""
@@ -309,13 +313,14 @@ String ReportNode::AsJSON() const {
   // value we want to print. Instead we construct the json by hand because it
   // is easier.
   s << "{";
+
   s << "\"calls\":[";
   for (size_t i = 0; i < calls.size(); i++) {
     size_t j = 0;
     s << "{";
     for (const auto& kv : calls[i]) {
       s << "\"" << kv.first << "\":";
-      print_metric(s, kv.second);
+      metric_as_json(s, kv.second);
       if (j < calls[i].size() - 1) {
         s << ",";
       }
@@ -326,7 +331,8 @@ String ReportNode::AsJSON() const {
       s << ",";
     }
   }
-  s << "],";
+  s << "],";  // end calls
+
   s << "\"device_metrics\":{";
   size_t i = 0;
   for (const auto& dev_kv : device_metrics) {
@@ -334,7 +340,7 @@ String ReportNode::AsJSON() const {
     s << "\"" << dev_kv.first << "\":{";
     for (const auto& metric_kv : dev_kv.second) {
       s << "\"" << metric_kv.first << "\":";
-      print_metric(s, metric_kv.second);
+      metric_as_json(s, metric_kv.second);
       if (j < dev_kv.second.size() - 1) {
         s << ",";
       }
@@ -346,7 +352,20 @@ String ReportNode::AsJSON() const {
     }
     i++;
   }
-  s << "}}";
+  s << "},";  // end device metrics
+
+  s << "\"configuration\":{";
+  size_t k = 0;
+  for (const auto& kv : configuration) {
+    s << "\"" << kv.first << "\":";
+    metric_as_json(s, kv.second);
+    if (k < configuration.size() - 1) {
+      s << ",";
+    }
+    k++;
+  }
+  s << "}";  // end configuration
+  s << "}";
   return s.str();
 }
 
@@ -395,6 +414,35 @@ ObjectRef AggregateMetric(const std::vector<ObjectRef>& metrics) {
   }
 }
 
+static String print_metric(ObjectRef metric) {
+  std::string val;
+  if (metric.as<CountNode>()) {
+    std::stringstream s;
+    s.imbue(std::locale(""));  // for 1000s seperators
+    s << std::fixed << metric.as<CountNode>()->value;
+    val = s.str();
+  } else if (metric.as<DurationNode>()) {
+    std::stringstream s;
+    s.imbue(std::locale(""));  // for 1000s seperators
+    s << std::fixed << std::setprecision(2) << metric.as<DurationNode>()->microseconds;
+    val = s.str();
+  } else if (metric.as<PercentNode>()) {
+    std::stringstream s;
+    s << std::fixed << std::setprecision(2) << metric.as<PercentNode>()->percent;
+    val = s.str();
+  } else if (metric.as<RatioNode>()) {
+    std::stringstream s;
+    s.imbue(std::locale(""));  // for 1000s seperators
+    s << std::setprecision(2) << metric.as<RatioNode>()->ratio;
+    val = s.str();
+  } else if (metric.as<StringObj>()) {
+    val = Downcast<String>(metric);
+  } else {
+    LOG(FATAL) << "Cannot print metric of type " << metric->GetTypeKey();
+  }
+  return val;
+}
+
 String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) const {
   // aggregate calls by op hash (or op name if hash is not set) + argument shapes
   std::vector<Map<String, ObjectRef>> aggregated_calls;
@@ -533,30 +581,7 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con
         // fill empty data with empty strings
         cols[i].push_back("");
       } else {
-        std::string val;
-        if ((*it).second.as<CountNode>()) {
-          std::stringstream s;
-          s.imbue(std::locale(""));  // for 1000s seperators
-          s << std::fixed << (*it).second.as<CountNode>()->value;
-          val = s.str();
-        } else if ((*it).second.as<DurationNode>()) {
-          std::stringstream s;
-          s.imbue(std::locale(""));  // for 1000s seperators
-          s << std::fixed << std::setprecision(2) << (*it).second.as<DurationNode>()->microseconds;
-          val = s.str();
-        } else if ((*it).second.as<PercentNode>()) {
-          std::stringstream s;
-          s << std::fixed << std::setprecision(2) << (*it).second.as<PercentNode>()->percent;
-          val = s.str();
-        } else if ((*it).second.as<RatioNode>()) {
-          std::stringstream s;
-          s.imbue(std::locale(""));  // for 1000s seperators
-          s << std::setprecision(2) << (*it).second.as<RatioNode>()->ratio;
-          val = s.str();
-        } else if ((*it).second.as<StringObj>()) {
-          val = Downcast<String>((*it).second);
-        }
-        cols[i].push_back(val);
+        cols[i].push_back(print_metric((*it).second));
       }
     }
   }
@@ -592,6 +617,12 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con
     }
     s << std::endl;
   }
+
+  // Add configuration information. It will not be aligned with the columns.
+  s << std::endl << "Configuration" << std::endl << "-------------" << std::endl;
+  for (auto kv : configuration) {
+    s << kv.first << ": " << print_metric(kv.second) << std::endl;
+  }
   return s.str();
 }
 
@@ -599,7 +630,7 @@ std::string DeviceString(Device dev) {
   return DeviceName(dev.device_type) + std::to_string(dev.device_id);
 }
 
-Report Profiler::Report(bool aggregate, bool sort) {
+Report Profiler::Report() {
   // sync all timers and normalize rows
   std::vector<std::unordered_map<String, ObjectRef>> rows;
   for (auto& cf : calls_) {
@@ -638,14 +669,16 @@ Report Profiler::Report(bool aggregate, bool sort) {
     converted_rows.push_back(row);
   }
 
-  return profiling::Report(converted_rows, device_metrics);
+  return profiling::Report(converted_rows, device_metrics, configuration_);
 }
 
 Report::Report(Array<Map<String, ObjectRef>> calls,
-               Map<String, Map<String, ObjectRef>> device_metrics) {
+               Map<String, Map<String, ObjectRef>> device_metrics,
+               Map<String, ObjectRef> configuration) {
   auto node = make_object<ReportNode>();
   node->calls = std::move(calls);
   node->device_metrics = std::move(device_metrics);
+  node->configuration = std::move(configuration);
   data_ = std::move(node);
 }
 
@@ -697,6 +730,7 @@ Report Report::FromJSON(String json) {
   std::string key;
   Array<Map<String, ObjectRef>> calls;
   Map<String, Map<String, ObjectRef>> device_metrics;
+  Map<String, ObjectRef> configuration;
 
   reader.BeginObject();
   while (reader.NextObjectItem(&key)) {
@@ -713,10 +747,12 @@ Report Report::FromJSON(String json) {
         device_metrics.Set(device_name, parse_metrics(&reader));
       }
       // reader.EndObject();
+    } else if (key == "configuration") {
+      configuration = parse_metrics(&reader);
     }
   }
 
-  return Report(calls, device_metrics);
+  return Report(calls, device_metrics, configuration);
 }
 
 TVM_REGISTER_OBJECT_TYPE(DurationNode);
@@ -855,8 +891,9 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
 
 TVM_REGISTER_GLOBAL("runtime.profiling.Report")
     .set_body_typed([](Array<Map<String, ObjectRef>> calls,
-                       Map<String, Map<String, ObjectRef>> device_metrics) {
-      return Report(calls, device_metrics);
+                       Map<String, Map<String, ObjectRef>> device_metrics,
+                       Map<String, ObjectRef> configuration) {
+      return Report(calls, device_metrics, configuration);
     });
 
 TVM_REGISTER_GLOBAL("runtime.profiling.Count").set_body_typed([](int64_t count) {
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index 393d1b399878f..0ace910b5c539 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -58,9 +58,9 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
           // on remotes, we accept a nullptr for collectors.
           if (collectors.defined()) {
             std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
-            prof_ = profiling::Profiler(devices, cs);
+            prof_ = profiling::Profiler(devices, cs, {{String("Executor"), String("VM")}});
           } else {
-            prof_ = profiling::Profiler(devices, {});
+            prof_ = profiling::Profiler(devices, {}, {{String("Executor"), String("VM")}});
           }
 
           auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self);
@@ -77,7 +77,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
           return report;
         });
   } else if (name == "profile_rpc") {
-    // We cannot return a Report over RPC because TMV RPC mechanism only
+    // We cannot return a Report over RPC because TVM RPC mechanism only
     // supports a subset of Object classes. Instead we serialize it on the
     // remote (here) and deserialize it on the other end.
     return TypedPackedFunc<std::string(std::string)>([sptr_to_self, this](std::string arg_name) {
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 29a8414337756..ab22bd2b9c481 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -69,6 +69,7 @@ def test_vm(target, dev):
     assert "Total" in str(report)
     assert "AllocTensorReg" in str(report)
     assert "AllocStorage" in str(report)
+    assert report.configuration["Executor"] == "VM"
 
     csv = read_csv(report)
     assert "Hash" in csv.keys()
@@ -102,6 +103,7 @@ def test_graph_executor(target, dev):
     assert "fused_nn_softmax" in str(report)
     assert "Total" in str(report)
     assert "Hash" in str(report)
+    assert "Graph" in str(report)
 
 
 @tvm.testing.parametrize_targets("cuda", "llvm")
@@ -147,6 +149,7 @@ def test_json():
     parsed = json.loads(report.json())
     assert "device_metrics" in parsed
     assert "calls" in parsed
+    assert "configuration" in parsed
     assert "Duration (us)" in parsed["calls"][0]
     assert "microseconds" in parsed["calls"][0]["Duration (us)"]
     assert len(parsed["calls"]) > 0

From 68dcecc926f890429a8f2cba9ce55eab6a18fa6e Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 6 Jun 2022 20:02:18 -0700
Subject: [PATCH 054/181] [MetaSchedule] Evo Independence from TaskScheduler
 (#11590)

Per discussion with @Kathryn-cat, we realized that the current API
design could be verbose if we only want to tune a single task, in which
case a dummy task scheduler still needs to be established to supply
`EvolutionarySearch` with proper `CostModel` and `Database`. This PR
fixes this UX issue.
---
 include/tvm/meta_schedule/search_strategy.h   |  17 +-
 include/tvm/meta_schedule/task_scheduler.h    |  20 +--
 include/tvm/meta_schedule/tune_context.h      |   2 -
 .../search_strategy/search_strategy.py        |  24 ++-
 .../task_scheduler/gradient_based.py          |  10 +-
 .../task_scheduler/round_robin.py             |  10 +-
 .../task_scheduler/task_scheduler.py          |  10 +-
 .../measure_callback/add_to_database.cc       |   5 +-
 .../search_strategy/evolutionary_search.cc    | 148 +++++++++---------
 .../search_strategy/replay_func.cc            |  48 +++---
 .../search_strategy/replay_trace.cc           |  63 ++++----
 .../search_strategy/search_strategy.cc        |   7 +
 .../task_scheduler/gradient_based.cc          |   7 +-
 .../task_scheduler/round_robin.cc             |   7 +-
 .../task_scheduler/task_scheduler.cc          |   6 +-
 .../test_meta_schedule_measure_callback.py    |  22 ++-
 .../test_meta_schedule_search_strategy.py     |  93 +++++------
 .../test_meta_schedule_task_scheduler.py      |  60 +++----
 18 files changed, 298 insertions(+), 261 deletions(-)

diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index 6895673a04cc3..139de7c99d042 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -113,12 +113,16 @@ class SearchStrategyNode : public runtime::Object {
 
   /*!
    * \brief Pre-tuning for the search strategy.
-   * \param design_spaces The design spaces for pre-tuning.
+   * \param design_spaces The design spaces used during tuning process.
+   * \param database The database used during tuning process.
+   * \param cost_model The cost model used during tuning process.
    * \note Pre-tuning is supposed to be called before the tuning process and after the
    *  initialization. Because the search strategy is stateful, we can always call pretuning
    *  and reset the search strategy.
    */
-  virtual void PreTuning(const Array<tir::Schedule>& design_spaces) = 0;
+  virtual void PreTuning(const Array<tir::Schedule>& design_spaces,
+                         const Optional<Database>& database,
+                         const Optional<CostModel>& cost_model) = 0;
 
   /*!
    * \brief Post-tuning for the search strategy.
@@ -159,7 +163,8 @@ class PySearchStrategyNode : public SearchStrategyNode {
    * \brief The function type of `PreTuning` method.
    * \param design_spaces The design spaces for pre-tuning.
    */
-  using FPreTuning = runtime::TypedPackedFunc<void(const Array<tir::Schedule>&)>;
+  using FPreTuning = runtime::TypedPackedFunc<void(
+      const Array<tir::Schedule>&, const Optional<Database>&, const Optional<CostModel>&)>;
   /*! \brief The function type of `PostTuning` method. */
   using FPostTuning = runtime::TypedPackedFunc<void()>;
   /*!
@@ -199,10 +204,8 @@ class PySearchStrategyNode : public SearchStrategyNode {
     this->f_initialize_with_tune_context(context);
   }
 
-  void PreTuning(const Array<tir::Schedule>& design_spaces) final {
-    ICHECK(f_pre_tuning != nullptr) << "PySearchStrategy's PreTuning method not implemented!";
-    this->f_pre_tuning(design_spaces);
-  }
+  void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
+                 const Optional<CostModel>& cost_model) final;
 
   void PostTuning() final {
     ICHECK(f_post_tuning != nullptr) << "PySearchStrategy's PostTuning method not implemented!";
diff --git a/include/tvm/meta_schedule/task_scheduler.h b/include/tvm/meta_schedule/task_scheduler.h
index 7453c2b484b90..5953a2c3e42b1 100644
--- a/include/tvm/meta_schedule/task_scheduler.h
+++ b/include/tvm/meta_schedule/task_scheduler.h
@@ -74,13 +74,13 @@ class TaskSchedulerNode : public runtime::Object {
   /*! \brief The runner of the scheduler. */
   Runner runner{nullptr};
   /*! \brief The database of the scheduler. */
-  Database database{nullptr};
-  /*! \brief The maximum number of trials allowed. */
-  int max_trials;
+  Optional<Database> database;
   /*! \brief The cost model of the scheduler. */
   Optional<CostModel> cost_model;
   /*! \brief The list of measure callbacks of the scheduler. */
   Array<MeasureCallback> measure_callbacks;
+  /*! \brief The maximum number of trials allowed. */
+  int max_trials;
   /*! \brief The number of trials already conducted. */
   int num_trials_already;
   /*! \brief The tuning task's logging function. t*/
@@ -94,9 +94,9 @@ class TaskSchedulerNode : public runtime::Object {
     v->Visit("builder", &builder);
     v->Visit("runner", &runner);
     v->Visit("database", &database);
-    v->Visit("max_trials", &max_trials);
     v->Visit("cost_model", &cost_model);
     v->Visit("measure_callbacks", &measure_callbacks);
+    v->Visit("max_trials", &max_trials);
     v->Visit("num_trials_already", &num_trials_already);
     // `logging_func` is not visited
   }
@@ -243,10 +243,10 @@ class TaskScheduler : public runtime::ObjectRef {
   TVM_DLL static TaskScheduler RoundRobin(Array<TuneContext> tasks,                            //
                                           Builder builder,                                     //
                                           Runner runner,                                       //
-                                          Database database,                                   //
-                                          int max_trials,                                      //
+                                          Optional<Database> database,                         //
                                           Optional<CostModel> cost_model,                      //
                                           Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                          int max_trials,                                      //
                                           PackedFunc logging_func);
   /*!
    * \brief Create a task scheduler that fetches tasks in a gradient based fashion.
@@ -268,10 +268,10 @@ class TaskScheduler : public runtime::ObjectRef {
                                              Array<FloatImm> task_weights,                        //
                                              Builder builder,                                     //
                                              Runner runner,                                       //
-                                             Database database,                                   //
-                                             int max_trials,                                      //
+                                             Optional<Database> database,                         //
                                              Optional<CostModel> cost_model,                      //
                                              Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                             int max_trials,                                      //
                                              PackedFunc logging_func,                             //
                                              double alpha,                                        //
                                              int window_size,                                     //
@@ -297,10 +297,10 @@ class TaskScheduler : public runtime::ObjectRef {
       Array<TuneContext> tasks,                                   //
       Builder builder,                                            //
       Runner runner,                                              //
-      Database database,                                          //
-      int max_trials,                                             //
+      Optional<Database> database,                                //
       Optional<CostModel> cost_model,                             //
       Optional<Array<MeasureCallback>> measure_callbacks,         //
+      int max_trials,                                             //
       PackedFunc logging_func,                                    //
       PyTaskSchedulerNode::FTune f_tune,                          //
       PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
index faa24fc99f4ce..d63fb819f3639 100644
--- a/include/tvm/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -61,8 +61,6 @@ class TuneContextNode : public runtime::Object {
   /*! \brief The number of threads to be used. */
   int num_threads;
 
-  /*! \brief The task scheduler that owns the tune context */
-  const TaskSchedulerNode* task_scheduler;
   /*! \brief Whether the tuning task has been stopped or finished. */
   bool is_terminated;
   /*! \brief The measure candidates. */
diff --git a/python/tvm/meta_schedule/search_strategy/search_strategy.py b/python/tvm/meta_schedule/search_strategy/search_strategy.py
index 07c47f01d1c55..14b46a0785f1d 100644
--- a/python/tvm/meta_schedule/search_strategy/search_strategy.py
+++ b/python/tvm/meta_schedule/search_strategy/search_strategy.py
@@ -18,7 +18,7 @@
 Meta Schedule search strategy that generates the measure
 candidates for measurement.
 """
-from typing import Callable, List, Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, Callable, List, Optional
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
@@ -29,6 +29,8 @@
 from ..runner import RunnerResult
 
 if TYPE_CHECKING:
+    from ..cost_model import CostModel
+    from ..database import Database
     from ..tune_context import TuneContext
 
 
@@ -87,15 +89,29 @@ def initialize_with_tune_context(self, context: "TuneContext") -> None:
             self, context
         )
 
-    def pre_tuning(self, design_spaces: List[Schedule]) -> None:
+    def pre_tuning(
+        self,
+        design_spaces: List[Schedule],
+        database: Optional["Database"] = None,
+        cost_model: Optional["CostModel"] = None,
+    ) -> None:
         """Pre-tuning for the search strategy.
 
         Parameters
         ----------
         design_spaces : List[Schedule]
-            The design spaces for pre-tuning.
+            The design spaces used during tuning process.
+        database : Optional[Database] = None
+            The database used during tuning process.
+        cost_model : Optional[CostModel] = None
+            The cost model used during tuning process.
         """
-        _ffi_api.SearchStrategyPreTuning(self, design_spaces)  # type: ignore # pylint: disable=no-member
+        _ffi_api.SearchStrategyPreTuning(  # type: ignore # pylint: disable=no-member
+            self,
+            design_spaces,
+            database,
+            cost_model,
+        )
 
     def post_tuning(self) -> None:
         """Post-tuning for the search strategy."""
diff --git a/python/tvm/meta_schedule/task_scheduler/gradient_based.py b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
index 6234449bf09b9..20d32dd1c59f9 100644
--- a/python/tvm/meta_schedule/task_scheduler/gradient_based.py
+++ b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
@@ -45,11 +45,11 @@ def __init__(
         task_weights: List[float],
         builder: Builder,
         runner: Runner,
-        database: Database,
-        max_trials: int,
         *,
+        database: Database,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
+        max_trials: int,
         alpha: float = 0.2,
         window_size: int = 3,
         seed: int = -1,
@@ -68,12 +68,12 @@ def __init__(
             The runner.
         database : Database
             The database.
-        max_trials : int
-            The maximum number of trials to run.
         cost_model : CostModel, default None.
             The cost model of the scheduler.
         measure_callbacks : Optional[List[MeasureCallback]] = None
             The list of measure callbacks of the scheduler.
+        max_trials : int
+            The maximum number of trials to run.
         alpha : float = 0.2
             The parameter alpha in gradient computation.
         window_size : int = 3
@@ -88,9 +88,9 @@ def __init__(
             builder,
             runner,
             database,
-            max_trials,
             cost_model,
             measure_callbacks,
+            max_trials,
             make_logging_func(logger),
             alpha,
             window_size,
diff --git a/python/tvm/meta_schedule/task_scheduler/round_robin.py b/python/tvm/meta_schedule/task_scheduler/round_robin.py
index a461358283949..ed395643bbaae 100644
--- a/python/tvm/meta_schedule/task_scheduler/round_robin.py
+++ b/python/tvm/meta_schedule/task_scheduler/round_robin.py
@@ -60,11 +60,11 @@ def __init__(
         task_weights: List[float],
         builder: Builder,
         runner: Runner,
-        database: Database,
-        max_trials: int,
         *,
+        database: Database,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
+        max_trials: int,
     ) -> None:
         """Constructor.
 
@@ -80,12 +80,12 @@ def __init__(
             The runner.
         database : Database
             The database.
-        max_trials : int
-            The maximum number of trials.
         cost_model : Optional[CostModel]
             The cost model.
         measure_callbacks: Optional[List[MeasureCallback]]
             The list of measure callbacks of the scheduler.
+        max_trials : int
+            The maximum number of trials.
         """
         del task_weights
         self.__init_handle_by_constructor__(
@@ -94,8 +94,8 @@ def __init__(
             builder,
             runner,
             database,
-            max_trials,
             cost_model,
             measure_callbacks,
+            max_trials,
             make_logging_func(logger),
         )
diff --git a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
index 4454078a6f16d..3d57a6b01b9db 100644
--- a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
+++ b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
@@ -31,7 +31,6 @@
 from ..tune_context import TuneContext
 from ..utils import make_logging_func
 
-
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
@@ -177,9 +176,9 @@ class PyTaskScheduler:
             "builder",
             "runner",
             "database",
-            "max_trials",
             "cost_model",
             "measure_callbacks",
+            "max_trials",
         ],
         "methods": [
             "tune",
@@ -195,18 +194,19 @@ def __init__(
         tasks: List[TuneContext],
         builder: Builder,
         runner: Runner,
-        database: Database,
-        max_trials: int,
+        *,
+        database: Optional[Database] = None,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
+        max_trials: int,
     ):
         self.tasks = tasks
         self.builder = builder
         self.runner = runner
         self.database = database
-        self.max_trials = max_trials
         self.cost_model = cost_model
         self.measure_callbacks = measure_callbacks
+        self.max_trials = max_trials
 
     def tune(self) -> None:
         """Auto-tuning."""
diff --git a/src/meta_schedule/measure_callback/add_to_database.cc b/src/meta_schedule/measure_callback/add_to_database.cc
index 20581f4630a63..0988da0414e2a 100644
--- a/src/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/meta_schedule/measure_callback/add_to_database.cc
@@ -27,8 +27,11 @@ class AddToDatabaseNode : public MeasureCallbackNode {
              const Array<MeasureCandidate>& measure_candidates,
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
+    if (!task_scheduler->database.defined()) {
+      return;
+    }
     TuneContext task = task_scheduler->tasks[task_id];
-    Database database = task_scheduler->database;
+    Database database = task_scheduler->database.value();
     Workload workload = database->CommitWorkload(task->mod.value());
     Target target = task->target.value();
     ICHECK_EQ(runner_results.size(), measure_candidates.size());
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index bdef26ef876e5..8b36a95217046 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -246,13 +246,41 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     int ed;
     /*! \brief The counter of returning empty results. */
     int num_empty_iters;
-
-    explicit State(EvolutionarySearchNode* self, Array<tir::Trace> design_spaces)
+    /*! \brief The metadata of the function arguments. */
+    Array<ArgInfo> args_info_{nullptr};
+    /*! \brief Pre thread data including module to be tuned and random state. */
+    std::vector<PerThreadData> per_thread_data_;
+    /*!
+     * \brief The workloads that are already measured.
+     * TODO(junrushao1994): add records from the database to avoid re-measuring.
+     * */
+    IRModuleSet measured_workloads_;
+    /*! \brief A Database for selecting useful candidates. */
+    Database database_{nullptr};
+    /*! \brief A cost model helping to explore the search space */
+    CostModel cost_model_{nullptr};
+    /*! \brief The token registered for the given workload in database. */
+    Workload token_{nullptr};
+
+    explicit State(EvolutionarySearchNode* self, Array<tir::Trace> design_spaces, Database database,
+                   CostModel cost_model)
         : self(self),
           design_spaces(design_spaces),
           st(0),
           ed(self->num_trials_per_iter),
-          num_empty_iters(0) {}
+          num_empty_iters(0) {
+      const TuneContextNode* ctx = self->context_;
+      IRModule mod = ctx->mod.value();
+      this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(mod));
+      this->per_thread_data_.resize(ctx->num_threads);
+      for (PerThreadData& data : this->per_thread_data_) {
+        data.mod = DeepCopyIRModule(mod);
+        data.rand_state = ForkSeed(&self->rand_state_);
+      }
+      this->database_ = database;
+      this->cost_model_ = cost_model;
+      this->token_ = database->CommitWorkload(mod);
+    }
 
     /*!
      * \brief Pick up best candidates from database.
@@ -293,33 +321,10 @@ class EvolutionarySearchNode : public SearchStrategyNode {
 
   /*! \brief The tuning context of the evolutionary search strategy. */
   const TuneContextNode* context_{nullptr};
-  /*! \brief The target for the workload. */
-  Target target_{nullptr};
-  /*! \brief The metadata of the function arguments. */
-  Array<ArgInfo> args_info_{nullptr};
-  /*! \brief A Database for selecting useful candidates. */
-  Database database_{nullptr};
-  /*! \brief A cost model helping to explore the search space */
-  CostModel cost_model_{nullptr};
-  /*! \brief The postprocessors. */
-  Array<Postproc> postprocs_{nullptr};
-  /*! \brief Mutators and their probability mass */
-  Map<Mutator, FloatImm> mutator_probs_{nullptr};
-  /*! \brief The number of threads to use. To be initialized with TuneContext. */
-  int num_threads_;
   /*! \brief The random state. To be initialized with TuneContext. */
   TRandState rand_state_;
-  /*! \brief Pre thread data including module to be tuned and random state. */
-  std::vector<PerThreadData> per_thread_data_;
   /*! \brief The state of the search strategy. */
   std::unique_ptr<State> state_ = nullptr;
-  /*! \brief The token registered for the given workload in database. */
-  Workload token_{nullptr};
-  /*!
-   * \brief The workloads that are already measured.
-   * TODO(junrushao1994): add records from the database to avoid re-measuring.
-   * */
-  IRModuleSet measured_workloads_;
 
   /*** Configuration: global ***/
   /*! \brief The number of trials per iteration. */
@@ -351,15 +356,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     // `context_` is not visited
-    // `target_` is not visited
-    // `args_info_` is not visited
-    // `database` is not visited
-    // `cost_model` is not visited
-    // `postprocs` is not visited
-    // `mutator_probs_` is not visited
-    // `num_threads` is not visited
     // `rand_state_` is not visited
-    // `per_thread_data_` is not visited
     // `state_` is not visited
 
     /*** Configuration: global ***/
@@ -386,39 +383,41 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     CHECK(context->num_threads > 0) << "Number of threads has to be larger than 0.";
     CHECK(context->target.defined()) << "Target must be defined!";
     this->context_ = context.get();
-    this->target_ = context->target.value();
-    this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(context->mod.value()));
-    this->mutator_probs_ = context->mutator_probs;
-    this->postprocs_ = context->postprocs;
-    this->num_threads_ = context->num_threads;
     this->rand_state_ = ForkSeed(&context->rand_state);
-    CHECK(context->task_scheduler != nullptr)
-        << "ValueError: TaskScheduler is not defined in TuneContext";
-    this->cost_model_ = context->task_scheduler->cost_model.value();
-    this->database_ = context->task_scheduler->database;
-    this->token_ = this->database_->CommitWorkload(context->mod.value());
-    this->per_thread_data_.resize(this->num_threads_);
-    for (const auto& kv : this->mutator_probs_) {
+    for (const auto& kv : context->mutator_probs) {
       double mass = kv.second->value;
       TVM_META_SCHEDULE_CHECK_PROB_RANGE(mass, "mutator_probs");
     }
-    for (PerThreadData& data : this->per_thread_data_) {
-      data.mod = DeepCopyIRModule(context->mod.value());
-      data.rand_state = ForkSeed(&this->rand_state_);
-    }
     this->state_.reset();
   }
 
-  void PreTuning(const Array<Schedule>& design_spaces) final {
+  void PreTuning(const Array<Schedule>& design_spaces, const Optional<Database>& database,
+                 const Optional<CostModel>& cost_model) final {
     ICHECK(!design_spaces.empty());
+    CHECK(this->context_ != nullptr) << "ValueError: Did you forget to initialize the TuneContext?";
+    CHECK(database.defined())
+        << "ValueError: Database is not supplied in PreTuning. Evolutionary"
+           "search algorithm requires a database to be present, so that it "
+           "could sample from previously-explored population. If you do not "
+           "intent to store data on disk, please use `tvm.meta_schedule.testing.DummyDatabase`";
+    CHECK(cost_model.defined())
+        << "ValueError: CostModel is not supplied in PreTuning. Evolutionary search "
+           "algorithm expects a cost model to filter out potentially less efficient kernels. If "
+           "you do not expect a cost model to help, please use "
+           "`tvm.meta_schedule.cost_model.RandomModel`";
+    if (this->state_ != nullptr) {
+      TVM_PY_LOG(WARNING, this->context_->logging_func)
+          << "EvolutionarySearch is already initialized.";
+      this->state_.reset();
+    }
     ICHECK(this->state_ == nullptr);
-    // Change to traces
     Array<tir::Trace> design_space_traces;
     design_space_traces.reserve(design_spaces.size());
     for (const Schedule& space : design_spaces) {
       design_space_traces.push_back(space->trace().value()->Simplified(true));
     }
-    this->state_ = std::make_unique<State>(this, design_space_traces);
+    this->state_ =
+        std::make_unique<State>(this, design_space_traces, database.value(), cost_model.value());
   }
 
   void PostTuning() final {
@@ -442,16 +441,16 @@ class EvolutionarySearchNode : public SearchStrategyNode {
 std::vector<Schedule> EvolutionarySearchNode::State::PickBestFromDatabase(int num) {
   std::vector<tir::Trace> measured_traces;
   measured_traces.reserve(num);
-  Array<TuningRecord> top_records = self->database_->GetTopK(self->token_, num);
+  Array<TuningRecord> top_records = this->database_->GetTopK(this->token_, num);
   for (TuningRecord record : top_records) {
     measured_traces.push_back(record->trace);
   }
   int actual_num = measured_traces.size();
-  ThreadedTraceApply pp(self->postprocs_);
+  ThreadedTraceApply pp(self->context_->postprocs);
   std::vector<Schedule> results(actual_num, Schedule{nullptr});
   auto f_proc_measured = [this, &measured_traces, &results, &pp](int thread_id,
                                                                  int trace_id) -> void {
-    PerThreadData& data = self->per_thread_data_.at(thread_id);
+    PerThreadData& data = this->per_thread_data_.at(thread_id);
     TRandState* rand_state = &data.rand_state;
     const IRModule& mod = data.mod;
     tir::Trace trace = measured_traces.at(trace_id);
@@ -464,17 +463,17 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickBestFromDatabase(int nu
       throw;
     }
   };
-  support::parallel_for_dynamic(0, actual_num, self->num_threads_, f_proc_measured);
+  support::parallel_for_dynamic(0, actual_num, self->context_->num_threads, f_proc_measured);
   return results;
 }
 
 std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int num) {
-  ThreadedTraceApply pp(self->postprocs_);
+  ThreadedTraceApply pp(self->context_->postprocs);
   std::vector<Schedule> out_schs;
   while (static_cast<int>(out_schs.size()) < self->init_min_unmeasured) {
     std::vector<Schedule> results(num, Schedule{nullptr});
     auto f_proc_unmeasured = [this, &results, &pp](int thread_id, int trace_id) -> void {
-      PerThreadData& data = self->per_thread_data_.at(thread_id);
+      PerThreadData& data = this->per_thread_data_.at(thread_id);
       TRandState* rand_state = &data.rand_state;
       const IRModule& mod = data.mod;
       Schedule& result = results.at(trace_id);
@@ -485,7 +484,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
         result = sch.value();
       }
     };
-    support::parallel_for_dynamic(0, num, self->num_threads_, f_proc_unmeasured);
+    support::parallel_for_dynamic(0, num, self->context_->num_threads, f_proc_unmeasured);
     for (int i = 0; i < num; i++) {
       if (results[i].defined()) {
         out_schs.push_back(results[i]);
@@ -501,14 +500,14 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     std::vector<Schedule> population, int num) {
   ICHECK_GT(num, 0);
   // The heap to record best schedule, we do not consider schedules that are already measured
-  IRModuleSet exists = self->measured_workloads_;
+  IRModuleSet exists = this->measured_workloads_;
   SizedHeap heap(num);
   for (int iter = 0;; ++iter) {
     // Predict normalized score with the cost model,
     std::vector<double> scores = PredictNormalizedScore(population,                           //
                                                         GetRef<TuneContext>(self->context_),  //
-                                                        self->cost_model_,                    //
-                                                        self->args_info_);
+                                                        this->cost_model_,                    //
+                                                        this->args_info_);
     ICHECK_EQ(scores.size(), population.size());
     for (int i = 0, n = population.size(); i < n; ++i) {
       Schedule sch = population.at(i);
@@ -524,18 +523,18 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     if (iter == self->genetic_num_iters) {
       break;
     }
-    // Set threaded samplers, with probability from predicated normalized throughputs
-    for (PerThreadData& data : self->per_thread_data_) {
-      data.Set(scores, self->genetic_mutate_prob, self->mutator_probs_);
+    // Set threaded samplers, with probability from predicated normalized throughput
+    for (PerThreadData& data : this->per_thread_data_) {
+      data.Set(scores, self->genetic_mutate_prob, self->context_->mutator_probs);
     }
-    ThreadedTraceApply pp(self->postprocs_);
+    ThreadedTraceApply pp(self->context_->postprocs);
     ConcurrentBitmask cbmask(self->population_size);
     std::vector<Schedule> next_population(self->population_size, Schedule{nullptr});
     // The worker function
     auto f_find_candidate = [&cbmask, &population, &next_population, &pp, this](int thread_id,
                                                                                 int trace_id) {
       // Prepare samplers
-      PerThreadData& data = self->per_thread_data_.at(thread_id);
+      PerThreadData& data = this->per_thread_data_.at(thread_id);
       TRandState* rand_state = &data.rand_state;
       const IRModule& mod = data.mod;
       std::function<int()>& trace_sampler = data.trace_sampler;
@@ -567,7 +566,8 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
         result = population.at(sampled_trace_id);
       }
     };
-    support::parallel_for_dynamic(0, self->population_size, self->num_threads_, f_find_candidate);
+    support::parallel_for_dynamic(0, self->population_size, self->context_->num_threads,
+                                  f_find_candidate);
     population.swap(next_population);
     TVM_PY_LOG(INFO, self->context_->logging_func) << "Evolve iter #" << iter << " done. Summary:\n"
                                                    << pp.SummarizeFailures();
@@ -607,7 +607,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickWithEpsGreedy(
       tir::SampleWithoutReplacement(&self->rand_state_, unmeasured.size(), unmeasured.size());
   std::vector<Schedule> results;
   results.reserve(num);
-  IRModuleSet& measured_workloads = self->measured_workloads_;
+  IRModuleSet& measured_workloads = this->measured_workloads_;
   for (int i = 0, i_bests = 0, i_rands = 0; i < num; ++i) {
     bool has_best = i_bests < static_cast<int>(bests.size());
     bool has_rand = i_rands < static_cast<int>(rands.size());
@@ -677,7 +677,7 @@ Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasure
       return NullOpt;
     }
   }
-  return AssembleCandidates(picks, self->args_info_);
+  return AssembleCandidates(picks, this->args_info_);
 }
 
 void EvolutionarySearchNode::State::NotifyRunnerResults(
@@ -713,6 +713,12 @@ SearchStrategy SearchStrategy::EvolutionarySearch(int num_trials_per_iter,     /
   return SearchStrategy(n);
 }
 
+class EvolutionarySearch : public SearchStrategy {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(EvolutionarySearch, SearchStrategy,
+                                                    EvolutionarySearchNode);
+};
+
 TVM_REGISTER_NODE_TYPE(EvolutionarySearchNode);
 TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyEvolutionarySearch")
     .set_body_typed(SearchStrategy::EvolutionarySearch);
diff --git a/src/meta_schedule/search_strategy/replay_func.cc b/src/meta_schedule/search_strategy/replay_func.cc
index 878c872a65fe2..1aaaaa09e8ab8 100644
--- a/src/meta_schedule/search_strategy/replay_func.cc
+++ b/src/meta_schedule/search_strategy/replay_func.cc
@@ -32,8 +32,14 @@ class ReplayFuncNode : public SearchStrategyNode {
     int st;
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
     int ed;
+    /*! \brief The metadata of the function arguments. */
+    Array<ArgInfo> args_info_{nullptr};
 
-    explicit State(ReplayFuncNode* self) : self(self), st(0), ed(self->num_trials_per_iter) {}
+    explicit State(ReplayFuncNode* self) : self(self), st(0), ed(self->num_trials_per_iter) {
+      const TuneContextNode* ctx = self->context_;
+      ICHECK(ctx);
+      this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(ctx->mod.value()));
+    }
 
     inline Optional<Array<MeasureCandidate>> GenerateMeasureCandidates();
     inline void NotifyRunnerResults(const Array<RunnerResult>& results);
@@ -44,14 +50,8 @@ class ReplayFuncNode : public SearchStrategyNode {
   /*! \brief The number of total trials. */
   int max_trials_per_task;
 
-  /*! \brief The module to be tuned. */
-  IRModule mod_{nullptr};
-  /*! \brief The metadata of the function arguments. */
-  Array<ArgInfo> args_info_{nullptr};
-  /*! \brief The post processors */
-  Array<Postproc> postprocs_{nullptr};
-  /*! \brief The space generator for measure candidates generation. */
-  SpaceGenerator space_generator_{nullptr};
+  /*! \brief The tuning context of the search strategy. */
+  const TuneContextNode* context_{nullptr};
   /*! \brief The random state. -1 means using random number. */
   TRandState rand_state_ = -1;
   /*! \brief The state of the search strategy. */
@@ -60,10 +60,7 @@ class ReplayFuncNode : public SearchStrategyNode {
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
     v->Visit("max_trials_per_task", &max_trials_per_task);
-    // `space_generator_` is not visited
-    // `mod_` is not visited
-    // `args_info_` is not visited
-    // `num_threads_` is not visited
+    // `context_` is not visited.
     // `rand_state_` is not visited
     // `state_` is not visited
   }
@@ -72,15 +69,21 @@ class ReplayFuncNode : public SearchStrategyNode {
   TVM_DECLARE_FINAL_OBJECT_INFO(ReplayFuncNode, SearchStrategyNode);
 
   void InitializeWithTuneContext(const TuneContext& context) final {
-    this->space_generator_ = context->space_generator.value();
-    this->mod_ = context->mod.value();
-    this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(context->mod.value()));
-    this->postprocs_ = context->postprocs;
+    CHECK(context->space_generator.defined())
+        << "ValueError: TuneContext.space_generator is not defined";
+    CHECK(context->mod.defined()) << "ValueError: TuneContext.mod is not defined";
+    this->context_ = context.get();
     this->rand_state_ = ForkSeed(&context->rand_state);
     this->state_.reset();
   }
 
-  void PreTuning(const Array<tir::Schedule>& design_spaces) final {
+  void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
+                 const Optional<CostModel>& cost_model) final {
+    CHECK(this->context_ != nullptr) << "ValueError: Did you forget to initialize the TuneContext?";
+    if (this->state_ != nullptr) {
+      TVM_PY_LOG(WARNING, this->context_->logging_func) << "ReplayFunc is already initialized.";
+      this->state_.reset();
+    }
     ICHECK(this->state_ == nullptr);
     this->state_ = std::make_unique<State>(this);
   }
@@ -109,21 +112,24 @@ inline Optional<Array<MeasureCandidate>> ReplayFuncNode::State::GenerateMeasureC
   }
   ed = std::min(ed, self->max_trials_per_task);
   Array<MeasureCandidate> result;
+  const TuneContextNode* ctx = self->context_;
+  ICHECK(ctx);
+  IRModule mod = ctx->mod.value();
   for (int i = st; i < ed; i++) {
     for (;;) {
-      Array<tir::Schedule> schs = self->space_generator_->GenerateDesignSpace(self->mod_);
+      Array<tir::Schedule> schs = ctx->space_generator.value()->GenerateDesignSpace(mod);
       int design_space_index = tir::SampleInt(&self->rand_state_, 0, schs.size());
       tir::Schedule sch = schs[design_space_index];
       sch->EnterPostproc();
       bool failed = false;
-      for (const Postproc& proc : self->postprocs_) {
+      for (const Postproc& proc : ctx->postprocs) {
         if (!proc->Apply(sch)) {
           failed = true;
           break;
         }
       }
       if (!failed) {
-        result.push_back(MeasureCandidate(sch, self->args_info_));
+        result.push_back(MeasureCandidate(sch, this->args_info_));
         break;
       }
     }
diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc
index f17c5d6c4eb3e..13f32a744e3a0 100644
--- a/src/meta_schedule/search_strategy/replay_trace.cc
+++ b/src/meta_schedule/search_strategy/replay_trace.cc
@@ -35,8 +35,22 @@ class ReplayTraceNode : public SearchStrategyNode {
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
     int ed;
 
+    /*! \brief The module to be tuned. */
+    Array<IRModule> per_thread_mod_{nullptr};
+    /*! \brief The metadata of the function arguments. */
+    Array<ArgInfo> args_info_{nullptr};
+
     explicit State(ReplayTraceNode* self, Array<tir::Trace> design_spaces)
-        : self(self), design_spaces(design_spaces), st(0), ed(self->num_trials_per_iter) {}
+        : self(self), design_spaces(design_spaces), st(0), ed(self->num_trials_per_iter) {
+      const TuneContextNode* ctx = self->context_;
+      ICHECK(ctx);
+      IRModule mod = ctx->mod.value();
+      this->per_thread_mod_.reserve(ctx->num_threads);
+      for (int i = 0; i < ctx->num_threads; i++) {
+        this->per_thread_mod_.push_back(DeepCopyIRModule(mod));
+      }
+      this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(mod));
+    }
 
     inline Optional<Array<MeasureCandidate>> GenerateMeasureCandidates();
     inline void NotifyRunnerResults(const Array<RunnerResult>& results);
@@ -47,14 +61,8 @@ class ReplayTraceNode : public SearchStrategyNode {
   /*! \brief The number of total trials. */
   int max_trials_per_task;
 
-  /*! \brief The module to be tuned. */
-  Array<IRModule> per_thread_mod_{nullptr};
-  /*! \brief The metadata of the function arguments. */
-  Array<ArgInfo> args_info_{nullptr};
-  /*! \brief The post processors */
-  Array<Postproc> postprocs_{nullptr};
-  /*! \brief The number of threads to use. -1 means using logical cpu number. */
-  int num_threads_ = -1;
+  /*! \brief The tuning context of the search strategy. */
+  const TuneContextNode* context_{nullptr};
   /*! \brief The random state. -1 means using random number. */
   TRandState rand_state_ = -1;
   /*! \brief The state of the search strategy. */
@@ -63,10 +71,7 @@ class ReplayTraceNode : public SearchStrategyNode {
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
     v->Visit("max_trials_per_task", &max_trials_per_task);
-    // `per_thread_mod_` is not visited
-    // `args_info_` is not visited
-    // `postprocs_` is not visited
-    // `num_threads_` is not visited
+    // `context_` is not visited.
     // `rand_state_` is not visited
     // `state_` is not visited
   }
@@ -75,22 +80,20 @@ class ReplayTraceNode : public SearchStrategyNode {
   TVM_DECLARE_FINAL_OBJECT_INFO(ReplayTraceNode, SearchStrategyNode);
 
   void InitializeWithTuneContext(const TuneContext& context) final {
-    CHECK(context->num_threads > 0) << "Number of threads has to be larger than 0.";
-    this->num_threads_ = context->num_threads;
-
-    this->per_thread_mod_.reserve(this->num_threads_);
-    for (int i = 0; i < this->num_threads_; i++) {
-      this->per_thread_mod_.push_back(DeepCopyIRModule(context->mod.value()));
-    }
-
-    this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(context->mod.value()));
-    this->postprocs_ = context->postprocs;
+    CHECK(context->mod.defined()) << "ValueError: TuneContext.mod is not defined";
+    this->context_ = context.get();
     this->rand_state_ = ForkSeed(&context->rand_state);
     this->state_.reset();
   }
 
-  void PreTuning(const Array<tir::Schedule>& design_spaces) final {
+  void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
+                 const Optional<CostModel>& cost_model) final {
     ICHECK(!design_spaces.empty());
+    CHECK(this->context_ != nullptr) << "ValueError: Did you forget to initialize the TuneContext?";
+    if (this->state_ != nullptr) {
+      TVM_PY_LOG(WARNING, this->context_->logging_func) << "RelayTrace is already initialized.";
+      this->state_.reset();
+    }
     ICHECK(this->state_ == nullptr);
     Array<tir::Trace> design_space_traces;
     design_space_traces.reserve(design_spaces.size());
@@ -124,24 +127,26 @@ inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasure
   }
   ed = std::min(ed, self->max_trials_per_task);
   ICHECK_LT(st, ed);
-  std::vector<TRandState> per_thread_rand_state = ForkSeed(&self->rand_state_, self->num_threads_);
+  const TuneContextNode* ctx = self->context_;
+  ICHECK(ctx);
+  std::vector<TRandState> per_thread_rand_state = ForkSeed(&self->rand_state_, ctx->num_threads);
   Array<MeasureCandidate> per_task_result(ed - st, MeasureCandidate{nullptr});
-  ThreadedTraceApply pp(self->postprocs_);
+  ThreadedTraceApply pp(ctx->postprocs);
   auto f_worker = [this, &per_thread_rand_state, &per_task_result, &pp](int thread_id,
                                                                         int task_id) -> void {
     TRandState& rand_state = per_thread_rand_state[thread_id];
-    IRModule mod = self->per_thread_mod_[thread_id];
+    IRModule mod = this->per_thread_mod_[thread_id];
     for (;;) {
       int design_space_index = tir::SampleInt(&rand_state, 0, design_spaces.size());
       tir::Trace trace = design_spaces[design_space_index];
       tir::Trace new_trace = tir::Trace(trace->insts, {});
       if (Optional<tir::Schedule> sch = pp.Apply(mod, new_trace, &rand_state)) {
-        per_task_result.Set(task_id, MeasureCandidate(sch.value(), self->args_info_));
+        per_task_result.Set(task_id, MeasureCandidate(sch.value(), this->args_info_));
         break;
       }
     }
   };
-  support::parallel_for_dynamic(0, ed - st, self->num_threads_, f_worker);
+  support::parallel_for_dynamic(0, ed - st, ctx->num_threads, f_worker);
   return per_task_result;
 }
 
diff --git a/src/meta_schedule/search_strategy/search_strategy.cc b/src/meta_schedule/search_strategy/search_strategy.cc
index fefe8dfce76e9..a6a1100cebe60 100644
--- a/src/meta_schedule/search_strategy/search_strategy.cc
+++ b/src/meta_schedule/search_strategy/search_strategy.cc
@@ -28,6 +28,13 @@ MeasureCandidate::MeasureCandidate(tir::Schedule sch, Array<ArgInfo> args_info)
   data_ = std::move(n);
 }
 
+void PySearchStrategyNode::PreTuning(const Array<tir::Schedule>& design_spaces,
+                                     const Optional<Database>& database,
+                                     const Optional<CostModel>& cost_model) {
+  ICHECK(f_pre_tuning != nullptr) << "PySearchStrategy's PreTuning method not implemented!";
+  this->f_pre_tuning(design_spaces, database, cost_model);
+}
+
 SearchStrategy SearchStrategy::PySearchStrategy(
     PySearchStrategyNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PySearchStrategyNode::FPreTuning f_pre_tuning,                                    //
diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
index f8cc9d5514941..73d191f593fec 100644
--- a/src/meta_schedule/task_scheduler/gradient_based.cc
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -189,10 +189,10 @@ TaskScheduler TaskScheduler::GradientBased(Array<TuneContext> tasks,
                                            Array<FloatImm> task_weights,                        //
                                            Builder builder,                                     //
                                            Runner runner,                                       //
-                                           Database database,                                   //
-                                           int max_trials,                                      //
+                                           Optional<Database> database,                         //
                                            Optional<CostModel> cost_model,                      //
                                            Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                           int max_trials,                                      //
                                            PackedFunc logging_func,                             //
                                            double alpha,                                        //
                                            int window_size,                                     //
@@ -227,9 +227,6 @@ TaskScheduler TaskScheduler::GradientBased(Array<TuneContext> tasks,
   n->best_time_cost_per_task_ = std::vector<double>(n_tasks, 1e100);
   n->num_rounds_already_ = 0;
   support::LinearCongruentialEngine(&n->rand_state_).Seed(seed);
-  for (const TuneContext& task : tasks) {
-    task->task_scheduler = n.get();
-  }
   return TaskScheduler(n);
 }
 
diff --git a/src/meta_schedule/task_scheduler/round_robin.cc b/src/meta_schedule/task_scheduler/round_robin.cc
index 446b11837930b..ea22878840aff 100644
--- a/src/meta_schedule/task_scheduler/round_robin.cc
+++ b/src/meta_schedule/task_scheduler/round_robin.cc
@@ -58,10 +58,10 @@ class RoundRobinNode final : public TaskSchedulerNode {
 TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,                            //
                                         Builder builder,                                     //
                                         Runner runner,                                       //
-                                        Database database,                                   //
-                                        int max_trials,                                      //
+                                        Optional<Database> database,                         //
                                         Optional<CostModel> cost_model,                      //
                                         Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                        int max_trials,                                      //
                                         PackedFunc logging_func) {
   ObjectPtr<RoundRobinNode> n = make_object<RoundRobinNode>();
   n->tasks = tasks;
@@ -74,9 +74,6 @@ TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,
   n->logging_func = logging_func;
   n->num_trials_already = 0;
   n->task_id = -1;
-  for (const TuneContext& task : tasks) {
-    task->task_scheduler = n.get();
-  }
   return TaskScheduler(n);
 }
 
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index fd1d95cd1f19b..25867fb4f3bbf 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -117,7 +117,7 @@ void TaskSchedulerNode::InitializeTask(int task_id) {
                                          << tir::AsTVMScript(sch->mod()) << "\n"
                                          << Concat(trace->AsPython(false), "\n");
   }
-  task->search_strategy.value()->PreTuning(design_spaces);
+  task->search_strategy.value()->PreTuning(design_spaces, database, cost_model);
 }
 
 void TaskSchedulerNode::Tune() {
@@ -203,10 +203,10 @@ TaskScheduler TaskScheduler::PyTaskScheduler(
     Array<TuneContext> tasks,                                   //
     Builder builder,                                            //
     Runner runner,                                              //
-    Database database,                                          //
-    int max_trials,                                             //
+    Optional<Database> database,                                //
     Optional<CostModel> cost_model,                             //
     Optional<Array<MeasureCallback>> measure_callbacks,         //
+    int max_trials,                                             //
     PackedFunc logging_func,                                    //
     PyTaskSchedulerNode::FTune f_tune,                          //
     PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
diff --git a/tests/python/unittest/test_meta_schedule_measure_callback.py b/tests/python/unittest/test_meta_schedule_measure_callback.py
index a1b188930f86a..298b51e0158e5 100644
--- a/tests/python/unittest/test_meta_schedule_measure_callback.py
+++ b/tests/python/unittest/test_meta_schedule_measure_callback.py
@@ -16,12 +16,10 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import re
-from random import random
 from typing import List
 
 import pytest
 import tvm
-from tvm.ir import IRModule, assert_structural_equal
 from tvm.meta_schedule.builder import BuilderResult
 from tvm.meta_schedule.measure_callback import PyMeasureCallback
 from tvm.meta_schedule.runner import RunnerResult
@@ -66,7 +64,7 @@ def apply(
             results: List[RunnerResult],
         ) -> None:
             assert len(measure_candidates) == 1
-            assert_structural_equal(measure_candidates[0].sch.mod, Matmul)
+            tvm.ir.assert_structural_equal(measure_candidates[0].sch.mod, Matmul)
             assert (
                 len(builds) == 1
                 and builds[0].error_msg is None
@@ -78,7 +76,14 @@ def apply(
 
     measure_callback = FancyMeasureCallback()
     measure_callback.apply(
-        RoundRobin([], [], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
+        RoundRobin(
+            tasks=[],
+            task_weights=[],
+            builder=DummyBuilder(),
+            runner=DummyRunner(),
+            database=DummyDatabase(),
+            max_trials=1,
+        ),
         0,
         [MeasureCandidate(Schedule(Matmul), None)],
         [BuilderResult("test_build", None)],
@@ -102,7 +107,14 @@ def apply(
     measure_callback = FailingMeasureCallback()
     with pytest.raises(ValueError, match="test"):
         measure_callback.apply(
-            RoundRobin([], [], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
+            RoundRobin(
+                tasks=[],
+                task_weights=[],
+                builder=DummyBuilder(),
+                runner=DummyRunner(),
+                database=DummyDatabase(),
+                max_trials=1,
+            ),
             0,
             [MeasureCandidate(Schedule(Matmul), None)],
             [BuilderResult("test_build", None)],
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index 94042dd753e0d..4eb8aac5a3314 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -123,43 +123,37 @@ def _schedule_matmul_small(sch: Schedule):
 
     num_trials_per_iter = 10
     max_trials_per_task = 2000
+    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
 
-    strategy = EvolutionarySearch(
-        num_trials_per_iter=num_trials_per_iter,
-        max_trials_per_task=max_trials_per_task,
-        population_size=5,
-        init_measured_ratio=0.1,
-        init_min_unmeasured=50,
-        genetic_num_iters=3,
-        genetic_mutate_prob=0.5,
-        genetic_max_fail_count=10,
-        eps_greedy=0.9,
-    )
     context = TuneContext(
         mod=Matmul,
-        space_generator=ScheduleFn(sch_fn=_schedule_matmul_small),
+        space_generator=ScheduleFn(
+            sch_fn=_schedule_matmul_small,
+        ),
+        search_strategy=EvolutionarySearch(
+            num_trials_per_iter=num_trials_per_iter,
+            max_trials_per_task=max_trials_per_task,
+            population_size=5,
+            init_measured_ratio=0.1,
+            init_min_unmeasured=50,
+            genetic_num_iters=3,
+            genetic_mutate_prob=0.5,
+            genetic_max_fail_count=10,
+            eps_greedy=0.9,
+        ),
         mutator_probs={
             DummyMutator(): 1.0,
         },
         target=tvm.target.Target("llvm"),
         num_threads=1,  # because we are using a mutator from the python side
     )
-    _scheduler = RoundRobin(
-        tasks=[context],
-        task_weights=[1.0],
-        builder=ms.builder.LocalBuilder(),
-        runner=ms.runner.LocalRunner(),
+    context.initialize()
+    strategy = context.search_strategy
+    strategy.pre_tuning(
+        context.space_generator.generate_design_space(context.mod),
         database=DummyDatabase(),
         cost_model=ms.cost_model.RandomModel(),
-        measure_callbacks=[],
-        max_trials=1,
     )
-    context.space_generator.initialize_with_tune_context(context)
-    spaces = context.space_generator.generate_design_space(context.mod)
-
-    strategy.initialize_with_tune_context(context)
-    strategy.pre_tuning(spaces)
-    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
     num_trials_each_iter: List[int] = []
     candidates = strategy.generate_measure_candidates()
     while candidates is not None:
@@ -177,52 +171,46 @@ def _schedule_matmul_small(sch: Schedule):
     strategy.post_tuning()
     assert sum(num_trials_each_iter) == 25
     assert num_trials_each_iter.count(0) < 5
-    del _scheduler
 
 
 def test_meta_schedule_evolutionary_search_early_stop():  # pylint: disable = invalid-name]
     def _schedule_matmul_empty(sch: Schedule):
         return sch
 
+    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
+
     num_trials_per_iter = 10
     max_trials_per_task = 100
 
-    strategy = EvolutionarySearch(
-        num_trials_per_iter=num_trials_per_iter,
-        max_trials_per_task=max_trials_per_task,
-        population_size=5,
-        init_measured_ratio=0.1,
-        init_min_unmeasured=50,
-        genetic_num_iters=3,
-        genetic_mutate_prob=0.5,
-        genetic_max_fail_count=10,
-        eps_greedy=0.9,
-    )
     context = TuneContext(
         mod=Matmul,
-        space_generator=ScheduleFn(sch_fn=_schedule_matmul_empty),
+        search_strategy=EvolutionarySearch(
+            num_trials_per_iter=num_trials_per_iter,
+            max_trials_per_task=max_trials_per_task,
+            population_size=5,
+            init_measured_ratio=0.1,
+            init_min_unmeasured=50,
+            genetic_num_iters=3,
+            genetic_mutate_prob=0.5,
+            genetic_max_fail_count=10,
+            eps_greedy=0.9,
+        ),
+        space_generator=ScheduleFn(
+            sch_fn=_schedule_matmul_empty,
+        ),
         mutator_probs={
             DummyMutator(): 1.0,
         },
         target=tvm.target.Target("llvm"),
-        num_threads=1,  # because we are using a mutator from the python side
+        num_threads=1,
     )
-    _scheduler = RoundRobin(
-        tasks=[context],
-        task_weights=[1.0],
-        builder=ms.builder.LocalBuilder(),
-        runner=ms.runner.LocalRunner(),
+    context.initialize()
+    strategy = context.search_strategy
+    strategy.pre_tuning(
+        context.space_generator.generate_design_space(context.mod),
         database=DummyDatabase(),
         cost_model=ms.cost_model.RandomModel(),
-        measure_callbacks=[],
-        max_trials=1,
     )
-    context.space_generator.initialize_with_tune_context(context)
-    spaces = context.space_generator.generate_design_space(context.mod)
-
-    strategy.initialize_with_tune_context(context)
-    strategy.pre_tuning(spaces)
-    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
     num_trials_each_iter: List[int] = []
     candidates = strategy.generate_measure_candidates()
     while candidates is not None:
@@ -239,7 +227,6 @@ def _schedule_matmul_empty(sch: Schedule):
         candidates = strategy.generate_measure_candidates()
     strategy.post_tuning()
     assert num_trials_each_iter == [1, 0, 0, 0, 0]
-    del _scheduler
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index 025bbe4225b54..f24dc5fbbc1fd 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -17,7 +17,6 @@
 """ Test Meta Schedule Task Scheduler """
 
 import random
-import sys
 import weakref
 from typing import Set
 
@@ -108,7 +107,6 @@ def main(  # type: ignore
 def _schedule_matmul(sch: Schedule):
     block = sch.get_block("matmul")
     i, j, k = sch.get_loops(block=block)
-    # TODO(@zxybazh): Change to `sample_perfect_tile` after upstreaming
     i_0, i_1, i_2, i_3 = sch.split(loop=i, factors=[2, 4, 64, 2])
     j_0, j_1, j_2, j_3 = sch.split(loop=j, factors=[4, 64, 2, 2])
     k_0, k_1 = sch.split(loop=k, factors=[32, 32])
@@ -118,7 +116,6 @@ def _schedule_matmul(sch: Schedule):
 def _schedule_batch_matmul(sch: Schedule):
     block = sch.get_block("matmul")
     i, j, k, t = sch.get_loops(block=block)
-    # TODO(@zxybazh): Change to `sample_perfect_tile` after upstreaming
     i_0, i_1, i_2, i_3 = sch.split(loop=i, factors=[2, 2, 2, 2])
     j_0, j_1, j_2, j_3 = sch.split(loop=j, factors=[2, 4, 64, 2])
     k_0, k_1 = sch.split(loop=k, factors=[32, 32])
@@ -156,23 +153,22 @@ def next_task_id(self) -> int:
 def test_meta_schedule_task_scheduler_single():
     num_trials_per_iter = 3
     max_trials_per_task = 10
-    sch_fn = ScheduleFn(sch_fn=_schedule_matmul)
-    replay = ReplayTrace(num_trials_per_iter, max_trials_per_task)
-    task = TuneContext(
-        MatmulModule,
-        target=tvm.target.Target("llvm"),
-        space_generator=sch_fn,
-        search_strategy=replay,
-        task_name="Test",
-        rand_state=42,
-    )
     database = DummyDatabase()
     round_robin = RoundRobin(
-        [task],
+        [
+            TuneContext(
+                MatmulModule,
+                target=tvm.target.Target("llvm"),
+                space_generator=ScheduleFn(sch_fn=_schedule_matmul),
+                search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+                task_name="Test",
+                rand_state=42,
+            )
+        ],
         [1.0],
-        DummyBuilder(),
-        DummyRunner(),
-        database,
+        builder=DummyBuilder(),
+        runner=DummyRunner(),
+        database=database,
         measure_callbacks=[measure_callback.AddToDatabase()],
         max_trials=max_trials_per_task,
     )
@@ -212,10 +208,10 @@ def test_meta_schedule_task_scheduler_multiple():
     database = DummyDatabase()
     round_robin = RoundRobin(
         tasks,
-        [1.0],
-        DummyBuilder(),
-        DummyRunner(),
-        database,
+        [1.0, 1.0, 1.0],
+        builder=DummyBuilder(),
+        runner=DummyRunner(),
+        database=database,
         measure_callbacks=[measure_callback.AddToDatabase()],
         max_trials=max_trials_per_task * len(tasks),
     )
@@ -239,18 +235,23 @@ class NIETaskScheduler(PyTaskScheduler):
         pass
 
     with pytest.raises(TVMError, match="PyTaskScheduler's NextTaskId method not implemented!"):
-        scheduler = NIETaskScheduler([], DummyBuilder(), DummyRunner(), DummyDatabase(), 1)
+        scheduler = NIETaskScheduler(
+            tasks=[],
+            builder=DummyBuilder(),
+            runner=DummyRunner(),
+            database=DummyDatabase(),
+            max_trials=1,
+        )
         scheduler.next_task_id()
 
 
 def test_meta_schedule_task_scheduler_avoid_cyclic():  # pylint: disable=invalid-name
-
     database = DummyDatabase()
     scheduler = MyTaskScheduler(
         [],
-        DummyBuilder(),
-        DummyRunner(),
-        database,
+        builder=DummyBuilder(),
+        runner=DummyRunner(),
+        database=database,
         measure_callbacks=[
             measure_callback.AddToDatabase(),
         ],
@@ -262,7 +263,6 @@ def test_meta_schedule_task_scheduler_avoid_cyclic():  # pylint: disable=invalid
 
 
 def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: disable=invalid-name
-
     num_trials_per_iter = 6
     max_trials_per_task = 101
     tasks = [
@@ -294,9 +294,9 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
     database = DummyDatabase()
     scheduler = MyTaskScheduler(
         tasks,
-        DummyBuilder(),
-        DummyRunner(),
-        database,
+        builder=DummyBuilder(),
+        runner=DummyRunner(),
+        database=database,
         measure_callbacks=[
             measure_callback.AddToDatabase(),
         ],

From a2ef144ea3aa8ae763c59cc596e73d6a89b3f046 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 7 Jun 2022 00:57:59 -0700
Subject: [PATCH 055/181] Refactor RewriteTensorize to prevent concurrent map
 updates (#11596)

---
 .../postproc/rewrite_tensorize.cc             | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/meta_schedule/postproc/rewrite_tensorize.cc b/src/meta_schedule/postproc/rewrite_tensorize.cc
index 1ad394e49c596..3df9075972963 100644
--- a/src/meta_schedule/postproc/rewrite_tensorize.cc
+++ b/src/meta_schedule/postproc/rewrite_tensorize.cc
@@ -28,10 +28,10 @@ namespace meta_schedule {
 using tir::BlockRV;
 using tir::LoopRV;
 
-void ApplyTensorization(const tir::Schedule& sch, const String& func_name,
-                        const tir::PrimFuncNode* func, bool vectorize_init_loop) {
-  std::vector<std::pair<std::string, std::function<void(tir::BlockRV)>>> jobs;
-
+void CollectTensorizationJobs(
+    const tir::Schedule& sch, const String& func_name, const tir::PrimFuncNode* func,
+    bool vectorize_init_loop,
+    std::vector<std::tuple<String, String, std::function<void(tir::BlockRV)>>>* jobs) {
   tir::PostOrderVisit(func->body, [=, &jobs](const ObjectRef& obj) {
     if (const auto* block = obj.as<tir::BlockNode>()) {
       tir::StmtSRef block_sref = sch->GetSRef(block);
@@ -39,7 +39,7 @@ void ApplyTensorization(const tir::Schedule& sch, const String& func_name,
               tir::GetAnn<String>(block_sref, tir::attr::meta_schedule_auto_tensorize)) {
         std::string block_name = block_sref->StmtAs<tir::BlockNode>()->name_hint;
         if (block_name.find("init") == std::string::npos) {
-          jobs.emplace_back(block_name, [sch, intrin_name](tir::BlockRV block) {
+          jobs->emplace_back(block_name, func_name, [sch, intrin_name](tir::BlockRV block) {
             try {
               sch->Tensorize(block, intrin_name.value());
             } catch (const std::exception& e) {
@@ -47,7 +47,7 @@ void ApplyTensorization(const tir::Schedule& sch, const String& func_name,
             }
           });
         } else if (vectorize_init_loop) {
-          jobs.emplace_back(block_name, [sch](tir::BlockRV block) {
+          jobs->emplace_back(block_name, func_name, [sch](tir::BlockRV block) {
             Array<BlockRV> child_blocks = sch->GetChildBlocks(block);
             ICHECK(child_blocks.size() == 1);
             Array<LoopRV> init_loops = sch->GetLoops(child_blocks[0]);
@@ -58,12 +58,6 @@ void ApplyTensorization(const tir::Schedule& sch, const String& func_name,
       }
     }
   });
-
-  for (auto kv : jobs) {
-    tir::BlockRV block = sch->GetBlock(kv.first, func_name);
-    sch->Unannotate(block, tir::attr::meta_schedule_auto_tensorize);
-    kv.second(block);
-  }
 }
 
 class RewriteTensorizeNode : public PostprocNode {
@@ -81,13 +75,23 @@ class RewriteTensorizeNode : public PostprocNode {
 };
 
 bool RewriteTensorizeNode::Apply(const tir::Schedule& sch) {
+  // The rewriting jobs, 3-tuple (block_name, func_name, job_func)
+  std::vector<std::tuple<String, String, std::function<void(tir::BlockRV)>>> jobs;
   for (const auto& kv : sch->mod()->functions) {
     GlobalVar g_var = kv.first;
     BaseFunc base_func = kv.second;
     if (const tir::PrimFuncNode* prim_func = base_func.as<tir::PrimFuncNode>()) {
-      ApplyTensorization(sch, g_var->name_hint, prim_func, vectorize_init_loop);
+      CollectTensorizationJobs(sch, g_var->name_hint, prim_func, vectorize_init_loop, &jobs);
     }
   }
+  for (const auto& job : jobs) {
+    const String& block_name = std::get<0>(job);
+    const String& func_name = std::get<1>(job);
+    const auto& job_func = std::get<2>(job);
+    BlockRV block = sch->GetBlock(block_name, func_name);
+    sch->Unannotate(block, tir::attr::meta_schedule_auto_tensorize);
+    job_func(block);
+  }
   return true;
 }
 

From 70884e957aa5c8de9c02c25a14d30563d7300cb9 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Tue, 7 Jun 2022 00:58:14 -0700
Subject: [PATCH 056/181] fix uint case (#11597)

---
 src/relay/transforms/fold_explicit_padding.cc         | 3 ++-
 tests/python/relay/test_pass_fold_explicit_padding.py | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc
index c60f36c7540e2..00162abc69f90 100644
--- a/src/relay/transforms/fold_explicit_padding.cc
+++ b/src/relay/transforms/fold_explicit_padding.cc
@@ -269,7 +269,8 @@ class SimplifyExplicitPad {
         } else if (node_map.count(avg_pool3d_)) {
           attrs = MakeAvgPoolAttrs(param, call_node->attrs.as<AvgPool3DAttrs>());
         }
-      } else if (node_map.count(max_pool_)) {
+      }
+      if (node_map.count(max_pool_)) {
         // Fold Padding and MaxPool only if pad_value is the min possible value for the dtype
         auto min_value = tvm::min_value(tvm::runtime::DataType(pad_value->data->dtype));
         const FloatImmNode* maybe_min_float = min_value.as<FloatImmNode>();
diff --git a/tests/python/relay/test_pass_fold_explicit_padding.py b/tests/python/relay/test_pass_fold_explicit_padding.py
index 41e2500d4ffa9..35354508a953a 100644
--- a/tests/python/relay/test_pass_fold_explicit_padding.py
+++ b/tests/python/relay/test_pass_fold_explicit_padding.py
@@ -228,8 +228,8 @@ def validate(
 
     # Check Pool pad folding when pad width on pad op is all zero.
     validate(max_pools, 1, [[0, 0], [0, 0], [0, 0]], float_min_val, [2, 0], "NCW", 2)
-    # Check MaxPool pad folding with int dtype
-    int_min_val = get_min_value("int32")
+    # Check MaxPool pad folding with uint dtype
+    int_min_val = get_min_value("uint8")
     validate(
         max_pools,
         2,
@@ -238,7 +238,7 @@ def validate(
         [2, 0, 0, 0],
         "NCHW",
         2,
-        dtype="int32",
+        dtype="uint8",
     )
     # Fold when original AvgPool has its own padding but count_include_pad=True
     validate(

From 32a86f8304928f16286cd9ffe6d47abc6c4a5bb6 Mon Sep 17 00:00:00 2001
From: Altan Haan <3124994+altanh@users.noreply.github.com>
Date: Tue, 7 Jun 2022 10:33:21 -0700
Subject: [PATCH 057/181] [TOPI] TE implementation of LSTM using scan (#11531)

* TE implementation of LSTM in TOPI

* docstring

* lint

* add injective tags where applicable
---
 python/tvm/topi/generic/nn.py              |  16 ++
 python/tvm/topi/nn/__init__.py             |   1 +
 python/tvm/topi/nn/lstm.py                 | 235 +++++++++++++++++++++
 python/tvm/topi/testing/__init__.py        |   1 +
 python/tvm/topi/testing/lstm_python.py     | 134 ++++++++++++
 tests/python/topi/python/test_topi_lstm.py | 161 ++++++++++++++
 6 files changed, 548 insertions(+)
 create mode 100644 python/tvm/topi/nn/lstm.py
 create mode 100644 python/tvm/topi/testing/lstm_python.py
 create mode 100644 tests/python/topi/python/test_topi_lstm.py

diff --git a/python/tvm/topi/generic/nn.py b/python/tvm/topi/generic/nn.py
index 4226c6caf23c9..80ea00ab01530 100644
--- a/python/tvm/topi/generic/nn.py
+++ b/python/tvm/topi/generic/nn.py
@@ -881,3 +881,19 @@ def schedule_correlation_nchw(outs):
         The computation schedule for the op.
     """
     return _default_schedule(outs, False)
+
+
+def schedule_lstm(outs):
+    """Schedule for LSTM
+
+    Parameters
+    ----------
+    outs : Array of Tensor
+        The outputs of LSTM (hidden states and cell states).
+
+    Returns
+    -------
+    sch: Schedule
+        The default schedule for LSTM.
+    """
+    return _default_schedule(outs, False)
diff --git a/python/tvm/topi/nn/__init__.py b/python/tvm/topi/nn/__init__.py
index d3d00305a17b3..1dd922d76819c 100644
--- a/python/tvm/topi/nn/__init__.py
+++ b/python/tvm/topi/nn/__init__.py
@@ -51,3 +51,4 @@
 from .space_to_batch_nd import *
 from .batch_to_space_nd import *
 from .loss import *
+from .lstm import *
diff --git a/python/tvm/topi/nn/lstm.py b/python/tvm/topi/nn/lstm.py
new file mode 100644
index 0000000000000..b9723b5675d01
--- /dev/null
+++ b/python/tvm/topi/nn/lstm.py
@@ -0,0 +1,235 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""General LSTM implementation using TE scan."""
+from tvm import te, tir
+from tvm.topi import tag
+
+
+def lstm(
+    Xs,
+    Wi,
+    Wh,
+    Bi=None,
+    Bh=None,
+    h_init=None,
+    c_init=None,
+    proj=None,
+    p_i=None,
+    p_f=None,
+    p_o=None,
+    f_act=tir.sigmoid,
+    g_act=tir.tanh,
+    h_act=tir.tanh,
+    reverse=False,
+    weight_layout: str = "IFGO",
+):
+    """General LSTM implemented using TE scan.
+
+    Parameters
+    ----------
+    Xs : te.Tensor
+        Input sequence with shape `(seq_len, batch_size, in_dim)`
+    Wi : te.Tensor
+        Input weight matrix with shape `(4 * hidden_dim, in_dim)`. The weights are packed according
+        to `weight_layout`.
+    Wh : te.Tensor
+        Hidden weight matrix with shape `(4 * hidden_dim, hidden_dim or proj_dim)`. Packed as `Wh`.
+    Bi : te.Tensor, optional
+        Input bias with shape `(4 * hidden_dim,)`, by default None. Packed as `Wh`.
+    Bh : te.Tensor, optional
+        Hidden bias with shape as `Bi`, by default None. Packed as `Wh`.
+    h_init : te.Tensor, optional
+        Initial hidden state with shape `(batch_size, hidden_dim or proj_dim)`, zero if None
+    c_init : te.Tensor, optional
+        Initial cell state with same shape as `h_init`, zero if None
+    proj : te.Tensor, optional
+        Projection matrix with shape `(proj_dim, hidden_dim)`, by default None
+    p_i, p_f, p_o : te.Tensor, optional
+        Peephole LSTM matrices with shape `(batch_size, hidden_dim)`, by default None
+    f_act, g_act, h_act : F, optional
+        Gate activation functions
+    reverse : bool, optional
+        Whether to process `Xs` in reverse, by default False
+    weight_layout : str, optional
+        The packed weight layout for gates, by default "IFGO". Note: I = input, F = forget,
+        G = cell, O = output.
+
+    Returns
+    -------
+    result : te.Tensor, te.Tensor
+        Tuple of hidden states (with shape `(seq_len, batch_size, hidden_dim or proj_dim)`), and
+        cell states (with shape `(seq_len, batch_size, hidden_dim)`).
+    """
+    assert len(weight_layout) == 4 and sorted(weight_layout) == sorted(
+        "IFGO"
+    ), f'given weight layout "{weight_layout}" is not a permutation of "IFGO"'
+
+    i_gate_idx = weight_layout.find("I")
+    f_gate_idx = weight_layout.find("F")
+    g_gate_idx = weight_layout.find("G")
+    o_gate_idx = weight_layout.find("O")
+
+    seq_len, batch_size, in_dim = Xs.shape
+    assert (
+        Wi.shape[0] % 4 == 0
+    ), f"dim 0 of input weight should be 4 * hidden_dim, but {Wi.shape[0]} is not divisible by 4"
+    hidden_dim = Wi.shape[0] // 4
+    proj_dim = hidden_dim
+    if proj is not None:
+        proj_dim = proj.shape[0]
+
+    # te.scan uses up 1 element for the initial value
+    scan_len = seq_len + 1
+
+    # precompute input-hidden matmul outside the scan
+    ki = te.reduce_axis((0, in_dim), name="ki2h")
+    Xi2h = te.compute(
+        (seq_len * batch_size, 4 * hidden_dim),
+        lambda tb, ij: te.sum(Xs[(tb // batch_size), tb % batch_size, ki] * Wi[ij, ki], axis=ki),
+        name="Xi2h",
+    )
+    if Bi is not None:
+        Xi2h = te.compute(
+            Xi2h.shape, lambda tb, ij: Xi2h[tb, ij] + Bi[ij], name="Xi2h_bias", tag=tag.INJECTIVE
+        )
+
+    h_state = te.placeholder((scan_len, batch_size, proj_dim), name="h_state")
+    c_state = te.placeholder((scan_len, batch_size, hidden_dim), name="c_state")
+    h_init = te.compute(
+        (1, batch_size, proj_dim),
+        lambda _, b, i: h_init[b, i] if h_init is not None else 0.0,
+        name="h_init",
+    )
+    c_init = te.compute(
+        (1, batch_size, hidden_dim),
+        lambda _, b, i: c_init[b, i] if c_init is not None else 0.0,
+        name="c_init",
+    )
+
+    # begin scan computations, first the (batched) hidden-hidden dense
+    kh = te.reduce_axis((0, proj_dim), name="kh2h")
+    s_h2h = te.compute(
+        (scan_len, batch_size, 4, hidden_dim),
+        lambda t, b, i, j: te.sum(h_state[t - 1, b, kh] * Wh[i * hidden_dim + j, kh], axis=kh),
+        name="s_h2h",
+    )
+    if Bh is not None:
+        s_h2h = te.compute(
+            s_h2h.shape,
+            lambda t, b, i, j: s_h2h[t, b, i, j] + Bh[i * hidden_dim + j],
+            name="s_h2h_bias",
+            tag=tag.INJECTIVE,
+        )
+
+    # helper to reverse time if scanning backwards
+    get_x_t = lambda t: seq_len - t if reverse else t - 1
+
+    gates = te.compute(
+        (scan_len, batch_size, 4, hidden_dim),
+        lambda t, b, i, j: Xi2h[get_x_t(t) * batch_size + b, i * hidden_dim + j]
+        + s_h2h[t, b, i, j],
+        name="gates",
+        tag=tag.INJECTIVE,
+    )
+
+    # helper to correctly read each gate dense from the batched output
+    read_gate = lambda t, b, j, idx: gates[t, b, idx, j]
+
+    gate_shape = (scan_len, batch_size, hidden_dim)
+
+    # compute the activated gates (and do some extra stuff if peephole weights are present)
+    if p_i is not None and p_f is not None:
+        i_gate = te.compute(
+            gate_shape,
+            lambda t, b, j: f_act(
+                read_gate(t, b, j, i_gate_idx) + p_i[b, j] * c_state[t - 1, b, j]
+            ),
+            name="i_gate_p",
+            tag=tag.INJECTIVE,
+        )
+        f_gate = te.compute(
+            gate_shape,
+            lambda t, b, j: f_act(
+                read_gate(t, b, j, f_gate_idx) + p_f[b, j] * c_state[t - 1, b, j]
+            ),
+            name="f_gate_p",
+            tag=tag.INJECTIVE,
+        )
+    else:
+        i_gate = te.compute(
+            gate_shape,
+            lambda *i: f_act(read_gate(*i, i_gate_idx)),
+            name="i_gate",
+            tag=tag.INJECTIVE,
+        )
+        f_gate = te.compute(
+            gate_shape,
+            lambda *i: f_act(read_gate(*i, f_gate_idx)),
+            name="f_gate",
+            tag=tag.INJECTIVE,
+        )
+
+    g_gate = te.compute(
+        gate_shape, lambda *i: g_act(read_gate(*i, g_gate_idx)), name="g_gate", tag=tag.INJECTIVE
+    )
+
+    next_c = te.compute(
+        gate_shape,
+        lambda t, b, j: f_gate[t, b, j] * c_state[t - 1, b, j] + i_gate[t, b, j] * g_gate[t, b, j],
+        name="next_c",
+    )
+
+    if p_o is not None:
+        o_gate = te.compute(
+            gate_shape,
+            lambda t, b, j: f_act(read_gate(t, b, j, o_gate_idx) + p_o[b, j] * next_c[t, b, j]),
+            name="o_gate_p",
+            tag=tag.INJECTIVE,
+        )
+    else:
+        o_gate = te.compute(
+            gate_shape,
+            lambda *i: f_act(read_gate(*i, o_gate_idx)),
+            name="o_gate",
+            tag=tag.INJECTIVE,
+        )
+
+    next_h = te.compute(gate_shape, lambda *i: o_gate(*i) * h_act(next_c(*i)), name="next_h")
+
+    # project hidden state back to proj_dim if projection matrix is present
+    if proj is not None:
+        kr = te.reduce_axis((0, hidden_dim), name="kh2p")
+        next_h = te.compute(
+            (scan_len, batch_size, proj_dim),
+            lambda t, b, j: te.sum(next_h[t, b, kr] * proj[j, kr], axis=kr),
+            name="next_h_proj",
+        )
+
+    scan_h, scan_c = te.scan(
+        [h_init, c_init], [next_h, next_c], [h_state, c_state], name="lstm_scan"
+    )
+
+    # drop the initial values, TODO(@altanh): is there a better way?
+    scan_h = te.compute(
+        (seq_len, batch_size, proj_dim), lambda t, b, j: scan_h[t + 1, b, j], name="hidden_states"
+    )
+    scan_c = te.compute(
+        (seq_len, batch_size, hidden_dim), lambda t, b, j: scan_c[t + 1, b, j], name="cell_states"
+    )
+
+    return scan_h, scan_c
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index 21ddf6fc55361..2f091cba10b7d 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -76,3 +76,4 @@
 from .dense import dense
 from .searchsorted import searchsorted_ref
 from .conv2d_backcward_weight_python import conv2d_backward_weight_python
+from .lstm_python import lstm_python
diff --git a/python/tvm/topi/testing/lstm_python.py b/python/tvm/topi/testing/lstm_python.py
new file mode 100644
index 0000000000000..ef1bce33658bc
--- /dev/null
+++ b/python/tvm/topi/testing/lstm_python.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""LSTM reference implementation using numpy."""
+import numpy as np
+
+
+def lstm_python(
+    Xs: np.array,
+    Wi: np.array,
+    Wh: np.array,
+    Bi: np.array = None,
+    Bh: np.array = None,
+    h_init: np.array = None,
+    c_init: np.array = None,
+    proj: np.array = None,
+    p_i: np.array = None,
+    p_f: np.array = None,
+    p_o: np.array = None,
+    f_act: str = "sigmoid",
+    g_act: str = "tanh",
+    h_act: str = "tanh",
+    reverse: bool = False,
+    weight_layout: str = "IFGO",
+):
+    """LSTM reference implementation using numpy
+
+    Parameters
+    ----------
+    Xs : np.array
+        (seq_length, batch_size, in_dim)
+    Wi : np.array
+        (4 * hidden_dim, in_dim)
+    Wh : np.array
+        (4 * hidden_dim, out_dim) where out_dim = proj_dim if proj_dim > 0, else hidden_dim
+    Bi : np.array, optional
+        (4 * hidden_dim,), by default None
+    Bh : np.array, optional
+        (4 * hidden_dim,), by default None
+    h_init : np.array, optional
+        (batch_size, out_dim), by default None
+    c_init : np.array, optional
+        (batch_size, hidden_dim), by default None
+    proj : np.array, optional
+        (proj_dim, hidden_dim), by default None
+    p_i, p_f, p_o: np.array, optional
+        (batch_size, hidden_dim), by default None
+    f_act, g_act, h_act: str, optional
+        activations, by default "sigmoid", "tanh", "tanh"
+    reverse : bool, optional
+        process Xs in reverse, by default False
+    weight_layout : str, optional
+        Packed layout for weights and biases, by default "IFGO"
+    """
+    i_gate_idx = weight_layout.find("I")
+    f_gate_idx = weight_layout.find("F")
+    g_gate_idx = weight_layout.find("G")
+    o_gate_idx = weight_layout.find("O")
+
+    str2act = {"sigmoid": lambda x: 1 / (1 + np.exp(-x)), "tanh": np.tanh}
+
+    f_act = str2act[f_act]
+    g_act = str2act[g_act]
+    h_act = str2act[h_act]
+
+    S, B, F = Xs.shape
+    H = Wi.shape[0] // 4
+    O = Wh.shape[1]
+
+    # make life a bit easier
+    Wi = np.reshape(Wi, (4, H, F))
+    Wh = np.reshape(Wh, (4, H, O))
+    if Bi is not None:
+        Bi = np.reshape(Bi, (4, H))
+    if Bh is not None:
+        Bh = np.reshape(Bh, (4, H))
+
+    h0 = h_init if h_init is not None else np.zeros((B, O), "float32")
+    c0 = c_init if c_init is not None else np.zeros((B, H), "float32")
+
+    hs = [h0]
+    cs = [c0]
+
+    for t in range(S):
+        x = Xs[S - t - 1 if reverse else t]
+        xh = [np.matmul(x, Wi[g].T) for g in range(4)]
+        if Bi is not None:
+            xh = [xh[g] + Bi[g] for g in range(4)]
+
+        hh = [np.matmul(hs[t], Wh[g].T) for g in range(4)]
+        if Bh is not None:
+            hh = [hh[g] + Bh[g] for g in range(4)]
+
+        sums = [xh[g] + hh[g] for g in range(4)]
+
+        if p_i is not None and p_f is not None:
+            i_gate = f_act(sums[i_gate_idx] + p_i * cs[t])
+            f_gate = f_act(sums[f_gate_idx] + p_f * cs[t])
+        else:
+            i_gate = f_act(sums[i_gate_idx])
+            f_gate = f_act(sums[f_gate_idx])
+
+        g_gate = g_act(sums[g_gate_idx])
+
+        next_c = f_gate * cs[t] + i_gate * g_gate
+
+        if p_o is not None:
+            o_gate = f_act(sums[o_gate_idx] + p_o * next_c)
+        else:
+            o_gate = f_act(sums[o_gate_idx])
+
+        next_h = o_gate * h_act(next_c)
+
+        if proj is not None:
+            next_h = np.matmul(next_h, proj.T)
+
+        hs.append(next_h)
+        cs.append(next_c)
+
+    return np.stack(hs[1:], axis=0), np.stack(cs[1:], axis=0)
diff --git a/tests/python/topi/python/test_topi_lstm.py b/tests/python/topi/python/test_topi_lstm.py
new file mode 100644
index 0000000000000..08ed5d73523d0
--- /dev/null
+++ b/tests/python/topi/python/test_topi_lstm.py
@@ -0,0 +1,161 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Test code for LSTM."""
+import numpy as np
+from rsa import verify
+import tvm
+from tvm import te, topi
+import tvm.testing
+import tvm.topi.testing
+
+
+def verify_lstm(
+    target,
+    dev,
+    seq_len,
+    batch_size,
+    in_dim,
+    hidden_dim,
+    proj_dim=0,
+    bias=True,
+    zero_init=True,
+    peephole=False,
+    reverse=False,
+    weight_layout="IFGO",
+):
+    out_dim = proj_dim if proj_dim > 0 else hidden_dim
+
+    def rand(*shape):
+        sqrt_k = np.sqrt(1 / hidden_dim)
+        return np.random.uniform(-sqrt_k, sqrt_k, size=shape).astype("float32")
+
+    def get_ref_data():
+        Xs = np.random.normal(size=(seq_len, batch_size, in_dim)).astype("float32")
+        Wi = rand(4 * hidden_dim, in_dim)
+        Wh = rand(4 * hidden_dim, out_dim)
+        Bi = None
+        Bh = None
+        h0 = None
+        c0 = None
+        proj = None
+        p_i = None
+        p_f = None
+        p_o = None
+
+        if bias:
+            Bi = rand(4 * hidden_dim)
+            Bh = rand(4 * hidden_dim)
+
+        if not zero_init:
+            h0 = np.random.normal(size=(batch_size, out_dim)).astype("float32")
+            c0 = np.random.normal(size=(batch_size, hidden_dim)).astype("float32")
+
+        if proj_dim > 0:
+            proj = rand(proj_dim, hidden_dim)
+
+        if peephole:
+            p_i, p_f, p_o = [rand(batch_size, hidden_dim) for _ in range(3)]
+
+        hs, cs = tvm.topi.testing.lstm_python(
+            Xs,
+            Wi,
+            Wh,
+            Bi=Bi,
+            Bh=Bh,
+            h_init=h0,
+            c_init=c0,
+            proj=proj,
+            p_i=p_i,
+            p_f=p_f,
+            p_o=p_o,
+            reverse=reverse,
+            weight_layout=weight_layout,
+        )
+
+        return [Xs, Wi, Wh, Bi, Bh, h0, c0, proj, p_i, p_f, p_o], [hs, cs]
+
+    args_np, (hs_np, cs_np) = get_ref_data()
+
+    args = [te.placeholder(a.shape, "float32") if a is not None else a for a in args_np]
+    real_args = [a for a in args if a is not None]
+
+    hs, cs = topi.nn.lstm(*args, reverse=reverse, weight_layout=weight_layout)
+    with tvm.target.Target(target):
+        sch = topi.generic.schedule_lstm([hs, cs])
+    func = tvm.build(sch, real_args + [hs, cs], target=target)
+
+    args_nd = [tvm.nd.array(a, dev) for a in args_np if a is not None]
+    hs_nd = tvm.nd.array(np.zeros((seq_len, batch_size, out_dim), "float32"), dev)
+    cs_nd = tvm.nd.array(np.zeros((seq_len, batch_size, hidden_dim), "float32"), dev)
+    func(*args_nd, hs_nd, cs_nd)
+
+    tvm.testing.assert_allclose(hs_nd.numpy(), hs_np, rtol=1e-4)
+    tvm.testing.assert_allclose(cs_nd.numpy(), cs_np, rtol=1e-4)
+
+
+def test_lstm():
+    verify_lstm(
+        "llvm",
+        tvm.cpu(0),
+        1,
+        1,
+        1,
+        1,
+        0,
+        True,
+        True,
+        False,
+        False,
+        "IFGO",
+    )
+
+    verify_lstm(
+        "llvm",
+        tvm.cpu(0),
+        8,
+        4,
+        8,
+        16,
+        0,
+        True,
+        False,
+        False,
+        False,
+        "IFGO",
+    )
+
+
+def test_lstm_proj():
+    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 8, True, True, False, False, "IFGO")
+
+
+def test_lstm_peephole():
+    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 0, True, True, True, False, "IFGO")
+
+
+def test_lstm_reverse():
+    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 0, True, True, False, True, "IFGO")
+
+
+def test_lstm_weight_layout_iofg():
+    # IOFG is used by ONNX, while IFGO is used by PyTorch
+    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 0, True, True, False, False, "IOFG")
+
+
+def test_lstm_assorted():
+    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 16, True, False, True, True, "OIGF")

From 12440895e4baad1de494f0a3876edee3e1df06ee Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 7 Jun 2022 11:08:32 -0700
Subject: [PATCH 058/181] [MetaSchedule] Add Testing Script with ONNX Support
 (#11587)

This PR introduces 2 tuning script for meta schedule and auto scheduler tuning support with onnx files. Now we can easily introduce onnx models benchmarking with command line scripts. Sample tuning call looks similar to the following script

For Meta Schedule ONNX tuning:
```
python3 -m tvm.meta_schedule.testing.tune_onnx_meta_schedule \
    --model-name   "$MODEL_NAME"                             \
    --onnx-path    "$ONNX_PATH"                              \
    --input-shape  "$INPUT_SHAPE"                            \
    --target       "$TARGET"                                 \
    --num-trials   $NUM_TRIALS                               \
    --rpc-host     $RPC_HOST                                 \
    --rpc-port     $RPC_PORT                                 \
    --rpc-key      $RPC_KEY                                  \
    --rpc-workers  $RPC_WORKERS                              \
    --work-dir     $WORK_DIR                                 \
    |& tee         "$WORK_DIR/$MODEL_NAME.log"
```

For AutoScheduler ONNX tuning:
```
python3 -m tvm.meta_schedule.testing.tune_onnx_auto_scheduler \
    --model-name   "$MODEL_NAME"                              \
    --onnx-path    "$ONNX_PATH"                               \
    --input-shape  "$INPUT_SHAPE"                             \
    --target       "$TARGET"                                  \
    --num-trials   $NUM_TRIALS                                \
    --rpc-host     $RPC_HOST                                  \
    --rpc-port     $RPC_PORT                                  \
    --rpc-key      $RPC_KEY                                   \
    --rpc-workers  $RPC_WORKERS                               \
    --log-dir      $WORK_DIR                                  \
    |& tee         "$WORK_DIR/$MODEL_NAME.log"
```
---
 .../testing/tune_onnx_auto_scheduler.py       | 238 ++++++++++++++++++
 .../testing/tune_onnx_meta_schedule.py        | 199 +++++++++++++++
 .../testing/tune_relay_auto_scheduler.py      |   4 +-
 3 files changed, 439 insertions(+), 2 deletions(-)
 create mode 100644 python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py
 create mode 100644 python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py

diff --git a/python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py b/python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py
new file mode 100644
index 0000000000000..e916f5ace3393
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py
@@ -0,0 +1,238 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+import argparse
+import json
+import os
+
+import numpy as np  # type: ignore
+import onnx  # type: ignore
+import tvm
+from tvm.relay.frontend import from_onnx
+from tvm import auto_scheduler
+from tvm import meta_schedule as ms
+from tvm import relay
+from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+
+
+def _parse_args():
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--model-name",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--onnx-path",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--input-shape",
+        type=str,
+        required=True,
+        help='example: `[{"name": "input1", "dtype": "int64", "shape": [1, 1, 8]}]',
+    )
+    args.add_argument(
+        "--target",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--num-trials",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-host",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-port",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-key",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-workers",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--work-dir",
+        type=str,
+        required=True,
+    )
+    parsed = args.parse_args()
+    parsed.target = tvm.target.Target(parsed.target)
+    parsed.input_shape = json.loads(parsed.input_shape)
+    parsed.rpc_config = ms.runner.RPCConfig(
+        tracker_host=parsed.rpc_host,
+        tracker_port=parsed.rpc_port,
+        tracker_key=parsed.rpc_key,
+        session_timeout_sec=3600,
+    )
+    return parsed
+
+
+ARGS = _parse_args()
+
+
+def main():
+    log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json")
+
+    runner = auto_scheduler.RPCRunner(
+        key=ARGS.rpc_key,
+        host=ARGS.rpc_host,
+        port=ARGS.rpc_port,
+        n_parallel=ARGS.rpc_workers,
+        number=3,
+        repeat=1,
+        min_repeat_ms=100,  # TODO
+        enable_cpu_cache_flush=False,  # TODO
+    )
+
+    if ARGS.target.kind.name == "llvm":
+        hardware_params = auto_scheduler.HardwareParams(
+            num_cores=int(ARGS.target.attrs["num-cores"]),
+            target=ARGS.target,
+        )
+    elif ARGS.target.kind.name == "cuda":
+        hardware_params = auto_scheduler.HardwareParams(
+            num_cores=-1,
+            vector_unit_bytes=16,
+            cache_line_bytes=64,
+            max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]),
+            max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]),
+            # The value `max_local_memory_per_block` is not used in AutoScheduler,
+            # but is required by the API.
+            max_local_memory_per_block=12345678,
+            max_vthread_extent=8,
+            warp_size=32,
+        )
+    else:
+        raise NotImplementedError(f"Unsupported target {ARGS.target}")
+
+    print(f"Workload: {ARGS.model_name}")
+    onnx_model = onnx.load(ARGS.onnx_path)
+    shape_dict = {}
+    for item in ARGS.input_shape:
+        print(f"  input_name: {item['name']}")
+        print(f"  input_shape: {item['shape']}")
+        print(f"  input_dtype: {item['dtype']}")
+        shape_dict[item["name"]] = item["shape"]
+    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
+    tasks, task_weights = auto_scheduler.extract_tasks(
+        mod["main"],
+        params,
+        target=ARGS.target,
+        hardware_params=hardware_params,
+    )
+    for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
+        print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====")
+        print(task.compute_dag)
+
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tuner.tune(
+        auto_scheduler.TuningOptions(
+            num_measure_trials=ARGS.num_trials,
+            runner=runner,
+            measure_callbacks=[
+                auto_scheduler.RecordToFile(log_file),
+            ],
+        )
+    )
+
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(
+            opt_level=3,
+            config={"relay.backend.use_auto_scheduler": True},
+        ):
+            lib = relay.build(
+                mod,
+                target=ARGS.target,
+                params=params,
+            )
+    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
+    input_data = {}
+    for item in ARGS.input_shape:
+        input_name, input_shape, input_dtype = item["name"], item["shape"], item["dtype"]
+        if input_dtype.startswith("float"):
+            input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype)
+        else:
+            input_data[input_name] = np.random.randint(
+                low=0, high=10000, size=input_shape, dtype=input_dtype
+            )
+
+    def f_timer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.graph_executor import GraphModule
+
+        # pylint: enable=import-outside-toplevel
+
+        mod = GraphModule(rt_mod["default"](dev))
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
+        ftimer = mod.module.time_evaluator(
+            "run",
+            dev,
+            min_repeat_ms=500,
+            repeat=3,
+        )
+        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
+        print("Running time in time_evaluator: ", results)
+
+    run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=lib,
+        dev_type=ARGS.target.kind.name,
+        args=input_data,
+        continuation=f_timer,
+    )
+
+    def f_per_layer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.debugger.debug_executor import create
+
+        # pylint: enable=import-outside-toplevel
+        mod = create(graph, rt_mod, dev)
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
+        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
+        graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
+        print("|graph_nodes| = ", len(graph_nodes))
+        print("|graph_time| = ", len(graph_time))
+        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        for k, v in graph_nodes_time.items():
+            print(f"{k} : {v:.3f}")
+
+    run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=rt_mod,
+        dev_type=ARGS.target.kind.name,
+        args=input_data,
+        continuation=f_per_layer,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py
new file mode 100644
index 0000000000000..f5c7d1cde80b4
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+import argparse
+import json
+import logging
+import numpy as np  # type: ignore
+import onnx  # type: ignore
+import tvm
+from tvm.relay.frontend import from_onnx
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+
+
+def _parse_args():
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--model-name",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--onnx-path",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--input-shape",
+        type=str,
+        required=True,
+        help='example: `[{"name": "input1", "dtype": "int64", "shape": [1, 1, 8]}]',
+    )
+    args.add_argument(
+        "--target",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--num-trials",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-host",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-port",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-key",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-workers",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--work-dir",
+        type=str,
+        required=True,
+    )
+    parsed = args.parse_args()
+    parsed.target = tvm.target.Target(parsed.target)
+    parsed.input_shape = json.loads(parsed.input_shape)
+    parsed.rpc_config = ms.runner.RPCConfig(
+        tracker_host=parsed.rpc_host,
+        tracker_port=parsed.rpc_port,
+        tracker_key=parsed.rpc_key,
+        session_timeout_sec=3600,
+    )
+    return parsed
+
+
+logging.basicConfig(
+    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+)
+logging.getLogger("tvm.meta_schedule").setLevel(logging.INFO)
+ARGS = _parse_args()
+
+
+def main():
+    print(f"Workload: {ARGS.model_name}")
+    onnx_model = onnx.load(ARGS.onnx_path)
+    shape_dict = {}
+    for item in ARGS.input_shape:
+        print(f"  input_name: {item['name']}")
+        print(f"  input_shape: {item['shape']}")
+        print(f"  input_dtype: {item['dtype']}")
+        shape_dict[item["name"]] = item["shape"]
+    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
+    alloc_repeat = 1
+    runner = ms.runner.RPCRunner(
+        rpc_config=ARGS.rpc_config,
+        evaluator_config=ms.runner.EvaluatorConfig(
+            number=3,
+            repeat=1,
+            min_repeat_ms=100,
+            enable_cpu_cache_flush=False,
+        ),
+        alloc_repeat=alloc_repeat,
+        max_workers=ARGS.rpc_workers,
+    )
+    lib = ms.tune_relay(
+        mod=mod,
+        target=ARGS.target,
+        config=ms.TuneConfig(
+            strategy="evolutionary",
+            num_trials_per_iter=64,
+            max_trials_per_task=ARGS.num_trials,
+            max_trials_global=ARGS.num_trials,
+        ),
+        runner=runner,  # type: ignore
+        work_dir=ARGS.work_dir,
+        params=params,
+    )
+    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
+    input_data = {}
+    for item in ARGS.input_shape:
+        input_name, input_shape, input_dtype = item["name"], item["shape"], item["dtype"]
+        if input_dtype.startswith("float"):
+            input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype)
+        else:
+            input_data[input_name] = np.random.randint(
+                low=0, high=10000, size=input_shape, dtype=input_dtype
+            )
+
+    def f_timer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.graph_executor import GraphModule
+
+        # pylint: enable=import-outside-toplevel
+
+        mod = GraphModule(rt_mod["default"](dev))
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
+        ftimer = mod.module.time_evaluator(
+            "run",
+            dev,
+            min_repeat_ms=500,
+            repeat=3,
+        )
+        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
+        print("Running time in time_evaluator: ", results)
+
+    run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=lib,
+        dev_type=ARGS.target.kind.name,
+        args=input_data,
+        continuation=f_timer,
+    )
+
+    def f_per_layer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.debugger.debug_executor import create
+
+        # pylint: enable=import-outside-toplevel
+        mod = create(graph, rt_mod, dev)
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
+        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
+        graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
+        print("|graph_nodes| = ", len(graph_nodes))
+        print("|graph_time| = ", len(graph_time))
+        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        for k, v in graph_nodes_time.items():
+            print(f"{k} : {v:.3f}")
+
+    run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=rt_mod,
+        dev_type=ARGS.target.kind.name,
+        args=input_data,
+        continuation=f_per_layer,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py b/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
index abac49c50c6ee..ff4f9313470c9 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
@@ -71,7 +71,7 @@ def _parse_args():
         required=True,
     )
     args.add_argument(
-        "--log-dir",
+        "--work-dir",
         type=str,
         required=True,
     )
@@ -96,7 +96,7 @@ def _parse_args():
 
 
 def main():
-    log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json")
+    log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
 
     runner = auto_scheduler.RPCRunner(
         key=ARGS.rpc_key,

From 81702192b49ddb37ce3e179eec3e88f3726acec1 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 7 Jun 2022 13:38:03 -0500
Subject: [PATCH 059/181] [MetaSchedule] Resolve dependencies between header
 files (#11604)

* [MetaSchedule] Resolve dependencies between header files

After PR11590 TVM stopped compiling with clang-14 and libc++. The problems
were caused by incomplete types used in contexts where complete types were
required. To resolve this, some code had to be moved into .cc files. Also
the MeasureCandidate classes needed to be added to their own include files
(or otherwise there would be a circular dependency between headers).

All headers from the meta_schedule directory were updated to include all
their dependencies (forward declarations were left where appropriate).

* Fix a typo: PySpaceGeneratorCode -> PySpaceGeneratorNode
---
 .../tvm/meta_schedule/apply_history_best.h    |  9 ++-
 include/tvm/meta_schedule/arg_info.h          |  3 +
 include/tvm/meta_schedule/builder.h           |  8 +++
 include/tvm/meta_schedule/cost_model.h        | 34 ++++-----
 include/tvm/meta_schedule/database.h          |  7 ++
 include/tvm/meta_schedule/extracted_task.h    |  7 +-
 include/tvm/meta_schedule/feature_extractor.h | 13 ++--
 include/tvm/meta_schedule/measure_callback.h  | 11 +--
 include/tvm/meta_schedule/measure_candidate.h | 67 ++++++++++++++++++
 include/tvm/meta_schedule/mutator.h           | 18 +++--
 include/tvm/meta_schedule/postproc.h          | 15 ++--
 include/tvm/meta_schedule/runner.h            |  6 ++
 include/tvm/meta_schedule/schedule_rule.h     | 20 +++---
 include/tvm/meta_schedule/search_strategy.h   | 69 ++++---------------
 include/tvm/meta_schedule/space_generator.h   | 21 +++---
 include/tvm/meta_schedule/task_scheduler.h    | 47 +++----------
 include/tvm/meta_schedule/tune_context.h      |  8 +++
 src/meta_schedule/cost_model/cost_model.cc    | 24 +++++++
 .../feature_extractor/feature_extractor.cc    |  6 ++
 .../measure_callback/measure_callback.cc      |  9 +++
 src/meta_schedule/mutator/mutator.cc          | 12 ++++
 src/meta_schedule/postproc/postproc.cc        | 11 +++
 .../schedule_rule/schedule_rule.cc            | 12 ++++
 .../search_strategy/search_strategy.cc        | 27 +++++++-
 .../space_generator/space_generator.cc        | 12 ++++
 .../task_scheduler/task_scheduler.cc          | 37 ++++++++++
 26 files changed, 344 insertions(+), 169 deletions(-)
 create mode 100644 include/tvm/meta_schedule/measure_candidate.h

diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h
index b5504a8ee0f8c..5b1816cef41ff 100644
--- a/include/tvm/meta_schedule/apply_history_best.h
+++ b/include/tvm/meta_schedule/apply_history_best.h
@@ -19,7 +19,14 @@
 #ifndef TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_
 #define TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_
 
+#include <tvm/ir/module.h>
 #include <tvm/meta_schedule/database.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/target/target.h>
 
 namespace tvm {
@@ -36,7 +43,7 @@ class ApplyHistoryBestNode : public runtime::Object {
   /*! \brief The logging function to be used */
   PackedFunc logging_func;
 
-  void VisitAttrs(AttrVisitor* v) { v->Visit("database", &database); }
+  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("database", &database); }
   /*!
    * \brief Query the best entry from the database
    * \param task_name The name of the task to be queried
diff --git a/include/tvm/meta_schedule/arg_info.h b/include/tvm/meta_schedule/arg_info.h
index 08553a001374e..c7dd3c7f65385 100644
--- a/include/tvm/meta_schedule/arg_info.h
+++ b/include/tvm/meta_schedule/arg_info.h
@@ -20,7 +20,10 @@
 #define TVM_META_SCHEDULE_ARG_INFO_H_
 
 #include <tvm/node/node.h>
+#include <tvm/node/reflection.h>
 #include <tvm/runtime/container/shape_tuple.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/object.h>
 #include <tvm/tir/function.h>
 
 namespace tvm {
diff --git a/include/tvm/meta_schedule/builder.h b/include/tvm/meta_schedule/builder.h
index 2b809459155ec..e41dc900a00da 100644
--- a/include/tvm/meta_schedule/builder.h
+++ b/include/tvm/meta_schedule/builder.h
@@ -20,6 +20,14 @@
 #define TVM_META_SCHEDULE_BUILDER_H_
 
 #include <tvm/ir/module.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/target/target.h>
 
 namespace tvm {
diff --git a/include/tvm/meta_schedule/cost_model.h b/include/tvm/meta_schedule/cost_model.h
index 6fadc2fb9c137..91d19c430b1fe 100644
--- a/include/tvm/meta_schedule/cost_model.h
+++ b/include/tvm/meta_schedule/cost_model.h
@@ -20,7 +20,15 @@
 #ifndef TVM_META_SCHEDULE_COST_MODEL_H_
 #define TVM_META_SCHEDULE_COST_MODEL_H_
 
-#include <tvm/meta_schedule/search_strategy.h>
+#include <tvm/meta_schedule/arg_info.h>
+#include <tvm/meta_schedule/measure_candidate.h>
+#include <tvm/meta_schedule/runner.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/tir/schedule/schedule.h>
 
 #include <vector>
 
@@ -126,28 +134,12 @@ class PyCostModelNode : public CostModelNode {
     // `f_as_string` is not visited
   }
 
-  void Load(const String& path) {
-    ICHECK(f_load != nullptr) << "PyCostModel's Load method not implemented!";
-    f_load(path);
-  }
-
-  void Save(const String& path) {
-    ICHECK(f_save != nullptr) << "PyCostModel's Save method not implemented!";
-    f_save(path);
-  }
+  void Load(const String& path);
+  void Save(const String& path);
   void Update(const TuneContext& context, const Array<MeasureCandidate>& candidates,
-              const Array<RunnerResult>& results) {
-    ICHECK(f_update != nullptr) << "PyCostModel's Update method not implemented!";
-    f_update(context, candidates, results);
-  }
-
+              const Array<RunnerResult>& results);
   std::vector<double> Predict(const TuneContext& context,
-                              const Array<MeasureCandidate>& candidates) {
-    ICHECK(f_predict != nullptr) << "PyCostModel's Predict method not implemented!";
-    std::vector<double> result(candidates.size(), 0.0);
-    f_predict(context, candidates, result.data());
-    return result;
-  }
+                              const Array<MeasureCandidate>& candidates);
 
   static constexpr const char* _type_key = "meta_schedule.PyCostModel";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyCostModelNode, CostModelNode);
diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index f07d8e1366441..1353dec3eda3f 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -19,7 +19,14 @@
 #ifndef TVM_META_SCHEDULE_DATABASE_H_
 #define TVM_META_SCHEDULE_DATABASE_H_
 
+#include <tvm/ir/expr.h>
+#include <tvm/ir/module.h>
 #include <tvm/meta_schedule/arg_info.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/target/target.h>
 #include <tvm/tir/schedule/trace.h>
 
diff --git a/include/tvm/meta_schedule/extracted_task.h b/include/tvm/meta_schedule/extracted_task.h
index c6613427fd5b6..898b974d87726 100644
--- a/include/tvm/meta_schedule/extracted_task.h
+++ b/include/tvm/meta_schedule/extracted_task.h
@@ -19,6 +19,11 @@
 #ifndef TVM_META_SCHEDULE_EXTRACTED_TASK_H_
 #define TVM_META_SCHEDULE_EXTRACTED_TASK_H_
 
+#include <tvm/ir/module.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
 #include <tvm/target/target.h>
 
 namespace tvm {
@@ -38,7 +43,7 @@ class ExtractedTaskNode : public runtime::Object {
   /*! \brief Weight of the task */
   int weight;
 
-  void VisitAttrs(AttrVisitor* v) {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("task_name", &task_name);
     v->Visit("mod", &mod);
     v->Visit("target", &target);
diff --git a/include/tvm/meta_schedule/feature_extractor.h b/include/tvm/meta_schedule/feature_extractor.h
index c2ca2beb9b686..02e9f26b2a600 100644
--- a/include/tvm/meta_schedule/feature_extractor.h
+++ b/include/tvm/meta_schedule/feature_extractor.h
@@ -20,7 +20,13 @@
 #ifndef TVM_META_SCHEDULE_FEATURE_EXTRACTOR_H_
 #define TVM_META_SCHEDULE_FEATURE_EXTRACTOR_H_
 
-#include <tvm/meta_schedule/search_strategy.h>
+#include <tvm/meta_schedule/measure_candidate.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 
 namespace tvm {
 namespace meta_schedule {
@@ -76,10 +82,7 @@ class PyFeatureExtractorNode : public FeatureExtractorNode {
   }
 
   Array<tvm::runtime::NDArray> ExtractFrom(const TuneContext& context,
-                                           const Array<MeasureCandidate>& candidates) {
-    ICHECK(f_extract_from != nullptr) << "PyFeatureExtractor's ExtractFrom method not implemented!";
-    return f_extract_from(context, candidates);
-  }
+                                           const Array<MeasureCandidate>& candidates) final;
 
   static constexpr const char* _type_key = "meta_schedule.PyFeatureExtractor";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyFeatureExtractorNode, FeatureExtractorNode);
diff --git a/include/tvm/meta_schedule/measure_callback.h b/include/tvm/meta_schedule/measure_callback.h
index e9abb123012ab..151582d4c9ce6 100644
--- a/include/tvm/meta_schedule/measure_callback.h
+++ b/include/tvm/meta_schedule/measure_callback.h
@@ -21,9 +21,15 @@
 #define TVM_META_SCHEDULE_MEASURE_CALLBACK_H_
 
 #include <tvm/meta_schedule/builder.h>
+#include <tvm/meta_schedule/measure_candidate.h>
 #include <tvm/meta_schedule/runner.h>
 #include <tvm/meta_schedule/search_strategy.h>
 #include <tvm/meta_schedule/tune_context.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 
 namespace tvm {
 namespace meta_schedule {
@@ -94,10 +100,7 @@ class PyMeasureCallbackNode : public MeasureCallbackNode {
              int task_id,                                        //
              const Array<MeasureCandidate>& measure_candidates,  //
              const Array<BuilderResult>& builds,                 //
-             const Array<RunnerResult>& results) final {
-    ICHECK(f_apply != nullptr) << "PyMeasureCallback's Apply method not implemented!";
-    return this->f_apply(task_scheduler, task_id, measure_candidates, builds, results);
-  }
+             const Array<RunnerResult>& results);
 
   static constexpr const char* _type_key = "meta_schedule.PyMeasureCallback";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyMeasureCallbackNode, MeasureCallbackNode);
diff --git a/include/tvm/meta_schedule/measure_candidate.h b/include/tvm/meta_schedule/measure_candidate.h
new file mode 100644
index 0000000000000..f7257b56d2067
--- /dev/null
+++ b/include/tvm/meta_schedule/measure_candidate.h
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_META_SCHEDULE_MEASURE_CANDIDATE_H_
+#define TVM_META_SCHEDULE_MEASURE_CANDIDATE_H_
+
+#include <tvm/meta_schedule/arg_info.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/object.h>
+#include <tvm/tir/schedule/schedule.h>
+
+namespace tvm {
+namespace meta_schedule {
+
+/*! \brief The schedule (with input shapes) to be measured. */
+class MeasureCandidateNode : public runtime::Object {
+ public:
+  /*! \brief The schedule for measurement. */
+  tir::Schedule sch;
+  /*! \brief The argument information, e.g., (shape, dtype) for tensors. */
+  Array<ArgInfo> args_info;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("sch", &sch);
+    v->Visit("args_info", &args_info);
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.MeasureCandidate";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MeasureCandidateNode, Object);
+};
+
+/*!
+ * \brief Managed reference to MeasureCandidateNode.
+ * \sa MeasureCandidateNode
+ */
+class MeasureCandidate : public runtime::ObjectRef {
+ public:
+  /*!
+   * \brief Constructor of MeasureCandidate.
+   * \param sch The schedule for measurement.
+   * \param args_info The argument information, e.g., (shape, dtype) for tensors.
+   */
+  TVM_DLL MeasureCandidate(tir::Schedule sch, Array<ArgInfo> args_info);
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(MeasureCandidate, ObjectRef, MeasureCandidateNode);
+};
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_MEASURE_CANDIDATE_H_
diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h
index d80fa70eee8a2..566cc82e9716d 100644
--- a/include/tvm/meta_schedule/mutator.h
+++ b/include/tvm/meta_schedule/mutator.h
@@ -20,7 +20,13 @@
 #ifndef TVM_META_SCHEDULE_MUTATOR_H_
 #define TVM_META_SCHEDULE_MUTATOR_H_
 
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/support/random_engine.h>
 #include <tvm/tir/schedule/schedule.h>
+#include <tvm/tir/schedule/trace.h>
 
 namespace tvm {
 namespace meta_schedule {
@@ -89,17 +95,9 @@ class PyMutatorNode : public MutatorNode {
     // `f_as_string` is not visited
   }
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    ICHECK(f_initialize_with_tune_context != nullptr)
-        << "PyMutator's InitializeWithTuneContext method not implemented!";
-    this->f_initialize_with_tune_context(context);
-  }
-
+  void InitializeWithTuneContext(const TuneContext& context) final;
   Optional<tir::Trace> Apply(const tir::Trace& trace,
-                             support::LinearCongruentialEngine::TRandState* rand_state) final {
-    ICHECK(f_apply != nullptr) << "PyMutator's Apply method not implemented!";
-    return this->f_apply(trace, *rand_state);
-  }
+                             support::LinearCongruentialEngine::TRandState* rand_state) final;
 
   static constexpr const char* _type_key = "meta_schedule.PyMutator";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyMutatorNode, MutatorNode);
diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 195d558550170..738e726aa146b 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -20,6 +20,9 @@
 #ifndef TVM_META_SCHEDULE_POSTPROC_H_
 #define TVM_META_SCHEDULE_POSTPROC_H_
 
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/tir/schedule/schedule.h>
 
 namespace tvm {
@@ -88,16 +91,8 @@ class PyPostprocNode : public PostprocNode {
     // `f_as_string` is not visited
   }
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    ICHECK(f_initialize_with_tune_context != nullptr)
-        << "PyPostproc's InitializeWithTuneContext method not implemented!";
-    this->f_initialize_with_tune_context(context);
-  }
-
-  bool Apply(const tir::Schedule& sch) final {
-    ICHECK(f_apply != nullptr) << "PyPostproc's Apply method not implemented!";
-    return this->f_apply(sch);
-  }
+  void InitializeWithTuneContext(const TuneContext& context) final;
+  bool Apply(const tir::Schedule& sch) final;
 
   static constexpr const char* _type_key = "meta_schedule.PyPostproc";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyPostprocNode, PostprocNode);
diff --git a/include/tvm/meta_schedule/runner.h b/include/tvm/meta_schedule/runner.h
index 61023c8e2db05..c095728369312 100644
--- a/include/tvm/meta_schedule/runner.h
+++ b/include/tvm/meta_schedule/runner.h
@@ -21,6 +21,12 @@
 
 #include <tvm/ir/expr.h>
 #include <tvm/meta_schedule/arg_info.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 
 namespace tvm {
 namespace meta_schedule {
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index b39c72e24db8e..7e0e5bda57b60 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -20,6 +20,14 @@
 #ifndef TVM_META_SCHEDULE_SCHEDULE_RULE_H_
 #define TVM_META_SCHEDULE_SCHEDULE_RULE_H_
 
+#include <tvm/ir/expr.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/tir/schedule/schedule.h>
 
 namespace tvm {
@@ -90,16 +98,8 @@ class PyScheduleRuleNode : public ScheduleRuleNode {
     // `f_as_string` is not visited
   }
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    ICHECK(f_initialize_with_tune_context != nullptr)
-        << "PyScheduleRule's InitializeWithTuneContext method not implemented!";
-    this->f_initialize_with_tune_context(context);
-  }
-
-  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block) final {
-    ICHECK(f_apply != nullptr) << "PyScheduleRule's Apply method not implemented!";
-    return this->f_apply(sch, block);
-  }
+  void InitializeWithTuneContext(const TuneContext& context) final;
+  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block) final;
 
   static constexpr const char* _type_key = "meta_schedule.PyScheduleRule";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyScheduleRuleNode, ScheduleRuleNode);
diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index 139de7c99d042..baae22f0d98ec 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -20,7 +20,15 @@
 #define TVM_META_SCHEDULE_SEARCH_STRATEGY_H_
 
 #include <tvm/meta_schedule/arg_info.h>
+#include <tvm/meta_schedule/cost_model.h>
+#include <tvm/meta_schedule/database.h>
+#include <tvm/meta_schedule/measure_candidate.h>
 #include <tvm/meta_schedule/runner.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/tir/schedule/schedule.h>
 
 namespace tvm {
@@ -28,40 +36,6 @@ namespace meta_schedule {
 
 // Forward declaration
 class TuneContext;
-class CostModel;
-class Database;
-
-/*! \brief The schedule (with input shapes) to be measured. */
-class MeasureCandidateNode : public runtime::Object {
- public:
-  /*! \brief The schedule for measurement. */
-  tir::Schedule sch;
-  /*! \brief The argument information, e.g., (shape, dtype) for tensors. */
-  Array<ArgInfo> args_info;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("sch", &sch);
-    v->Visit("args_info", &args_info);
-  }
-
-  static constexpr const char* _type_key = "meta_schedule.MeasureCandidate";
-  TVM_DECLARE_FINAL_OBJECT_INFO(MeasureCandidateNode, Object);
-};
-
-/*!
- * \brief Managed reference to MeasureCandidateNode.
- * \sa MeasureCandidateNode
- */
-class MeasureCandidate : public runtime::ObjectRef {
- public:
-  /*!
-   * \brief Constructor of MeasureCandidate.
-   * \param sch The schedule for measurement.
-   * \param args_info The argument information, e.g., (shape, dtype) for tensors.
-   */
-  TVM_DLL MeasureCandidate(tir::Schedule sch, Array<ArgInfo> args_info);
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(MeasureCandidate, ObjectRef, MeasureCandidateNode);
-};
 
 /*!
  * \brief The search strategy for measure candidates generation.
@@ -198,33 +172,14 @@ class PySearchStrategyNode : public SearchStrategyNode {
     // `f_notify_runner_results` is not visited
   }
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    ICHECK(f_initialize_with_tune_context != nullptr)
-        << "PySearchStrategy's InitializeWithTuneContext method not implemented!";
-    this->f_initialize_with_tune_context(context);
-  }
-
+  void InitializeWithTuneContext(const TuneContext& context) final;
   void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
                  const Optional<CostModel>& cost_model) final;
-
-  void PostTuning() final {
-    ICHECK(f_post_tuning != nullptr) << "PySearchStrategy's PostTuning method not implemented!";
-    this->f_post_tuning();
-  }
-
-  Optional<Array<MeasureCandidate>> GenerateMeasureCandidates() final {
-    ICHECK(f_generate_measure_candidates != nullptr)
-        << "PySearchStrategy's GenerateMeasureCandidates method not implemented!";
-    return this->f_generate_measure_candidates();
-  }
-
+  void PostTuning() final;
+  Optional<Array<MeasureCandidate>> GenerateMeasureCandidates() final;
   void NotifyRunnerResults(const TuneContext& context,
                            const Array<MeasureCandidate>& measure_candidates,
-                           const Array<RunnerResult>& results) final {
-    ICHECK(f_notify_runner_results != nullptr)
-        << "PySearchStrategy's NotifyRunnerResults method not implemented!";
-    this->f_notify_runner_results(context, measure_candidates, results);
-  }
+                           const Array<RunnerResult>& results);
 
   static constexpr const char* _type_key = "meta_schedule.PySearchStrategy";
   TVM_DECLARE_FINAL_OBJECT_INFO(PySearchStrategyNode, SearchStrategyNode);
diff --git a/include/tvm/meta_schedule/space_generator.h b/include/tvm/meta_schedule/space_generator.h
index bad9ae0f6c6e9..f7d6cac31cab6 100644
--- a/include/tvm/meta_schedule/space_generator.h
+++ b/include/tvm/meta_schedule/space_generator.h
@@ -20,6 +20,10 @@
 #define TVM_META_SCHEDULE_SPACE_GENERATOR_H_
 
 #include <tvm/ir/module.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/tir/schedule/schedule.h>
 
 namespace tvm {
@@ -64,7 +68,7 @@ class TuneContext;
 │                   └───  Runner Future ◄────┘                        │
 └─────────────────────────────────────────────────────────────────────┘
 */
-class SpaceGeneratorNode : public Object {
+class SpaceGeneratorNode : public runtime::Object {
  public:
   /*! \brief Default destructor */
   virtual ~SpaceGeneratorNode() = default;
@@ -112,17 +116,8 @@ class PySpaceGeneratorNode : public SpaceGeneratorNode {
     // `f_generate_design_space` is not visited
   }
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    ICHECK(f_initialize_with_tune_context != nullptr)
-        << "PySpaceGenerator's InitializeWithTuneContext method not implemented!";
-    f_initialize_with_tune_context(context);
-  }
-
-  Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod) final {
-    ICHECK(f_generate_design_space != nullptr)
-        << "PySpaceGenerator's GenerateDesignSpace method not implemented!";
-    return f_generate_design_space(mod);
-  }
+  void InitializeWithTuneContext(const TuneContext& context) final;
+  Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod) final;
 
   static constexpr const char* _type_key = "meta_schedule.PySpaceGenerator";
   TVM_DECLARE_FINAL_OBJECT_INFO(PySpaceGeneratorNode, SpaceGeneratorNode);
@@ -132,7 +127,7 @@ class PySpaceGeneratorNode : public SpaceGeneratorNode {
  * \brief Managed reference to SpaceGeneratorNode.
  * \sa SpaceGeneratorNode
  */
-class SpaceGenerator : public ObjectRef {
+class SpaceGenerator : public runtime::ObjectRef {
  protected:
   SpaceGenerator() = default;
 
diff --git a/include/tvm/meta_schedule/task_scheduler.h b/include/tvm/meta_schedule/task_scheduler.h
index 5953a2c3e42b1..385816e790e29 100644
--- a/include/tvm/meta_schedule/task_scheduler.h
+++ b/include/tvm/meta_schedule/task_scheduler.h
@@ -25,6 +25,12 @@
 #include <tvm/meta_schedule/measure_callback.h>
 #include <tvm/meta_schedule/runner.h>
 #include <tvm/meta_schedule/tune_context.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/support/random_engine.h>
 
 namespace tvm {
 namespace meta_schedule {
@@ -181,42 +187,11 @@ class PyTaskSchedulerNode : public TaskSchedulerNode {
     // `f_next_task_id` is not visited
   }
 
-  void Tune() final {
-    if (f_tune == nullptr) {
-      TaskSchedulerNode::Tune();
-    } else {
-      f_tune();
-    }
-  }
-
-  void InitializeTask(int task_id) final {
-    if (f_initialize_task == nullptr) {
-      TaskSchedulerNode::InitializeTask(task_id);
-    } else {
-      f_initialize_task(task_id);
-    }
-  }
-
-  void TouchTask(int task_id) final {
-    if (f_touch_task == nullptr) {
-      return TaskSchedulerNode::TouchTask(task_id);
-    } else {
-      return f_touch_task(task_id);
-    }
-  }
-
-  Array<RunnerResult> JoinRunningTask(int task_id) final {
-    if (f_join_running_task == nullptr) {
-      return TaskSchedulerNode::JoinRunningTask(task_id);
-    } else {
-      return f_join_running_task(task_id);
-    }
-  }
-
-  int NextTaskId() final {
-    ICHECK(f_next_task_id != nullptr) << "PyTaskScheduler's NextTaskId method not implemented!";
-    return f_next_task_id();
-  }
+  void Tune() final;
+  void InitializeTask(int task_id) final;
+  void TouchTask(int task_id) final;
+  Array<RunnerResult> JoinRunningTask(int task_id) final;
+  int NextTaskId() final;
 
   static constexpr const char* _type_key = "meta_schedule.PyTaskScheduler";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyTaskSchedulerNode, TaskSchedulerNode);
diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
index d63fb819f3639..ee09099d1a926 100644
--- a/include/tvm/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -19,6 +19,7 @@
 #ifndef TVM_META_SCHEDULE_TUNE_CONTEXT_H_
 #define TVM_META_SCHEDULE_TUNE_CONTEXT_H_
 
+#include <tvm/ir/expr.h>
 #include <tvm/ir/module.h>
 #include <tvm/meta_schedule/builder.h>
 #include <tvm/meta_schedule/mutator.h>
@@ -27,6 +28,13 @@
 #include <tvm/meta_schedule/schedule_rule.h>
 #include <tvm/meta_schedule/search_strategy.h>
 #include <tvm/meta_schedule/space_generator.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/support/random_engine.h>
 #include <tvm/target/target.h>
 
diff --git a/src/meta_schedule/cost_model/cost_model.cc b/src/meta_schedule/cost_model/cost_model.cc
index c6efb54303360..aabab5d83a1c9 100644
--- a/src/meta_schedule/cost_model/cost_model.cc
+++ b/src/meta_schedule/cost_model/cost_model.cc
@@ -21,6 +21,30 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PyCostModelNode::Load(const String& path) {
+  ICHECK(f_load != nullptr) << "PyCostModel's Load method not implemented!";
+  f_load(path);
+}
+
+void PyCostModelNode::Save(const String& path) {
+  ICHECK(f_save != nullptr) << "PyCostModel's Save method not implemented!";
+  f_save(path);
+}
+
+void PyCostModelNode::Update(const TuneContext& context, const Array<MeasureCandidate>& candidates,
+                             const Array<RunnerResult>& results) {
+  ICHECK(f_update != nullptr) << "PyCostModel's Update method not implemented!";
+  f_update(context, candidates, results);
+}
+
+std::vector<double> PyCostModelNode::Predict(const TuneContext& context,
+                                             const Array<MeasureCandidate>& candidates) {
+  ICHECK(f_predict != nullptr) << "PyCostModel's Predict method not implemented!";
+  std::vector<double> result(candidates.size(), 0.0);
+  f_predict(context, candidates, result.data());
+  return result;
+}
+
 CostModel CostModel::PyCostModel(PyCostModelNode::FLoad f_load,        //
                                  PyCostModelNode::FSave f_save,        //
                                  PyCostModelNode::FUpdate f_update,    //
diff --git a/src/meta_schedule/feature_extractor/feature_extractor.cc b/src/meta_schedule/feature_extractor/feature_extractor.cc
index 84d22493aaa6d..1ebbb6e2e2339 100644
--- a/src/meta_schedule/feature_extractor/feature_extractor.cc
+++ b/src/meta_schedule/feature_extractor/feature_extractor.cc
@@ -21,6 +21,12 @@
 namespace tvm {
 namespace meta_schedule {
 
+Array<tvm::runtime::NDArray> PyFeatureExtractorNode::ExtractFrom(
+    const TuneContext& context, const Array<MeasureCandidate>& candidates) {
+  ICHECK(f_extract_from != nullptr) << "PyFeatureExtractor's ExtractFrom method not implemented!";
+  return f_extract_from(context, candidates);
+}
+
 FeatureExtractor FeatureExtractor::PyFeatureExtractor(
     PyFeatureExtractorNode::FExtractFrom f_extract_from,  //
     PyFeatureExtractorNode::FAsString f_as_string) {
diff --git a/src/meta_schedule/measure_callback/measure_callback.cc b/src/meta_schedule/measure_callback/measure_callback.cc
index 733d118c735d3..c7851a6fadf62 100644
--- a/src/meta_schedule/measure_callback/measure_callback.cc
+++ b/src/meta_schedule/measure_callback/measure_callback.cc
@@ -21,6 +21,15 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PyMeasureCallbackNode::Apply(const TaskScheduler& task_scheduler,                //
+                                  int task_id,                                        //
+                                  const Array<MeasureCandidate>& measure_candidates,  //
+                                  const Array<BuilderResult>& builds,                 //
+                                  const Array<RunnerResult>& results) {
+  ICHECK(f_apply != nullptr) << "PyMeasureCallback's Apply method not implemented!";
+  return f_apply(task_scheduler, task_id, measure_candidates, builds, results);
+}
+
 MeasureCallback MeasureCallback::PyMeasureCallback(PyMeasureCallbackNode::FApply f_apply,  //
                                                    PyMeasureCallbackNode::FAsString f_as_string) {
   ObjectPtr<PyMeasureCallbackNode> n = make_object<PyMeasureCallbackNode>();
diff --git a/src/meta_schedule/mutator/mutator.cc b/src/meta_schedule/mutator/mutator.cc
index 27383adf84e0e..43b95000c71d4 100644
--- a/src/meta_schedule/mutator/mutator.cc
+++ b/src/meta_schedule/mutator/mutator.cc
@@ -21,6 +21,18 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PyMutatorNode::InitializeWithTuneContext(const TuneContext& context) {
+  ICHECK(f_initialize_with_tune_context != nullptr)
+      << "PyMutator's InitializeWithTuneContext method not implemented!";
+  f_initialize_with_tune_context(context);
+}
+
+Optional<tir::Trace> PyMutatorNode::Apply(
+    const tir::Trace& trace, support::LinearCongruentialEngine::TRandState* rand_state) {
+  ICHECK(f_apply != nullptr) << "PyMutator's Apply method not implemented!";
+  return f_apply(trace, *rand_state);
+}
+
 Mutator Mutator::PyMutator(
     PyMutatorNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PyMutatorNode::FApply f_apply,                                             //
diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc
index ff069e2c68cbd..0f4f1b1192f65 100644
--- a/src/meta_schedule/postproc/postproc.cc
+++ b/src/meta_schedule/postproc/postproc.cc
@@ -21,6 +21,17 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PyPostprocNode::InitializeWithTuneContext(const TuneContext& context) {
+  ICHECK(f_initialize_with_tune_context != nullptr)
+      << "PyPostproc's InitializeWithTuneContext method not implemented!";
+  f_initialize_with_tune_context(context);
+}
+
+bool PyPostprocNode::Apply(const tir::Schedule& sch) {
+  ICHECK(f_apply != nullptr) << "PyPostproc's Apply method not implemented!";
+  return f_apply(sch);
+}
+
 Postproc Postproc::PyPostproc(
     PyPostprocNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PyPostprocNode::FApply f_apply,                                             //
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index f80f684dafa81..80f8725b0c0d7 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -21,6 +21,18 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PyScheduleRuleNode::InitializeWithTuneContext(const TuneContext& context) {
+  ICHECK(f_initialize_with_tune_context != nullptr)
+      << "PyScheduleRule's InitializeWithTuneContext method not implemented!";
+  f_initialize_with_tune_context(context);
+}
+
+Array<tir::Schedule> PyScheduleRuleNode::Apply(const tir::Schedule& sch,
+                                               const tir::BlockRV& block) {
+  ICHECK(f_apply != nullptr) << "PyScheduleRule's Apply method not implemented!";
+  return f_apply(sch, block);
+}
+
 ScheduleRule ScheduleRule::PyScheduleRule(
     PyScheduleRuleNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PyScheduleRuleNode::FApply f_apply,                                             //
diff --git a/src/meta_schedule/search_strategy/search_strategy.cc b/src/meta_schedule/search_strategy/search_strategy.cc
index a6a1100cebe60..f4c392ca2f1a1 100644
--- a/src/meta_schedule/search_strategy/search_strategy.cc
+++ b/src/meta_schedule/search_strategy/search_strategy.cc
@@ -28,11 +28,36 @@ MeasureCandidate::MeasureCandidate(tir::Schedule sch, Array<ArgInfo> args_info)
   data_ = std::move(n);
 }
 
+void PySearchStrategyNode::InitializeWithTuneContext(const TuneContext& context) {
+  ICHECK(f_initialize_with_tune_context != nullptr)
+      << "PySearchStrategy's InitializeWithTuneContext method not implemented!";
+  f_initialize_with_tune_context(context);
+}
+
 void PySearchStrategyNode::PreTuning(const Array<tir::Schedule>& design_spaces,
                                      const Optional<Database>& database,
                                      const Optional<CostModel>& cost_model) {
   ICHECK(f_pre_tuning != nullptr) << "PySearchStrategy's PreTuning method not implemented!";
-  this->f_pre_tuning(design_spaces, database, cost_model);
+  f_pre_tuning(design_spaces, database, cost_model);
+}
+
+void PySearchStrategyNode::PostTuning() {
+  ICHECK(f_post_tuning != nullptr) << "PySearchStrategy's PostTuning method not implemented!";
+  f_post_tuning();
+}
+
+Optional<Array<MeasureCandidate>> PySearchStrategyNode::GenerateMeasureCandidates() {
+  ICHECK(f_generate_measure_candidates != nullptr)
+      << "PySearchStrategy's GenerateMeasureCandidates method not implemented!";
+  return f_generate_measure_candidates();
+}
+
+void PySearchStrategyNode::NotifyRunnerResults(const TuneContext& context,
+                                               const Array<MeasureCandidate>& measure_candidates,
+                                               const Array<RunnerResult>& results) {
+  ICHECK(f_notify_runner_results != nullptr)
+      << "PySearchStrategy's NotifyRunnerResults method not implemented!";
+  f_notify_runner_results(context, measure_candidates, results);
 }
 
 SearchStrategy SearchStrategy::PySearchStrategy(
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index 6df8da2f7aa12..5c5ab6ebbae5b 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -21,6 +21,18 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PySpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
+  ICHECK(f_initialize_with_tune_context != nullptr)
+      << "PySpaceGenerator's InitializeWithTuneContext method not implemented!";
+  f_initialize_with_tune_context(context);
+}
+
+Array<tir::Schedule> PySpaceGeneratorNode::GenerateDesignSpace(const IRModule& mod) {
+  ICHECK(f_generate_design_space != nullptr)
+      << "PySpaceGenerator's GenerateDesignSpace method not implemented!";
+  return f_generate_design_space(mod);
+}
+
 SpaceGenerator SpaceGenerator::PySpaceGenerator(
     PySpaceGeneratorNode::FInitializeWithTuneContext f_initialize_with_tune_context,
     PySpaceGeneratorNode::FGenerateDesignSpace f_generate_design_space) {
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index 25867fb4f3bbf..5d41f2edfb26f 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -199,6 +199,43 @@ Array<RunnerResult> TaskSchedulerNode::JoinRunningTask(int task_id) {
   return results;
 }
 
+void PyTaskSchedulerNode::Tune() {
+  if (f_tune == nullptr) {
+    TaskSchedulerNode::Tune();
+  } else {
+    f_tune();
+  }
+}
+
+void PyTaskSchedulerNode::InitializeTask(int task_id) {
+  if (f_initialize_task == nullptr) {
+    TaskSchedulerNode::InitializeTask(task_id);
+  } else {
+    f_initialize_task(task_id);
+  }
+}
+
+void PyTaskSchedulerNode::TouchTask(int task_id) {
+  if (f_touch_task == nullptr) {
+    return TaskSchedulerNode::TouchTask(task_id);
+  } else {
+    return f_touch_task(task_id);
+  }
+}
+
+Array<RunnerResult> PyTaskSchedulerNode::JoinRunningTask(int task_id) {
+  if (f_join_running_task == nullptr) {
+    return TaskSchedulerNode::JoinRunningTask(task_id);
+  } else {
+    return f_join_running_task(task_id);
+  }
+}
+
+int PyTaskSchedulerNode::NextTaskId() {
+  ICHECK(f_next_task_id != nullptr) << "PyTaskScheduler's NextTaskId method not implemented!";
+  return f_next_task_id();
+}
+
 TaskScheduler TaskScheduler::PyTaskScheduler(
     Array<TuneContext> tasks,                                   //
     Builder builder,                                            //

From d8f57ed7ff6daf585ca56bc2cf9326eca9e73fca Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Tue, 7 Jun 2022 11:54:46 -0700
Subject: [PATCH 060/181] [Relay] IndexedGraph improvements in preparation for
 Collage (#11481)

* [Relay] Odd's 'n ends changes to help Collage.
 - Complete the implementation of WithFields.
   (Unfortunately they appear to be without unit tests and I continue this tradition...)
 - InferTypeExpr for InferTypeLocal but return the expression rather than the type.
 - Remove python binding of InlineComposites since C++ impl was removed some time ago.
 - Make IndexedGraph<Expr/DFPattern> more robust as stand-alone datastructure, and avoid unnecessary copies.
   This will become a fundamental datastructure in Collage rather than just a helper for DFPatternMatcher.
 - Extend IndexedGraph with a notion of 'basic block' on every dataflow node. Needed by Collage to
   avoid impossible partitions.

* - Revert non IndexedGraph changes.

* - Stick to 'Indexed graph' terminology
- More tests

* - Stick to 'Indexed graph' terminology
- More tests

* - Remove silly unit test
---
 src/relay/ir/dataflow_matcher.cc            |  90 ++--
 src/relay/ir/dataflow_matcher_impl.h        |  19 +-
 src/relay/ir/indexed_graph.cc               | 526 ++++++++++++++------
 src/relay/ir/indexed_graph.h                | 283 +++++++++--
 src/relay/op/dyn/tensor/transform.cc        |   1 +
 tests/cpp/relay/ir/indexed_graph_test.cc    | 205 ++++++++
 tests/python/relay/test_dataflow_pattern.py |  35 +-
 7 files changed, 922 insertions(+), 237 deletions(-)
 create mode 100644 tests/cpp/relay/ir/indexed_graph_test.cc

diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index 8d7ed163a1975..df896cb690eb2 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -36,6 +36,7 @@ namespace relay {
 
 // Pattern Matcher
 bool DFPatternMatcher::Match(const DFPattern& pattern, const Expr& expr) {
+  VLOG(1) << "Match " << PrettyPrint(pattern) << " in:" << std::endl << PrettyPrint(expr);
   memo_.clear();
   matched_nodes_.clear();
   return VisitDFPattern(pattern, expr);
@@ -58,6 +59,7 @@ bool DFPatternMatcher::VisitDFPattern(const DFPattern& pattern, const Expr& expr
     if (out) {
       memo_[pattern].push_back(expr);
       matched_nodes_.push_back(pattern);
+      VLOG(1) << "Matched " << PrettyPrint(pattern) << " at:" << std::endl << PrettyPrint(expr);
     } else {
       ClearMap(watermark);
     }
@@ -124,7 +126,6 @@ bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, cons
   if (!matches) {
     return matches;
   }
-  VLOG(1) << "considering AttrPatternNode at:\n" << PrettyPrint(expr);
   auto attributes = attr_pattern->attrs.as<DictAttrsNode>()->dict;
   if (const auto* op_node = expr.as<OpNode>()) {
     Op op = GetRef<Op>(op_node);
@@ -299,14 +300,18 @@ bool DFPatternMatcher::VisitDFPattern_(const CallPatternNode* op, const Expr& ex
 // Recursively find the Dominator parent along all inputs paths.
 bool DFPatternMatcher::MatchesPath(const DominatorPatternNode* op, const Expr& expr) {
   auto call_node = expr.as<CallNode>();
-  for (auto node : expr_graph_.node_map_.at(expr)->inputs_) {
-    if (!(call_node && node->ref_ == call_node->op)) {
+  auto index_node = expr_to_node(expr);
+  for (auto node : index_node->inputs_) {
+    if (!(call_node && node->ref() == call_node->op)) {
       memoize_ = true;
-      if (VisitDFPattern(op->parent, node->ref_)) {
+      if (VisitDFPattern(op->parent, node->ref())) {
         return true;
       } else {
         memoize_ = false;
-        if (!VisitDFPattern(op->path, node->ref_) || !MatchesPath(op, node->ref_)) {
+        if (!VisitDFPattern(op->path, node->ref())) {
+          return false;
+        }
+        if (!MatchesPath(op, node->ref())) {
           return false;
         }
       }
@@ -318,19 +323,19 @@ bool DFPatternMatcher::MatchesPath(const DominatorPatternNode* op, const Expr& e
 // Iteratively ensure that the parent is dominated somewhere by the child or the path
 bool DFPatternMatcher::DominatesParent(const DominatorPatternNode* op, const Expr& expr) {
   std::stack<Expr> stack;
-  std::unordered_set<Expr, ObjectPtrHash, ObjectPtrEqual> visited;
+  std::unordered_set<const ExprNode*> visited;
   stack.push(expr);
   while (!stack.empty()) {
     Expr current = stack.top();
     stack.pop();
-    for (auto node : expr_graph_.node_map_.at(current)->dominator_children_) {
-      if (visited.count(node->ref_) == 0) {
-        if (VisitDFPattern(op->parent, node->ref_)) {
+    for (auto node : expr_to_node(current)->dominator_children_) {
+      if (visited.count(node->node_ref_) == 0) {
+        if (VisitDFPattern(op->parent, node->ref())) {
           return true;
         } else {
-          stack.push(node->ref_);
+          stack.push(node->ref());
         }
-        visited.insert(node->ref_);
+        visited.insert(node->node_ref_);
       }
     }
   }
@@ -500,7 +505,8 @@ bool DFPatternMatcher::VisitDFPattern_(const WildcardPatternNode* op, const Expr
 }
 
 bool MatchPattern(DFPattern pattern, Expr expr) {
-  return DFPatternMatcher(expr).Match(pattern, expr);
+  std::unique_ptr<IndexedGraph<Expr>> expr_graph = CreateIndexedGraph(expr);
+  return DFPatternMatcher(expr_graph.get()).Match(pattern, expr);
 }
 
 TVM_REGISTER_GLOBAL("relay.dataflow_pattern.match").set_body_typed(MatchPattern);
@@ -575,7 +581,8 @@ const std::unordered_map<int, PatternGrouper::Group>& PatternGrouper::GroupMatch
 
   pattern_ = pattern;
   pattern_graph_ = CreateIndexedGraph(pattern_);
-  auto matcher = DFPatternMatcher(pre);
+  std::unique_ptr<IndexedGraph<Expr>> expr_graph = CreateIndexedGraph(pre);
+  DFPatternMatcher matcher(expr_graph.get());
   matcher_ = &matcher;
   this->VisitExprs();
   return this->groups_;
@@ -583,9 +590,9 @@ const std::unordered_map<int, PatternGrouper::Group>& PatternGrouper::GroupMatch
 
 void PatternGrouper::VisitExprs() {
   std::unordered_set<Expr, ObjectPtrHash, ObjectPtrEqual> pre_partitioned;
-  for (size_t i = matcher_->expr_graph_.topological_order_.size(); i != 0; --i) {
-    size_t index = i - 1;
-    Expr current = matcher_->expr_graph_.topological_order_.at(index)->ref_;
+  for (PostDfsIndex i = matcher_->size(); i != 0; --i) {
+    PostDfsIndex index = i - 1;
+    const auto current = matcher_->index_to_node(index)->ref();
     if (gid_assignments_.count(current) == 0) {  // Don't visit nodes we've already grouped
       if (auto op = current.as<FunctionNode>()) {
         if (op->attrs.defined() && op->attrs->dict.count(attr::kPartitionedFromPattern) != 0) {
@@ -607,9 +614,10 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
   auto node_map = matcher_->GetMemo();
   // Get fuzzy patterns
   std::unordered_set<Expr, ObjectPtrHash, ObjectPtrEqual> fuzzy_matches;
-  for (auto node : pattern_graph_.topological_order_) {
+  for (PostDfsIndex index = 0; index < pattern_graph_->size(); ++index) {
+    auto node = pattern_graph_->index_to_node(index);
     // Don't treat fuzzy Dominator patterns input variables for partition
-    if (auto op = node->ref_.as<DominatorPatternNode>()) {
+    if (auto op = node->ref().as<DominatorPatternNode>()) {
       for (auto fuzzy_op : {op->parent, op->path}) {
         for (auto match : node_map[fuzzy_op]) {
           fuzzy_matches.insert(match);
@@ -617,12 +625,13 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
       }
     }
     // Don't treat Function params or body as input variables for partition
-    if (node->ref_.as<FunctionPatternNode>()) {
-      auto matches = node_map[node->ref_];
+    if (node->ref().as<FunctionPatternNode>()) {
+      auto matches = node_map[node->ref()];
       for (auto match : matches) {
-        auto graph = CreateIndexedGraph(match.as<FunctionNode>()->body);
-        for (auto node : graph.topological_order_) {
-          fuzzy_matches.insert(node->ref_);
+        auto sub_graph = CreateIndexedGraph(match.as<FunctionNode>()->body);
+        for (PostDfsIndex sub_index = 0; sub_index < sub_graph->size(); ++sub_index) {
+          auto sub_node = sub_graph->index_to_node(sub_index);
+          fuzzy_matches.insert(sub_node->ref());
         }
       }
     }
@@ -636,10 +645,11 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
   std::unordered_map<Expr, Var, ObjectPtrHash, ObjectPtrEqual> inputs;
   Array<Var> params;
 
-  for (auto node : pattern_graph_.topological_order_) {
+  for (PostDfsIndex index = 0; index < pattern_graph_->size(); ++index) {
+    auto node = pattern_graph_->index_to_node(index);
     auto make_input = [&](const Expr& input) {
       if (fuzzy_matches.count(input) == 0 && input.as<OpNode>() == nullptr &&
-          input.as<FunctionNode>() == nullptr && !EmbedConst(input, node->ref_)) {
+          input.as<FunctionNode>() == nullptr && !EmbedConst(input, node->ref())) {
         inputs[input] =
             Var("FunctionVar_" + std::to_string(graph_number_) + "_" + std::to_string(var_number),
                 NullValue<Type>());
@@ -648,11 +658,11 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
         var_number++;
       }
     };
-    auto tuple = node->ref_.as<TuplePatternNode>();
-    auto call = node->ref_.as<CallPatternNode>();
+    auto tuple = node->ref().as<TuplePatternNode>();
+    auto call = node->ref().as<CallPatternNode>();
     if (tuple && !tuple->fields.defined()) {
-      if (node_map.count(node->ref_)) {
-        auto matches = node_map[node->ref_];
+      if (node_map.count(node->ref())) {
+        auto matches = node_map[node->ref()];
         for (auto match : matches) {
           for (auto input : match.as<TupleNode>()->fields) {
             make_input(input);
@@ -660,8 +670,8 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
         }
       }
     } else if (call && !call->args.defined()) {
-      if (node_map.count(node->ref_)) {
-        auto matches = node_map[node->ref_];
+      if (node_map.count(node->ref())) {
+        auto matches = node_map[node->ref()];
         for (auto match : matches) {
           for (auto input : match.as<CallNode>()->args) {
             make_input(input);
@@ -669,8 +679,8 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
         }
       }
     } else if (node->inputs_.size() == 0) {
-      if (node_map.count(node->ref_)) {
-        auto matches = node_map[node->ref_];
+      if (node_map.count(node->ref())) {
+        auto matches = node_map[node->ref()];
         for (auto match : matches) {
           make_input(match);
         }
@@ -708,13 +718,17 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
         return;
       } else if (kv.second != body) {
         // if the node isn't the output of the group
-        auto node = matcher_->expr_graph_.node_map_.at(kv.first);
+        auto node = matcher_->expr_to_node(kv.first);
         for (auto* output : node->outputs_) {
           // and the node is used by nodes outside of the group
-          if (memo.count(output->ref_) == 0 &&
-              !matcher_->expr_graph_.node_map_.at(expr)->Dominates(output)) {
-            // Exit because nodes in this pattern's body are used outside the pattern
-            // fusing it would be invalid
+          if (memo.count(output->ref()) == 0) {
+            // TODO(mbs): This condition used to also include the following test, which since
+            // the dominators relation is used back-to-front was always vacuously true. So the
+            // code is just rejecting the match if a strictly internal node happened to connect
+            // to an outside node.
+            ICHECK(!matcher_->expr_to_node(expr)->Dominates(output));
+            // Exit because nodes in this pattern's body are used outside the pattern, fusing it
+            // would be invalid
             return;
           }
         }
diff --git a/src/relay/ir/dataflow_matcher_impl.h b/src/relay/ir/dataflow_matcher_impl.h
index d993d4720e4ed..f04190f72e40b 100644
--- a/src/relay/ir/dataflow_matcher_impl.h
+++ b/src/relay/ir/dataflow_matcher_impl.h
@@ -27,7 +27,9 @@
 #include <tvm/relay/dataflow_matcher.h>
 #include <tvm/relay/dataflow_pattern.h>
 #include <tvm/relay/dataflow_pattern_functor.h>
+#include <tvm/relay/expr_functor.h>
 
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -39,10 +41,20 @@ namespace relay {
 
 class DFPatternMatcher : public DFPatternFunctor<bool(const DFPattern&, const Expr&)> {
  public:
-  explicit DFPatternMatcher(const Expr& root_expr) : expr_graph_(CreateIndexedGraph(root_expr)) {}
+  explicit DFPatternMatcher(const IndexedGraph<Expr>* expr_graph) : expr_graph_(expr_graph) {}
   bool Match(const DFPattern& pattern, const Expr& expr);
   Map<DFPattern, Array<Expr>> GetMemo() { return Map<DFPattern, Array<Expr>>(memo_); }
-  const IndexedGraph<Expr> expr_graph_;
+
+  const IndexedGraph<Expr>::Node* expr_to_node(const Expr& expr) const {
+    return expr_graph_->item_to_node(expr);
+  }
+  const IndexedGraph<Expr>::Node* index_to_node(size_t index) const {
+    return expr_graph_->index_to_node(index);
+  }
+  size_t size() const { return expr_graph_->size(); }
+  const std::unordered_map<DFPattern, Array<Expr>, ObjectPtrHash, ObjectPtrEqual>& memo() const {
+    return memo_;
+  }
 
  protected:
   bool VisitDFPattern(const DFPattern& pattern, const Expr& expr) override;
@@ -67,6 +79,7 @@ class DFPatternMatcher : public DFPatternFunctor<bool(const DFPattern&, const Ex
   bool MatchesPath(const DominatorPatternNode* op, const Expr& expr);
   bool DominatesParent(const DominatorPatternNode* op, const Expr& expr);
 
+  const IndexedGraph<Expr>* expr_graph_;
   std::unordered_map<DFPattern, Array<Expr>, ObjectPtrHash, ObjectPtrEqual> memo_;
   std::vector<DFPattern> matched_nodes_;
   bool memoize_ = true;
@@ -131,7 +144,7 @@ class PatternGrouper {
   std::unordered_map<int, Group> groups_;
   std::unordered_map<Expr, int, ObjectPtrHash, ObjectPtrEqual> gid_assignments_;
   DFPatternMatcher* matcher_ = nullptr;
-  IndexedGraph<DFPattern> pattern_graph_;
+  std::unique_ptr<IndexedGraph<DFPattern>> pattern_graph_;
   int gid_ = 0;
   int graph_number_ = 0;
 };
diff --git a/src/relay/ir/indexed_graph.cc b/src/relay/ir/indexed_graph.cc
index 4efe57b491db0..f39ff4850eae1 100644
--- a/src/relay/ir/indexed_graph.cc
+++ b/src/relay/ir/indexed_graph.cc
@@ -19,195 +19,393 @@
 
 /*!
  * \file src/relay/ir/indexed_graph.cc
- * \brief Utilties for Creating Indexed Graphs.
+ * \brief A graph representation of the dataflow in a Relay expression or Relay (dataflow)
+ * pattern.
  */
 #include "indexed_graph.h"
 
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/dataflow_pattern_functor.h>
 #include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
+#include <tvm/relay/pattern_functor.h>
+
+#include <string>
 
 namespace tvm {
 namespace relay {
 
-// IndexedGraph
+std::string RefToSummary(const Expr& expr) {
+  class Visitor : public ExprFunctor<std::string(const Expr&)> {
+    std::string VisitExpr_(const VarNode* op) final { return "%" + op->name_hint(); }
+    std::string VisitExpr_(const GlobalVarNode* op) final { return "@" + op->name_hint; }
+    std::string VisitExpr_(const ConstantNode* op) final { return "const"; }
+    std::string VisitExpr_(const TupleNode* op) final {
+      return "tuple(" + std::to_string(op->fields.size()) + ")";
+    }
+    std::string VisitExpr_(const FunctionNode* op) final { return "fn"; }
+    std::string VisitExpr_(const CallNode* op) final {
+      return VisitExpr(op->op) + "(" + std::to_string(op->args.size()) + ")";
+    }
+    std::string VisitExpr_(const LetNode* op) final { return "let"; }
+    std::string VisitExpr_(const IfNode* op) final { return "if"; }
+    std::string VisitExpr_(const OpNode* op) final { return op->name; }
+    std::string VisitExpr_(const TupleGetItemNode* op) final {
+      return "." + std::to_string(op->index);
+    }
+    std::string VisitExpr_(const RefCreateNode* op) final { return "ref_create"; }
+    std::string VisitExpr_(const RefReadNode* op) final { return "ref_read"; }
+    std::string VisitExpr_(const RefWriteNode* op) final { return "ref_write"; }
+    std::string VisitExpr_(const ConstructorNode* op) final { return "ctor"; }
+    std::string VisitExpr_(const MatchNode* op) final { return "match"; }
+  };
+  return Visitor().VisitExpr(expr);
+}
+
+std::string RefToSummary(const DFPattern& pattern) {
+  // TODO(mbs): Implement as debugging requires.
+  return "";
+}
 
-IndexedGraph<Expr> CreateIndexedGraph(const Expr& expr) {
-  using NodePtr = std::shared_ptr<IndexedGraph<Expr>::Node>;
-  /*! \brief Creator Creates an IndexedGraph and determintes Topological order */
+std::unique_ptr<IndexedGraph<Expr>> CreateIndexedGraph(const Expr& expr) {
+  /*!
+   * \brief Adds indexed graph nodes in post-dfs order, and discovers which let-bound vars are to
+   * recursive functions.
+   */
   class Creator : public MixedModeVisitor {
    public:
-    IndexedGraph<Expr> CreateGraph(const Expr& expr) {
+    std::pair<std::unique_ptr<IndexedGraph<Expr>>,
+              std::unique_ptr<std::unordered_set<const CallNode*>>>
+    CreateGraph(const Expr& expr) {
       VisitExpr(expr);
-      graph_.node_map_[expr]->is_external_ = true;
-      return std::move(graph_);
+      // Last visited node is implicitly used 'externally'.
+      graph_->item_to_node(expr)->is_external_ = true;
+      return {std::move(graph_), std::move(rec_calls_)};
     }
 
    protected:
     using MixedModeVisitor::VisitExpr_;
 
+    // By the default the MixedModeVisitor will place
+    //  - callee and arguments before a call
+    //  - tuple fields before a tuple
+    //  - tuple before a tuple projection
     void VisitLeaf(const Expr& expr) override {
+      if (const auto* var_node = expr.as<VarNode>()) {
+        if (var_node == current_let_bound_var_) {
+          // Don't visit occurrences of let-rec bound vars in the recursive function body.
+          // Instead, wait for them to be visited at call sites outside of the function.
+          VLOG(1) << "Ignore let-rec var '" << var_node->name_hint() << "'";
+          return;
+        }
+      }
+
       MixedModeVisitor::VisitLeaf(expr);
-      auto node = std::make_shared<IndexedGraph<Expr>::Node>(expr, index_++);
-      graph_.node_map_[expr] = node;
-      graph_.topological_order_.push_back(node);
+      graph_->AddNode(expr);
+
+      if (const auto* call_node = expr.as<CallNode>()) {
+        if (const auto* var_node = call_node->op.as<VarNode>()) {
+          if (var_node == current_let_bound_var_) {
+            // Remember this is a recursive call to the let-rec bound function.
+            // The Annotator functor below will not record any dependency from the let-rec bound
+            // var to the expression so that the indexed graph is always a DAG.
+            VLOG(1) << "Remembering recursive call to '" << var_node->name_hint() << "'";
+            rec_calls_->emplace(call_node);
+          }
+        }
+      }
     }
 
-    void VisitExpr_(const LetNode* let) override {
+    void VisitExpr_(const LetNode* let_node) override {
       auto pre_visit = [&](const LetNode* op) {
-        this->VisitSpan(op->span);
-        this->VisitExpr(op->value);
-        this->VisitExpr(op->var);
+        // Let-bound values come before their let-bound variable.
+        const VarNode* prev_let_bound_var = current_let_bound_var_;
+        current_let_bound_var_ = op->var.get();
+        VisitExpr(op->value);
+        current_let_bound_var_ = prev_let_bound_var;
+        VisitExpr(op->var);
       };
       auto post_visit = [&](const LetNode* op) {
-        this->VisitExpr(op->body);
-        if (let != op) {
-          Expr expr = GetRef<Expr>(op);
+        VisitExpr(op->body);
+        if (let_node != op) {
+          // Replicate VisitLeaf, which we are effectively bypassing.
           visit_counter_[op]++;
-          auto node = std::make_shared<IndexedGraph<Expr>::Node>(expr, index_++);
-          graph_.node_map_[expr] = node;
-          graph_.topological_order_.push_back(node);
+          graph_->AddNode(GetRef<Expr>(op));
         }
       };
-      ExpandANormalForm(let, pre_visit, post_visit);
+      ExpandANormalForm(let_node, pre_visit, post_visit);
     }
 
-    IndexedGraph<Expr> graph_;
-    size_t index_ = 0;
+    class PatternCreator : public PatternVisitor {
+     public:
+      explicit PatternCreator(Creator* creator) : creator_(creator) {}
+
+     private:
+      void VisitPattern_(const PatternVarNode* pattern_var_node) final {
+        creator_->VisitLeaf(pattern_var_node->var);
+      }
+
+      Creator* creator_;
+    };
+
+    void VisitExpr_(const MatchNode* match_node) override {
+      // Matched data comes before match-bound vars then match rhs, in match order.
+      VisitExpr(match_node->data);
+      for (const Clause& c : match_node->clauses) {
+        PatternCreator pattern_creator(this);
+        pattern_creator.VisitPattern(c->lhs);
+        VisitExpr(c->rhs);
+      }
+    }
+
+    /*! \brief Graph we are accumulated nodes into. */
+    std::unique_ptr<IndexedGraph<Expr>> graph_ = std::make_unique<IndexedGraph<Expr>>();
+    /*! \brief Variable the currently visited expression is to be let-bound to, if any. */
+    const VarNode* current_let_bound_var_ = nullptr;
+    /*! \brief Accumulated calls to recursive functions. */
+    std::unique_ptr<std::unordered_set<const CallNode*>> rec_calls_ =
+        std::make_unique<std::unordered_set<const CallNode*>>();
   };
-  /*! \brief Annotator takes an IndexedGraph, fills it's forward outputs, and does dominator tree
-   * analysis.
+
+  /*!
+   * \brief Fills in the inputs and outputs for all nodes, then does dominator analysis.
    *
-   *  Annotator use ExprFunctor to visit nodes, but iterates over them in pre-determined
-   * topological order instead of recursing.
+   * Thought we use the ExprFunctor to visit nodes, we never recurse and instead just inspect
+   * each sub-expression's immediate sub-sub-expressions to accumulate inputs and outputs.
    */
-  class Annotator : public ExprFunctor<void(const Expr&, NodePtr)> {
+  class Annotator : public ExprFunctor<void(const Expr&)> {
    public:
-    Annotator(const IndexedGraph<Expr>& graph) : graph_(graph) {}
-    IndexedGraph<Expr> Annotate() {
+    explicit Annotator(std::pair<std::unique_ptr<IndexedGraph<Expr>>,
+                                 std::unique_ptr<std::unordered_set<const CallNode*>>>
+                           args)
+        : graph_(std::move(args.first)), rec_calls_(std::move(args.second)) {}
+
+    std::unique_ptr<IndexedGraph<Expr>> Annotate() {
       // Visit all of the nodes in topological order to get forward outputs
-      for (const auto& node : graph_.topological_order_) {
-        ExprFunctor::VisitExpr(node->ref_, nullptr);
+      for (PostDfsIndex index = 0; index < graph_->size(); ++index) {
+        VisitExpr(graph_->index_to_node(index)->ref());
       }
       // do the dominator analysis
-      graph_.PostDom();
+      graph_->PostDom();
       return std::move(graph_);
     }
 
-    /*! Default visitation pushes the parent to the child's outputs and the child to the parent's
-     * inputs*/
-    void VisitExpr(const Expr& expr, NodePtr parent) override {
-      auto current = graph_.node_map_[expr];
-      if (parent) {
-        current->outputs_.push_back(parent.get());
-        parent->inputs_.push_back(current.get());
-      }
+    /*!
+     * \brief Add \p parent as a possible output of the node corresponding to \p expr.
+     */
+    void AddOutput(const Expr& expr, IndexedGraph<Expr>::Node* parent) {
+      auto current = graph_->item_to_node(expr);
+      current->outputs_.push_back(parent);
+      parent->inputs_.push_back(current);
     }
 
    protected:
-    IndexedGraph<Expr> graph_;
-    void VisitExpr_(const VarNode* op, NodePtr parent) override {
-      if (op->type_annotation.defined()) {
-        this->VisitType(op->type_annotation);
-      }
-    }
+    void VisitExpr_(const VarNode* var_node) override {}
 
-    void VisitExpr_(const GlobalVarNode* op, NodePtr parent) override {}
+    void VisitExpr_(const GlobalVarNode* global_var_node) override {}
 
-    void VisitExpr_(const ConstantNode* op, NodePtr parent) override {}
+    void VisitExpr_(const ConstantNode* constant_node) override {}
 
-    void VisitExpr_(const TupleNode* op, NodePtr parent) override {
-      for (auto field : op->fields) {
-        this->VisitExpr(field, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const TupleNode* tuple_node) override {
+      auto node = graph_->item_to_node(GetRef<Tuple>(tuple_node));
+      for (auto field : tuple_node->fields) {
+        AddOutput(field, node);
       }
     }
 
-    void VisitExpr_(const FunctionNode* op, NodePtr parent) override {
-      for (auto param : op->params) {
-        this->VisitExpr(param, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const FunctionNode* function_node) override {
+      auto node = graph_->item_to_node(GetRef<Function>(function_node));
+      // Nothing to do for parameters -- each use of a parameter will contribute to its outputs.
+      AddOutput(function_node->body, node);
+    }
+
+    void VisitExpr_(const CallNode* call_node) override {
+      auto node = graph_->item_to_node(GetRef<Call>(call_node));
+      if (rec_calls_->count(call_node)) {
+        // We want the indexed graph to be a DAG, so don't consider a call to a let-rec bound
+        // function from inside the function to depend on the let-rec bound var.
+        VLOG(1) << "Ignoring op in call " << RefToSummary(GetRef<Call>(call_node));
+      } else {
+        AddOutput(call_node->op, node);
+      }
+      for (auto arg : call_node->args) {
+        AddOutput(arg, node);
       }
+    }
+
+    void VisitExpr_(const LetNode* let_node) override {
+      auto node = graph_->item_to_node(GetRef<Let>(let_node));
+      auto let_var_node = graph_->item_to_node(let_node->var);
+      AddOutput(let_node->value, let_var_node);
+      // Nothing to do for the let-bound variable -- each use of that variable in the let-body
+      // will contribute to its outputs.
+      AddOutput(let_node->body, node);
+    }
 
-      this->VisitExpr(op->body, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const IfNode* if_node) override {
+      auto node = graph_->item_to_node(GetRef<If>(if_node));
+      AddOutput(if_node->cond, node);
+      AddOutput(if_node->true_branch, node);
+      AddOutput(if_node->false_branch, node);
     }
 
-    void VisitExpr_(const CallNode* op, NodePtr parent) override {
-      this->VisitExpr(op->op, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const OpNode* op_node) override {}
 
-      for (auto ty_arg : op->type_args) {
-        this->VisitType(ty_arg);
+    void VisitExpr_(const TupleGetItemNode* tuple_get_item_node) override {
+      auto node = graph_->item_to_node(GetRef<TupleGetItem>(tuple_get_item_node));
+      AddOutput(tuple_get_item_node->tuple, node);
+    }
+
+    void VisitExpr_(const RefCreateNode* ref_create_node) override {
+      auto node = graph_->item_to_node(GetRef<RefCreate>(ref_create_node));
+      AddOutput(ref_create_node->value, node);
+    }
+
+    void VisitExpr_(const RefReadNode* ref_read_node) override {
+      auto node = graph_->item_to_node(GetRef<RefRead>(ref_read_node));
+      AddOutput(ref_read_node->ref, node);
+    }
+
+    void VisitExpr_(const RefWriteNode* ref_write_node) override {
+      auto node = graph_->item_to_node(GetRef<RefWrite>(ref_write_node));
+      AddOutput(ref_write_node->ref, node);
+      AddOutput(ref_write_node->value, node);
+    }
+
+    void VisitExpr_(const ConstructorNode* constructor_node) override {}
+
+    class PatternAnnotator : public PatternVisitor {
+     public:
+      PatternAnnotator(Annotator* annotator, const ExprNode* adt_node)
+          : annotator_(annotator), adt_node_(adt_node) {}
+
+     private:
+      void VisitPattern_(const PatternVarNode* pattern_var_node) final {
+        auto node = annotator_->graph_->item_to_node(pattern_var_node->var);
+        annotator_->AddOutput(GetRef<Expr>(adt_node_), node);
       }
 
-      for (auto arg : op->args) {
-        this->VisitExpr(arg, graph_.node_map_[GetRef<Expr>(op)]);
+      Annotator* annotator_;
+      const ExprNode* adt_node_;
+    };
+
+    void VisitExpr_(const MatchNode* match_node) override {
+      // Data flows from the match data to pattern vars into match arms and out into overall
+      // match.
+      auto node = graph_->item_to_node(GetRef<Match>(match_node));
+      for (const Clause& c : match_node->clauses) {
+        PatternAnnotator pattern_annotator(this, match_node->data.get());
+        pattern_annotator.VisitPattern(c->lhs);
+        AddOutput(c->rhs, node);
       }
     }
 
-    void VisitExpr_(const LetNode* op, NodePtr parent) override {
-      this->VisitExpr(op->value, graph_.node_map_[GetRef<Expr>(op)]);
-      this->VisitExpr(op->var, graph_.node_map_[GetRef<Expr>(op)]);
-      this->VisitExpr(op->body, graph_.node_map_[GetRef<Expr>(op)]);
-    }
+    std::unique_ptr<IndexedGraph<Expr>> graph_;
+    /*! \brief Accumulated calls to recursive functions. */
+    std::unique_ptr<std::unordered_set<const CallNode*>> rec_calls_;
+  };
+
+  /*! \brief Fills in the basic blocks for all nodes. */
+  class Blocker : public MixedModeVisitor {
+   public:
+    explicit Blocker(std::unique_ptr<IndexedGraph<Expr>> graph) : graph_(std::move(graph)) {}
 
-    void VisitExpr_(const IfNode* op, NodePtr parent) override {
-      this->VisitExpr(op->cond, graph_.node_map_[GetRef<Expr>(op)]);
-      this->VisitExpr(op->true_branch, graph_.node_map_[GetRef<Expr>(op)]);
-      this->VisitExpr(op->false_branch, graph_.node_map_[GetRef<Expr>(op)]);
+    std::unique_ptr<IndexedGraph<Expr>> Scope(const Expr& expr) {
+      VisitExpr(expr);
+      return std::move(graph_);
     }
 
-    void VisitExpr_(const OpNode* op, NodePtr parent) override { return; }
+   private:
+    using MixedModeVisitor::VisitExpr_;
 
-    void VisitExpr_(const TupleGetItemNode* op, NodePtr parent) override {
-      this->VisitExpr(op->tuple, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitLeaf(const Expr& expr) override {
+      MixedModeVisitor::VisitLeaf(expr);
+      SetScope(expr);
     }
 
-    void VisitExpr_(const RefCreateNode* op, NodePtr parent) override {
-      this->VisitExpr(op->value, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const FunctionNode* function_node) override {
+      auto node = graph_->item_to_node(GetRef<Function>(function_node));
+      basic_block_stack_.push_back(node);
+      ExprVisitor::VisitExpr_(function_node);
+      basic_block_stack_.pop_back();
     }
 
-    void VisitExpr_(const RefReadNode* op, NodePtr parent) override {
-      this->VisitExpr(op->ref, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const IfNode* if_node) override {
+      VisitExpr(if_node->cond);
+      auto node = graph_->item_to_node(GetRef<If>(if_node));
+      basic_block_stack_.push_back(node);
+      VisitExpr(if_node->true_branch);
+      VisitExpr(if_node->false_branch);
+      basic_block_stack_.pop_back();
     }
 
-    void VisitExpr_(const RefWriteNode* op, NodePtr parent) override {
-      this->VisitExpr(op->ref, graph_.node_map_[GetRef<Expr>(op)]);
-      this->VisitExpr(op->value, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const LetNode* let_node) override {
+      auto pre_visit = [&](const LetNode* op) {
+        VisitExpr(op->value);
+        VisitExpr(op->var);
+      };
+      auto post_visit = [&](const LetNode* op) {
+        VisitExpr(op->body);
+        if (let_node != op) {
+          visit_counter_[op]++;
+          SetScope(GetRef<Let>(op));
+        }
+      };
+      ExpandANormalForm(let_node, pre_visit, post_visit);
     }
 
-    void VisitExpr_(const ConstructorNode* op, NodePtr parent) override {
-      for (const Type& t : op->inputs) {
-        this->VisitType(t);
+    class PatternBlocker : public PatternVisitor {
+     public:
+      explicit PatternBlocker(Blocker* scoper) : scoper_(scoper) {}
+
+     private:
+      void VisitPattern_(const PatternVarNode* pattern_var_node) final {
+        scoper_->SetScope(pattern_var_node->var);
       }
-      this->VisitType(op->belong_to);
-    }
 
-    void VisitExpr_(const MatchNode* op, NodePtr parent) override {
-      this->VisitExpr(op->data, graph_.node_map_[GetRef<Expr>(op)]);
-      for (const Clause& c : op->clauses) {
-        this->VisitClause(c, graph_.node_map_[GetRef<Expr>(op)]);
+      Blocker* scoper_;
+    };
+
+    void VisitExpr_(const MatchNode* match_node) override {
+      VisitExpr(match_node->data);
+      auto node = graph_->item_to_node(GetRef<Match>(match_node));
+      basic_block_stack_.push_back(node);
+      for (const Clause& c : match_node->clauses) {
+        PatternBlocker pattern_scoper(this);
+        pattern_scoper.VisitPattern(c->lhs);
+        VisitExpr(c->rhs);
       }
+      basic_block_stack_.pop_back();
     }
 
-    void VisitClause(const Clause& op, NodePtr parent) {
-      this->VisitPattern(op->lhs);
-      this->VisitExpr(op->rhs, parent);
+    void SetScope(const Expr& expr) {
+      auto node = graph_->item_to_node(expr);
+      if (!basic_block_stack_.empty()) {
+        node->basic_block_ = basic_block_stack_.back();
+      }
     }
 
-    void VisitPattern(const Pattern& p) { return; }
-
-    void VisitType(const Type& t) { return; }
+    std::unique_ptr<IndexedGraph<Expr>> graph_;
+    std::vector<IndexedGraph<Expr>::Node*> basic_block_stack_;
   };
-  return Annotator(Creator().CreateGraph(expr)).Annotate();
+
+  VLOG(1) << "CreateIndexedGraph:" << std::endl << PrettyPrint(expr);
+  std::unique_ptr<IndexedGraph<Expr>> graph =
+      Blocker(Annotator(Creator().CreateGraph(expr)).Annotate()).Scope(expr);
+  VLOG(1) << "graph:" << std::endl << graph->ToString();
+#if TVM_LOG_DEBUG
+  graph->CheckValid();
+#endif
+  return graph;
 }
 
-IndexedGraph<DFPattern> CreateIndexedGraph(const DFPattern& pattern) {
-  using NodePtr = std::shared_ptr<IndexedGraph<DFPattern>::Node>;
-  /*! \brief Creator Creates an IndexedGraph and determintes Toplogical order */
+std::unique_ptr<IndexedGraph<DFPattern>> CreateIndexedGraph(const DFPattern& pattern) {
+  /*! \brief Creates an IndexedGraph and determines topological order */
   class Creator : public DFPatternVisitor {
    public:
-    IndexedGraph<DFPattern> CreateGraph(const DFPattern& pattern) {
+    std::unique_ptr<IndexedGraph<DFPattern>> CreateGraph(const DFPattern& pattern) {
+      graph_ = std::make_unique<IndexedGraph<DFPattern>>();
       VisitDFPattern(pattern);
-      graph_.node_map_[pattern]->is_external_ = true;
+      graph_->item_to_node(pattern)->is_external_ = true;
       return std::move(graph_);
     }
 
@@ -215,121 +413,135 @@ IndexedGraph<DFPattern> CreateIndexedGraph(const DFPattern& pattern) {
     void VisitDFPattern(const DFPattern& pattern) override {
       if (this->visited_.count(pattern.get()) == 0) {
         DFPatternVisitor::VisitDFPattern(pattern);
-        auto node = std::make_shared<IndexedGraph<DFPattern>::Node>(pattern, index_++);
-        graph_.node_map_[pattern] = node;
-        graph_.topological_order_.push_back(node);
+        graph_->AddNode(pattern);
       }
     }
-    IndexedGraph<DFPattern> graph_;
-    size_t index_ = 0;
+
+    std::unique_ptr<IndexedGraph<DFPattern>> graph_;
   };
+
   /*! \brief Annotator takes an IndexedGraph, fills it's forward outputs, and does domiantor tree
    * analysis.
    *
    *  Annotator use ExprFunctor to visit nodes, but iterates over them in pre-determined
    * topological order instead of recursing.
    */
-  class Annotator : public DFPatternFunctor<void(const DFPattern&, NodePtr)> {
+  class Annotator : public DFPatternFunctor<void(const DFPattern&)> {
    public:
-    Annotator(const IndexedGraph<DFPattern>& graph) : graph_(graph) {}
-    IndexedGraph<DFPattern> Annotate() {
+    Annotator(std::unique_ptr<IndexedGraph<DFPattern>> graph) : graph_(std::move(graph)) {}
+
+    std::unique_ptr<IndexedGraph<DFPattern>> Annotate() {
       // Visit all of the nodes in topological order to get forward outputs
-      for (const auto& node : graph_.topological_order_) {
-        DFPatternFunctor::VisitDFPattern(node->ref_, nullptr);
+      for (PostDfsIndex index = 0; index < graph_->size(); ++index) {
+        VisitDFPattern(graph_->index_to_node(index)->ref());
       }
-      graph_.PostDom();
       // do the dominator analysis
+      graph_->PostDom();
       return std::move(graph_);
     }
 
     /*! Default visitation pushes the parent to the child's outputs */
-    void VisitDFPattern(const DFPattern& pattern, NodePtr parent) override {
-      auto current = graph_.node_map_[pattern];
+    void AddOutput(const DFPattern& pattern, IndexedGraph<DFPattern>::Node* parent) {
+      auto current = graph_->item_to_node(pattern);
       if (parent) {
-        current->outputs_.push_back(parent.get());
-        parent->inputs_.push_back(current.get());
+        current->outputs_.push_back(parent);
+        parent->inputs_.push_back(current);
       }
     }
 
    protected:
-    IndexedGraph<DFPattern> graph_;
-    void VisitDFPattern_(const AltPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->left, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->right, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const AltPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<AltPattern>(op));
+      AddOutput(op->left, node);
+      AddOutput(op->right, node);
     }
 
-    void VisitDFPattern_(const AttrPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->pattern, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const AttrPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<AttrPattern>(op));
+      AddOutput(op->pattern, node);
     }
 
-    void VisitDFPattern_(const CallPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->op, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const CallPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<CallPattern>(op));
+      AddOutput(op->op, node);
       if (op->args.defined()) {
         for (auto arg : op->args) {
-          VisitDFPattern(arg, graph_.node_map_[GetRef<DFPattern>(op)]);
+          AddOutput(arg, node);
         }
       }
     }
 
-    void VisitDFPattern_(const ConstantPatternNode* op, NodePtr parent) override {}
+    void VisitDFPattern_(const ConstantPatternNode* op) override {}
 
-    void VisitDFPattern_(const DataTypePatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->pattern, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const DataTypePatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<DataTypePattern>(op));
+      AddOutput(op->pattern, node);
     }
 
-    void VisitDFPattern_(const DominatorPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->parent, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->path, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->child, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const DominatorPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<DominatorPattern>(op));
+      AddOutput(op->parent, node);
+      AddOutput(op->path, node);
+      AddOutput(op->child, node);
     }
 
-    void VisitDFPattern_(const ExprPatternNode* op, NodePtr parent) override {}
+    void VisitDFPattern_(const ExprPatternNode* op) override {}
 
-    void VisitDFPattern_(const FunctionPatternNode* op, NodePtr parent) override {
+    void VisitDFPattern_(const FunctionPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<FunctionPattern>(op));
       if (op->params.defined()) {
         for (auto param : op->params) {
-          VisitDFPattern(param, graph_.node_map_[GetRef<DFPattern>(op)]);
+          AddOutput(param, node);
         }
       }
-      VisitDFPattern(op->body, graph_.node_map_[GetRef<DFPattern>(op)]);
+      AddOutput(op->body, node);
     }
 
-    void VisitDFPattern_(const ShapePatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->pattern, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const ShapePatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<ShapePattern>(op));
+      AddOutput(op->pattern, node);
     }
 
-    void VisitDFPattern_(const TupleGetItemPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->tuple, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const TupleGetItemPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<TupleGetItemPattern>(op));
+      AddOutput(op->tuple, node);
     }
 
-    void VisitDFPattern_(const TuplePatternNode* op, NodePtr parent) override {
+    void VisitDFPattern_(const TuplePatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<TuplePattern>(op));
       if (op->fields.defined()) {
         for (auto field : op->fields) {
-          VisitDFPattern(field, graph_.node_map_[GetRef<DFPattern>(op)]);
+          AddOutput(field, node);
         }
       }
     }
 
-    void VisitDFPattern_(const IfPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->cond, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->true_branch, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->false_branch, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const IfPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<IfPattern>(op));
+      AddOutput(op->cond, node);
+      AddOutput(op->true_branch, node);
+      AddOutput(op->false_branch, node);
     }
 
-    void VisitDFPattern_(const LetPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->var, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->value, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->body, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const LetPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<LetPattern>(op));
+      AddOutput(op->var, node);
+      AddOutput(op->value, node);
+      AddOutput(op->body, node);
     }
 
-    void VisitDFPattern_(const TypePatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->pattern, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const TypePatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<TypePattern>(op));
+      AddOutput(op->pattern, node);
     }
 
-    void VisitDFPattern_(const VarPatternNode* op, NodePtr parent) override {}
+    void VisitDFPattern_(const VarPatternNode* op) override {}
 
-    void VisitDFPattern_(const WildcardPatternNode* op, NodePtr parent) override {}
+    void VisitDFPattern_(const WildcardPatternNode* op) override {}
+
+    std::unique_ptr<IndexedGraph<DFPattern>> graph_;
   };
+
   return Annotator(Creator().CreateGraph(pattern)).Annotate();
 }
 
diff --git a/src/relay/ir/indexed_graph.h b/src/relay/ir/indexed_graph.h
index d073bcaeea5c9..c1ce53f40da3d 100644
--- a/src/relay/ir/indexed_graph.h
+++ b/src/relay/ir/indexed_graph.h
@@ -19,7 +19,12 @@
 
 /*!
  * \file src/relay/ir/indexed_graph.h
- * \brief A pattern matcher for matching dataflow properties.
+ * \brief A graph representation of the dataflow in a Relay expression or Relay (dataflow)
+ * pattern. Each 'indexed graph' node is 1:1 with an expression/pattern 'node', hence the
+ * term 'IndexedGraph'. Dataflow is captured in a generic representation which is convenient
+ * for analysis, particularly pattern matching and partitioning.
+ *
+ * TODO(mbs): Copied from fuse_ops.cc, consider refactoring to share implementation.
  */
 #ifndef TVM_RELAY_IR_INDEXED_GRAPH_H_
 #define TVM_RELAY_IR_INDEXED_GRAPH_H_
@@ -28,6 +33,7 @@
 
 #include <memory>
 #include <stack>
+#include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -36,47 +42,108 @@
 namespace tvm {
 namespace relay {
 
+/*! \brief The index of a node in the post-dfs traversal of overall expression. */
+using PostDfsIndex = size_t;
+
+/*!
+ * \brief Returns a brief summary of the 'reference' expression or pattern. Only used by
+ * IndexedGraph::ToString() for debugging.
+ */
+std::string RefToSummary(const Expr& expr);
+std::string RefToSummary(const DFPattern& pattern);
+
 /*!
- * \brief A Wrapper around a templated graph type
- *  Holds a forward-backward indexed representation of the graph and a dominator tree representation
- * of the graph
+ * \brief Represents the implied dataflow of an expression or (dataflow) pattern as a DAG who's
+ * nodes are 1:1 with those in the underlying expression/pattern.
+ *
+ * Each indexed graph node captures:
+ *  - Dataflow inputs.
+ *  - Dataflow outputs (or a flag indicating the node is an implied output).
+ *  - Dominator parent (ie closest node at which all outputs of the current node re-combine).
+ *  - Dominator children (inverse of above).
+ *  - Basic block (ie node representing the body of a function, arm of an if, etc).
  *
- *  This class is templated and the implementaiton is in the header file so we can analyze both
- * DFPattern and Expr with the same infrastructure.
+ * This class is templated so we can analyze both DFPatterns and Exprs with the same infrastructure.
  *
- *  IndexedGraph should be instantiated through the CreateIndexedGraph utilities.
+ * IndexedGraph should be instantiated through the CreateIndexedGraph utilities below.
  */
 template <typename T>
 class IndexedGraph {
  public:
-  /*! \brief A Node that wraps the input type and represents the indexed graph and dominator tree */
+  using TNode = typename T::ContainerType;
+
+  /*! \brief A Node in the graph. */
   struct Node {
     /*! \brief Node Constructor
-     *  \param ref The input graph node
-     *  \param index The index of the node in toplogical order
+     *  \param ref The expression or dataflow pattern node this indexed graph node is augmenting.
+     *  \param index The index of this node in the topological order
      */
-    Node(const T& ref, const size_t index) : ref_(ref), index_(index) {}
+    Node(const TNode* ref, PostDfsIndex index) : node_ref_(ref), index_(index) {}
+
+    /*! \brief The underlying expression or pattern node. */
+    const TNode* node_ref_;
 
-    /*! \brief The input node */
-    const T ref_;
-    /*! \brief The topological order index */
-    const size_t index_;
+    T ref() const {
+      ICHECK(node_ref_ != nullptr);
+      return GetRef<T>(node_ref_);
+    }
+
+    /*!
+     * \brief The index of this node in post-dfs order. If left.index_ > right.index_ then
+     * left does not flow into right. If left.index_ = right.index_ then left and right are
+     * the same node.
+     */
+    const PostDfsIndex index_;
 
-    /*! \brief A boolean to determine if this node is external to the graph */
+    /*! \brief If true this node has implicit outputs, for example as the result of a function. */
     bool is_external_ = false;
-    /*! \brief The forward inputs of the node */
+    /*! \brief Immediate dataflow inputs to this node. */
     std::vector<Node*> inputs_;
-    /*! \brief The forward outputs/users of the node */
+    /*! \brief Immediate dataflow outputs of this node -- may be empty if is_external_ is true. */
     std::vector<Node*> outputs_;
 
-    /*! \brief The depth of the node in the dominator tree */
+    /*!
+     * \brief The node representing the 'basic block' containing this node:
+     *  - Function bodies start a new basic block for their bodies.
+     *  - The true and false branches of an if start their own blocks.
+     *  - The arms of a match each have their own blocks.
+     */
+    Node* basic_block_ = nullptr;
+
+    /*! \brief The depth of this node in the dominator tree */
     size_t depth_ = 0;
-    /*! \brief The dominator parent/final user of the outputs of this node */
-    Node* dominator_parent_;
-    /*! \brief The nodes this node dominates */
+    /*!
+     * \brief The dominator parent of this node. This is the node N with least index such that
+     * all possible dataflows from this node pass through N.
+     */
+    Node* dominator_parent_ = nullptr;
+    /*! \brief The nodes this node dominates. */
     std::vector<Node*> dominator_children_;
 
-    bool Dominates(const Node* other) {
+    /*!
+     * Add to \p nodes all the nodes which are strictly downstream of \p this, ie can be
+     * reached by following output paths.
+     */
+    void AccumulateDownstreamNodes(std::unordered_set<const Node*>* nodes) const {
+      std::stack<const Node*> stack;
+      stack.push(this);
+      while (!stack.empty()) {
+        const Node* current = stack.top();
+        stack.pop();
+        for (auto node : current->outputs_) {
+          if (nodes->count(node) == 0) {
+            stack.push(node);
+            nodes->insert(node);
+          }
+        }
+      }
+    }
+
+    /*!
+     * \brief Returns true if \p this is a dominator of \p other. Ie all dataflow paths from \p
+     * other pass through \p this.
+     */
+    bool Dominates(const Node* other) const {
       std::stack<const Node*> stack;
       std::unordered_set<const Node*> visited;
       stack.push(this);
@@ -97,10 +164,125 @@ class IndexedGraph {
       return false;
     }
   };
+
+  PostDfsIndex size() const { return topological_order_.size(); }
+
+  Node* item_to_node(const T& item) { return item_to_node(item.get()); }
+  const Node* item_to_node(const T& item) const { return item_to_node(item.get()); }
+
+  Node* item_to_node(const TNode* item) {
+    auto itr = node_map_.find(item);
+    ICHECK(itr != node_map_.end()) << PrettyPrint(GetRef<T>(item));
+    return itr->second;
+  }
+
+  const Node* item_to_node(const TNode* item) const {
+    auto itr = node_map_.find(item);
+    ICHECK(itr != node_map_.end()) << PrettyPrint(GetRef<T>(item));
+    return itr->second;
+  }
+
+  Node* index_to_node(PostDfsIndex index) {
+    ICHECK_LT(index, topological_order_.size()) << index;
+    return topological_order_[index].get();
+  }
+
+  const Node* index_to_node(PostDfsIndex index) const {
+    ICHECK_LT(index, topological_order_.size()) << index;
+    return topological_order_[index].get();
+  }
+
+  /*!
+   * \brief (For debugging only) Returns description of indexed graph with hints as to the
+   * sub-expressions or sub-patterns corresponding to each indexed graph node.
+   */
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "IndexedGraph(size = " << topological_order_.size() << ") {" << std::endl;
+    for (PostDfsIndex index = 0; index < topological_order_.size(); ++index) {
+      const Node* node = topological_order_[index].get();
+      ICHECK_EQ(index, node->index_);
+      os << "  " << index << " (" << RefToSummary(node->ref()) << "): inputs=[";
+      for (const auto* sub_node : node->inputs_) {
+        os << sub_node->index_ << ",";
+      }
+      os << "], outputs=[";
+      for (const auto* sub_node : node->outputs_) {
+        os << sub_node->index_ << ",";
+      }
+      os << "]";
+      if (node->is_external_) {
+        os << ", external";
+      }
+      if (node->basic_block_) {
+        os << ", basic_block=" << node->basic_block_->index_;
+      }
+      if (node->depth_ > 0) {
+        os << ", depth=" << node->depth_;
+      }
+      if (node->dominator_parent_) {
+        os << ", dom_parent=" << node->dominator_parent_->index_;
+      }
+      os << ", dom_children=[";
+      for (const auto* sub_node : node->dominator_children_) {
+        os << sub_node->index_ << ",";
+      }
+      os << "]" << std::endl;
+    }
+    os << "}";
+    return os.str();
+  }
+
+  /*!
+   * Check-fails if the graph is ill-formed. For debugging only.
+   */
+  void CheckValid() const {
+    ICHECK_GT(topological_order_.size(), 0);
+    for (PostDfsIndex index = 0; index < topological_order_.size(); ++index) {
+      const Node* node = topological_order_[index].get();
+      // We have a node.
+      ICHECK(node);
+      // Bijections with post-dfs indexes and expressions/patterns are correct.
+      ICHECK_EQ(node->index_, index);
+      ICHECK(node->node_ref_);
+      auto itr = node_map_.find(node->node_ref_);
+      ICHECK(itr != node_map_.end());
+      ICHECK_EQ(itr->second, node) << "at index " << index << " in:" << std::endl << ToString();
+      // Inputs come before.
+      for (size_t i = 0; i < node->inputs_.size(); ++i) {
+        const Node* input = node->inputs_[i];
+        ICHECK(input);
+        ICHECK_LT(input->index_, index);
+        ICHECK(std::find(input->outputs_.begin(), input->outputs_.end(), node) !=
+               input->outputs_.end());
+      }
+      // Outputs come after.
+      for (size_t i = 0; i < node->outputs_.size(); ++i) {
+        const Node* output = node->outputs_[i];
+        ICHECK(output);
+        ICHECK_GT(output->index_, index);
+        ICHECK(std::find(output->inputs_.begin(), output->inputs_.end(), node) !=
+               output->inputs_.end());
+      }
+      ICHECK_GT(node->depth_, 0);
+      // Dominator children come before.
+      for (size_t i = 0; i < node->dominator_children_.size(); ++i) {
+        const Node* child = node->dominator_children_[i];
+        ICHECK(child);
+        ICHECK_LT(child->index_, index);
+      }
+      if (node->dominator_parent_) {
+        // Dominator comes after.
+        ICHECK_GT(node->dominator_parent_->index_, index);
+      }
+    }
+  }
+
+ private:
   /*! \brief Construct the domination tree inside IndexedGraph */
   void PostDom() {
-    for (size_t i = topological_order_.size(); i != 0; --i) {
-      size_t index = i - 1;
+    for (PostDfsIndex i = topological_order_.size(); i != 0; --i) {
+      PostDfsIndex index = i - 1;
       auto* current = topological_order_[index].get();
       if (current->is_external_) {
         current->depth_ = 1;
@@ -109,16 +291,13 @@ class IndexedGraph {
         auto parent = LeastCommonAncestor(current->outputs_);
         current->depth_ = parent ? parent->depth_ + 1 : 1;
         current->dominator_parent_ = parent;
-        parent->dominator_children_.push_back(current);
+        if (parent) {
+          parent->dominator_children_.push_back(current);
+        }
       }
     }
   }
-  /*! \brief Map of input nodes to IndexedGraph Nodes */
-  std::unordered_map<T, std::shared_ptr<Node>, ObjectPtrHash, ObjectPtrEqual> node_map_;
-  /*! \brief Topological IndexedGraph Nodes */
-  std::vector<std::shared_ptr<Node>> topological_order_;
 
- protected:
   /*! \brief Find the least common ancestor of all outputs of a node */
   Node* LeastCommonAncestor(const std::vector<Node*>& outputs) {
     if (outputs.size() == 0) {
@@ -136,9 +315,11 @@ class IndexedGraph {
     if (lhs == nullptr || rhs == nullptr) {
       return nullptr;
     }
+    PostDfsIndex lhs_index = lhs->index_;
+    PostDfsIndex rhs_index = rhs->index_;
     while (lhs != rhs) {
-      ICHECK(lhs);
-      ICHECK(rhs);
+      ICHECK(lhs && rhs) << "LCA(" << lhs_index << ", " << rhs_index << ") on graph:" << std::endl
+                         << ToString();
       if (lhs->depth_ < rhs->depth_) {
         rhs = rhs->dominator_parent_;
       } else if (lhs->depth_ > rhs->depth_) {
@@ -150,13 +331,41 @@ class IndexedGraph {
     }
     return lhs;
   }
+
+  /*!
+   * \brief Appends a node corresponding to \p ref, and maintains the sub-expression/sub-pattern to
+   * node bijection. The insertion index will be the node's PostDfsIndex. All other node properties
+   * are accumulated in-place.
+   */
+  void AddNode(const T& ref) {
+    PostDfsIndex index = topological_order_.size();
+    auto node = std::make_unique<Node>(ref.get(), index);
+    node_map_[ref.get()] = node.get();
+    topological_order_.emplace_back(std::move(node));
+  }
+
+  /*!
+   * \brief Map from underlying sub-expression or sub-pattern nodes to their indexed graph nodes.
+   */
+  std::unordered_map<const TNode*, Node*> node_map_;
+  /*! \brief All nodes in increasing post-dfs index order. This vector owns all the nodes. */
+  std::vector<std::unique_ptr<Node>> topological_order_;
+
+  friend std::unique_ptr<IndexedGraph<Expr>> CreateIndexedGraph(const Expr& expr);
+  friend std::unique_ptr<IndexedGraph<DFPattern>> CreateIndexedGraph(const DFPattern& pattern);
 };
 
-/*! \brief Create an Indexed Graph based on an Expr */
-IndexedGraph<Expr> CreateIndexedGraph(const Expr& expr);
-/*! \brief Create an Indexed Graph based on an DFPattern */
-IndexedGraph<DFPattern> CreateIndexedGraph(const DFPattern& pattern);
+/*! \brief Returns an Indexed Graph for \p expr, which much outlive the result. */
+std::unique_ptr<IndexedGraph<Expr>> CreateIndexedGraph(const Expr& expr);
+
+/*!
+ * \brief Returns an Indexed Graph for \p pattern, which must outlive the result.
+ * The dataflow for a pattern mimics the dataflow for the expression which would match
+ * that pattern.
+ */
+std::unique_ptr<IndexedGraph<DFPattern>> CreateIndexedGraph(const DFPattern& pattern);
 
 }  // namespace relay
 }  // namespace tvm
+
 #endif  // TVM_RELAY_IR_INDEXED_GRAPH_H_
diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index f7045305e90d3..d5cc6608662b2 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -258,6 +258,7 @@ RELAY_REGISTER_OP("dyn.broadcast_to")
     .describe(R"code(Broadcast the first input to match the shape argument.
 )code" TVM_ADD_FILELINE)
     .set_num_inputs(2)
+    .set_attrs_type<InitOpAttrs>()
     .add_argument("data", "Tensor", "The input tensor.")
     .add_argument("shape", "Tensor", "Target shape.")
     .set_support_level(4)
diff --git a/tests/cpp/relay/ir/indexed_graph_test.cc b/tests/cpp/relay/ir/indexed_graph_test.cc
new file mode 100644
index 0000000000000..17ec682616843
--- /dev/null
+++ b/tests/cpp/relay/ir/indexed_graph_test.cc
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "../../../src/relay/ir/indexed_graph.h"
+
+#include <gtest/gtest.h>
+#include <tvm/parser/parser.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+
+namespace tvm {
+namespace relay {
+namespace {
+
+// A module stolen from onnx/test_forward.py::test_loop which combines functions, recursion,
+// control flow, tuples as well as the usual operator calls.
+// We include the known post-dfs indexes in comments to help write the tests.
+IRModule TestRecursiveIRModule() {
+  Device device = {kDLCPU, 0};
+  Constant const0(runtime::NDArray::Empty(ShapeTuple({1}), DataType::Int(64), device));
+  Constant const1(runtime::NDArray::Empty(ShapeTuple({0, 1}), DataType::Float(32), device));
+  Map<String, Array<ObjectRef>> metadata;
+  metadata.Set("relay.Constant", Array<ObjectRef>({const0, const1}));
+  constexpr const char* kModel = R"(
+    #[version = "0.0.5"]
+    def @main(%trip_count: int64,                                        // 0
+              %cond: bool,                                               // 1
+              %y: Tensor[(1), float32])                                  // 2
+              -> (Tensor[(1), float32], Tensor[(?, ?), float32]) {
+      %17 = (
+        let %while_loop = fn (%iter_count: int64,                        // 3
+                              %max_count: int64,                         // 4
+                              %cond_in: bool,                            // 5
+                              %y_in: Tensor[(1), float32],               // 6
+                              %scan_out: Tensor[(?, ?), float32])        // 7
+                              -> (int64, int64, bool, Tensor[(1), float32], Tensor[(?, ?), float32]) {
+          %0 = equal(%cond_in, True);                                    // 11
+          %1 = less(%iter_count, %max_count);                            // 13
+          %2 = logical_and(%0, %1);                                      // 14
+          if (%2) {
+            %3 = cast(%iter_count, dtype="float32");                     // 20
+            %4 = add(%y_in, %3);                                         // 21
+            %5 = less(%4, 5f);                                           // 23
+            %6 = squeeze(%5);                                            // 24
+            %7 = reshape(%iter_count, newshape=[1]);                     // 29
+            %8 = (%7, meta[relay.Constant][0]);                          // 31
+            %9 = concatenate(%8);                                        // 32
+            %10 = copy(%4);                                              // 36
+            %11 = dyn.broadcast_to(%scan_out, %9, shape=None);           // 33
+            %12 = expand_dims(%10, axis=0);                              // 37
+            %13 = (%11, %12);                                            // 38
+            %14 = add(%iter_count, 1i64);                                // 17
+            %15 = cast(%6, dtype="bool");                                // 25
+            %16 = concatenate(%13);                                      // 39
+            %while_loop(%14, %max_count, %15, %4, %16)                   // 40
+          } else {
+            (%iter_count, %max_count, %cond_in, %y_in, %scan_out)        // 41
+          }                                                              // 42
+        };                                                               // 43
+        %while_loop                                                      // 44
+      );                                                                 // 45
+      %18 = %17(0i64, %trip_count, %cond, %y, meta[relay.Constant][1]);  // 48
+      %19 = %18.3;                                                       // 49
+      %20 = %18.4;                                                       // 50
+      (%19, %20)                                                         // 51
+    }                                                                    // 52
+  )";
+  return parser::ParseModule("string", kModel, /*init_module=*/{}, metadata);
+}
+
+TEST(IndexedGraph, RecursiveExprRegression) {
+  IRModule ir_mod = TestRecursiveIRModule();
+  auto main = Downcast<Function>(ir_mod->Lookup("main"));
+  auto graph = CreateIndexedGraph(main);
+  graph->CheckValid();
+
+  {
+    // Dataflow node properties for %4
+    auto node = graph->index_to_node(21);
+    const auto* call_node = node->ref().as<CallNode>();
+    ASSERT_NE(call_node, nullptr);
+    const auto* op_node = call_node->op.as<OpNode>();
+    ASSERT_NE(op_node, nullptr);
+    ASSERT_EQ(op_node->name, "add");
+
+    // 3 inputs (the op itself is an input)
+    ASSERT_EQ(node->inputs_.size(), 3);
+    ASSERT_EQ(node->inputs_[0]->index_, 15);  // the add op
+    ASSERT_EQ(node->inputs_[1]->index_, 6);   // %y_in
+    ASSERT_EQ(node->inputs_[2]->index_, 20);  // %3
+
+    // 3 outputs
+    ASSERT_EQ(node->outputs_.size(), 3);
+    ASSERT_EQ(node->outputs_[0]->index_, 23);  // %5
+    ASSERT_EQ(node->outputs_[1]->index_, 36);  // %10
+    ASSERT_EQ(node->outputs_[2]->index_, 40);  // recursive %while_loop call
+
+    // In the 'if' basic block
+    ASSERT_EQ(node->basic_block_->index_, 42);
+
+    // Dominator 'parent' is recursive call
+    ASSERT_EQ(node->dominator_parent_->index_, 40);
+
+    // One dominator child from %3
+    ASSERT_EQ(node->dominator_children_.size(), 1);
+    ASSERT_EQ(node->dominator_children_[0]->index_, 20);
+  }
+
+  {
+    // The recursive call to %while_loop does not depend on %while_loop
+    auto node = graph->index_to_node(40);
+    const auto* call_node = node->ref().as<CallNode>();
+    ASSERT_NE(call_node, nullptr);
+    const auto* var_node = call_node->op.as<VarNode>();
+    ASSERT_NE(var_node, nullptr);
+    ASSERT_EQ(var_node->name_hint(), "while_loop");
+
+    ASSERT_EQ(node->inputs_.size(), 5);
+    ASSERT_EQ(node->inputs_[0]->index_, 17);  // %14
+    ASSERT_EQ(node->inputs_[1]->index_, 4);   // %max_count
+    ASSERT_EQ(node->inputs_[2]->index_, 25);  // %15
+    ASSERT_EQ(node->inputs_[3]->index_, 21);  // %4
+    ASSERT_EQ(node->inputs_[4]->index_, 39);  // %16
+  }
+
+  {
+    // Downstream nodes of %18
+    auto node = graph->index_to_node(48);
+    std::unordered_set<const IndexedGraph<Expr>::Node*> downstreams;
+    node->AccumulateDownstreamNodes(&downstreams);
+    ASSERT_EQ(downstreams.size(), 4);
+    for (const auto* downstream : downstreams) {
+      ASSERT_TRUE(downstream->index_ >= 49 && downstream->index_ <= 52);
+    }
+  }
+
+  {
+    // Dominates relation for %4
+    auto upstream = graph->index_to_node(21);
+    // Path 1: 21->23->24->25->40
+    // Path 2: 21->36->37->38->39->40
+    // Then 40->43
+    auto downstream = graph->index_to_node(43);
+    ASSERT_TRUE(downstream->Dominates(upstream));
+  }
+}
+
+// A module with unused let-bound function. The 'add' operator should have no dominator
+// since it is used both in the unused function and in the main body.
+IRModule TestUnusedLetBoundIRModule() {
+  constexpr const char* kModel = R"(
+    #[version = "0.0.5"]
+    def @main(%x: int64) -> int64 {   // 0
+      let %f = fn (                   // 5
+        %y: int64                     // 1
+      ) {
+        add(%x, %y)                   // 3
+      };
+      if (less(%x, 5i64)) {
+        add(%x, 3i64)                 // 10
+      } else {
+        %x
+      }
+    }
+  )";
+  return parser::ParseModule("string", kModel);
+}
+
+TEST(IndexedGraph, UnusedLetVars) {
+  IRModule ir_mod = TestUnusedLetBoundIRModule();
+  auto main = Downcast<Function>(ir_mod->Lookup("main"));
+  auto graph = CreateIndexedGraph(main);
+  graph->CheckValid();
+
+  {
+    auto node = graph->index_to_node(2);
+    const auto* op_node = node->ref().as<OpNode>();
+    ICHECK(op_node);
+    ICHECK_EQ(op_node->name, "add");
+    ICHECK_EQ(node->outputs_.size(), 2);
+    ICHECK_EQ(node->outputs_[0]->index_, 3);
+    ICHECK_EQ(node->outputs_[1]->index_, 10);
+    ICHECK(node->dominator_parent_ == nullptr);
+  }
+}
+
+}  // namespace
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index 74e03f6a97551..f0474c9112736 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=unused-wildcard-import
 import numpy as np
-import pytest
 
 import tvm
 from tvm import relay
@@ -601,6 +600,38 @@ def test_match_fake_diamond():
     assert not diamond.match(out)
 
 
+def test_at_most_one_parent():
+    # Pattern
+    P = is_op("nn.conv2d")(wildcard(), wildcard())  # 'parent'
+    I = is_op("nn.relu")(wildcard())  # 'intermediate' ('path' in the code)
+    C = is_op("add")(wildcard(), wildcard())  # 'child'
+    pattern = dominates(P, I, C)
+
+    #       n6(P)
+    #      /  \
+    #     n7   \
+    #    /      \
+    #    n8(P)  n10(I)
+    #    \      /
+    #    n9(I) /
+    #      \  /
+    #      n11(C)
+
+    x = relay.var("x")
+    w = relay.var("w")
+    n6 = relay.op.nn.conv2d(x, w)  # matches P
+    n7 = relay.op.tanh(n6)  # does not match I
+    n8 = relay.op.nn.conv2d(n7, w)  # matches P
+    n9 = relay.op.nn.relu(n8)  # matches I
+    n10 = relay.op.nn.relu(n6)  # matches I
+    n11 = relay.add(n9, n10)  # matches C
+
+    # Does not match: Can't match the parent pattern P at both 8 and 6.
+    # Note that if we did allow P to be used twice the implementation would
+    # need to be changed to not 'jump over' n7.
+    assert not pattern.match(n11)
+
+
 def test_match_dominator():
     # Pattern
     is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
@@ -1760,4 +1791,4 @@ def callback(self, pre, post, node_map):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()

From 774ee969fcb19e9d16e74de77c64848fd30e9a52 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Tue, 7 Jun 2022 18:16:25 -0400
Subject: [PATCH 061/181] [relay] add missing virtual d'tor (#11601)

Add a default virtual destructor to
`tvm::relay::transforms::GlobalSymbolCache`, so that
correct destructors run when destroying
subclass instances.
---
 src/relay/transforms/compiler_function_utils.cc | 2 ++
 src/relay/transforms/compiler_function_utils.h  | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc
index b98d089b346a3..f22e9bd80dd07 100644
--- a/src/relay/transforms/compiler_function_utils.cc
+++ b/src/relay/transforms/compiler_function_utils.cc
@@ -119,6 +119,8 @@ class CallRewriter : public MixedModeMutator {
 
 }  // namespace
 
+GlobalSymbolCache::~GlobalSymbolCache() = default;
+
 GlobalVar ExistingGlobalSymbolCache::GetGlobalSymbol(const Function& function) {
   Optional<String> opt_global_symbol = function->GetAttr<String>(tvm::attr::kGlobalSymbol);
   ICHECK(opt_global_symbol.defined())
diff --git a/src/relay/transforms/compiler_function_utils.h b/src/relay/transforms/compiler_function_utils.h
index 7b5143444bf8a..e4b1f05211fe1 100644
--- a/src/relay/transforms/compiler_function_utils.h
+++ b/src/relay/transforms/compiler_function_utils.h
@@ -71,6 +71,7 @@ namespace transforms {
  */
 class GlobalSymbolCache {
  public:
+  virtual ~GlobalSymbolCache();
   virtual GlobalVar GetGlobalSymbol(const Function& function) = 0;
 };
 

From d490620085792f802d606209008698e65fb12c0e Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 7 Jun 2022 17:16:37 -0500
Subject: [PATCH 062/181] [Hexagon][CI] Re-enable Hexagon tests in CI (#11613)

* [Hexagon][CI] Re-enable Hexagon tests in CI

These were enabled in https://github.com/apache/tvm/pull/11294, then
erroneously disabled in https://github.com/apache/tvm/pull/11313.
This applies the same fix as in
https://github.com/apache/tvm/pull/11294, checking the
`ANDROID_SERIAL_NUMBER` to determine if Hexagon tests can execute at
runtime, but using the refactored `pytest.skipif` messages introduced
in https://github.com/apache/tvm/pull/11313.

* Fixed circular dependency, but feels somewhat ugly
---
 python/tvm/contrib/hexagon/_ci_env_check.py | 62 +++++++++++++++++++++
 python/tvm/contrib/hexagon/pytest_plugin.py | 10 +---
 python/tvm/testing/utils.py                 |  8 +--
 3 files changed, 66 insertions(+), 14 deletions(-)
 create mode 100644 python/tvm/contrib/hexagon/_ci_env_check.py

diff --git a/python/tvm/contrib/hexagon/_ci_env_check.py b/python/tvm/contrib/hexagon/_ci_env_check.py
new file mode 100644
index 0000000000000..c1c70750e86ae
--- /dev/null
+++ b/python/tvm/contrib/hexagon/_ci_env_check.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Hexagon environment checks for CI usage
+
+These may be required by either tvm.testing or
+tvm.contrib.hexagon.pytest_plugin, and are separated here to avoid a
+circular dependency.
+"""
+
+import os
+
+import tvm
+
+ANDROID_SERIAL_NUMBER = "ANDROID_SERIAL_NUMBER"
+HEXAGON_TOOLCHAIN = "HEXAGON_TOOLCHAIN"
+
+
+def _compile_time_check():
+    """Return True if compile-time support for Hexagon is present, otherwise
+    error string.
+
+    Designed for use as a the ``compile_time_check`` argument to
+    `tvm.testing.Feature`.
+    """
+    if (
+        tvm.testing.utils._cmake_flag_enabled("USE_LLVM")
+        and tvm.target.codegen.llvm_version_major() < 7
+    ):
+        return "Hexagon requires LLVM 7 or later"
+
+    if "HEXAGON_TOOLCHAIN" not in os.environ:
+        return f"Missing environment variable {HEXAGON_TOOLCHAIN}."
+
+    return True
+
+
+def _run_time_check():
+    """Return True if run-time support for Hexagon is present, otherwise
+    error string.
+
+    Designed for use as a the ``run_time_check`` argument to
+    `tvm.testing.Feature`.
+    """
+    if ANDROID_SERIAL_NUMBER not in os.environ:
+        return f"Missing environment variable {ANDROID_SERIAL_NUMBER}."
+
+    return True
diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index 2c62a0a0b5694..278bd833da954 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -53,15 +53,7 @@ def _compose(args, decs):
     return decs
 
 
-def requires_hexagon_toolchain(*args):
-    _requires_hexagon_toolchain = [
-        pytest.mark.skipif(
-            os.environ.get(HEXAGON_TOOLCHAIN) is None,
-            reason=f"Missing environment variable {HEXAGON_TOOLCHAIN}.",
-        ),
-    ]
-
-    return _compose(args, _requires_hexagon_toolchain)
+requires_hexagon_toolchain = tvm.testing.requires_hexagon(support_required="compile-only")
 
 
 @tvm.testing.fixture
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 939786c9294fc..bf3cc94f5ddf7 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -88,6 +88,7 @@ def test_something():
 import tvm._ffi
 
 from tvm.contrib import nvcc, cudnn
+import tvm.contrib.hexagon._ci_env_check as hexagon
 from tvm.error import TVMError
 
 
@@ -937,11 +938,8 @@ def _any_gpu_exists():
     "Hexagon",
     cmake_flag="USE_HEXAGON",
     target_kind_enabled="hexagon",
-    compile_time_check=lambda: (
-        (_cmake_flag_enabled("USE_LLVM") and tvm.target.codegen.llvm_version_major() >= 7)
-        or "Hexagon requires LLVM 7 or later"
-    ),
-    target_kind_hardware="hexagon",
+    compile_time_check=hexagon._compile_time_check,
+    run_time_check=hexagon._run_time_check,
     parent_features="llvm",
 )
 

From 52d90da1d3bc6b12611b1d30a38c02837fbf8d76 Mon Sep 17 00:00:00 2001
From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com>
Date: Tue, 7 Jun 2022 18:05:14 -0700
Subject: [PATCH 063/181] [MetaSchedule] TuningRecord Optional Arguments
 (#11598)

In some situations, such as before measuring the candidates, the arguments `run_secs`, `target`, and `args_info` in `TuningRecord` are not required. Per this request, the new `TuningRecord` API now accepts arguments in the order of `trace, workload, run_secs, target, args_info` with the last three being optional. Note that some tests might fail due to the change of argument order, so they might need to be adjusted accordingly.
---
 include/tvm/meta_schedule/database.h          | 17 +++---
 python/tvm/meta_schedule/database/database.py | 26 ++++-----
 python/tvm/meta_schedule/testing/utils.py     |  2 +-
 src/meta_schedule/database/database.cc        | 54 ++++++++++++-------
 src/meta_schedule/database/json_database.cc   |  4 +-
 .../measure_callback/add_to_database.cc       |  2 +-
 .../unittest/test_meta_schedule_database.py   | 26 ++++-----
 .../test_meta_schedule_integration.py         |  2 +-
 .../unittest/test_meta_schedule_tune_relay.py |  2 +-
 9 files changed, 75 insertions(+), 60 deletions(-)

diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index 1353dec3eda3f..37a315bf744e9 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -103,19 +103,19 @@ class TuningRecordNode : public runtime::Object {
  public:
   /*! \brief The trace tuned. */
   tir::Trace trace;
-  /*! \brief The profiling result in seconds. */
-  Array<FloatImm> run_secs;
   /*! \brief The workload. */
   Workload workload{nullptr};
+  /*! \brief The profiling result in seconds. */
+  Optional<Array<FloatImm>> run_secs;
   /*! \brief The target for tuning. */
-  Target target;
+  Optional<Target> target;
   /*! \brief The argument information. */
-  Array<ArgInfo> args_info;
+  Optional<Array<ArgInfo>> args_info;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("trace", &trace);
-    v->Visit("run_secs", &run_secs);
     v->Visit("workload", &workload);
+    v->Visit("run_secs", &run_secs);
     v->Visit("target", &target);
     v->Visit("args_info", &args_info);
   }
@@ -140,13 +140,14 @@ class TuningRecord : public runtime::ObjectRef {
   /*!
    \brief Constructor of a tuning record.
    \param trace The trace of the tuning record.
-   \param run_secs The running time of the tuning record.
    \param workload The workload of the tuning record.
+   \param run_secs The running time of the tuning record.
    \param target The target of the tuning record.
    \param args_info The argument information of the tuning record.
   */
-  TVM_DLL explicit TuningRecord(tir::Trace trace, Array<FloatImm> run_secs, Workload workload,
-                                Target target, Array<ArgInfo> args_info);
+  TVM_DLL explicit TuningRecord(tir::Trace trace, Workload workload,
+                                Optional<Array<FloatImm>> run_secs, Optional<Target> target,
+                                Optional<Array<ArgInfo>> args_info);
   /*!
    * \brief Create a tuning record from a json object.
    * \param json_obj The json object.
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index 314bf434c417f..8e0c805410204 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Tuning record database"""
-from typing import Any, Callable, List
+from typing import Any, Callable, List, Optional
 
 from tvm._ffi import register_object
 from tvm.ir.module import IRModule
@@ -82,35 +82,35 @@ class TuningRecord(Object):
     ----------
     trace : tvm.ir.Trace
         The trace of the tuning record.
-    run_secs : List[float]
-        The run time of the tuning record.
     workload : Workload
         The workload of the tuning record.
-    target : Target
+    run_secs : Optional[List[float]]
+        The run time of the tuning record.
+    target : Optional[Target]
         The target of the tuning record.
-    args_info : List[ArgInfo]
+    args_info : Optional[List[ArgInfo]]
         The argument information of the tuning record.
     """
 
     trace: Trace
-    run_secs: List[float]
     workload: Workload
-    target: Target
-    args_info: List[ArgInfo]
+    run_secs: Optional[List[float]]
+    target: Optional[Target]
+    args_info: Optional[List[ArgInfo]]
 
-    def __init__(
+    def __init__(  # type: ignore # pylint: disable=too-many-arguments
         self,
         trace: Trace,
-        run_secs: List[float],
         workload: Workload,
-        target: Target,
-        args_info: List[ArgInfo],
+        run_secs: Optional[List[float]] = None,
+        target: Optional[Target] = None,
+        args_info: Optional[List[ArgInfo]] = None,
     ) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.TuningRecord,  # type: ignore # pylint: disable=no-member
             trace,
-            run_secs,
             workload,
+            run_secs,
             target,
             args_info,
         )
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
index a832dfc6bcc4a..62950fdd0bb4a 100644
--- a/python/tvm/meta_schedule/testing/utils.py
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -155,7 +155,7 @@ def apply_fixed_schedules(
 
         if schedule_fn(task, sch):
             workload = database.commit_workload(mod)
-            tune_rec = TuningRecord(sch.trace, [0.0], workload, target, [])
+            tune_rec = TuningRecord(sch.trace, workload, [0.0], target, [])
             database.commit_tuning_record(tune_rec)
 
     return database
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index fc7cc74de5c67..86d999e4fdf59 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -74,48 +74,62 @@ Workload Workload::FromJSON(const ObjectRef& json_obj) {
 
 /******** TuningRecord ********/
 
-TuningRecord::TuningRecord(tir::Trace trace, Array<FloatImm> run_secs, Workload workload,
-                           Target target, Array<ArgInfo> args_info) {
+TuningRecord::TuningRecord(tir::Trace trace, Workload workload, Optional<Array<FloatImm>> run_secs,
+                           Optional<Target> target, Optional<Array<ArgInfo>> args_info) {
   ObjectPtr<TuningRecordNode> n = make_object<TuningRecordNode>();
   n->trace = trace;
-  n->run_secs = run_secs;
   n->workload = workload;
+  n->run_secs = run_secs;
   n->target = target;
   n->args_info = args_info;
   this->data_ = n;
 }
 
 ObjectRef TuningRecordNode::AsJSON() const {
-  Array<ObjectRef> json_args_info;
-  json_args_info.reserve(args_info.size());
-  for (const ArgInfo& arg_info : args_info) {
-    json_args_info.push_back(arg_info->AsJSON());
+  Optional<Array<ObjectRef>> json_args_info{nullptr};
+  Optional<ObjectRef> json_target{nullptr};
+  if (args_info.defined()) {
+    Array<ObjectRef> info;
+    info.reserve(args_info.value().size());
+    for (const ArgInfo& arg_info : args_info.value()) {
+      info.push_back(arg_info->AsJSON());
+    }
+    json_args_info = info;
+  }
+  if (target.defined()) {
+    json_target = target.value()->Export();
   }
   return Array<ObjectRef>{trace->AsJSON(false),  //
                           run_secs,              //
-                          target->Export(),      //
+                          json_target,           //
                           json_args_info};
 }
 
 TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& workload) {
   tir::Trace trace{nullptr};
-  Array<FloatImm> run_secs{nullptr};
-  Target target{nullptr};
-  Array<ArgInfo> args_info;
+  Optional<Array<FloatImm>> run_secs{nullptr};
+  Optional<Target> target{nullptr};
+  Optional<Array<ArgInfo>> args_info{nullptr};
   try {
     const ArrayNode* json_array = json_obj.as<ArrayNode>();
     CHECK(json_array && json_array->size() == 4);
     // Load json[1] => run_secs
-    run_secs = Downcast<Array<FloatImm>>(json_array->at(1));
+    if (json_array->at(1).defined()) {
+      run_secs = Downcast<Array<FloatImm>>(json_array->at(1));
+    }
     // Load json[2] => target
-    target = Target(Downcast<Map<String, ObjectRef>>(json_array->at(2)));
+    if (json_array->at(2).defined()) {
+      target = Target(Downcast<Map<String, ObjectRef>>(json_array->at(2)));
+    }
     // Load json[3] => args_info
-    {
+    if (json_array->at(3).defined()) {
       const ArrayNode* json_args_info = json_array->at(3).as<ArrayNode>();
-      args_info.reserve(json_args_info->size());
+      Array<ArgInfo> info;
+      info.reserve(json_args_info->size());
       for (const ObjectRef& json_arg_info : *json_args_info) {
-        args_info.push_back(ArgInfo::FromJSON(json_arg_info));
+        info.push_back(ArgInfo::FromJSON(json_arg_info));
       }
+      args_info = info;
     }
     // Load json[0] => trace
     {
@@ -130,7 +144,7 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w
     LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
                << "\nThe error is: " << e.what();
   }
-  return TuningRecord(trace, run_secs, workload, target, args_info);
+  return TuningRecord(trace, workload, run_secs, target, args_info);
 }
 
 /******** PyDatabase ********/
@@ -161,9 +175,9 @@ TVM_REGISTER_GLOBAL("meta_schedule.WorkloadAsJSON")
     .set_body_method<Workload>(&WorkloadNode::AsJSON);
 TVM_REGISTER_GLOBAL("meta_schedule.WorkloadFromJSON").set_body_typed(&Workload::FromJSON);
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecord")
-    .set_body_typed([](tir::Trace trace, Array<FloatImm> run_secs, Workload workload, Target target,
-                       Array<ArgInfo> args_info) {
-      return TuningRecord(trace, run_secs, workload, target, args_info);
+    .set_body_typed([](tir::Trace trace, Workload workload, Optional<Array<FloatImm>> run_secs,
+                       Optional<Target> target, Optional<Array<ArgInfo>> args_info) {
+      return TuningRecord(trace, workload, run_secs, target, args_info);
     });
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsJSON")
     .set_body_method<TuningRecord>(&TuningRecordNode::AsJSON);
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 2e76940feee39..155d223217da9 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -40,8 +40,8 @@ struct SortTuningRecordByMeanRunSecs {
   }
 
   bool operator()(const TuningRecord& a, const TuningRecord& b) const {
-    double a_time = Mean(a->run_secs);
-    double b_time = Mean(b->run_secs);
+    double a_time = Mean(a->run_secs.value_or({}));
+    double b_time = Mean(b->run_secs.value_or({}));
     return a_time < b_time;
   }
 };
diff --git a/src/meta_schedule/measure_callback/add_to_database.cc b/src/meta_schedule/measure_callback/add_to_database.cc
index 0988da0414e2a..27b4e55a7de5b 100644
--- a/src/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/meta_schedule/measure_callback/add_to_database.cc
@@ -47,8 +47,8 @@ class AddToDatabaseNode : public MeasureCallbackNode {
       }
       database->CommitTuningRecord(TuningRecord(
           /*trace=*/candidate->sch->trace().value(),
-          /*run_secs=*/run_secs,
           /*workload=*/workload,
+          /*run_secs=*/run_secs,
           /*target=*/target,
           /*args_info=*/candidate->args_info));
     }
diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py
index d494f997c1ce7..1edfbe6c7a782 100644
--- a/tests/python/unittest/test_meta_schedule_database.py
+++ b/tests/python/unittest/test_meta_schedule_database.py
@@ -115,8 +115,8 @@ def test_meta_schedule_tuning_record_round_trip():
         workload = database.commit_workload(mod)
         record = TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
-            [1.5, 2.5, 1.8],
             workload,
+            [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
             ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
         )
@@ -140,8 +140,8 @@ def test_meta_schedule_database_has_workload():
         workload = database.commit_workload(mod)
         record = TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
-            [1.5, 2.5, 1.8],
             workload,
+            [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
             ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
         )
@@ -158,8 +158,8 @@ def test_meta_schedule_database_add_entry():
         workload = database.commit_workload(mod)
         record = TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
-            [1.5, 2.5, 1.8],
             workload,
+            [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
             ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
         )
@@ -178,8 +178,8 @@ def test_meta_schedule_database_missing():
         workload_2 = database.commit_workload(mod_2)
         record = TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
-            [1.5, 2.5, 1.8],
             workload,
+            [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
             ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
         )
@@ -197,43 +197,43 @@ def test_meta_schedule_database_sorting():
         records = [
             TuningRecord(
                 trace,
-                [7.0, 8.0, 9.0],
                 token,
+                [7.0, 8.0, 9.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [1.0, 2.0, 3.0],
                 token,
+                [1.0, 2.0, 3.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [4.0, 5.0, 6.0],
                 token,
+                [4.0, 5.0, 6.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [1.1, 1.2, 600.0],
                 token,
+                [1.1, 1.2, 600.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [1.0, 100.0, 6.0],
                 token,
+                [1.0, 100.0, 6.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [4.0, 9.0, 8.0],
                 token,
+                [4.0, 9.0, 8.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
@@ -259,22 +259,22 @@ def test_meta_schedule_database_reload():
         records = [
             TuningRecord(
                 trace,
-                [7.0, 8.0, 9.0],
                 token,
+                [7.0, 8.0, 9.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [1.0, 2.0, 3.0],
                 token,
+                [1.0, 2.0, 3.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [4.0, 5.0, 6.0],
                 token,
+                [4.0, 5.0, 6.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index a423bdb48afdf..3b33039bd2874 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -267,7 +267,7 @@ def test_meta_schedule_integration_apply_history_best():
     target = Target("llvm")
     workload = database.commit_workload(MockModule)
     database.commit_tuning_record(
-        TuningRecord(Schedule(MockModule).trace, [1.0], workload, target, [])
+        TuningRecord(Schedule(MockModule).trace, workload, [1.0], target, [])
     )
     mod = env.query(task_name="mock-task", mod=mod, target=target, dispatched=[MockModule])
     assert tvm.ir.structural_equal(mod, workload.mod)
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index e5076af520f30..e0883dbd227ed 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -307,8 +307,8 @@ def test_meta_schedule_relay_lowering():
         database.commit_tuning_record(
             TuningRecord(
                 Trace([], {}),
-                [0.0],
                 database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc),
+                [0.0],
                 target=target,
                 args_info=[],
             )

From f5f9600614c4aa933c863001459d92b13d9b72fc Mon Sep 17 00:00:00 2001
From: "Ehsan M. Kermani" <6980212+ehsanmok@users.noreply.github.com>
Date: Tue, 7 Jun 2022 22:08:28 -0700
Subject: [PATCH 064/181] [docs] Various content corrections (#11517)

* [docs] Various content corrections

* Fix underline title
---
 gallery/how_to/deploy_models/deploy_sparse.py      |  8 ++++----
 .../how_to/extend_tvm/bring_your_own_datatypes.py  |  2 +-
 gallery/how_to/extend_tvm/low_level_custom_pass.py |  2 +-
 gallery/how_to/extend_tvm/use_pass_infra.py        |  8 ++++----
 gallery/how_to/extend_tvm/use_pass_instrument.py   |  4 ++--
 gallery/how_to/optimize_operators/opt_conv_cuda.py |  2 +-
 .../optimize_operators/opt_conv_tensorcore.py      |  2 +-
 gallery/how_to/optimize_operators/opt_gemm.py      |  4 ++--
 .../how_to/tune_with_autotvm/tune_conv2d_cuda.py   |  2 +-
 gallery/how_to/work_with_relay/build_gcn.py        |  2 +-
 gallery/how_to/work_with_relay/using_relay_viz.py  |  6 +++---
 gallery/how_to/work_with_schedules/extern_op.py    |  4 ++--
 gallery/how_to/work_with_schedules/intrin_math.py  |  2 +-
 gallery/how_to/work_with_schedules/scan.py         |  2 +-
 gallery/tutorial/auto_scheduler_matmul_x86.py      |  4 ++--
 gallery/tutorial/autotvm_matmul_x86.py             | 14 +++++++-------
 gallery/tutorial/intro_topi.py                     |  3 +--
 gallery/tutorial/tensor_expr_get_started.py        |  8 ++++----
 gallery/tutorial/tensor_ir_blitz_course.py         |  6 +++---
 19 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/gallery/how_to/deploy_models/deploy_sparse.py b/gallery/how_to/deploy_models/deploy_sparse.py
index 768a697f45cfc..56a5f1aafd1ce 100644
--- a/gallery/how_to/deploy_models/deploy_sparse.py
+++ b/gallery/how_to/deploy_models/deploy_sparse.py
@@ -36,11 +36,11 @@
 
 Pruning is a technique primarily used to reduce the parameter size of a model
 by replacing weight values with 0s. Although many methods exist for choosing which
-weights should be set to 0, the most straight forward is by picking the 
+weights should be set to 0, the most straight forward is by picking the
 weights with the smallest value. Typically, weights are pruned to a desired
 sparsity percentage. For example, a 95% sparse model would have only 5% of
 its weights non-zero. Pruning to very high sparsities often requires
-finetuning or full retraining as it tends to be a lossy approximation.
+fine-tuning or full retraining as it tends to be a lossy approximation.
 Although parameter size benefits are quite easy to obtain from a pruned model
 through simple compression, leveraging sparsity to yield runtime speedups
 is more complicated.
@@ -50,8 +50,8 @@
 value and location. The benefit of bunching up pruned weights is that it allows
 an algorithm such as matrix multiplication to skip entire blocks. It turns out
 that some degree of *block sparsity* is very important to realizing significant
-speedups on most hardware available today. 
-This is because when loading memory in most CPUs or GPUs, 
+speedups on most hardware available today.
+This is because when loading memory in most CPUs or GPUs,
 it doesn't save any work to skip reading a single value at a time, instead an entire
 chunk or tile is read in and executed using something like vectorized instructions.
 
diff --git a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
index 018245609923a..1a48781e24336 100644
--- a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
+++ b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
@@ -313,7 +313,7 @@ def convert_ndarray(dst_dtype, array):
     print(str(e).split("\n")[-1])
 
 ######################################################################
-# When we attempt to run the model, we get a familiar error telling us that more functions need to be registerd for myfloat.
+# When we attempt to run the model, we get a familiar error telling us that more functions need to be registered for myfloat.
 #
 # Because this is a neural network, many more operations are required.
 # Here, we register all the needed functions:
diff --git a/gallery/how_to/extend_tvm/low_level_custom_pass.py b/gallery/how_to/extend_tvm/low_level_custom_pass.py
index 8f631075429fd..ee96d8220cac3 100644
--- a/gallery/how_to/extend_tvm/low_level_custom_pass.py
+++ b/gallery/how_to/extend_tvm/low_level_custom_pass.py
@@ -129,7 +129,7 @@ def vectorize(f, mod, ctx):
     tvm.tir.stmt_functor.post_order_visit(f.body, find_width8)
 
     if not loops:
-        return sf
+        return f
 
     # The last list arugment indicates what kinds of nodes will be transformed.
     # Thus, in this case only `For` nodes will call `vectorize8`
diff --git a/gallery/how_to/extend_tvm/use_pass_infra.py b/gallery/how_to/extend_tvm/use_pass_infra.py
index 67cdfdedce0e8..e38383e69011a 100644
--- a/gallery/how_to/extend_tvm/use_pass_infra.py
+++ b/gallery/how_to/extend_tvm/use_pass_infra.py
@@ -35,7 +35,7 @@
 pass infra. For more details about each type of these passes, please refer to
 the :ref:`pass-infra`
 
-This tutorial mainly demostrates how developers can use the pass infra to perform
+This tutorial mainly demonstrates how developers can use the pass infra to perform
 a certain optimization and create an optimization pipeline for a Relay program.
 The same approach can be used for tir as well.
 """
@@ -104,7 +104,7 @@ def example():
 print(mod)
 
 ###############################################################################
-# Some optimizations, such as fusion, are parameteric as well. For example,
+# Some optimizations, such as fusion, are parametric as well. For example,
 # opt level 0 will not allow operators to be fused together. Users can pass the
 # `fuse_opt_level` to enable this.
 mod = relay.transform.FuseOps(fuse_opt_level=0)(mod)
@@ -127,7 +127,7 @@ def example():
 # these issues explicitly by specifying the required passes of each pass and
 # packing them as a whole to execute. For example, the same passes can now be
 # applied using the sequential style as the following. :py:class:`tvm.transform.Sequential` is
-# similiar to `torch.nn.sequential <https://pytorch.org/docs/stable/nn.html#torch.nn.Sequential>`_
+# similar to `torch.nn.sequential <https://pytorch.org/docs/stable/nn.html#torch.nn.Sequential>`_
 # and `mxnet.gluon.block <https://mxnet.apache.org/api/python/docs/_modules/mxnet/gluon/block.html>`_.
 # For example, `torch.nn.sequential` is used to contain a sequence of PyTorch
 # `Modules` that will be added to build a network. It focuses on the network
@@ -267,7 +267,7 @@ def run_before_pass(self, mod, info):
 # -------
 # This tutorial has covered how we can write and invoke passes in TVM more
 # conveniently using the pass infra. Different ways of invoking a pass are also
-# disucssed. Using :py:class:`tvm.transform.Sequential` can largely help
+# discussed. Using :py:class:`tvm.transform.Sequential` can largely help
 # users to ease the work of handling multiple optimization passes and their
 # dependencies. In addition, an example is provided to illustrate
 # how we can debug a pass using the ``PrintIR`` and tracing.
diff --git a/gallery/how_to/extend_tvm/use_pass_instrument.py b/gallery/how_to/extend_tvm/use_pass_instrument.py
index 3369304a651d3..036aa63e374f0 100644
--- a/gallery/how_to/extend_tvm/use_pass_instrument.py
+++ b/gallery/how_to/extend_tvm/use_pass_instrument.py
@@ -30,7 +30,7 @@
 for collecting timing information (:py:class:`tvm.ir.instrument.PassTimingInstrument`),
 but an extension mechanism is available via the :py:func:`tvm.instrument.pass_instrument` decorator.
 
-This tutorial demostrates how developers can use ``PassContext`` to instrument
+This tutorial demonstrates how developers can use ``PassContext`` to instrument
 passes. Please also refer to the :ref:`pass-infra`.
 """
 import tvm
@@ -314,7 +314,7 @@ def exit_pass_ctx(self):
     print("Catching", str(ex).split("\n")[-1])
 
 ###############################################################################
-# Exceptions occured in ``should_run``, ``run_before_pass``, ``run_after_pass``
+# Exceptions occurred in ``should_run``, ``run_before_pass``, ``run_after_pass``
 # are not handled explicitly -- we rely on the context manager (the ``with`` syntax)
 # to exit ``PassContext`` safely.
 #
diff --git a/gallery/how_to/optimize_operators/opt_conv_cuda.py b/gallery/how_to/optimize_operators/opt_conv_cuda.py
index 0ac2c625bf781..3d2caa0d31214 100644
--- a/gallery/how_to/optimize_operators/opt_conv_cuda.py
+++ b/gallery/how_to/optimize_operators/opt_conv_cuda.py
@@ -97,7 +97,7 @@
 #      :width: 271px
 #
 # In this example, we load both Apad and W into buffer AA and WW, which are
-# stored in the shared memory. These bufferes will be later shared by all
+# stored in the shared memory. These buffers will be later shared by all
 # threads within the same thread block to compute the convolution. Each thread
 # then loads its own part from shared buffer into their local registers, AL and
 # WL. BL is a local cache of output B, which is also stored in the thread local
diff --git a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
index 702e4a777df57..ccfc7b9743aaa 100644
--- a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
+++ b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
@@ -306,7 +306,7 @@ def intrin_func(ins, outs):
 #   *Warp-level Operation*
 #
 #   Note that all TensorCore instructions are warp-level instructions, which means all 32 threads
-#   in a warp should do this instruction simultaneously. Making theadIdx.x extent=32 is one of the
+#   in a warp should do this instruction simultaneously. Making threadIdx.x extent=32 is one of the
 #   easiest way to solve this. Then We can bind threadIdx.x to any loops except those contain
 #   TensorCore intrinsics directly or indirectly. Also note that it is not the unique solution.
 #   The only thing we should do is to make sure all threads in a warp can call TensorCore at the same time.
diff --git a/gallery/how_to/optimize_operators/opt_gemm.py b/gallery/how_to/optimize_operators/opt_gemm.py
index 5d698c612ee8f..920d7a87fabf9 100644
--- a/gallery/how_to/optimize_operators/opt_gemm.py
+++ b/gallery/how_to/optimize_operators/opt_gemm.py
@@ -312,7 +312,7 @@
 s[CC].reorder(ko, mc, ki, nc)
 s[CC].vectorize(nc)
 
-# TODO: Add separate optimization step to discuss loop unrolloing
+# TODO: Add separate optimization step to discuss loop unrolling
 # unrolling is a loop optimization strategy which can reduce branch
 # prediction failures and increases the chance of concurrent execution
 # unroll kfactor loops
@@ -390,4 +390,4 @@
 # our generated code can achieve 60% of the `numpy` performance with MKL.
 # Note that the outputs on the web page reflect the running times on a non-exclusive
 # Docker container, thereby they are *unreliable*. It is highly encouraged to run the
-# tutorial by yourself to observe the performance gain acheived by TVM.
+# tutorial by yourself to observe the performance gain achieved by TVM.
diff --git a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
index ef921563e466f..e3072773bf593 100644
--- a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
+++ b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
@@ -74,7 +74,7 @@
 #
 # If you are familiar with writing cuda schedule, you can find the following
 # template is very general. Actually this template can be easily modified
-# to tune other operators such as depthwise convolution and gemm.
+# to tune other operators such as depthwise convolution and GEMM.
 # In order to fully understand this template, you should be familiar with
 # the schedule primitives and auto tuning API. You can refer to the above
 # tutorials and :ref:`autotvm tutorial <tutorial-autotvm-matmul-x86>`
diff --git a/gallery/how_to/work_with_relay/build_gcn.py b/gallery/how_to/work_with_relay/build_gcn.py
index d76baec1eec14..fcffbd77ff86b 100644
--- a/gallery/how_to/work_with_relay/build_gcn.py
+++ b/gallery/how_to/work_with_relay/build_gcn.py
@@ -314,7 +314,7 @@ def prepare_params(g, data):
 # Compile and run with TVM
 # ------------------------
 #
-# Export the weigths from PyTorch model to Python Dict
+# Export the weights from PyTorch model to Python Dict
 model_params = {}
 for param_tensor in torch_model.state_dict():
     model_params[param_tensor] = torch_model.state_dict()[param_tensor].numpy()
diff --git a/gallery/how_to/work_with_relay/using_relay_viz.py b/gallery/how_to/work_with_relay/using_relay_viz.py
index 10e6dab12e245..b0132f40b9b51 100644
--- a/gallery/how_to/work_with_relay/using_relay_viz.py
+++ b/gallery/how_to/work_with_relay/using_relay_viz.py
@@ -22,7 +22,7 @@
 
 Relay IR module can contain lots of operations. Although an individual
 operation is usually easy to understand, putting them together can cause
-a complicated, hard-to-read graph. Things can get even worse with optimiztion-passes
+a complicated, hard-to-read graph. Things can get even worse with optimization-passes
 coming into play.
 
 This utility visualizes an IR module as nodes and edges. It defines a set of interfaces including
@@ -89,7 +89,7 @@
 # -------------------------------------------
 # Sometimes we want to emphasize interested information, or parse things differently for a specific usage.
 # It is possible to provide customized parsers as long as it obeys the interface.
-# Here demostrate how to customize parsers for ``relay.var``.
+# Here demonstrate how to customize parsers for ``relay.var``.
 # We need to implement abstract interface :py:class:`tvm.contrib.relay_viz.interface.VizParser`.
 class YourAwesomeParser(VizParser):
     def __init__(self):
@@ -131,7 +131,7 @@ def node(self, viz_node):
         super().node(viz_node)
         # if it's AwesomeVar, duplicate it.
         if viz_node.type_name == "AwesomeVar":
-            duplicated_id = f"duplciated_{viz_node.identity}"
+            duplicated_id = f"duplicated_{viz_node.identity}"
             duplicated_type = "double AwesomeVar"
             super().node(VizNode(duplicated_id, duplicated_type, ""))
             # connect the duplicated var to the original one
diff --git a/gallery/how_to/work_with_schedules/extern_op.py b/gallery/how_to/work_with_schedules/extern_op.py
index fb9b2eaf8d13b..a0aa5d72450c0 100644
--- a/gallery/how_to/work_with_schedules/extern_op.py
+++ b/gallery/how_to/work_with_schedules/extern_op.py
@@ -25,7 +25,7 @@
 some of the convolution kernels and define the rest of the stages.
 
 TVM supports these black box function calls natively.
-Specfically, TVM support all the tensor functions that are DLPack compatible.
+Specifically, TVM support all the tensor functions that are DLPack compatible.
 Which means we can call any function with POD types(pointer, int, float)
 or pointer to DLTensor as argument.
 """
@@ -52,7 +52,7 @@
 # list of symbolic placeholder for the outputs and returns the executing statement.
 #
 # In this case we simply call a registered TVM function, which invokes a CBLAS call.
-# TVM does not control internal of the extern array function and treats it as blackbox.
+# TVM does not control internal of the extern array function and treats it as black-box.
 # We can further mix schedulable TVM calls that add a bias term to the result.
 #
 n = 1024
diff --git a/gallery/how_to/work_with_schedules/intrin_math.py b/gallery/how_to/work_with_schedules/intrin_math.py
index 92383b90a53f9..535563bfb5306 100644
--- a/gallery/how_to/work_with_schedules/intrin_math.py
+++ b/gallery/how_to/work_with_schedules/intrin_math.py
@@ -26,7 +26,7 @@
 These functions are target system dependent and may have different
 names of different target platforms. In this tutorial, we will learn
 how we can invoke these target specific functions, and how we can unify
-the interface via tvm's intrinsic API.
+the interface via TVM's intrinsic API.
 """
 from __future__ import absolute_import, print_function
 import numpy as np
diff --git a/gallery/how_to/work_with_schedules/scan.py b/gallery/how_to/work_with_schedules/scan.py
index ba8b5a9f8e06a..3f3d7e91ee1c1 100644
--- a/gallery/how_to/work_with_schedules/scan.py
+++ b/gallery/how_to/work_with_schedules/scan.py
@@ -60,7 +60,7 @@
 # Schedule the Scan Cell
 # ----------------------
 # We can schedule the body of the scan by scheduling the update and
-# init part seperately. Note that it is invalid to schedule the
+# init part separately. Note that it is invalid to schedule the
 # first iteration dimension of the update part.
 # To split on the time iteration, user can schedule on scan_op.scan_axis instead.
 #
diff --git a/gallery/tutorial/auto_scheduler_matmul_x86.py b/gallery/tutorial/auto_scheduler_matmul_x86.py
index 9f3a6070ccb23..b9f89f6723c9b 100644
--- a/gallery/tutorial/auto_scheduler_matmul_x86.py
+++ b/gallery/tutorial/auto_scheduler_matmul_x86.py
@@ -78,13 +78,13 @@ def matmul_add(N, L, M, dtype):
 # ----------------------
 # With the function defined, we can now create the task for the auto_scheduler
 # to search against. We specify the particular parameters for this matrix
-# multiplication, in this case a multiplication of to square matricies of size
+# multiplication, in this case a multiplication of two square matrices of size
 # 1024x1024. We then create a search task with N=L=M=1024 and dtype="float32"
 #
 # .. admonition:: Improve performance with custom targets
 #
 #   In order for TVM to take full advantage of specific hardware platforms,
-#   you will want to manuall specify your CPU capabilities. For example:
+#   you will want to manually specify your CPU capabilities. For example:
 #
 #     - replace ``llvm`` below with ``llvm -mcpu=core-avx2`` to enable AVX2
 #     - replace ``llvm`` below with ``llvm -mcpu=skylake-avx512`` to enable AVX-512
diff --git a/gallery/tutorial/autotvm_matmul_x86.py b/gallery/tutorial/autotvm_matmul_x86.py
index 54581172115d2..b84a6193cde6e 100644
--- a/gallery/tutorial/autotvm_matmul_x86.py
+++ b/gallery/tutorial/autotvm_matmul_x86.py
@@ -28,7 +28,7 @@
 find the optimal schedule. This process is called Auto-Tuning, which helps
 automate the process of optimizing tensor computation.
 
-This tutorial builds on the previous `tutorial on how to write a matrix
+This tutorial builds on the previous :doc:`tutorial on how to write a matrix
 multiplication using TE <tensor_expr_get_started>`.
 
 There are two steps in auto-tuning.
@@ -201,7 +201,7 @@ def matmul_v1(N, L, M, dtype):
 # knob. This is the lowest level API to define the space, and gives an explicit
 # enumeration of the parameter space to search. However, we also provide
 # another set of APIs that can make the definition of the search space easier
-# and smarter. Where possible, we receomment you use this higher-level API
+# and smarter. Where possible, we recommend you use this higher-level API
 #
 # In the following example, we use :any:`ConfigSpace.define_split` to define a
 # split knob. It will enumerate all the possible ways to split an axis and
@@ -267,7 +267,7 @@ def matmul(N, L, M, dtype):
 # Step 2: Use AutoTVM to Optimize the Matrix Multiplication
 # ---------------------------------------------------------
 # In Step 1, we wrote a matrix multiplication template that allowed us to
-# paramaterize the block size used in the `split` schedule. We can now conduct
+# parameterize the block size used in the `split` schedule. We can now conduct
 # a search over this parameter space. The next step is to pick a tuner to guide
 # the exploration of this space.
 #
@@ -295,7 +295,7 @@ def matmul(N, L, M, dtype):
 #
 # You can choose the tuner according to the size of your space, your time
 # budget and other factors.  For example, if your space is very small (less
-# than 1000), a gridsearch tuner or a random tuner is good enough. If your
+# than 1000), a grid-search tuner or a random tuner is good enough. If your
 # space is at the level of 10^9 (this is the space size of a conv2d operator on
 # CUDA GPU), XGBoostTuner can explore more efficiently and find better configs.
 
@@ -342,7 +342,7 @@ def matmul(N, L, M, dtype):
 ################################################################################
 # With tuning completed, we can choose the configuration from the log file that
 # has the best measured performance and compile the schedule with the
-# corresponding parameters. We also do a quick verfication that the schedule is
+# corresponding parameters. We also do a quick verification that the schedule is
 # producing correct answers.  We can call the function :code:`matmul` directly
 # under the :any:`autotvm.apply_history_best` context. When we call this
 # function, it will query the dispatch context with its argument and get the
@@ -371,7 +371,7 @@ def matmul(N, L, M, dtype):
 # TVM to search a parameter space and choose optimized schedule configurations.
 # To gain a deeper understanding of how this works, we recommend expanding on
 # this example by adding new search parameters to the schedule based on
-# schedule operations demonstated in the `Getting Started With Tensor
+# schedule operations demonstrated in the :ref: `Getting Started With Tensor
 # Expressions <tensor_expr_get_started>_` tutorial. In the upcoming sections, we
-# will demonstate the AutoScheduler, a method for TVM to optimize common
+# will demonstrate the AutoScheduler, a method for TVM to optimize common
 # operators without the need for the user to provide a user-defined template.
diff --git a/gallery/tutorial/intro_topi.py b/gallery/tutorial/intro_topi.py
index dad8c53bf4ae3..17fa3ff370e54 100644
--- a/gallery/tutorial/intro_topi.py
+++ b/gallery/tutorial/intro_topi.py
@@ -23,9 +23,8 @@
 
 This is an introductory tutorial to TVM Operator Inventory (TOPI).
 TOPI provides numpy-style generic operations and schedules with higher abstractions than TVM.
-In this tutorial, we will see how TOPI can save us from writing boilerplates code in TVM.
+In this tutorial, we will see how TOPI can save us from writing boilerplate code in TVM.
 """
-from __future__ import absolute_import, print_function
 
 import tvm
 import tvm.testing
diff --git a/gallery/tutorial/tensor_expr_get_started.py b/gallery/tutorial/tensor_expr_get_started.py
index 7d8c0d781a3f3..25ea4e8a55ee5 100644
--- a/gallery/tutorial/tensor_expr_get_started.py
+++ b/gallery/tutorial/tensor_expr_get_started.py
@@ -187,8 +187,8 @@ def evaluate_addition(func, target, optimization, log):
 evaluate_addition(fadd, tgt, "naive", log=log)
 
 ################################################################################
-# Updating the Schedule to Use Paralleism
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Updating the Schedule to Use Parallelism
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # Now that we've illustrated the fundamentals of TE, let's go deeper into what
 # schedules do, and how they can be used to optimize tensor expressions for
@@ -754,7 +754,7 @@ def evaluate_operation(s, vars, target, name, optimization, log):
 # regular but discontinuous. We expect that after some transformation we can
 # get a continuous access pattern. By reordering a ``[16][16]`` array to a
 # ``[16/4][16][4]`` array the access pattern of B will be sequential when
-# grabing the corresponding value from the packed array.
+# grabbing the corresponding value from the packed array.
 #
 # To accomplish this, we are going to have to start with a new default
 # schedule, taking into account the new packing of B. It's worth taking a
@@ -889,7 +889,7 @@ def evaluate_operation(s, vars, target, name, optimization, log):
 # have from this introduction to TE, we can now begin to explore how TVM can
 # automate the schedule optimization process.
 #
-# This tutorial provided a walkthrough of TVM Tensor Expresstion (TE) workflow
+# This tutorial provided a walk-through of TVM Tensor Expression (TE) workflow
 # using a vector add and a matrix multiplication examples. The general workflow
 # is
 #
diff --git a/gallery/tutorial/tensor_ir_blitz_course.py b/gallery/tutorial/tensor_ir_blitz_course.py
index e9a0801f34a81..11edc7ae9f3b9 100644
--- a/gallery/tutorial/tensor_ir_blitz_course.py
+++ b/gallery/tutorial/tensor_ir_blitz_course.py
@@ -25,7 +25,7 @@
 
 - An implementation for transforming and optimizing programs on various hardware backends.
 
-- An abstraction for automatic tensorized program optimization.
+- An abstraction for automatic _tensorized_ program optimization.
 
 """
 
@@ -145,7 +145,7 @@ def main(a: T.handle, b: T.handle):
 # sequence of schedule primitives will help to improve the performance. And at last, we can lower
 # and build it into a runnable module.
 #
-# Here we just demostrate a very simple tranformation. First we create schedule on the input `ir_module`.
+# Here we just demonstrate a very simple transformation. First we create schedule on the input `ir_module`.
 
 sch = tvm.tir.Schedule(ir_module)
 print(type(sch))
@@ -155,7 +155,7 @@ def main(a: T.handle, b: T.handle):
 
 # Get block by its name
 block_b = sch.get_block("B")
-# Get loops surronding the block
+# Get loops surrounding the block
 (i,) = sch.get_loops(block_b)
 # Tile the loop nesting.
 i_0, i_1, i_2 = sch.split(i, factors=[2, 2, 2])

From a95a820cfaa0fa5d83e2f6a7c304c61e0de782c1 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Wed, 8 Jun 2022 13:41:02 +0800
Subject: [PATCH 065/181] [DNNL] Fix end of line in test_dnnl UT file (#11560)

---
 tests/python/contrib/test_dnnl.py | 2072 ++++++++++++++---------------
 1 file changed, 1036 insertions(+), 1036 deletions(-)

diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 19ac183d66dfe..babfad4a0c8c7 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -1,1036 +1,1036 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import itertools
-import numpy as np
-import sys
-import subprocess
-
-import tvm
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-from tvm.relay.op.contrib import dnnl
-import tvm.testing
-
-
-has_dnnl_codegen = pytest.mark.skipif(
-    not tvm.get_global_func("relay.ext.dnnl", True), reason="DNNL codegen not available"
-)
-
-run_module = tvm.testing.parameter(
-    pytest.param(False, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
-    pytest.param(True, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
-    ids=["compile", "run"],
-)
-
-_bf16_supported = None
-
-
-def bf16_supported():
-    global _bf16_supported
-    if _bf16_supported is None:
-        _bf16_supported = False
-        if sys.platform.startswith("darwin"):
-            cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode()
-            for line in cpu_info.split("\n"):
-                if line.startswith("hw.optional.avx512f"):
-                    _bf16_supported = bool(line.split(":", 1)[1])
-        elif sys.platform.startswith("linux"):
-            _bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
-    return _bf16_supported
-
-
-def partition_for_dnnl(mod, params=None, alter_layout=True):
-    """Partition the graph greedily offloading supported operators to DNNL.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-    params : Optional[Dict[str, NDArray]]
-        Constant input parameters.
-    Returns
-    -------
-    mod : Module
-        Annotated and partitioned module.
-    """
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-
-    with TempOpAttr("nn.conv2d", "FTVMLegalize", dnnl.legalize_group_conv):
-        with TempOpAttr("nn.conv2d_transpose", "FTVMLegalize", dnnl.legalize_group_conv):
-            seq = tvm.transform.Sequential(
-                [
-                    transform.CanonicalizeOps(),
-                    transform.InferType(),
-                    transform.SimplifyInference(),
-                    transform.FoldConstant(),
-                    transform.FoldScaleAxis(),
-                    # fold consecutive add ops to simplify pattern `conv2d-bias_add-bn-relu`
-                    transform.SimplifyExpr(),
-                    transform.FoldConstant(),
-                    # alter group conv /conv_transpose layout to `GOIHW` / `GIOHW`
-                    transform.Legalize(),
-                    transform.FoldConstant(),
-                ]
-            )
-            with tvm.transform.PassContext(opt_level=3):
-                mod = seq(mod)
-    if alter_layout:
-        with TempOpAttr("nn.conv1d", "FTVMAlterOpLayout", dnnl.alter_conv):
-            with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", dnnl.alter_conv):
-                with TempOpAttr("nn.conv3d", "FTVMAlterOpLayout", dnnl.alter_conv):
-                    with TempOpAttr(
-                        "nn.conv2d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose
-                    ):
-                        with TempOpAttr(
-                            "nn.conv3d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose
-                        ):
-                            alter_layout_seq = tvm.transform.Sequential(
-                                [
-                                    transform.AlterOpLayout(),
-                                    transform.FoldConstant(),
-                                ]
-                            )
-                            with tvm.transform.PassContext(opt_level=3):
-                                mod = alter_layout_seq(mod)
-
-    byoc_seq = tvm.transform.Sequential(
-        [
-            transform.MergeComposite(dnnl.pattern_table()),
-            transform.AnnotateTarget("dnnl"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        mod = byoc_seq(mod)
-        mod = dnnl.prune_dnnl_subgraphs(mod)
-    return mod
-
-
-def vmobj_to_list(o):
-    if isinstance(o, tvm.nd.NDArray):
-        o_np = o.numpy()
-        if o_np.dtype == np.uint16:
-            o_np = np.left_shift(o_np.astype("uint32"), 16).view("<f4")
-        return [o_np]
-    elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list):
-        return [vmobj_to_list(f) for f in o]
-    else:
-        raise RuntimeError("Unknown object type: %s" % type(o))
-
-
-def assert_result_dict_holds(result_dict):
-    for k1, k2 in itertools.combinations(result_dict, 2):
-        res1 = vmobj_to_list(result_dict[k1])
-        res2 = vmobj_to_list(result_dict[k2])
-        for r1, r2 in zip(res1, res2):
-            if "bf16" in k1 or "bf16" in k2:
-                np.testing.assert_array_almost_equal(r1, r2, decimal=1)
-            else:
-                tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
-
-
-def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, test_bf16=True):
-    def check_dnnl_used(mod, subgraph_num=None):
-        num_dnnl_subgraphs = sum(
-            [1 if "dnnl" in gv.name_hint else 0 for gv in mod.get_global_vars()]
-        )
-        if subgraph_num:
-            assert num_dnnl_subgraphs == subgraph_num
-        else:
-            assert num_dnnl_subgraphs >= 1
-
-    dev = tvm.cpu()
-    result_dict = dict()
-    for mode in ["graph", "vm"]:
-        configs = [
-            (False, False, False),
-            (True, False, False),
-            (True, True, False),
-        ]
-        if test_bf16 and bf16_supported():
-            configs += [(True, False, True), (True, True, True)]
-        for use_dnnl, alter_layout, use_bf16 in configs:
-            result_key = (
-                mode
-                + ("_dnnl" if use_dnnl else "")
-                + ("_layout" if alter_layout else "")
-                + ("_bf16" if use_bf16 else "_fp32")
-            )
-            processed_mod = mod
-            if use_bf16:
-                processed_mod = relay.transform.ToMixedPrecision("bfloat16")(processed_mod)
-                if tvm.ir.structural_equal(processed_mod, mod):
-                    print("can not convert to bfloat16, skipping...")
-                    continue
-            if use_dnnl:
-                processed_mod = partition_for_dnnl(processed_mod, params, alter_layout)
-                check_dnnl_used(processed_mod)
-
-            with tvm.transform.PassContext(opt_level=3):
-                func = relay.create_executor(
-                    mode, mod=processed_mod, device=dev, target=target
-                ).evaluate()
-            if run_module:
-                if isinstance(input, dict):
-                    result_dict[result_key] = func(**input, **params)
-                else:
-                    result_dict[result_key] = func(input, **params)
-
-    if run_module:
-        assert_result_dict_holds(result_dict)
-
-
-def run_and_verify_func(
-    config, run_module, subgraph_num=None, target="llvm", dtype="float32", test_bf16=True
-):
-    """Test a Relay func by compiling, running, and comparing TVM and DNNL outputs.
-    Parameters
-    ----------
-    config : Tuple[relay.Function, Dict[str, NDArray], List[str]]
-        A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and
-        3) A list of which vars should be considered params.
-    run_module: bool
-        If True, the built module will be run after being compiled.
-    """
-    f, input_shapes, is_param = config
-    params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(dtype) for x in is_param}
-    input_dict = {
-        k: np.random.uniform(-1, 1, v).astype(dtype)
-        for k, v in input_shapes.items()
-        if k not in is_param
-    }
-    run_and_verify(
-        f,
-        input_dict,
-        params,
-        subgraph_num=subgraph_num,
-        target=target,
-        run_module=run_module,
-        test_bf16=test_bf16,
-    )
-
-
-def get_conv1d(
-    x_shape=((1, 3, 224)),
-    k_shape=(16, 3, 3),
-    groups=1,
-    padding=(1, 1),
-    strides=(1),
-    dilation=(1),
-    channels=None,
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.conv1d(
-        x,
-        kernel,
-        kernel_size=k_shape[2:3],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-        channels=k_shape[0],
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv1d_bias(x_shape=(1, 3, 224), k_shape=(10, 3, 3), activation=None, dtype="float32"):
-    conv, dic, param_lst = get_conv1d(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv1d_bias_bn_relu(x_shape=(1, 3, 224), k_shape=(10, 3, 3), dtype="float32"):
-    conv1d_bias, dic, param_lst = get_conv1d_bias(x_shape, k_shape, dtype=dtype)
-    beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
-    moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
-    conv1d_bias_bn, _, _ = relay.nn.batch_norm(
-        conv1d_bias,
-        gamma=gamma,
-        beta=beta,
-        moving_mean=moving_mean,
-        moving_var=moving_var,
-        axis=1,
-        center=True,
-        scale=True,
-        epsilon=1e-5,
-    )
-    return relay.nn.relu(conv1d_bias_bn), dic, param_lst
-
-
-def get_conv2d(
-    x_shape=(1, 32, 8, 8),
-    k_shape=(16, 32, 3, 3),
-    groups=1,
-    padding=(0, 0),
-    strides=(1, 1),
-    dilation=(1, 1),
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.conv2d(
-        x,
-        kernel,
-        kernel_size=k_shape[2:4],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-        channels=k_shape[0],
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv2d_transpose(
-    x_shape=(1, 32, 8, 8),
-    k_shape=(32, 16, 3, 3),
-    groups=1,
-    padding=(0, 0),
-    strides=(1, 1),
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.conv2d_transpose(
-        x,
-        kernel,
-        channels=k_shape[1] * groups,
-        kernel_size=k_shape[2:4],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv2d_weights_const(
-    x_shape=(1, 32, 8, 8),
-    k_shape=(16, 32, 3, 3),
-    groups=1,
-    padding=(0, 0),
-    strides=(1, 1),
-    dilation=(1, 1),
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-    out = relay.nn.conv2d(
-        x,
-        kernel,
-        channels=k_shape[0],
-        kernel_size=k_shape[2:4],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-    )
-    dic = {"x": x_shape}
-    param_lst = []
-    return out, dic, param_lst
-
-
-def get_conv2d_bias(
-    x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv2d_weights_const(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv2d_transpose_bias(
-    x_shape=(1, 32, 8, 8), k_shape=(32, 16, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv2d_transpose(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[1],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[1],)
-    param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv2d_bias_bn_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
-    conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
-    beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
-    moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
-    conv2d_bias_bn, _, _ = relay.nn.batch_norm(
-        conv2d_bias,
-        gamma=gamma,
-        beta=beta,
-        moving_mean=moving_mean,
-        moving_var=moving_var,
-        axis=1,
-        center=True,
-        scale=True,
-        epsilon=1e-5,
-    )
-    return relay.nn.relu(conv2d_bias_bn), dic, param_lst
-
-
-def get_conv2d_bias_sum_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
-    conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
-    sum_data = relay.const(np.random.randint(x_shape).astype(dtype))
-    conv2d_bias_sum = relay.add(sum_data, conv2d_bias)
-    return relay.nn.relu(conv2d_bias_sum), dic, param_lst
-
-
-def get_conv3d(
-    x_shape=(1, 32, 8, 8, 8),
-    k_shape=(16, 32, 3, 3, 3),
-    groups=1,
-    padding=(0, 0, 0),
-    strides=(1, 1, 1),
-    dilation=(1, 1, 1),
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-    out = relay.nn.conv3d(
-        x,
-        kernel,
-        channels=k_shape[0],
-        kernel_size=k_shape[2:],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv3d_transpose(
-    x_shape=(1, 32, 8, 8, 8),
-    k_shape=(32, 16, 3, 3, 3),
-    groups=1,
-    padding=(0, 0, 0),
-    strides=(1, 1, 1),
-    output_padding=(0, 0, 0),
-    activation=None,
-    dtype="float32",
-    data_layout="NCDHW",
-    kernel_layout="OIDHW",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-    out = relay.nn.conv3d_transpose(
-        x,
-        kernel,
-        channels=k_shape[1],
-        kernel_size=k_shape[2:5],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        output_padding=output_padding,
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv3d_bias(
-    x_shape=(1, 32, 8, 8, 8), k_shape=(16, 32, 3, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv3d(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv3d_transpose_bias(
-    x_shape=(1, 32, 8, 8, 8), k_shape=(32, 16, 3, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv3d_transpose(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[1],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[1],)
-    param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_dense(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.dense(x, kernel, units=k_shape[0])
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-    return out, dic, param_lst
-
-
-def get_dense_bias(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
-    dense, dic, param_lst = get_dense(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(dense, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-    return out, dic, param_lst
-
-
-def test_dnnl_not_compatible(run_module, target="llvm", dtype="float32"):
-    xshape = (1, 32, 14, 14)
-    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.add(x, x)
-    z = relay.cast(relay.cast(y, "int32"), "float32")
-    out = relay.nn.relu(z)
-    f = relay.Function([x], out)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = partition_for_dnnl(mod)
-    for mode in ["graph", "vm"]:
-        with tvm.transform.PassContext(opt_level=3):
-            func = relay.create_executor(mode, mod=mod, device=tvm.cpu(0), target=target).evaluate()
-            if run_module:
-                results = func(x_data)
-
-
-def test_multiple_outputs(run_module, dtype="float32"):
-    def get_graph():
-        x = relay.var("x", shape=(1, 3), dtype=dtype)
-        y = relay.var("y", shape=(1, 3), dtype=dtype)
-        z = relay.add(x, y)
-        w = relay.add(z, y)
-        out = relay.Tuple((z, w))
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": (1, 3), "y": (1, 3)}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module, dtype=dtype)
-
-
-def test_elementwise(run_module, dtype="float32"):
-    def get_graph(op, x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = op(x)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    for op in [
-        relay.abs,
-        relay.exp,
-        relay.log,
-        relay.sqrt,
-        relay.nn.relu,
-        relay.tanh,
-        relay.sigmoid,
-    ]:
-        run_and_verify_func(get_graph(op), run_module=run_module)
-
-
-def test_clip(run_module, dtype="float32"):
-    def get_graph(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = relay.clip(x, a_min=-0.2, a_max=0.4)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-
-
-def test_leaky_relu(run_module, dtype="float32"):
-    def get_graph(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = relay.nn.leaky_relu(x, alpha=0.1)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-
-
-def test_softmax(run_module, dtype="float32"):
-    def get_graph(x_shape, axis):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = relay.nn.softmax(x, axis=axis)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph((1, 1000), axis=1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 1000), axis=-1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 4), axis=-2), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 4), axis=1), run_module=run_module)
-
-
-def test_conv1d(run_module, dtype="float32"):
-    conv1d, dic, param_lst = get_conv1d(channels=16, dtype=dtype)
-    conv1d = tvm.IRModule.from_expr(conv1d)
-    config = conv1d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    x_shape = (1, 32, 224)
-    k_shape = (16, 32, 3)
-    conv1d_bias, dic, param_lst = get_conv1d(x_shape, k_shape, dtype=dtype)
-    conv1d_bias = tvm.IRModule.from_expr(conv1d_bias)
-    config = conv1d_bias, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv1d_pattern(run_module, dtype="float32"):
-    x_shape = (1, 3, 224)
-    k_shape = (16, 3, 3)
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
-    for a in activation_lst:
-        conv1d, dic, param_lst = get_conv1d(x_shape, k_shape, activation=a, dtype=dtype)
-        conv1d = tvm.IRModule.from_expr(conv1d)
-        config = conv1d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv1d_bias, dic, param_lst = get_conv1d_bias(x_shape, k_shape, activation=a, dtype=dtype)
-        conv1d_bias = tvm.IRModule.from_expr(conv1d_bias)
-        config = conv1d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    for k_shape, groups in [((16, 32, 3, 3), 1), ((32, 1, 3, 3), 32), ((32, 2, 3, 3), 16)]:
-        for padding in [(0, 0), (1, 1)]:
-            for strides in [(1, 1), (2, 2)]:
-                for dilation in [(1, 1), (2, 2)]:
-                    conv2d, dic, param_lst = get_conv2d(
-                        x_shape=x_shape,
-                        k_shape=k_shape,
-                        groups=groups,
-                        padding=padding,
-                        strides=strides,
-                        dilation=dilation,
-                        dtype=dtype,
-                    )
-                    conv2d = tvm.IRModule.from_expr(conv2d)
-                    config = conv2d, dic, param_lst
-                    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_weights_const(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    k_shape = (16, 32, 3, 3)
-    conv2d, dic, param_lst = get_conv2d_weights_const(x_shape, k_shape, dtype=dtype)
-    conv2d = tvm.IRModule.from_expr(conv2d)
-    config = conv2d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    x_shape = (1, 3, 8, 8)
-    k_shape = (16, 3, 3, 3)
-    conv2d, dic, param_lst = get_conv2d_weights_const(x_shape, k_shape, dtype=dtype)
-    conv2d = tvm.IRModule.from_expr(conv2d)
-    config = conv2d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_pattern(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    k_shape = (16, 32, 3, 3)
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
-    for a in activation_lst:
-        conv2d, dic, param_lst = get_conv2d(x_shape, k_shape, activation=a, dtype=dtype)
-        conv2d = tvm.IRModule.from_expr(conv2d)
-        config = conv2d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, activation=a, dtype=dtype)
-        conv2d_bias = tvm.IRModule.from_expr(conv2d_bias)
-        config = conv2d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
-    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
-    config = conv2d_bias_bn_relu, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
-    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
-    config = conv2d_bias_bn_relu, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_transpose(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    for k_shape, groups in [((32, 16, 3, 3), 1), ((32, 1, 3, 3), 32), ((32, 4, 3, 3), 16)]:
-        for padding in [(0, 0), (1, 1)]:
-            for strides in [(1, 1), (2, 2)]:
-                conv2d_transpose, dic, param_lst = get_conv2d_transpose(
-                    x_shape=x_shape,
-                    k_shape=k_shape,
-                    groups=groups,
-                    padding=padding,
-                    strides=strides,
-                    dtype=dtype,
-                )
-                conv2d_transpose = tvm.IRModule.from_expr(conv2d_transpose)
-                config = conv2d_transpose, dic, param_lst
-                run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_transpose_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
-    for a in activation_lst:
-        conv2d, dic, param_lst = get_conv2d_transpose(activation=a, dtype=dtype)
-        conv2d = tvm.IRModule.from_expr(conv2d)
-        config = conv2d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv2d_bias, dic, param_lst = get_conv2d_transpose_bias(activation=a, dtype=dtype)
-        conv2d_bias = tvm.IRModule.from_expr(conv2d_bias)
-        config = conv2d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d(run_module, dtype="float32"):
-    conv3d, dic, param_lst = get_conv3d(dtype=dtype)
-    conv3d = tvm.IRModule.from_expr(conv3d)
-    config = conv3d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d, dic, param_lst = get_conv3d(padding=(0, 0, 0, 1, 1, 1), dtype=dtype)
-    conv3d = tvm.IRModule.from_expr(conv3d)
-    config = conv3d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d, dic, param_lst = get_conv3d(
-        x_shape=(1, 3, 8, 8, 8), k_shape=(16, 3, 3, 3, 3), dtype=dtype
-    )
-    conv3d = tvm.IRModule.from_expr(conv3d)
-    config = conv3d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
-    for a in activation_lst:
-        conv3d, dic, param_lst = get_conv3d(activation=a, dtype=dtype)
-        conv3d = tvm.IRModule.from_expr(conv3d)
-        config = conv3d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv3d_bias, dic, param_lst = get_conv3d_bias(activation=a, dtype=dtype)
-        conv3d_bias = tvm.IRModule.from_expr(conv3d_bias)
-        config = conv3d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d_transpose(run_module, dtype="float32"):
-    conv3d_transpose, dic, param_lst = get_conv3d_transpose(dtype=dtype)
-    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
-    config = conv3d_transpose, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d_transpose, dic, param_lst = get_conv3d_transpose(strides=(2, 2, 2), dtype=dtype)
-    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
-    config = conv3d_transpose, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d_transpose, dic, param_lst = get_conv3d_transpose(
-        strides=(2, 2, 2), output_padding=(1, 1, 1), dtype=dtype
-    )
-    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
-    config = conv3d_transpose, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d_transpose_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
-    for a in activation_lst:
-        conv3d, dic, param_lst = get_conv3d_transpose(activation=a, dtype=dtype)
-        conv3d = tvm.IRModule.from_expr(conv3d)
-        config = conv3d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv3d_bias, dic, param_lst = get_conv3d_transpose_bias(activation=a, dtype=dtype)
-        conv3d_bias = tvm.IRModule.from_expr(conv3d_bias)
-        config = conv3d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_dense(run_module, dtype="float32"):
-    x_shape = (1, 16)
-    k_shape = (32, 16)
-
-    dense, dic, param_lst = get_dense(x_shape, k_shape, dtype=dtype)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    dense, dic, param_lst = get_dense(x_shape, k_shape=(1, 16), dtype=dtype)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_dense_pattern(run_module, dtype="float32"):
-    x_shape = (1, 16)
-    k_shape = (32, 16)
-
-    dense, dic, param_lst = get_dense(x_shape, k_shape, dtype=dtype)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    dense_bias, dic, param_lst = get_dense_bias(x_shape, k_shape, dtype=dtype)
-    dense_bias = tvm.IRModule.from_expr(dense_bias)
-    config = dense_bias, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_pool2d(run_module, dtype="float32"):
-    def get_graph(
-        op,
-        x_shape=(1, 3, 32, 32),
-        pool_size=(2, 2),
-        strides=(2, 2),
-        padding=(0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-    ):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-            )
-        out = tvm.IRModule.from_expr(out)
-        return out, {"x": x_shape}, []
-
-    for pool_size in [(2, 2), (3, 3)]:
-        for strides in [(1, 1), (2, 2)]:
-            for padding in [(0, 0), (1, 1), (0, 0, 1, 1)]:
-                for ceil_mode in [False]:
-                    # Skip "the padding size is larger than or equal to the filter size for exclusive-counting pooling"
-                    if pool_size == (2, 2) and padding == (0, 0, 1, 1):
-                        continue
-                    for count_include_pad in [False, True]:
-                        # Skip "inclusive-counted blended or average pooling is not supported in combination with asymmetric padding"
-                        if count_include_pad and (padding == (0, 0, 1, 1) or strides == (2, 2)):
-                            continue
-                        run_and_verify_func(
-                            get_graph(
-                                relay.nn.avg_pool2d,
-                                pool_size=pool_size,
-                                strides=strides,
-                                padding=padding,
-                                ceil_mode=ceil_mode,
-                                count_include_pad=count_include_pad,
-                            ),
-                            run_module=run_module,
-                        )
-                    run_and_verify_func(
-                        get_graph(
-                            relay.nn.max_pool2d,
-                            pool_size=pool_size,
-                            strides=strides,
-                            padding=padding,
-                            ceil_mode=ceil_mode,
-                        ),
-                        run_module=run_module,
-                    )
-
-
-def test_pool3d(run_module, dtype="float32"):
-    def get_graph(
-        op,
-        x_shape=(1, 3, 8, 32, 32),
-        pool_size=(2, 2, 2),
-        strides=(2, 2, 2),
-        padding=(0, 0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-        dtype="float32",
-    ):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-            )
-        out = tvm.IRModule.from_expr(out)
-        return out, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(relay.nn.avg_pool3d), run_module=run_module)
-    run_and_verify_func(get_graph(relay.nn.max_pool3d), run_module=run_module)
-    run_and_verify_func(
-        get_graph(relay.nn.max_pool3d, padding=(0, 0, 0, 1, 1, 1)), run_module=run_module
-    )
-    run_and_verify_func(get_graph(relay.nn.max_pool3d, strides=(1, 1, 1)), run_module=run_module)
-
-
-def test_prune_dnnl_subgraph(run_module):
-    """In this test, OP "add" should be offloaded from dnnl codegen."""
-
-    def get_graph():
-        x1 = relay.var("x1", shape=(1, 32, 56, 56))
-        x2 = relay.var("x2", shape=(1, 32, 56, 56))
-        bias = relay.var("bias", shape=(32,))
-        weight = relay.var("weight", shape=(32, 32, 3, 3))
-        y = relay.nn.conv2d(
-            x1,
-            weight,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y = relay.nn.bias_add(y, bias)
-        y = relay.nn.relu(y)
-        y = relay.nn.global_max_pool2d(y)
-        y = relay.add(y, x2)
-        dic = {
-            "x1": (1, 32, 56, 56),
-            "x2": (1, 32, 56, 56),
-            "weight": (32, 32, 3, 3),
-            "bias": (32,),
-        }
-        param_lst = ["weight", "bias"]
-        out = tvm.IRModule.from_expr(y)
-        return out, dic, param_lst
-
-    run_and_verify_func(get_graph(), subgraph_num=1, run_module=run_module, test_bf16=False)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+import itertools
+import numpy as np
+import sys
+import subprocess
+
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+from tvm.relay.op.contrib import dnnl
+import tvm.testing
+
+
+has_dnnl_codegen = pytest.mark.skipif(
+    not tvm.get_global_func("relay.ext.dnnl", True), reason="DNNL codegen not available"
+)
+
+run_module = tvm.testing.parameter(
+    pytest.param(False, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
+    pytest.param(True, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
+    ids=["compile", "run"],
+)
+
+_bf16_supported = None
+
+
+def bf16_supported():
+    global _bf16_supported
+    if _bf16_supported is None:
+        _bf16_supported = False
+        if sys.platform.startswith("darwin"):
+            cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode()
+            for line in cpu_info.split("\n"):
+                if line.startswith("hw.optional.avx512f"):
+                    _bf16_supported = bool(line.split(":", 1)[1])
+        elif sys.platform.startswith("linux"):
+            _bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
+    return _bf16_supported
+
+
+def partition_for_dnnl(mod, params=None, alter_layout=True):
+    """Partition the graph greedily offloading supported operators to DNNL.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : Optional[Dict[str, NDArray]]
+        Constant input parameters.
+    Returns
+    -------
+    mod : Module
+        Annotated and partitioned module.
+    """
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+
+    with TempOpAttr("nn.conv2d", "FTVMLegalize", dnnl.legalize_group_conv):
+        with TempOpAttr("nn.conv2d_transpose", "FTVMLegalize", dnnl.legalize_group_conv):
+            seq = tvm.transform.Sequential(
+                [
+                    transform.CanonicalizeOps(),
+                    transform.InferType(),
+                    transform.SimplifyInference(),
+                    transform.FoldConstant(),
+                    transform.FoldScaleAxis(),
+                    # fold consecutive add ops to simplify pattern `conv2d-bias_add-bn-relu`
+                    transform.SimplifyExpr(),
+                    transform.FoldConstant(),
+                    # alter group conv /conv_transpose layout to `GOIHW` / `GIOHW`
+                    transform.Legalize(),
+                    transform.FoldConstant(),
+                ]
+            )
+            with tvm.transform.PassContext(opt_level=3):
+                mod = seq(mod)
+    if alter_layout:
+        with TempOpAttr("nn.conv1d", "FTVMAlterOpLayout", dnnl.alter_conv):
+            with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", dnnl.alter_conv):
+                with TempOpAttr("nn.conv3d", "FTVMAlterOpLayout", dnnl.alter_conv):
+                    with TempOpAttr(
+                        "nn.conv2d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose
+                    ):
+                        with TempOpAttr(
+                            "nn.conv3d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose
+                        ):
+                            alter_layout_seq = tvm.transform.Sequential(
+                                [
+                                    transform.AlterOpLayout(),
+                                    transform.FoldConstant(),
+                                ]
+                            )
+                            with tvm.transform.PassContext(opt_level=3):
+                                mod = alter_layout_seq(mod)
+
+    byoc_seq = tvm.transform.Sequential(
+        [
+            transform.MergeComposite(dnnl.pattern_table()),
+            transform.AnnotateTarget("dnnl"),
+            transform.MergeCompilerRegions(),
+            transform.PartitionGraph(),
+        ]
+    )
+    with tvm.transform.PassContext(opt_level=3):
+        mod = byoc_seq(mod)
+        mod = dnnl.prune_dnnl_subgraphs(mod)
+    return mod
+
+
+def vmobj_to_list(o):
+    if isinstance(o, tvm.nd.NDArray):
+        o_np = o.numpy()
+        if o_np.dtype == np.uint16:
+            o_np = np.left_shift(o_np.astype("uint32"), 16).view("<f4")
+        return [o_np]
+    elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list):
+        return [vmobj_to_list(f) for f in o]
+    else:
+        raise RuntimeError("Unknown object type: %s" % type(o))
+
+
+def assert_result_dict_holds(result_dict):
+    for k1, k2 in itertools.combinations(result_dict, 2):
+        res1 = vmobj_to_list(result_dict[k1])
+        res2 = vmobj_to_list(result_dict[k2])
+        for r1, r2 in zip(res1, res2):
+            if "bf16" in k1 or "bf16" in k2:
+                np.testing.assert_array_almost_equal(r1, r2, decimal=1)
+            else:
+                tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
+
+
+def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, test_bf16=True):
+    def check_dnnl_used(mod, subgraph_num=None):
+        num_dnnl_subgraphs = sum(
+            [1 if "dnnl" in gv.name_hint else 0 for gv in mod.get_global_vars()]
+        )
+        if subgraph_num:
+            assert num_dnnl_subgraphs == subgraph_num
+        else:
+            assert num_dnnl_subgraphs >= 1
+
+    dev = tvm.cpu()
+    result_dict = dict()
+    for mode in ["graph", "vm"]:
+        configs = [
+            (False, False, False),
+            (True, False, False),
+            (True, True, False),
+        ]
+        if test_bf16 and bf16_supported():
+            configs += [(True, False, True), (True, True, True)]
+        for use_dnnl, alter_layout, use_bf16 in configs:
+            result_key = (
+                mode
+                + ("_dnnl" if use_dnnl else "")
+                + ("_layout" if alter_layout else "")
+                + ("_bf16" if use_bf16 else "_fp32")
+            )
+            processed_mod = mod
+            if use_bf16:
+                processed_mod = relay.transform.ToMixedPrecision("bfloat16")(processed_mod)
+                if tvm.ir.structural_equal(processed_mod, mod):
+                    print("can not convert to bfloat16, skipping...")
+                    continue
+            if use_dnnl:
+                processed_mod = partition_for_dnnl(processed_mod, params, alter_layout)
+                check_dnnl_used(processed_mod)
+
+            with tvm.transform.PassContext(opt_level=3):
+                func = relay.create_executor(
+                    mode, mod=processed_mod, device=dev, target=target
+                ).evaluate()
+            if run_module:
+                if isinstance(input, dict):
+                    result_dict[result_key] = func(**input, **params)
+                else:
+                    result_dict[result_key] = func(input, **params)
+
+    if run_module:
+        assert_result_dict_holds(result_dict)
+
+
+def run_and_verify_func(
+    config, run_module, subgraph_num=None, target="llvm", dtype="float32", test_bf16=True
+):
+    """Test a Relay func by compiling, running, and comparing TVM and DNNL outputs.
+    Parameters
+    ----------
+    config : Tuple[relay.Function, Dict[str, NDArray], List[str]]
+        A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and
+        3) A list of which vars should be considered params.
+    run_module: bool
+        If True, the built module will be run after being compiled.
+    """
+    f, input_shapes, is_param = config
+    params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(dtype) for x in is_param}
+    input_dict = {
+        k: np.random.uniform(-1, 1, v).astype(dtype)
+        for k, v in input_shapes.items()
+        if k not in is_param
+    }
+    run_and_verify(
+        f,
+        input_dict,
+        params,
+        subgraph_num=subgraph_num,
+        target=target,
+        run_module=run_module,
+        test_bf16=test_bf16,
+    )
+
+
+def get_conv1d(
+    x_shape=((1, 3, 224)),
+    k_shape=(16, 3, 3),
+    groups=1,
+    padding=(1, 1),
+    strides=(1),
+    dilation=(1),
+    channels=None,
+    activation=None,
+    dtype="float32",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
+    out = relay.nn.conv1d(
+        x,
+        kernel,
+        kernel_size=k_shape[2:3],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+        dilation=dilation,
+        channels=k_shape[0],
+    )
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv1d_bias(x_shape=(1, 3, 224), k_shape=(10, 3, 3), activation=None, dtype="float32"):
+    conv, dic, param_lst = get_conv1d(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
+    out = relay.nn.bias_add(conv, bias)
+    dic["bias"] = (k_shape[0],)
+    param_lst += ["bias"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv1d_bias_bn_relu(x_shape=(1, 3, 224), k_shape=(10, 3, 3), dtype="float32"):
+    conv1d_bias, dic, param_lst = get_conv1d_bias(x_shape, k_shape, dtype=dtype)
+    beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
+    gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
+    moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
+    moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
+    conv1d_bias_bn, _, _ = relay.nn.batch_norm(
+        conv1d_bias,
+        gamma=gamma,
+        beta=beta,
+        moving_mean=moving_mean,
+        moving_var=moving_var,
+        axis=1,
+        center=True,
+        scale=True,
+        epsilon=1e-5,
+    )
+    return relay.nn.relu(conv1d_bias_bn), dic, param_lst
+
+
+def get_conv2d(
+    x_shape=(1, 32, 8, 8),
+    k_shape=(16, 32, 3, 3),
+    groups=1,
+    padding=(0, 0),
+    strides=(1, 1),
+    dilation=(1, 1),
+    activation=None,
+    dtype="float32",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
+    out = relay.nn.conv2d(
+        x,
+        kernel,
+        kernel_size=k_shape[2:4],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+        dilation=dilation,
+        channels=k_shape[0],
+    )
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv2d_transpose(
+    x_shape=(1, 32, 8, 8),
+    k_shape=(32, 16, 3, 3),
+    groups=1,
+    padding=(0, 0),
+    strides=(1, 1),
+    activation=None,
+    dtype="float32",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
+    out = relay.nn.conv2d_transpose(
+        x,
+        kernel,
+        channels=k_shape[1] * groups,
+        kernel_size=k_shape[2:4],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+    )
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv2d_weights_const(
+    x_shape=(1, 32, 8, 8),
+    k_shape=(16, 32, 3, 3),
+    groups=1,
+    padding=(0, 0),
+    strides=(1, 1),
+    dilation=(1, 1),
+    dtype="float32",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
+    out = relay.nn.conv2d(
+        x,
+        kernel,
+        channels=k_shape[0],
+        kernel_size=k_shape[2:4],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+        dilation=dilation,
+    )
+    dic = {"x": x_shape}
+    param_lst = []
+    return out, dic, param_lst
+
+
+def get_conv2d_bias(
+    x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), activation=None, dtype="float32"
+):
+    conv, dic, param_lst = get_conv2d_weights_const(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
+    out = relay.nn.bias_add(conv, bias)
+    dic["bias"] = (k_shape[0],)
+    param_lst += ["bias"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv2d_transpose_bias(
+    x_shape=(1, 32, 8, 8), k_shape=(32, 16, 3, 3), activation=None, dtype="float32"
+):
+    conv, dic, param_lst = get_conv2d_transpose(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[1],), dtype=dtype)
+    out = relay.nn.bias_add(conv, bias)
+    dic["bias"] = (k_shape[1],)
+    param_lst += ["bias"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv2d_bias_bn_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
+    conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
+    beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
+    gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
+    moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
+    moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
+    conv2d_bias_bn, _, _ = relay.nn.batch_norm(
+        conv2d_bias,
+        gamma=gamma,
+        beta=beta,
+        moving_mean=moving_mean,
+        moving_var=moving_var,
+        axis=1,
+        center=True,
+        scale=True,
+        epsilon=1e-5,
+    )
+    return relay.nn.relu(conv2d_bias_bn), dic, param_lst
+
+
+def get_conv2d_bias_sum_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
+    conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
+    sum_data = relay.const(np.random.randint(x_shape).astype(dtype))
+    conv2d_bias_sum = relay.add(sum_data, conv2d_bias)
+    return relay.nn.relu(conv2d_bias_sum), dic, param_lst
+
+
+def get_conv3d(
+    x_shape=(1, 32, 8, 8, 8),
+    k_shape=(16, 32, 3, 3, 3),
+    groups=1,
+    padding=(0, 0, 0),
+    strides=(1, 1, 1),
+    dilation=(1, 1, 1),
+    activation=None,
+    dtype="float32",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
+    out = relay.nn.conv3d(
+        x,
+        kernel,
+        channels=k_shape[0],
+        kernel_size=k_shape[2:],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+        dilation=dilation,
+    )
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv3d_transpose(
+    x_shape=(1, 32, 8, 8, 8),
+    k_shape=(32, 16, 3, 3, 3),
+    groups=1,
+    padding=(0, 0, 0),
+    strides=(1, 1, 1),
+    output_padding=(0, 0, 0),
+    activation=None,
+    dtype="float32",
+    data_layout="NCDHW",
+    kernel_layout="OIDHW",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
+    out = relay.nn.conv3d_transpose(
+        x,
+        kernel,
+        channels=k_shape[1],
+        kernel_size=k_shape[2:5],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+        output_padding=output_padding,
+        data_layout=data_layout,
+        kernel_layout=kernel_layout,
+    )
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv3d_bias(
+    x_shape=(1, 32, 8, 8, 8), k_shape=(16, 32, 3, 3, 3), activation=None, dtype="float32"
+):
+    conv, dic, param_lst = get_conv3d(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
+    out = relay.nn.bias_add(conv, bias)
+    dic["bias"] = (k_shape[0],)
+    param_lst += ["bias"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv3d_transpose_bias(
+    x_shape=(1, 32, 8, 8, 8), k_shape=(32, 16, 3, 3, 3), activation=None, dtype="float32"
+):
+    conv, dic, param_lst = get_conv3d_transpose(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[1],), dtype=dtype)
+    out = relay.nn.bias_add(conv, bias)
+    dic["bias"] = (k_shape[1],)
+    param_lst += ["bias"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_dense(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
+    out = relay.nn.dense(x, kernel, units=k_shape[0])
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+    return out, dic, param_lst
+
+
+def get_dense_bias(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
+    dense, dic, param_lst = get_dense(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
+    out = relay.nn.bias_add(dense, bias)
+    dic["bias"] = (k_shape[0],)
+    param_lst += ["bias"]
+    return out, dic, param_lst
+
+
+def test_dnnl_not_compatible(run_module, target="llvm", dtype="float32"):
+    xshape = (1, 32, 14, 14)
+    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+
+    x = relay.var("x", shape=(xshape), dtype=dtype)
+    y = relay.add(x, x)
+    z = relay.cast(relay.cast(y, "int32"), "float32")
+    out = relay.nn.relu(z)
+    f = relay.Function([x], out)
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod = partition_for_dnnl(mod)
+    for mode in ["graph", "vm"]:
+        with tvm.transform.PassContext(opt_level=3):
+            func = relay.create_executor(mode, mod=mod, device=tvm.cpu(0), target=target).evaluate()
+            if run_module:
+                results = func(x_data)
+
+
+def test_multiple_outputs(run_module, dtype="float32"):
+    def get_graph():
+        x = relay.var("x", shape=(1, 3), dtype=dtype)
+        y = relay.var("y", shape=(1, 3), dtype=dtype)
+        z = relay.add(x, y)
+        w = relay.add(z, y)
+        out = relay.Tuple((z, w))
+        f = tvm.IRModule.from_expr(out)
+        return f, {"x": (1, 3), "y": (1, 3)}, []
+
+    run_and_verify_func(get_graph(), run_module=run_module, dtype=dtype)
+
+
+def test_elementwise(run_module, dtype="float32"):
+    def get_graph(op, x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        out = op(x)
+        f = tvm.IRModule.from_expr(out)
+        return f, {"x": x_shape}, []
+
+    for op in [
+        relay.abs,
+        relay.exp,
+        relay.log,
+        relay.sqrt,
+        relay.nn.relu,
+        relay.tanh,
+        relay.sigmoid,
+    ]:
+        run_and_verify_func(get_graph(op), run_module=run_module)
+
+
+def test_clip(run_module, dtype="float32"):
+    def get_graph(x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        out = relay.clip(x, a_min=-0.2, a_max=0.4)
+        f = tvm.IRModule.from_expr(out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(), run_module=run_module)
+
+
+def test_leaky_relu(run_module, dtype="float32"):
+    def get_graph(x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        out = relay.nn.leaky_relu(x, alpha=0.1)
+        f = tvm.IRModule.from_expr(out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(), run_module=run_module)
+
+
+def test_softmax(run_module, dtype="float32"):
+    def get_graph(x_shape, axis):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        out = relay.nn.softmax(x, axis=axis)
+        f = tvm.IRModule.from_expr(out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 1000), axis=1), run_module=run_module)
+    run_and_verify_func(get_graph((1, 1000), axis=-1), run_module=run_module)
+    run_and_verify_func(get_graph((1, 3, 4), axis=-2), run_module=run_module)
+    run_and_verify_func(get_graph((1, 3, 4), axis=1), run_module=run_module)
+
+
+def test_conv1d(run_module, dtype="float32"):
+    conv1d, dic, param_lst = get_conv1d(channels=16, dtype=dtype)
+    conv1d = tvm.IRModule.from_expr(conv1d)
+    config = conv1d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    x_shape = (1, 32, 224)
+    k_shape = (16, 32, 3)
+    conv1d_bias, dic, param_lst = get_conv1d(x_shape, k_shape, dtype=dtype)
+    conv1d_bias = tvm.IRModule.from_expr(conv1d_bias)
+    config = conv1d_bias, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv1d_pattern(run_module, dtype="float32"):
+    x_shape = (1, 3, 224)
+    k_shape = (16, 3, 3)
+    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    for a in activation_lst:
+        conv1d, dic, param_lst = get_conv1d(x_shape, k_shape, activation=a, dtype=dtype)
+        conv1d = tvm.IRModule.from_expr(conv1d)
+        config = conv1d, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+        conv1d_bias, dic, param_lst = get_conv1d_bias(x_shape, k_shape, activation=a, dtype=dtype)
+        conv1d_bias = tvm.IRModule.from_expr(conv1d_bias)
+        config = conv1d_bias, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv2d(run_module, dtype="float32"):
+    x_shape = (1, 32, 8, 8)
+    for k_shape, groups in [((16, 32, 3, 3), 1), ((32, 1, 3, 3), 32), ((32, 2, 3, 3), 16)]:
+        for padding in [(0, 0), (1, 1)]:
+            for strides in [(1, 1), (2, 2)]:
+                for dilation in [(1, 1), (2, 2)]:
+                    conv2d, dic, param_lst = get_conv2d(
+                        x_shape=x_shape,
+                        k_shape=k_shape,
+                        groups=groups,
+                        padding=padding,
+                        strides=strides,
+                        dilation=dilation,
+                        dtype=dtype,
+                    )
+                    conv2d = tvm.IRModule.from_expr(conv2d)
+                    config = conv2d, dic, param_lst
+                    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv2d_weights_const(run_module, dtype="float32"):
+    x_shape = (1, 32, 8, 8)
+    k_shape = (16, 32, 3, 3)
+    conv2d, dic, param_lst = get_conv2d_weights_const(x_shape, k_shape, dtype=dtype)
+    conv2d = tvm.IRModule.from_expr(conv2d)
+    config = conv2d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    x_shape = (1, 3, 8, 8)
+    k_shape = (16, 3, 3, 3)
+    conv2d, dic, param_lst = get_conv2d_weights_const(x_shape, k_shape, dtype=dtype)
+    conv2d = tvm.IRModule.from_expr(conv2d)
+    config = conv2d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv2d_pattern(run_module, dtype="float32"):
+    x_shape = (1, 32, 8, 8)
+    k_shape = (16, 32, 3, 3)
+    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    for a in activation_lst:
+        conv2d, dic, param_lst = get_conv2d(x_shape, k_shape, activation=a, dtype=dtype)
+        conv2d = tvm.IRModule.from_expr(conv2d)
+        config = conv2d, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+        conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, activation=a, dtype=dtype)
+        conv2d_bias = tvm.IRModule.from_expr(conv2d_bias)
+        config = conv2d_bias, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
+    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
+    config = conv2d_bias_bn_relu, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
+    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
+    config = conv2d_bias_bn_relu, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv2d_transpose(run_module, dtype="float32"):
+    x_shape = (1, 32, 8, 8)
+    for k_shape, groups in [((32, 16, 3, 3), 1), ((32, 1, 3, 3), 32), ((32, 4, 3, 3), 16)]:
+        for padding in [(0, 0), (1, 1)]:
+            for strides in [(1, 1), (2, 2)]:
+                conv2d_transpose, dic, param_lst = get_conv2d_transpose(
+                    x_shape=x_shape,
+                    k_shape=k_shape,
+                    groups=groups,
+                    padding=padding,
+                    strides=strides,
+                    dtype=dtype,
+                )
+                conv2d_transpose = tvm.IRModule.from_expr(conv2d_transpose)
+                config = conv2d_transpose, dic, param_lst
+                run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv2d_transpose_pattern(run_module, dtype="float32"):
+    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    for a in activation_lst:
+        conv2d, dic, param_lst = get_conv2d_transpose(activation=a, dtype=dtype)
+        conv2d = tvm.IRModule.from_expr(conv2d)
+        config = conv2d, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+        conv2d_bias, dic, param_lst = get_conv2d_transpose_bias(activation=a, dtype=dtype)
+        conv2d_bias = tvm.IRModule.from_expr(conv2d_bias)
+        config = conv2d_bias, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv3d(run_module, dtype="float32"):
+    conv3d, dic, param_lst = get_conv3d(dtype=dtype)
+    conv3d = tvm.IRModule.from_expr(conv3d)
+    config = conv3d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv3d, dic, param_lst = get_conv3d(padding=(0, 0, 0, 1, 1, 1), dtype=dtype)
+    conv3d = tvm.IRModule.from_expr(conv3d)
+    config = conv3d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv3d, dic, param_lst = get_conv3d(
+        x_shape=(1, 3, 8, 8, 8), k_shape=(16, 3, 3, 3, 3), dtype=dtype
+    )
+    conv3d = tvm.IRModule.from_expr(conv3d)
+    config = conv3d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv3d_pattern(run_module, dtype="float32"):
+    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    for a in activation_lst:
+        conv3d, dic, param_lst = get_conv3d(activation=a, dtype=dtype)
+        conv3d = tvm.IRModule.from_expr(conv3d)
+        config = conv3d, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+        conv3d_bias, dic, param_lst = get_conv3d_bias(activation=a, dtype=dtype)
+        conv3d_bias = tvm.IRModule.from_expr(conv3d_bias)
+        config = conv3d_bias, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv3d_transpose(run_module, dtype="float32"):
+    conv3d_transpose, dic, param_lst = get_conv3d_transpose(dtype=dtype)
+    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
+    config = conv3d_transpose, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv3d_transpose, dic, param_lst = get_conv3d_transpose(strides=(2, 2, 2), dtype=dtype)
+    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
+    config = conv3d_transpose, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv3d_transpose, dic, param_lst = get_conv3d_transpose(
+        strides=(2, 2, 2), output_padding=(1, 1, 1), dtype=dtype
+    )
+    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
+    config = conv3d_transpose, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv3d_transpose_pattern(run_module, dtype="float32"):
+    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    for a in activation_lst:
+        conv3d, dic, param_lst = get_conv3d_transpose(activation=a, dtype=dtype)
+        conv3d = tvm.IRModule.from_expr(conv3d)
+        config = conv3d, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+        conv3d_bias, dic, param_lst = get_conv3d_transpose_bias(activation=a, dtype=dtype)
+        conv3d_bias = tvm.IRModule.from_expr(conv3d_bias)
+        config = conv3d_bias, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_dense(run_module, dtype="float32"):
+    x_shape = (1, 16)
+    k_shape = (32, 16)
+
+    dense, dic, param_lst = get_dense(x_shape, k_shape, dtype=dtype)
+    dense = tvm.IRModule.from_expr(dense)
+    config = dense, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    dense, dic, param_lst = get_dense(x_shape, k_shape=(1, 16), dtype=dtype)
+    dense = tvm.IRModule.from_expr(dense)
+    config = dense, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_dense_pattern(run_module, dtype="float32"):
+    x_shape = (1, 16)
+    k_shape = (32, 16)
+
+    dense, dic, param_lst = get_dense(x_shape, k_shape, dtype=dtype)
+    dense = tvm.IRModule.from_expr(dense)
+    config = dense, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    dense_bias, dic, param_lst = get_dense_bias(x_shape, k_shape, dtype=dtype)
+    dense_bias = tvm.IRModule.from_expr(dense_bias)
+    config = dense_bias, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_pool2d(run_module, dtype="float32"):
+    def get_graph(
+        op,
+        x_shape=(1, 3, 32, 32),
+        pool_size=(2, 2),
+        strides=(2, 2),
+        padding=(0, 0),
+        ceil_mode=False,
+        count_include_pad=None,
+    ):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        if count_include_pad is not None:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+            )
+        else:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+            )
+        out = tvm.IRModule.from_expr(out)
+        return out, {"x": x_shape}, []
+
+    for pool_size in [(2, 2), (3, 3)]:
+        for strides in [(1, 1), (2, 2)]:
+            for padding in [(0, 0), (1, 1), (0, 0, 1, 1)]:
+                for ceil_mode in [False]:
+                    # Skip "the padding size is larger than or equal to the filter size for exclusive-counting pooling"
+                    if pool_size == (2, 2) and padding == (0, 0, 1, 1):
+                        continue
+                    for count_include_pad in [False, True]:
+                        # Skip "inclusive-counted blended or average pooling is not supported in combination with asymmetric padding"
+                        if count_include_pad and (padding == (0, 0, 1, 1) or strides == (2, 2)):
+                            continue
+                        run_and_verify_func(
+                            get_graph(
+                                relay.nn.avg_pool2d,
+                                pool_size=pool_size,
+                                strides=strides,
+                                padding=padding,
+                                ceil_mode=ceil_mode,
+                                count_include_pad=count_include_pad,
+                            ),
+                            run_module=run_module,
+                        )
+                    run_and_verify_func(
+                        get_graph(
+                            relay.nn.max_pool2d,
+                            pool_size=pool_size,
+                            strides=strides,
+                            padding=padding,
+                            ceil_mode=ceil_mode,
+                        ),
+                        run_module=run_module,
+                    )
+
+
+def test_pool3d(run_module, dtype="float32"):
+    def get_graph(
+        op,
+        x_shape=(1, 3, 8, 32, 32),
+        pool_size=(2, 2, 2),
+        strides=(2, 2, 2),
+        padding=(0, 0, 0),
+        ceil_mode=False,
+        count_include_pad=None,
+        dtype="float32",
+    ):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        if count_include_pad is not None:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+            )
+        else:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+            )
+        out = tvm.IRModule.from_expr(out)
+        return out, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(relay.nn.avg_pool3d), run_module=run_module)
+    run_and_verify_func(get_graph(relay.nn.max_pool3d), run_module=run_module)
+    run_and_verify_func(
+        get_graph(relay.nn.max_pool3d, padding=(0, 0, 0, 1, 1, 1)), run_module=run_module
+    )
+    run_and_verify_func(get_graph(relay.nn.max_pool3d, strides=(1, 1, 1)), run_module=run_module)
+
+
+def test_prune_dnnl_subgraph(run_module):
+    """In this test, OP "add" should be offloaded from dnnl codegen."""
+
+    def get_graph():
+        x1 = relay.var("x1", shape=(1, 32, 56, 56))
+        x2 = relay.var("x2", shape=(1, 32, 56, 56))
+        bias = relay.var("bias", shape=(32,))
+        weight = relay.var("weight", shape=(32, 32, 3, 3))
+        y = relay.nn.conv2d(
+            x1,
+            weight,
+            channels=32,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+        )
+        y = relay.nn.bias_add(y, bias)
+        y = relay.nn.relu(y)
+        y = relay.nn.global_max_pool2d(y)
+        y = relay.add(y, x2)
+        dic = {
+            "x1": (1, 32, 56, 56),
+            "x2": (1, 32, 56, 56),
+            "weight": (32, 32, 3, 3),
+            "bias": (32,),
+        }
+        param_lst = ["weight", "bias"]
+        out = tvm.IRModule.from_expr(y)
+        return out, dic, param_lst
+
+    run_and_verify_func(get_graph(), subgraph_num=1, run_module=run_module, test_bf16=False)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 8ba43003a00c2ca92017df2ec24ccaef6ddcf636 Mon Sep 17 00:00:00 2001
From: Jian Sheng <84881952+jsheng-jian@users.noreply.github.com>
Date: Tue, 7 Jun 2022 23:09:49 -0700
Subject: [PATCH 066/181] minor fix after loading trt engine from disk (#11614)

---
 src/runtime/contrib/tensorrt/tensorrt_runtime.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index 554515c456797..18ffdbbbba858 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -376,7 +376,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
     helper.DeclareField("batch_size", &batch_size);
     helper.ReadAllFields(&reader);
     trt_engine_cache_[std::make_pair(symbol_name_, batch_size)] = engine_and_context;
-    LOG(INFO) << "finished saving engine and context ... ";
+    max_batch_size_ = batch_size;
+    LOG(INFO) << "finished loading engine and context ... ";
     return true;
   }
 

From 6dc0c624cdd8fb9d7fdd2194a755b0dffbe2de93 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Wed, 8 Jun 2022 03:00:35 -0700
Subject: [PATCH 067/181] [Relay] Restore dominator check (#11616)

It is ok to match a sub-graph which has dataflow
outside of the sub-graph, provided all such flows
eventually come into the sub-graph.
---
 src/relay/ir/dataflow_matcher.cc            | 28 +++++++----
 src/relay/ir/dataflow_matcher_impl.h        |  1 +
 tests/python/contrib/test_cutlass.py        |  2 +-
 tests/python/relay/test_dataflow_pattern.py | 52 ++++++++++++++++++++-
 4 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index df896cb690eb2..b2776a41c50ce 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -609,6 +609,8 @@ void PatternGrouper::VisitExprs() {
 }
 
 void PatternGrouper::CreateGroup(const Expr& expr) {
+  VLOG(1) << "Creating group for:" << std::endl << PrettyPrint(expr);
+
   int var_number = 0;
 
   auto node_map = matcher_->GetMemo();
@@ -696,6 +698,7 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
   auto body = extractor.Mutate(expr);
 
   group.function = Function(params, body, NullValue<Type>(), Array<TypeVar>());
+  VLOG(1) << "Candidate extracted function:" << std::endl << PrettyPrint(group.function);
   group.name = extractor.GetName();
   // Check to make sure we aren't overlapping with another group or creating an invalid fusion
   // The MatchExtractor will create a new graph by replacing nodes that match the inputs of the
@@ -708,6 +711,10 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
   // Similiarly, if interior nodes in a group are used outside of the group fusing to a single
   // output would create an invalid graph tranformation, so we block the creation of such groups.
   auto memo = extractor.GetMemo();
+  for (auto kv : memo) {
+    VLOG(1) << "matched index " << matcher_->expr_to_node(kv.first)->index_;
+  }
+
   for (auto kv : memo) {
     // Check to ensure that this node isn't an input or a global
     if (inputs.count(kv.first) == 0 && kv.first.as<OpNode>() == nullptr &&
@@ -720,16 +727,19 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
         // if the node isn't the output of the group
         auto node = matcher_->expr_to_node(kv.first);
         for (auto* output : node->outputs_) {
-          // and the node is used by nodes outside of the group
           if (memo.count(output->ref()) == 0) {
-            // TODO(mbs): This condition used to also include the following test, which since
-            // the dominators relation is used back-to-front was always vacuously true. So the
-            // code is just rejecting the match if a strictly internal node happened to connect
-            // to an outside node.
-            ICHECK(!matcher_->expr_to_node(expr)->Dominates(output));
-            // Exit because nodes in this pattern's body are used outside the pattern, fusing it
-            // would be invalid
-            return;
+            // A node inside the matched group contributes an output to nodes outside of the matched
+            // group...
+            auto root = matcher_->expr_to_node(expr);
+            if (!root->Dominates(output)) {
+              // ...and the outside dataflow does not come back to the root of the matched group.
+              // So reject the match since it would create a cycle.
+              VLOG(1) << "Rejecting group since would create a cycle with output " << output->index_
+                      << " for root " << root->index_ << " in graph:" << std::endl
+                      << matcher_->expr_graph().ToString();
+              return;
+            }
+            // else: We'll allow the output to be included in the matched group.
           }
         }
       }
diff --git a/src/relay/ir/dataflow_matcher_impl.h b/src/relay/ir/dataflow_matcher_impl.h
index f04190f72e40b..a174d8e34eb7f 100644
--- a/src/relay/ir/dataflow_matcher_impl.h
+++ b/src/relay/ir/dataflow_matcher_impl.h
@@ -55,6 +55,7 @@ class DFPatternMatcher : public DFPatternFunctor<bool(const DFPattern&, const Ex
   const std::unordered_map<DFPattern, Array<Expr>, ObjectPtrHash, ObjectPtrEqual>& memo() const {
     return memo_;
   }
+  const IndexedGraph<Expr>& expr_graph() const { return *expr_graph_; }
 
  protected:
   bool VisitDFPattern(const DFPattern& pattern, const Expr& expr) override;
diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py
index c105979402211..8e5238b17399c 100644
--- a/tests/python/contrib/test_cutlass.py
+++ b/tests/python/contrib/test_cutlass.py
@@ -941,4 +941,4 @@ def test_conv2d_bwd():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index f0474c9112736..ba066e9a438f9 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -1458,7 +1458,6 @@ def concat(*args):
 
 
 def test_partition_fuzzy_function_args():
-
     func_pattern = FunctionPattern(None, wildcard() + wildcard())(None) + wildcard()
     x = relay.var("x")
     y = relay.var("y")
@@ -1790,5 +1789,56 @@ def callback(self, pre, post, node_map):
     assert tvm.ir.structural_equal(out, expected)
 
 
+def test_matched_outside_but_dominated():
+    """In this example the pattern matches the nn.conv2d/add/multiply flow. Even though the
+    add output is consumed by the sigmoid, the sigmoid itself is dominated by the multiply.
+    So partitioning can proceed, all be it with a duplication of the add."""
+    in_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(16, 16, 32, 32), float16], %weight: Tensor[(32, 16, 3, 3), float16], %bias: Tensor[(32), float32]) -> Tensor[(16, 32, 32, 32), float32] {
+          %0 = layout_transform(%data, src_layout="NCHW", dst_layout="NHWC");
+          %1 = layout_transform(%weight, src_layout="OIHW", dst_layout="OHWI");
+          %2 = expand_dims(%bias, axis=1, num_newaxis=2);
+          %3 = expand_dims(%2, axis=0);
+          %4 = nn.conv2d(%0, %1, padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="OHWI", out_dtype="float32");
+          %5 = layout_transform(%3, src_layout="NCHW", dst_layout="NHWC");
+          %6 = add(%4, %5);
+          %7 = sigmoid(%6);
+          %8 = multiply(%6, %7);
+          layout_transform(%8, src_layout="NHWC", dst_layout="NCHW")
+        }
+        """
+    )
+    expected_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(16, 16, 32, 32), float16], %weight: Tensor[(32, 16, 3, 3), float16], %bias: Tensor[(32), float32]) -> Tensor[(16, 32, 32, 32), float32] {
+          %2 = expand_dims(%bias, axis=1, num_newaxis=2);
+          %3 = expand_dims(%2, axis=0);
+          %4 = layout_transform(%data, src_layout="NCHW", dst_layout="NHWC");
+          %5 = layout_transform(%weight, src_layout="OIHW", dst_layout="OHWI");
+          %6 = nn.conv2d(%4, %5, padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="OHWI", out_dtype="float32");
+          %7 = layout_transform(%3, src_layout="NCHW", dst_layout="NHWC");
+          %8 = add(%6, %7);
+          %9 = sigmoid(%8);
+          %10 = fn (%FunctionVar_0_0, %FunctionVar_0_1, %FunctionVar_0_2, %FunctionVar_0_3, PartitionedFromPattern="nn.conv2d_add_multiply_") {
+            %0 = nn.conv2d(%FunctionVar_0_0, %FunctionVar_0_1, padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="OHWI", out_dtype="float32");
+            %1 = add(%0, %FunctionVar_0_2);
+            multiply(%1, %FunctionVar_0_3)
+          };
+          %11 = %10(%4, %5, %7, %9);
+          layout_transform(%11, src_layout="NHWC", dst_layout="NCHW")
+        }
+        """
+    )
+    pattern = is_op("multiply")(
+        is_op("add")(is_op("nn.conv2d")(wildcard(), wildcard()), wildcard()), wildcard()
+    )
+    actual_mod = tvm.IRModule.from_expr(pattern.partition(in_mod["main"]))
+    actual_mod = relay.transform.InferType()(actual_mod)
+    tvm.ir.assert_structural_equal(actual_mod, expected_mod)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From b00b1229c881fa6f2f9fe9e44819c9dc3de09f74 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 8 Jun 2022 07:24:36 -0500
Subject: [PATCH 068/181] [Hexagon] Make local symbols visible to loaded
 modules in RPC server (#11611)

The simulator library `libhexagon_rpc_sim.so` contains TVM runtime built
into it, but since it's loaded as a "local" library these symbols are not
visible to shared libraries loaded by subsequent dlopens. (Same applies to
symbols from the C++ runtime.)

To make these symbols visible, dlopen the defining libraries as "global".
(Re-dlopeninig an already loaded library is a well-defined operation.)
---
 src/runtime/hexagon/rpc/simulator/rpc_server.cc | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index 29373be542f3f..9b4ce3f11443e 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -18,6 +18,7 @@
  */
 
 #include <HAP_farf.h>
+#include <dlfcn.h>
 
 #include <algorithm>
 #include <cassert>
@@ -288,7 +289,16 @@ int DISPATCH_FUNCTION_NAME(void* serverp) {
   return 0;
 }
 
-int main() {
+int main(int argc, char* argv[]) {
+  // Load C++RT and ourselves as "global" to make all the symbols defined
+  // there be visible to any subsequent libraries loaded via dlopen.
+  void* cxx_abi = dlopen("libc++abi.so", RTLD_GLOBAL);
+  ICHECK(cxx_abi != nullptr);
+  void* cxx = dlopen("libc++.so", RTLD_GLOBAL);
+  ICHECK(cxx != nullptr);
+  void* self = dlopen(argv[0], RTLD_GLOBAL);
+  ICHECK(self != nullptr);
+
   const auto* api = tvm::runtime::Registry::Get("device_api.hexagon");
   ICHECK(api != nullptr);
   tvm::runtime::Registry::Register("device_api.cpu", true).set_body(*api);
@@ -308,6 +318,9 @@ int main() {
     // nothing
   }
 
+  dlclose(self);
+  dlclose(cxx);
+  dlclose(cxx_abi);
   return 0;
 }
 

From e19cf20054a9fe5049c71b02753c155110b0a6ba Mon Sep 17 00:00:00 2001
From: Philipp van Kempen <philipp.van-kempen@tum.de>
Date: Wed, 8 Jun 2022 15:21:29 +0200
Subject: [PATCH 069/181] TVMC: Allow to overwrite TVM_CONFIGS_JSON_DIR via
 environment variables (#11623)

If a non-default location for the build directory is used, e.g. set via TVM_LIBRARY_PATH
we need to provide the user a way to overwrite CONFIGS_JSON_DIR as well.
---
 python/tvm/driver/tvmc/config_options.py      |  9 +++++++
 .../driver/tvmc/test_parse_config_file.py     | 27 ++++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/python/tvm/driver/tvmc/config_options.py b/python/tvm/driver/tvmc/config_options.py
index ae5616e7245af..c384c89b1a2b6 100644
--- a/python/tvm/driver/tvmc/config_options.py
+++ b/python/tvm/driver/tvmc/config_options.py
@@ -43,6 +43,15 @@ def get_configs_json_dir() -> str:
     """
     global CONFIGS_JSON_DIR
     if CONFIGS_JSON_DIR is None:
+
+        # If a non-default location for the build directory is used, e.g. set via TVM_LIBRARY_PATH
+        # we need to provide the user a way to overwrite CONFIGS_JSON_DIR as well.
+        if os.environ.get("TVM_CONFIGS_JSON_DIR", None):
+            user_config_dir = os.environ["TVM_CONFIGS_JSON_DIR"]
+            if os.path.isdir(user_config_dir):
+                CONFIGS_JSON_DIR = user_config_dir
+                return CONFIGS_JSON_DIR
+
         candidate_paths = []
         candidate_paths.extend(libinfo.find_lib_path())
         # When running from source, the configs directory will be located one directory above the
diff --git a/tests/python/driver/tvmc/test_parse_config_file.py b/tests/python/driver/tvmc/test_parse_config_file.py
index a80daba3a47ab..6aec2cd453a3e 100644
--- a/tests/python/driver/tvmc/test_parse_config_file.py
+++ b/tests/python/driver/tvmc/test_parse_config_file.py
@@ -20,7 +20,7 @@
 
 import tvm
 from tvm.driver.tvmc.main import _main
-from tvm.driver.tvmc.config_options import convert_config_json_to_cli
+from tvm.driver.tvmc.config_options import convert_config_json_to_cli, get_configs_json_dir
 
 
 def test_parse_json_config_file_one_target():
@@ -153,3 +153,28 @@ def test_tvmc_cl_compile_run_config_file(tflite_mobilenet_v1_1_quant, tmpdir_fac
     exit_code = _main(tvmc_args)
     on_error = "Trying to run a MLF archive must fail because it's only supported on micro targets."
     assert exit_code != 0, on_error
+
+
+def test_tvmc_get_configs_json_dir(tmpdir_factory, monkeypatch):
+    # Reset global state
+    monkeypatch.setattr(tvm.driver.tvmc.config_options, "CONFIGS_JSON_DIR", None)
+
+    # Get default directory for reference
+    default_dir = get_configs_json_dir()
+
+    # Set custom dir which does not exist -> ignore
+    monkeypatch.setattr(tvm.driver.tvmc.config_options, "CONFIGS_JSON_DIR", None)
+    monkeypatch.setenv("TVM_CONFIGS_JSON_DIR", "not_a_directory")
+    result = get_configs_json_dir()
+    assert_msg = "Non-existant directory passed via TVM_CONFIGS_JSON_DIR should be ignored."
+    assert result == default_dir, assert_msg
+
+    # Set custom dir which does exist
+    monkeypatch.setattr(tvm.driver.tvmc.config_options, "CONFIGS_JSON_DIR", None)
+    configs_dir = tmpdir_factory.mktemp("configs")
+    monkeypatch.setenv("TVM_CONFIGS_JSON_DIR", str(configs_dir))
+    result = get_configs_json_dir()
+    assert_msg = (
+        "Custom value passed via TVM_CONFIGS_JSON_DIR should be used instead of default one."
+    )
+    assert result != default_dir and result is not None, assert_msg

From 96a513cd97be4b42acb51d1c9b73288820e90185 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 8 Jun 2022 11:39:42 -0700
Subject: [PATCH 070/181] Patch replay trace. (#11621)

---
 include/tvm/meta_schedule/search_strategy.h    |  4 +++-
 .../search_strategy/replay_trace.py            |  8 +++++++-
 .../search_strategy/replay_trace.cc            | 18 +++++++++++++++---
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index baae22f0d98ec..5e249850f5d5b 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -211,8 +211,10 @@ class SearchStrategy : public runtime::ObjectRef {
    * \brief Constructor of replay trace search strategy.
    * \param num_trials_per_iter The number of trials per iteration, i.e., the batch size.
    * \param max_trials_per_task The total number of trials for trace replaying.
+   * \param max_fail_count The max number of failures during trace replaying.
    */
-  TVM_DLL static SearchStrategy ReplayTrace(int num_trials_per_iter, int max_trials_per_task);
+  TVM_DLL static SearchStrategy ReplayTrace(int num_trials_per_iter, int max_trials_per_task,
+                                            int max_fail_count);
 
   /*!
    * \brief Constructor of replay func search strategy.
diff --git a/python/tvm/meta_schedule/search_strategy/replay_trace.py b/python/tvm/meta_schedule/search_strategy/replay_trace.py
index 70461d65f7765..36dbb8734e577 100644
--- a/python/tvm/meta_schedule/search_strategy/replay_trace.py
+++ b/python/tvm/meta_schedule/search_strategy/replay_trace.py
@@ -33,15 +33,21 @@ class ReplayTrace(SearchStrategy):
         Number of trials per iteration.
     max_trials_per_task : int
         Total number of trials for one task
+    max_fail_count : int
+        Max number of failures during trace replaying.
     """
 
     num_trials_per_iter: int
     max_trials_per_task: int
+    max_fail_count: int
 
-    def __init__(self, num_trials_per_iter: int, max_trials_per_task: int):
+    def __init__(
+        self, num_trials_per_iter: int, max_trials_per_task: int, max_fail_count: int = 100
+    ):
         """Constructor"""
         self.__init_handle_by_constructor__(
             _ffi_api.SearchStrategyReplayTrace,  # type: ignore # pylint: disable=no-member
             num_trials_per_iter,
             max_trials_per_task,
+            max_fail_count,
         )
diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc
index 13f32a744e3a0..355f71455d912 100644
--- a/src/meta_schedule/search_strategy/replay_trace.cc
+++ b/src/meta_schedule/search_strategy/replay_trace.cc
@@ -60,6 +60,8 @@ class ReplayTraceNode : public SearchStrategyNode {
   int num_trials_per_iter;
   /*! \brief The number of total trials. */
   int max_trials_per_task;
+  /*! \brief The max number of failures during trace replaying. */
+  int max_fail_count;
 
   /*! \brief The tuning context of the search strategy. */
   const TuneContextNode* context_{nullptr};
@@ -71,6 +73,7 @@ class ReplayTraceNode : public SearchStrategyNode {
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
     v->Visit("max_trials_per_task", &max_trials_per_task);
+    v->Visit("max_fail_count", &max_fail_count);
     // `context_` is not visited.
     // `rand_state_` is not visited
     // `state_` is not visited
@@ -136,7 +139,8 @@ inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasure
                                                                         int task_id) -> void {
     TRandState& rand_state = per_thread_rand_state[thread_id];
     IRModule mod = this->per_thread_mod_[thread_id];
-    for (;;) {
+
+    for (int fail_count = 0; fail_count < self->max_fail_count; fail_count++) {
       int design_space_index = tir::SampleInt(&rand_state, 0, design_spaces.size());
       tir::Trace trace = design_spaces[design_space_index];
       tir::Trace new_trace = tir::Trace(trace->insts, {});
@@ -147,7 +151,13 @@ inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasure
     }
   };
   support::parallel_for_dynamic(0, ed - st, ctx->num_threads, f_worker);
-  return per_task_result;
+  Array<MeasureCandidate> filtered;
+  filtered.reserve(ed - st);
+  for (MeasureCandidate result : per_task_result)
+    if (result.defined()) {
+      filtered.push_back(result);
+    }
+  return filtered;
 }
 
 inline void ReplayTraceNode::State::NotifyRunnerResults(const Array<RunnerResult>& results) {
@@ -155,10 +165,12 @@ inline void ReplayTraceNode::State::NotifyRunnerResults(const Array<RunnerResult
   ed += self->num_trials_per_iter;
 }
 
-SearchStrategy SearchStrategy::ReplayTrace(int num_trials_per_iter, int max_trials_per_task) {
+SearchStrategy SearchStrategy::ReplayTrace(int num_trials_per_iter, int max_trials_per_task,
+                                           int max_fail_count) {
   ObjectPtr<ReplayTraceNode> n = make_object<ReplayTraceNode>();
   n->num_trials_per_iter = num_trials_per_iter;
   n->max_trials_per_task = max_trials_per_task;
+  n->max_fail_count = max_fail_count;
   return SearchStrategy(n);
 }
 

From 9817338508f3f8cd5a444133b4de99ce577c031b Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Thu, 9 Jun 2022 03:12:36 +0800
Subject: [PATCH 071/181] [BYOC][DNNL] Enable layer normalization in DNNL byoc.
 (#11508)

* Enable layer normalization in DNNL byoc.

* Added unittest for layer norm and make code compatible after introducing TensorRequisite(PR-11345)

* Fix lint issue

* Fix clang format issue
---
 python/tvm/relay/op/contrib/dnnl.py           | 70 ++++++++++++++++++-
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 47 +++++++++++++
 tests/python/contrib/test_dnnl.py             | 21 ++++++
 3 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 2e975cf49c885..c87a7162b0707 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -41,7 +41,7 @@
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
 from ... import _ffi_api
-from ...dataflow_pattern import wildcard, is_op
+from ...dataflow_pattern import wildcard, is_op, is_expr, rewrite, DFPatternCallback
 from .register import register_pattern_table
 
 logger = logging.getLogger("DNNL")
@@ -92,6 +92,7 @@ def _func_wrapper(expr):
 _register_external_op_helper("nn.softmax")
 _register_external_op_helper("add")
 _register_external_op_helper("multiply")
+_register_external_op_helper("nn.layer_norm")
 
 
 def make_conv_pattern(conv_name, with_bias=True, with_eltwise=None):
@@ -455,6 +456,7 @@ def visit_call(self, call):
                 "nn.conv3d",
                 "nn.conv3d_transpose",
                 "nn.dense",
+                "nn.layer_norm",
             ]
         )
         if isinstance(call.op, tvm.tir.op.Op):
@@ -526,3 +528,69 @@ def visit_call(self, call):
     new_mod["main"] = SubgraphRemover(subgraphs_to_remove, mod, new_mod).visit(mod["main"])
     new_mod = transform.RemoveUnusedFunctions()(new_mod)
     return new_mod
+
+
+class LayerNormRewrite(DFPatternCallback):
+    """
+    A callback to rewrite the following operators into a single layer normalization operator.
+
+    Pattern #1:
+    1   %4 = mean(%3, axis=[-1], keepdims=True) /* ty=Tensor[(1, 3136, 1), float32] */;
+    2   %5 = subtract(%3, %4) /* ty=Tensor[(1, 3136, 64), float32] */;
+    3   %6 = cast(%5, dtype="float32") /* ty=Tensor[(1, 3136, 64), float32] */;
+    4   %7 = power(%6, 2f /* ty=float32 */) /* ty=Tensor[(1, 3136, 64), float32] */;
+    5   %8 = mean(%7, axis=[-1], keepdims=True) /* ty=Tensor[(1, 3136, 1), float32] */;
+    6   %9 = add(%8, 1e-05f /* ty=float32 */) /* ty=Tensor[(1, 3136, 1), float32] */;
+    7   %10 = sqrt(%9) /* ty=Tensor[(1, 3136, 1), float32] */;
+    8   %11 = divide(%5, %10) /* ty=Tensor[(1, 3136, 64), float32] */;
+    9   %12 = multiply(%11, meta[relay.Constant][2] /* ty=Tensor[(64), float32] */)
+            /* ty=Tensor[(1, 3136, 64), float32] */;
+    10   %13 = add(%12, meta[relay.Constant][3] /* ty=Tensor[(64), float32] */)
+            /* ty=Tensor[(1, 3136, 64), float32] */;
+
+    Pattern #2:
+    1   %0 = mean(%input, axis=[-1], keepdims=True);
+    2   %1 = variance(%input, %0, axis=[-1], keepdims=True);
+    3   %2 = add(%1, 1e-05f /* ty=float32 */) /* ty=Tensor[(1, 49, 1), float32] */;
+    4   %3 = subtract(%input, %0);
+    5   %4 = sqrt(%2) /* ty=Tensor[(1, 49, 1), float32] */;
+    6   %5 = divide(%3, %4);
+    7   %6 = multiply(%5, meta[relay.Constant][0] /* ty=Tensor[(64), float32] */)
+            /* ty=Tensor[(1, 49, 64), float32] */;
+    8   %7 = add(%6, meta[relay.Constant][1] /* ty=Tensor[(64), float32] */)
+            /* ty=Tensor[(1, 49, 64), float32] */
+
+    """
+
+    def __init__(self):
+        super(LayerNormRewrite, self).__init__()
+        self.data = wildcard()
+        self.gamma = wildcard()
+        self.beta = wildcard()
+        mu = is_op("mean")(self.data)
+        diff = is_op("subtract")(self.data, mu)
+        cdiff = diff | is_op("cast")(diff)
+        const_two = is_expr(relay.const(2)) | is_expr(relay.const(2.0))
+        p1 = is_op("power")(cdiff, const_two)
+        mp1 = is_op("mean")(p1) | is_op("variance")(self.data, mu)
+        eps = is_expr(relay.const(1e-5))
+        added_eps = is_op("add")(mp1, eps)
+        deno = is_op("sqrt")(added_eps)
+        div_out = is_op("divide")(diff, deno)
+        weighted = is_op("multiply")(div_out, self.gamma)
+        added_bias = is_op("add")(weighted, self.beta)
+        self.pattern = added_bias
+
+    def callback(self, pre, post, node_map):
+        data = node_map[self.data][0]
+        gamma = node_map[self.gamma][0]
+        beta = node_map[self.beta][0]
+        return relay.op.nn.layer_norm(data=data, gamma=gamma, beta=beta)
+
+
+def rewrite_layer_norm(mod):
+    """Rewrite the input graph to replace multiple operators with a TVM native layer normalization
+    operator so that we can offload them to dnnl layer normalization byoc part.
+    """
+    mod["main"] = rewrite(LayerNormRewrite(), mod["main"])
+    return mod
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index a2417f012ea42..db8f25e2a6ea5 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -203,6 +203,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
           Binary(nid, dnnl::algorithm::binary_add);
         } else if ("multiply" == op_name) {
           Binary(nid, dnnl::algorithm::binary_mul);
+        } else if ("nn.layer_norm" == op_name) {
+          LayerNorm(nid);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
@@ -449,6 +451,51 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
                                                              {DNNL_ARG_VARIANCE, var_tr}});
   }
 
+  void LayerNorm(const size_t& nid) {
+    auto node = nodes_[nid];
+
+    auto src_tr = GetInput(nid, 0);
+    auto gamma_tr = GetInput(nid, 1);
+    auto beta_tr = GetInput(nid, 2);
+    auto dst_tr = GetOutput(nid, 0);
+
+    auto axis = GetNodeAttr<int>(node, "axis");
+    auto epsilon = GetNodeAttr<float>(node, "epsilon");
+    auto center = GetNodeAttr<bool>(node, "center");
+    auto scale = GetNodeAttr<bool>(node, "scale");
+
+    ICHECK(axis == -1 && center && scale) << "Unimplemented LayerNorm case";
+
+    // LN description.
+    auto lnorm_desc = dnnl::layer_normalization_forward::desc(
+        dnnl::prop_kind::forward_inference, src_tr.desc(), epsilon,
+        dnnl::normalization_flags::use_scale_shift);
+
+    auto lnorm_prim_desc = dnnl::layer_normalization_forward::primitive_desc(lnorm_desc, engine_);
+
+    // Concatenate scale and shift tensors
+    auto scale_shift_tr = TensorRequisite::AsIs(lnorm_prim_desc.weights_desc(), GenUniqueEid());
+    auto sc_sh_dims = scale_shift_tr.dims();
+
+    ICHECK(sc_sh_dims.size() == 2);
+    ICHECK(sc_sh_dims[0] == 2);
+    sc_sh_dims[0] /= 2;
+    auto scale_tr = scale_shift_tr.Crop(sc_sh_dims, {0, 0}).Squeeze();
+    auto shift_tr = scale_shift_tr.Crop(sc_sh_dims, {1, 0}).Squeeze();
+
+    auto register_copy = [this](const TensorRequisite& src, const TensorRequisite& dst) {
+      dnnl::reorder::primitive_desc copy_pd(engine_, src.desc(), engine_, dst.desc());
+      Submit(dnnl::reorder(copy_pd), {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}});
+    };
+
+    register_copy(gamma_tr, scale_tr);
+    register_copy(beta_tr, shift_tr);
+
+    Submit(
+        dnnl::layer_normalization_forward(lnorm_prim_desc),
+        {{DNNL_ARG_SRC, src_tr}, {DNNL_ARG_DST, dst_tr}, {DNNL_ARG_SCALE_SHIFT, scale_shift_tr}});
+  }
+
   void Pooling(const size_t& nid, dnnl::algorithm algo) {
     auto node = nodes_[nid];
 
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index babfad4a0c8c7..3e4e831aa594e 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -111,6 +111,8 @@ def partition_for_dnnl(mod, params=None, alter_layout=True):
                             with tvm.transform.PassContext(opt_level=3):
                                 mod = alter_layout_seq(mod)
 
+    mod = dnnl.rewrite_layer_norm(mod)
+
     byoc_seq = tvm.transform.Sequential(
         [
             transform.MergeComposite(dnnl.pattern_table()),
@@ -454,6 +456,16 @@ def get_conv2d_bias_bn_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype
     return relay.nn.relu(conv2d_bias_bn), dic, param_lst
 
 
+def get_layer_norm(x_shape=(1, 49, 64), dtype="float32"):
+    dic = {"input": x_shape}
+    param_lst = []
+    input = relay.var("input", shape=x_shape)
+    beta = relay.const(np.zeros(x_shape[2]).astype(dtype))
+    gamma = relay.const(np.ones(x_shape[2]).astype(dtype))
+    out = relay.nn.layer_norm(input, gamma=gamma, beta=beta)
+    return out, dic, param_lst
+
+
 def get_conv2d_bias_sum_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
     conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
     sum_data = relay.const(np.random.randint(x_shape).astype(dtype))
@@ -1032,5 +1044,14 @@ def get_graph():
     run_and_verify_func(get_graph(), subgraph_num=1, run_module=run_module, test_bf16=False)
 
 
+def test_layer_norm(run_module, dtype="float32"):
+    x_shape = (1, 49, 64)
+
+    ln, dic, param_lst = get_layer_norm(x_shape, dtype=dtype)
+    ln = tvm.IRModule.from_expr(ln)
+    config = ln, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 99c113a237cfd3f21d78fbb405160ed8b9b5af0b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 8 Jun 2022 12:39:09 -0700
Subject: [PATCH 072/181] [COMMUNITY] @tkonolige -> Committer (#11626)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index cfd99ae73f653..8f43ad455e08a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -47,6 +47,7 @@ We do encourage everyone to work anything they are interested in.
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm - ethos-u, memory planner
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay
+- [Tristan Konolige](https://github.com/tkonolige): @tkonolige - profiling, relay, tir, runtime
 - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574 - tir, tvm-script
 - [Wuwei Lin](https://github.com/vinx13): @vinx13 - relay, topi
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay

From 97e681dc3477570b268bd84aae539219e5a0b29c Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 8 Jun 2022 13:23:58 -0700
Subject: [PATCH 073/181] [Hexagon] Add random string to workspace name
 (#11593)

---
 python/tvm/contrib/hexagon/build.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 43856253cb180..c659d66bec5db 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -25,6 +25,8 @@
 import signal
 import socket
 import stat
+import random
+import string
 import subprocess
 from typing import Union
 
@@ -58,7 +60,9 @@ def _get_hexagon_rpc_lib_dir() -> pathlib.Path:
 
 def _get_test_directory_name() -> str:
     """Generate a time-stamped name for use as a test directory name."""
-    return datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+    date_str = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+    random_str = "".join(random.choice(string.ascii_lowercase) for _ in range(10))
+    return f"{date_str}-{random_str}"
 
 
 class HexagonLauncherRPC(metaclass=abc.ABCMeta):

From df4f4c0b4bccd775af25967fdf057392c1a2826e Mon Sep 17 00:00:00 2001
From: "Sevin F. Varoglu" <sfvaroglu@octoml.ai>
Date: Wed, 8 Jun 2022 14:08:06 -0700
Subject: [PATCH 074/181] [ONNX] Add ReduceSum opset13 support (non-dynamic)
 (#11606)

* [ONNX] Add ReduceSum opset13 support (non-dynamic)

* Add check

* Add support for constant axis

* noop

* Rework logic
---
 python/tvm/relay/frontend/onnx.py          | 26 ++++++++++++++++++++++
 tests/python/frontend/onnx/test_forward.py |  4 ----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index abfa5629d5534..29c0a778ef6ee 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2270,6 +2270,32 @@ def _impl_v12(cls, inputs, attr, params):
 
         return cls._impl_v1(inputs, attr, params)
 
+    @classmethod
+    def _impl_v13(cls, inputs, attr, params):
+        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
+            inputs[0] = _op.expand_dims(inputs[0], axis=0)
+
+        noop_with_empty_axes = attr.get("noop_with_empty_axes", 0)
+        num_axis = int(infer_type(inputs[1]).checked_type.shape[0]) if inputs[1] is not None else 0
+
+        if noop_with_empty_axes and num_axis == 0:
+            return inputs[0]
+
+        if len(inputs) == 2:
+            if isinstance(inputs[1], _expr.Constant):
+                # Get axis and unpack scalar
+                constant_axis = int(inputs[1].data.numpy()[0])
+                return cls.run_calculation([inputs[0]], constant_axis, attr.get("keepdims", True))
+
+            if num_axis > 0:
+                raise ValueError("Dynamic Reduce is not supported yet!")
+
+            axis_len = len(infer_shape(inputs[0]))
+            axis = list(range(axis_len))
+            return cls.run_calculation([inputs[0]], axis, attr.get("keepdims", True))
+
+        return cls._impl_v1(inputs, attr, params)
+
 
 class ReduceMax(Reduce):
     """Operator converter for ReduceMax."""
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index ebaad9b4cb136..967597f7d12b8 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5172,12 +5172,8 @@ def verify_eyelike(indata):
     "test_qlinearmatmul_3D",
     "test_range_float_type_positive_delta_expanded",
     "test_range_int32_type_negative_delta_expanded",
-    "test_reduce_sum_default_axes_keepdims_example",
-    "test_reduce_sum_default_axes_keepdims_random",
     "test_reduce_sum_do_not_keepdims_example",
     "test_reduce_sum_do_not_keepdims_random",
-    "test_reduce_sum_empty_axes_input_noop_example",
-    "test_reduce_sum_empty_axes_input_noop_random",
     "test_reduce_sum_keepdims_example",
     "test_reduce_sum_keepdims_random",
     "test_reduce_sum_negative_axes_keepdims_example",

From 2f9d9b4e5c7dcb3c9879fb2496f1f50e85b9c55a Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Thu, 9 Jun 2022 07:31:55 +0300
Subject: [PATCH 075/181] [OpenCL] Implement conv2d_winograd algorithm for
 Adreno (#11543)

* Implement conv2d_winograd algorithm for Adreno

* Implement gtest for OpenCL texture pool

* Implement conv2d_nhwc_winograd for Adreno

* Minor refactoring

* Fix lint

* Apply comments

* Apply comments

* Fix lint
---
 CMakeLists.txt                                |  16 +
 cmake/modules/LibInfo.cmake                   |   1 +
 cmake/modules/OpenCL.cmake                    |   6 +
 python/tvm/relay/op/strategy/adreno.py        |  99 +++-
 python/tvm/topi/adreno/__init__.py            |   2 +
 python/tvm/topi/adreno/conv2d_alter_op.py     | 218 +++++++-
 .../tvm/topi/adreno/conv2d_nchw_winograd.py   | 128 +++++
 .../tvm/topi/adreno/conv2d_nhwc_winograd.py   | 128 +++++
 .../tvm/topi/adreno/conv2d_winograd_common.py | 512 ++++++++++++++++++
 python/tvm/topi/adreno/utils.py               |  28 +
 src/runtime/opencl/texture_pool.cc            | 191 ++++---
 src/runtime/texture.h                         |  22 +-
 src/support/libinfo.cc                        |   5 +
 .../opencl/opencl_texture_pool_test.cc        | 151 ++++++
 tests/cpp-runtime/opencl/run_gtests.cc        |  60 ++
 tests/python/contrib/test_opencl/conftest.py  |  29 +
 .../contrib/test_opencl/test_run_gtests.py    |  55 ++
 .../python/relay/test_conv2d_nchw_texture.py  |  43 ++
 .../python/relay/test_conv2d_nhwc_texture.py  |  43 ++
 tests/python/relay/utils/adreno_utils.py      |   1 +
 20 files changed, 1638 insertions(+), 100 deletions(-)
 create mode 100644 python/tvm/topi/adreno/conv2d_nchw_winograd.py
 create mode 100644 python/tvm/topi/adreno/conv2d_nhwc_winograd.py
 create mode 100644 python/tvm/topi/adreno/conv2d_winograd_common.py
 create mode 100644 tests/cpp-runtime/opencl/opencl_texture_pool_test.cc
 create mode 100644 tests/cpp-runtime/opencl/run_gtests.cc
 create mode 100644 tests/python/contrib/test_opencl/conftest.py
 create mode 100644 tests/python/contrib/test_opencl/test_run_gtests.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5352eddd25987..b4d6e18aad630 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,7 @@ endif()
 # Alernatively, use cmake -DOPTION=VALUE through command-line.
 tvm_option(USE_CUDA "Build with CUDA" OFF)
 tvm_option(USE_OPENCL "Build with OpenCL" OFF)
+tvm_option(USE_OPENCL_GTEST "Path to OpenCL specific gtest version for runtime cpp tests." /path/to/opencl/gtest)
 tvm_option(USE_VULKAN "Build with Vulkan" OFF)
 
 
@@ -609,6 +610,18 @@ if(BUILD_FOR_HEXAGON AND DEFINED USE_HEXAGON_GTEST AND EXISTS ${USE_HEXAGON_GTES
   include_directories("${USE_HEXAGON_GTEST}/include")
 endif()
 
+if(USE_OPENCL AND DEFINED USE_OPENCL_GTEST AND EXISTS ${USE_OPENCL_GTEST})
+  include(FetchContent)
+  FetchContent_Declare(googletest SOURCE_DIR "${USE_OPENCL_GTEST}")
+  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  FetchContent_MakeAvailable(googletest)
+  target_link_libraries(tvm_runtime PUBLIC gtest)
+  target_link_libraries(tvm PUBLIC gtest)
+  include_directories("${USE_OPENCL_GTEST}/include")
+  include_directories("${USE_OPENCL_GTEST}/googletest/include")
+  message(STATUS "Found OpenCL gtest at ${USE_OPENCL_GTEST}")
+endif()
+
 # Set flags for clang
 include(cmake/modules/ClangFlags.cmake)
 set(CRC16_INCLUDE_PATH "3rdparty/libcrc/include")
@@ -668,6 +681,9 @@ install(TARGETS tvm_runtime EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_S
 if(BUILD_FOR_HEXAGON AND DEFINED USE_HEXAGON_GTEST AND EXISTS ${USE_HEXAGON_GTEST})
   install(TARGETS gtest EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
 endif()
+if(USE_OPENCL AND DEFINED USE_OPENCL_GTEST AND EXISTS ${USE_OPENCL_GTEST})
+  install(TARGETS gtest EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
+endif()
 
 if (INSTALL_DEV)
   install(
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 76ddbede8ac06..3e6b3c787f656 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -89,6 +89,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_MSVC_MT="${USE_MSVC_MT}"
     TVM_INFO_USE_NNPACK="${USE_NNPACK}"
     TVM_INFO_USE_OPENCL="${USE_OPENCL}"
+    TVM_INFO_USE_OPENCL_GTEST="${USE_OPENCL_GTEST}"
     TVM_INFO_USE_OPENMP="${USE_OPENMP}"
     TVM_INFO_USE_PAPI="${USE_PAPI}"
     TVM_INFO_USE_PROFILER="${USE_PROFILER}"
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index 648e83f575d18..430af7e8722c8 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -55,6 +55,12 @@ if(USE_OPENCL)
   message(STATUS "Build with OpenCL support")
   tvm_file_glob(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenCL_LIBRARIES})
+
+  if(DEFINED USE_OPENCL_GTEST AND EXISTS ${USE_OPENCL_GTEST})
+    file_glob_append(RUNTIME_OPENCL_SRCS
+      "${CMAKE_SOURCE_DIR}/tests/cpp-runtime/opencl/*.cc"
+    )
+  endif()
   list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
 else()
   list(APPEND COMPILER_SRCS src/target/opt/build_opencl_off.cc)
diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py
index a783440bb38cc..01b3935a6f1bc 100644
--- a/python/tvm/relay/op/strategy/adreno.py
+++ b/python/tvm/relay/op/strategy/adreno.py
@@ -28,6 +28,7 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     data, kernel = inputs
     dilation_h, dilation_w = attrs.get_int_tuple("dilation")
+    stride_h, stride_w = attrs.get_int_tuple("strides")
     groups = attrs.groups
     data_layout = attrs.data_layout
     kernel_layout = attrs.kernel_layout
@@ -38,6 +39,28 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
         if (data_layout == "NCHW" and kernel_layout == "OIHW") or (
             data_layout == "NCHW4c" and kernel_layout == "OIHW4o"
         ):
+            if len(kernel.shape) == 4:
+                _, _, kh, kw = get_const_tuple(kernel.shape)
+            else:
+                _, _, kh, kw, _ = get_const_tuple(kernel.shape)
+            if (
+                (2 < kh < 8 and 2 < kw < 8 and kh == kw)
+                and (stride_h == 1 and stride_w == 1)
+                and (dilation_h == 1 and dilation_w == 1)
+            ):
+                if out_type.dtype == "float16":
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd),
+                        wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd),
+                        name="conv2d_nchw_winograd.image2d",
+                        plevel=25,
+                    )
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_acc32),
+                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd_acc32),
+                    name="conv2d_nchw_winograd_acc32.image2d",
+                    plevel=30,
+                )
             if out_type.dtype == "float16":
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.adreno.conv2d_nchwc),
@@ -48,12 +71,34 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.adreno.conv2d_nchwc_acc32),
                 wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc_acc32),
-                name="conv2d_nchwc_tpack.image2d",
+                name="conv2d_nchwc_acc32.image2d",
                 plevel=20,
             )
         elif (data_layout == "NHWC" and kernel_layout == "HWIO") or (
             data_layout == "NHWC4c" and kernel_layout == "HWIO4o"
         ):
+            if len(kernel.shape) == 4:
+                kh, kw, _, _ = get_const_tuple(kernel.shape)
+            else:
+                kh, kw, _, _, _ = get_const_tuple(kernel.shape)
+            if (
+                (2 < kh < 8 and 2 < kw < 8 and kh == kw)
+                and (stride_h == 1 and stride_w == 1)
+                and (dilation_h == 1 and dilation_w == 1)
+            ):
+                if out_type.dtype == "float16":
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd),
+                        wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd),
+                        name="conv2d_nhwc_winograd.image2d",
+                        plevel=25,
+                    )
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_acc32),
+                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd_acc32),
+                    name="conv2d_nhwc_winograd_acc32.image2d",
+                    plevel=30,
+                )
             if out_type.dtype == "float16":
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.adreno.conv2d_nhwc),
@@ -153,6 +198,58 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
     return strategy
 
 
+@conv2d_winograd_without_weight_transfrom_strategy.register("adreno")
+def conv2d_winograd_without_weight_transfrom_strategy_adreno(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transfrom adreno strategy"""
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.data_layout
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+    strategy = _op.OpStrategy()
+    if layout in ("NCHW", "NCHW4c"):
+        if out_type.dtype == "float16":
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform),
+                wrap_topi_schedule(
+                    topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform
+                ),
+                name="conv2d_nchw_winograd_without_weight_transform.image2d",
+                plevel=35,
+            )
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform_acc32),
+            wrap_topi_schedule(
+                topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform_acc32
+            ),
+            name="conv2d_nchw_winograd_without_weight_transform_acc32.image2d",
+            plevel=40,
+        )
+    elif layout in ("NHWC", "NHWC4c"):
+        if out_type.dtype == "float16":
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform),
+                wrap_topi_schedule(
+                    topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform
+                ),
+                name="conv2d_nhwc_winograd_without_weight_transform.image2d",
+                plevel=35,
+            )
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform_acc32),
+            wrap_topi_schedule(
+                topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform_acc32
+            ),
+            name="conv2d_nhwc_winograd_without_weight_transform_acc32.image2d",
+            plevel=40,
+        )
+    else:
+        raise RuntimeError(
+            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
+        )
+    return strategy
+
+
 @schedule_pool.register("adreno")
 def schedule_pool_adreno(attrs, outs, target):
     """schedule pooling ops for adreno"""
diff --git a/python/tvm/topi/adreno/__init__.py b/python/tvm/topi/adreno/__init__.py
index 6c9b7463c1d4e..57a9013b1a2ab 100644
--- a/python/tvm/topi/adreno/__init__.py
+++ b/python/tvm/topi/adreno/__init__.py
@@ -23,3 +23,5 @@
 from .depthwise_conv2d_nhwc import *
 from .pooling import *
 from .conv2d_alter_op import *
+from .conv2d_nchw_winograd import *
+from .conv2d_nhwc_winograd import *
diff --git a/python/tvm/topi/adreno/conv2d_alter_op.py b/python/tvm/topi/adreno/conv2d_alter_op.py
index e8944093c0f54..16573991e09c5 100644
--- a/python/tvm/topi/adreno/conv2d_alter_op.py
+++ b/python/tvm/topi/adreno/conv2d_alter_op.py
@@ -25,6 +25,7 @@
 from tvm import relay
 from tvm import autotvm
 from ..utils import get_const_tuple
+from .utils import infer_tile_size
 from ..nn import conv2d_alter_layout
 
 logger = logging.getLogger("topi")
@@ -58,7 +59,6 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     kernel_layout = attrs["kernel_layout"]
     data_tensor, kernel_tensor = tinfos
     data_dtype = data_tensor.dtype
-    kernel_dtype = kernel_tensor.dtype
     out_dtype = out_type.dtype
 
     if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
@@ -70,12 +70,228 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         )
         workload = autotvm.task.get_workload(outs)
         if workload is None:
+            if impl.name.find("winograd") != -1:
+                if dilation != (1, 1):
+                    logger.warning("Does not support weight pre-transform for dilated convolution.")
+                    return None
+
+                assert (data_layout == "NCHW" and kernel_layout == "OIHW") or (
+                    data_layout == "NHWC" and kernel_layout == "HWIO"
+                )
+                if data_layout == "NCHW":
+                    N, CI, H, W = get_const_tuple(data_tensor.shape)
+                    CO, _, KH, KW = get_const_tuple(kernel_tensor.shape)
+                    weight = inputs[1]
+                else:
+                    N, H, W, CI = get_const_tuple(data_tensor.shape)
+                    KH, KW, _, CO = get_const_tuple(kernel_tensor.shape)
+                    weight = relay.layout_transform(inputs[1], "HWIO", "OIHW")
+
+                # Pre-compute weight transformation in winograd
+                tile_size = infer_tile_size(data_tensor, data_layout)
+
+                # alpha, alpha, CO, CI
+                weight = relay.nn.contrib_conv2d_winograd_weight_transform(
+                    weight, tile_size=tile_size
+                )
+                new_attrs["tile_size"] = tile_size
+                new_attrs["channels"] = CO
+                return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                    inputs[0], weight, **new_attrs
+                )
             return None
 
         cfg = dispatch_ctx.query(target, workload)
 
     topi_tmpl = workload[0]
 
+    if "conv2d_nchw_winograd" in topi_tmpl:
+        suffix = "_acc32" if "acc32" in topi_tmpl else ""
+        wkl_name = "conv2d_nchw_winograd_without_weight_transform" + suffix + ".image2d"
+        if dilation != (1, 1):
+            logger.warning("Does not support weight pre-transform for dilated convolution.")
+            return None
+
+        tile_size = infer_tile_size(data_tensor, data_layout)
+        if len(data_tensor.shape) == 5:
+            assert data_layout == "NCHW4c" and kernel_layout == "OIHW4o"
+            N, CI, H, W, CB = get_const_tuple(data_tensor.shape)
+            CO, _, KH, KW, COB = get_const_tuple(kernel_tensor.shape)
+            weight = relay.layout_transform(inputs[1], "OIHW4o", "OIHW")
+            weight = relay.nn.contrib_conv2d_winograd_weight_transform(weight, tile_size=tile_size)
+            weight = relay.layout_transform(weight, "HWOI", "HWIO4o")
+
+            new_attrs["tile_size"] = tile_size
+            new_attrs["channels"] = CO * COB
+
+            new_data = data_tensor
+            new_weight = te.placeholder(
+                (KH + tile_size - 1, KW + tile_size - 1, CI * CB, CO, COB),
+                dtype=kernel_tensor.dtype,
+            )
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation, out_dtype],
+                wkl_name,
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                inputs[0], weight, **new_attrs
+            )
+
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data_tensor.shape)
+        CO, _, KH, KW = get_const_tuple(kernel_tensor.shape)
+
+        # pre-compute weight transformation in winograd
+        weight = relay.nn.contrib_conv2d_winograd_weight_transform(inputs[1], tile_size=tile_size)
+        weight = relay.transpose(weight, axes=[2, 3, 0, 1])  # HWOI -> OIHW
+        new_attrs["tile_size"] = tile_size
+        new_attrs["channels"] = CO
+
+        # Store the same config for the altered operator (workload)
+        new_data = data_tensor
+        new_weight = te.placeholder(
+            (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel_tensor.dtype
+        )
+        in_channel_block = CI % 4
+        if in_channel_block == 0:
+            in_channel_block = 4
+        num_filter_block = CO % 4
+        if num_filter_block == 0:
+            num_filter_block = 4
+
+        if in_channel_block != 4 or num_filter_block != 4:
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation, out_dtype],
+                wkl_name,
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                inputs[0], weight, **new_attrs
+            )
+
+        new_attrs["data_layout"] = "NCHW%dc" % in_channel_block
+        # (oc, ic, h, w) -> (h, w, ic, oc // 4, oc % 4)
+        new_attrs["kernel_layout"] = "HWIO%do" % num_filter_block
+        new_attrs["out_layout"] = "NCHW%dc" % num_filter_block
+        # Store altered operator's config
+        new_data = te.placeholder(
+            (N, CI // in_channel_block, H, W, in_channel_block), dtype=data_dtype
+        )
+        new_weight = te.placeholder(
+            (KH + tile_size - 1, KW + tile_size - 1, CI, CO // num_filter_block, num_filter_block),
+            dtype=kernel_tensor.dtype,
+        )
+        new_workload = autotvm.task.args_to_workload(
+            [
+                new_data,
+                new_weight,
+                strides,
+                padding,
+                dilation,
+                out_dtype,
+            ],
+            wkl_name,
+        )
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+            inputs[0], weight, **new_attrs
+        )
+
+    if "conv2d_nhwc_winograd" in topi_tmpl:
+        suffix = "_acc32" if "acc32" in topi_tmpl else ""
+        wkl_name = "conv2d_nhwc_winograd_without_weight_transform" + suffix + ".image2d"
+        if dilation != (1, 1):
+            logger.warning("Does not support weight pre-transform for dilated convolution.")
+            return None
+
+        tile_size = infer_tile_size(data_tensor, data_layout)
+        if len(data_tensor.shape) == 5:
+            assert data_layout == "NHWC4c" and kernel_layout == "HWIO4o"
+            N, CI, H, W, CB = get_const_tuple(data_tensor.shape)
+            KH, KW, _, CO, COB = get_const_tuple(kernel_tensor.shape)
+            weight = relay.layout_transform(inputs[1], "HWIO4o", "OIHW")
+            weight = relay.nn.contrib_conv2d_winograd_weight_transform(weight, tile_size=tile_size)
+            weight = relay.layout_transform(weight, "HWOI", "HWIO4o")
+
+            new_attrs["tile_size"] = tile_size
+            new_attrs["channels"] = CO * COB
+
+            new_data = data_tensor
+            new_weight = te.placeholder(
+                (KH + tile_size - 1, KW + tile_size - 1, CI * CB, CO, COB),
+                dtype=kernel_tensor.dtype,
+            )
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation, out_dtype],
+                wkl_name,
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                inputs[0], weight, **new_attrs
+            )
+
+        assert data_layout == "NHWC" and kernel_layout == "HWIO"
+        N, H, W, CI = get_const_tuple(data_tensor.shape)
+        KH, KW, _, CO = get_const_tuple(kernel_tensor.shape)
+
+        # pre-compute weight transformation in winograd
+        weight = relay.layout_transform(inputs[1], "HWIO", "OIHW")
+        weight = relay.nn.contrib_conv2d_winograd_weight_transform(weight, tile_size=tile_size)
+        weight = relay.transpose(weight, axes=[0, 1, 3, 2])  # HWOI -> HWIO
+        new_attrs["tile_size"] = tile_size
+        new_attrs["channels"] = CO
+
+        # Store the same config for the altered operator (workload)
+        new_data = data_tensor
+        new_weight = te.placeholder(
+            (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel_tensor.dtype
+        )
+        in_channel_block = CI % 4
+        if in_channel_block == 0:
+            in_channel_block = 4
+        num_filter_block = CO % 4
+        if num_filter_block == 0:
+            num_filter_block = 4
+
+        if in_channel_block != 4 or num_filter_block != 4:
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation, out_dtype],
+                wkl_name,
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                inputs[0], weight, **new_attrs
+            )
+
+        new_attrs["data_layout"] = "NHWC%dc" % in_channel_block
+        # (oc, ic, h, w) -> (h, w, ic, oc // 4, oc % 4)
+        new_attrs["kernel_layout"] = "HWIO%do" % num_filter_block
+        new_attrs["out_layout"] = "NHWC%dc" % num_filter_block
+        # Store altered operator's config
+        new_data = te.placeholder(
+            (N, H, W, CI // in_channel_block, in_channel_block), dtype=data_dtype
+        )
+        new_weight = te.placeholder(
+            (KH + tile_size - 1, KW + tile_size - 1, CI, CO // num_filter_block, num_filter_block),
+            dtype=kernel_tensor.dtype,
+        )
+        new_workload = autotvm.task.args_to_workload(
+            [
+                new_data,
+                new_weight,
+                strides,
+                padding,
+                dilation,
+                out_dtype,
+            ],
+            wkl_name,
+        )
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+            inputs[0], weight, **new_attrs
+        )
+
     if "conv2d_nchwc" in topi_tmpl:  # covers both conv2d_nchwc and depthwise_conv2d_nchwc
         if data_layout == "NCHW" and kernel_layout == "OIHW":
             batch, in_channels, in_height, in_width = data_tensor.shape
diff --git a/python/tvm/topi/adreno/conv2d_nchw_winograd.py b/python/tvm/topi/adreno/conv2d_nchw_winograd.py
new file mode 100644
index 0000000000000..16f7cb8b19d95
--- /dev/null
+++ b/python/tvm/topi/adreno/conv2d_nchw_winograd.py
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Winograd NCHW template for Adreno backend"""
+
+import logging
+from tvm import autotvm
+from .conv2d_winograd_common import conv2d_winograd_comp, schedule_conv2d_winograd_impl
+
+
+logger = logging.getLogger("conv2d_nchw_winograd")
+
+
+@autotvm.register_topi_compute("conv2d_nchw_winograd.image2d")
+def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    args = {"shared": False, "accumulator": "float16"}
+    return conv2d_nchw_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
+    )
+
+
+@autotvm.register_topi_compute("conv2d_nchw_winograd_acc32.image2d")
+def conv2d_nchw_winograd_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    args = {"shared": False, "accumulator": "float32"}
+    return conv2d_nchw_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
+    )
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd.image2d")
+def schedule_conv2d_nchw_winograd(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16")
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd_acc32.image2d")
+def schedule_conv2d_nchw_winograd_acc32(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32")
+
+
+@autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform.image2d")
+def conv2d_nchw_winograd_without_weight_transform(
+    cfg, data, kernel, strides, padding, dilation, out_dtype
+):
+    args = {"shared": False, "accumulator": "float16"}
+    return conv2d_nchw_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
+    )
+
+
+@autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform_acc32.image2d")
+def conv2d_nchw_winograd_without_weight_transform_acc32(
+    cfg, data, kernel, strides, padding, dilation, out_dtype
+):
+    args = {"shared": False, "accumulator": "float32"}
+    return conv2d_nchw_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
+    )
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform.image2d")
+def schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16", pre_computed=True)
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform_acc32.image2d")
+def schedule_conv2d_nchw_winograd_without_weight_transform_acc32(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32", pre_computed=True)
+
+
+def conv2d_nchw_winograd_comp(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed
+):
+    """Compute declaration for winograd
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data: tvm.te.Tensor
+        4-D or 5-D Data tensor with shape NCHW or NCHW4c
+
+    kernel: tvm.te.Tensor
+        4-D or 5-D tensor with shape OIHW or OIHW4o
+
+    strides: int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding: int or a list/tuple of 2 or 4 ints
+        padding size, or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
+
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    args: dict
+        Dictionary with additional arguments, e.g. accumulator type
+
+    pre_computed: bool
+        Flag if weights were pre computed if true or the weights should be
+        computed in runtime
+
+    Returns
+    -------
+    output: tvm.te.Tensor
+        4-D or 5-D with shape NCHW or NCHW4c
+    """
+    return conv2d_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, "NCHW"
+    )
diff --git a/python/tvm/topi/adreno/conv2d_nhwc_winograd.py b/python/tvm/topi/adreno/conv2d_nhwc_winograd.py
new file mode 100644
index 0000000000000..bfe385f210a49
--- /dev/null
+++ b/python/tvm/topi/adreno/conv2d_nhwc_winograd.py
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Winograd NHWC template for Adreno backend"""
+
+import logging
+from tvm import autotvm
+from .conv2d_winograd_common import conv2d_winograd_comp, schedule_conv2d_winograd_impl
+
+
+logger = logging.getLogger("conv2d_nhwc_winograd")
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd.image2d")
+def conv2d_nhwc_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    args = {"shared": False, "accumulator": "float16"}
+    return conv2d_nhwc_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
+    )
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd_acc32.image2d")
+def conv2d_nhwc_winograd_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    args = {"shared": False, "accumulator": "float32"}
+    return conv2d_nhwc_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
+    )
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd.image2d")
+def schedule_conv2d_nhwc_winograd(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16")
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd_acc32.image2d")
+def schedule_conv2d_nhwc_winograd_acc32(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32")
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd_without_weight_transform.image2d")
+def conv2d_nhwc_winograd_without_weight_transform(
+    cfg, data, kernel, strides, padding, dilation, out_dtype
+):
+    args = {"shared": False, "accumulator": "float16"}
+    return conv2d_nhwc_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
+    )
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd_without_weight_transform_acc32.image2d")
+def conv2d_nhwc_winograd_without_weight_transform_acc32(
+    cfg, data, kernel, strides, padding, dilation, out_dtype
+):
+    args = {"shared": False, "accumulator": "float32"}
+    return conv2d_nhwc_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
+    )
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd_without_weight_transform.image2d")
+def schedule_conv2d_nhwc_winograd_without_weight_transform(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16", pre_computed=True)
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd_without_weight_transform_acc32.image2d")
+def schedule_conv2d_nhwc_winograd_without_weight_transform_acc32(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32", pre_computed=True)
+
+
+def conv2d_nhwc_winograd_comp(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed
+):
+    """Compute declaration for winograd
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data: tvm.te.Tensor
+        4-D or 5-D Data tensor with shape NCHW or NCHW4c
+
+    kernel: tvm.te.Tensor
+        4-D or 5-D tensor with shape OIHW or OIHW4o
+
+    strides: int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding: int or a list/tuple of 2 or 4 ints
+        padding size, or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
+
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    args: dict
+        Dictionary with additional arguments, e.g. accumulator type
+
+    pre_computed: bool
+        Flag if weights were pre computed if true or the weights should be
+        computed in runtime
+
+    Returns
+    -------
+    output: tvm.te.Tensor
+        4-D or 5-D with shape NCHW or NCHW4c
+    """
+    return conv2d_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, "NHWC"
+    )
diff --git a/python/tvm/topi/adreno/conv2d_winograd_common.py b/python/tvm/topi/adreno/conv2d_winograd_common.py
new file mode 100644
index 0000000000000..494b691a7f076
--- /dev/null
+++ b/python/tvm/topi/adreno/conv2d_winograd_common.py
@@ -0,0 +1,512 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Common Winograd implementation for Adreno backend"""
+
+import tvm
+from tvm import te
+from tvm import autotvm
+
+from tvm.topi import nn
+from tvm.topi.utils import get_const_int, get_const_tuple, traverse_inline
+from ..nn.winograd_util import winograd_transform_matrices
+from .utils import (
+    split_to_chunks,
+    pack_input,
+    pack_filter,
+    bind_data_copy,
+    get_texture_storage,
+    infer_tile_size,
+)
+
+
+def conv2d_winograd_comp(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, layout
+):
+    """Compute declaration for winograd
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data: tvm.te.Tensor
+        4-D or 5-D Data tensor with shape NCHW or NCHW4c
+
+    kernel: tvm.te.Tensor
+        4-D or 5-D tensor with shape OIHW or OIHW4o
+
+    strides: int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding: int or a list/tuple of 2 or 4 ints
+        padding size, or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
+
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    args: dict
+        Dictionary with additional arguments, e.g. accumulator type
+
+    pre_computed: bool
+        Flag if weights were pre computed if true or the weights should be
+        computed in runtime
+
+    layout: str
+        NHWC or NCHW values are accepted
+
+    Returns
+    -------
+    output: tvm.te.Tensor
+        4-D or 5-D with shape NCHW or NCHW4c
+    """
+    assert layout in ("NCHW", "NHWC")
+    tile_size = infer_tile_size(data, layout)
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+    HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
+
+    convert_from4d = False
+    if len(data.shape) == 4:
+        if layout == "NCHW":
+            N, DCI, H, W = get_const_tuple(data.shape)
+        else:
+            N, H, W, DCI = get_const_tuple(data.shape)
+        if not pre_computed:
+            if layout == "NCHW":
+                out_channels, CI, KH, KW = get_const_tuple(kernel.shape)
+            else:
+                KH, KW, CI, out_channels = get_const_tuple(kernel.shape)
+        else:
+            alpha, _, CI, out_channels = get_const_tuple(kernel.shape)
+            KH = KW = alpha + 1 - tile_size
+
+        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(CI, 4)
+        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channels, 4)
+        if autotvm.GLOBAL_SCOPE.in_tuning is True:
+            if layout == "NCHW":
+                dshape = (N, in_channel_chunks, H, W, in_channel_block)
+            else:
+                dshape = (N, H, W, in_channel_chunks, in_channel_block)
+            if not pre_computed:  # kernel tensor is raw tensor, do strict check
+                if layout == "NCHW":
+                    kshape = (out_channel_chunks, CI, KH, KW, out_channel_block)
+                else:
+                    kshape = (KH, KW, CI, out_channel_chunks, out_channel_block)
+            else:
+                kshape = (alpha, alpha, CI, out_channel_chunks, out_channel_block)
+            data = tvm.te.placeholder(dshape, data.dtype, name="data_placeholder")
+            kernel = tvm.te.placeholder(kshape, kernel.dtype, name="kernel_placeholder")
+        else:
+            convert_from4d = True
+            data = pack_input(
+                data, layout, N, in_channel_chunks, in_channel_block, in_channel_tail, H, W
+            )
+            kernel_layout = "OIHW" if layout == "NCHW" else "HWIO"
+            if not pre_computed:  # kernel tensor is raw tensor, do strict check
+                kernel = pack_filter(
+                    kernel,
+                    kernel_layout,
+                    out_channel_chunks,
+                    out_channel_block,
+                    out_channel_tail,
+                    CI,
+                    in_channel_chunks,
+                    in_channel_block,
+                    in_channel_tail,
+                    KH,
+                    KW,
+                )
+            else:
+                kernel = pack_filter(
+                    kernel,
+                    "HWIO",
+                    out_channel_chunks,
+                    out_channel_block,
+                    out_channel_tail,
+                    CI,
+                    in_channel_chunks,
+                    in_channel_block,
+                    in_channel_tail,
+                    alpha,
+                    alpha,
+                )
+    if layout == "NCHW":
+        N, DCI, H, W, CB = get_const_tuple(data.shape)
+    else:
+        N, H, W, DCI, CB = get_const_tuple(data.shape)
+    if not pre_computed:  # kernel tensor is raw tensor, do strict check
+        if layout == "NCHW":
+            CO, CI, KH, KW, COB = get_const_tuple(kernel.shape)
+        else:
+            KH, KW, CI, CO, COB = get_const_tuple(kernel.shape)
+        alpha = KW + tile_size - 1
+        assert HSTR == 1 and WSTR == 1 and KH == KW
+    else:
+        alpha, _, CI, CO, COB = get_const_tuple(kernel.shape)
+        KH = KW = alpha + 1 - tile_size
+        assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1
+
+    if isinstance(N, tvm.tir.Any):
+        N = tvm.te.size_var("n")
+
+    if not isinstance(H, int) or not isinstance(W, int):
+        raise RuntimeError(
+            "adreno winograd conv2d doesn't support dynamic input\
+                           height or width."
+        )
+
+    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))
+    if layout == "NCHW":
+        data_pad = nn.pad(data, (0, 0, pt, pl, 0), (0, 0, pb, pr, 0), name="data_pad")
+    else:
+        data_pad = nn.pad(data, (0, pt, pl, 0, 0), (0, pb, pr, 0, 0), name="data_pad")
+
+    r = KW
+    m = tile_size
+    A, B, G = winograd_transform_matrices(m, r, out_dtype)
+
+    H = (H + pt + pb - KH) // HSTR + 1
+    W = (W + pl + pr - KW) // WSTR + 1
+    nH, nW = (H + m - 1) // m, (W + m - 1) // m
+
+    P = N * nH * nW if isinstance(N, int) else nH * nW
+
+    # transform kernel
+    if not pre_computed:
+        r_kh = te.reduce_axis((0, KH), name="r_kh")
+        r_kw = te.reduce_axis((0, KW), name="r_kw")
+        if layout == "NCHW":
+            kernel_pack = te.compute(
+                (alpha, alpha, CI, CO, COB),
+                lambda eps, nu, ci, co, cob: te.sum(
+                    kernel[co][ci][r_kh][r_kw][cob] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
+                ),
+                name="kernel_pack",
+            )
+        else:
+            kernel_pack = te.compute(
+                (alpha, alpha, CI, CO, COB),
+                lambda eps, nu, ci, co, cob: te.sum(
+                    kernel[r_kh][r_kw][ci][co][cob] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
+                ),
+                name="kernel_pack",
+            )
+    else:
+        kernel_pack = kernel
+
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
+    if layout == "NCHW":
+        N, CI, H, W, CB = get_const_tuple(data.shape)
+    else:
+        N, H, W, CI, CB = get_const_tuple(data.shape)
+
+    # pack input tile
+    if layout == "NCHW":
+        input_tile = te.compute(
+            (alpha, alpha, CI, P, CB),
+            lambda eps, nu, c, p, cb: data_pad[idxdiv(p, (nH * nW))][c][
+                idxmod(idxdiv(p, nW), nH) * m + eps
+            ][idxmod(p, nW) * m + nu][cb],
+            name="d",
+        )
+    else:
+        input_tile = te.compute(
+            (alpha, alpha, CI, P, CB),
+            lambda eps, nu, c, p, cb: data_pad[idxdiv(p, (nH * nW))][
+                idxmod(idxdiv(p, nW), nH) * m + eps
+            ][idxmod(p, nW) * m + nu][c][cb],
+            name="d",
+        )
+
+    # transform data
+    r_a = te.reduce_axis((0, alpha), "r_a")
+    r_b = te.reduce_axis((0, alpha), "r_a")
+    data_pack = te.compute(
+        (P, CI, alpha, alpha, CB),
+        lambda p, ci, eps, nu, cb: te.sum(
+            input_tile[r_a][r_b][ci][p][cb] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
+        ),
+        name="data_pack",
+    )
+
+    # repack transformed data
+    data_pack_trans = te.compute(
+        (alpha, alpha, CI, P, CB),
+        lambda eps, nu, c, p, cb: data_pack[p][c][eps][nu][cb],
+        name="data_pack_trans",
+    )
+
+    # do batch gemm
+    ci = te.reduce_axis((0, CI), name="ci")
+    cb = te.reduce_axis((0, CB), name="cb")
+    bgemm = te.compute(
+        (alpha, alpha, CO, P, COB),
+        lambda eps, nu, co, p, cob: te.sum(
+            (
+                kernel_pack[eps][nu][ci * CB + cb][co][cob] * data_pack_trans[eps][nu][ci][p][cb]
+            ).astype(args["accumulator"]),
+            axis=[ci, cb],
+        ),
+        name="bgemm",
+    )
+
+    # inverse transform
+    r_a = te.reduce_axis((0, alpha), "r_a")
+    r_b = te.reduce_axis((0, alpha), "r_a")
+    inverse = te.compute(
+        (CO, P, m, m, COB),
+        lambda co, p, vh, vw, cob: te.sum(
+            bgemm[r_a][r_b][co][p][cob] * (A[r_a][vh] * A[r_b][vw]).astype(args["accumulator"]),
+            axis=[r_a, r_b],
+        ),
+        name="inverse",
+    )
+
+    # output
+    if layout == "NCHW":
+        if convert_from4d and autotvm.GLOBAL_SCOPE.in_tuning is False:
+            output = te.compute(
+                (N, out_channels, H, W),
+                lambda n, c, h, w: inverse[c // CB][n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)][
+                    idxmod(h, m)
+                ][idxmod(w, m)][c % CB].astype(out_dtype),
+                name="output",
+                tag="cast_from_acc" + args["accumulator"][-2:],
+            )
+        else:
+            output = te.compute(
+                (N, CO, H, W, COB),
+                lambda n, co, h, w, cob: inverse[co][
+                    n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)
+                ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype),
+                name="output",
+                tag="cast_from_acc" + args["accumulator"][-2:],
+            )
+    else:
+        if convert_from4d and autotvm.GLOBAL_SCOPE.in_tuning is False:
+            output = te.compute(
+                (N, H, W, out_channels),
+                lambda n, h, w, c: inverse[c // CB][n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)][
+                    idxmod(h, m)
+                ][idxmod(w, m)][c % CB].astype(out_dtype),
+                name="output",
+                tag="cast_from_acc" + args["accumulator"][-2:],
+            )
+        else:
+            output = te.compute(
+                (N, H, W, CO, COB),
+                lambda n, h, w, co, cob: inverse[co][
+                    n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)
+                ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype),
+                name="output",
+                tag="cast_from_acc" + args["accumulator"][-2:],
+            )
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * CO * COB * H * W * CI * CB * KH * KW)
+
+    return output
+
+
+def schedule_conv2d_winograd_impl(cfg, outs, tag, pre_computed=False):
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == tag:
+            schedule_conv2d_winograd(cfg, s, op.output(0), pre_computed=pre_computed)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def schedule_conv2d_winograd(cfg, s, output, pre_computed):
+    """Schedule winograd template"""
+    inverse = s[output].op.input_tensors[0]
+    bgemm, A = s[inverse].op.input_tensors
+    kernel_pack, data_pack_trans = s[bgemm].op.input_tensors
+    data_pack = s[data_pack_trans].op.input_tensors[0]
+    input_tile, B = s[data_pack].op.input_tensors
+    pad_data = s[input_tile].op.input_tensors[0]
+
+    # data transform
+    s[B].compute_inline()
+    s[A].compute_inline()
+
+    # probably will improve real topology execution
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # Padding to texture
+        AA = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [input_tile])
+        bind_data_copy(s[AA])
+
+    s[input_tile].compute_inline()
+
+    OL = s.cache_write(data_pack, "local")
+    c, p, eps, nu, cb = s[data_pack].op.axis
+    fused = s[data_pack].fuse(c, p, eps, nu)
+    bx, tx = s[data_pack].split(fused, 128)
+    s[data_pack].vectorize(cb)
+    s[data_pack].bind(bx, te.thread_axis("blockIdx.x"))
+    s[data_pack].bind(tx, te.thread_axis("threadIdx.x"))
+
+    _, _, eps, nu, cb = s[OL].op.axis
+    r_a, r_b = s[OL].op.reduce_axis
+    s[OL].unroll(eps)
+    s[OL].unroll(nu)
+    s[OL].unroll(r_a)
+    s[OL].unroll(r_b)
+    s[OL].vectorize(cb)
+    s[OL].compute_at(s[data_pack], tx)
+    s[data_pack].set_scope(get_texture_storage(data_pack.shape))
+
+    s[data_pack_trans].compute_inline()
+
+    # transform kernel
+    if not pre_computed:
+        kernel, G = s[kernel_pack].op.input_tensors
+        eps, nu, ci, co, cob = s[kernel_pack].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # skip this part during tuning to make recrods accurate
+            # this part will be pre-computed during pre-compute optimization pass
+            s[G].pragma(s[G].op.axis[0], "debug_skip_region")
+            s[kernel_pack].pragma(eps, "debug_skip_region")
+        else:
+            s[G].compute_inline()
+            r_a, r_b = s[kernel_pack].op.reduce_axis
+            for axis in [eps, nu, r_a, r_b]:
+                s[kernel_pack].unroll(axis)
+
+            fused = s[kernel_pack].fuse(ci, co)
+            bb, tt = s[kernel_pack].split(fused, 128)
+            s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b, cob)
+            s[kernel_pack].vectorize(cob)
+            s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
+            s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
+    else:
+        kernel = kernel_pack
+
+    if isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag:
+        # manage scheduling of datacopy
+        pack_data = pad_data.op.input_tensors[0]
+        bind_data_copy(s[pack_data])
+        bind_data_copy(s[kernel])
+    elif isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+    s[pad_data].compute_inline()
+
+    ##### space definition begin #####
+    cfg.define_knob("auto_unroll_max_step", [0, 4, 16])
+    b1, b2, y, x, cb = s[bgemm].op.axis
+    rcc = s[bgemm].op.reduce_axis[0]
+    alpha = get_const_int(b1.dom.extent)
+
+    cfg.define_split(
+        "tile_y", y, num_outputs=3, filter=lambda entry: entry.size[2] <= 64 and entry.size[1] <= 8
+    )
+    cfg.define_split(
+        "tile_x",
+        x,
+        num_outputs=3,
+        filter=lambda entry: entry.size[2] <= 64 and entry.size[1] >= 4 and entry.size[1] <= 8,
+    )
+    cfg.define_split("tile_rc", rcc, num_outputs=2)
+    # TODO: Uncomment the following lines when multi_filter will be introduced
+    # cfg.multi_filter(
+    # filter=lambda entity: entity["tile_y"].size[2] * entity["tile_x"].size[2] in range(32,1024)
+    # )
+    ##### space definition end #####
+
+    # batch gemm
+    OL = s.cache_write(bgemm, "local")
+    if (
+        autotvm.GLOBAL_SCOPE.in_tuning
+        or isinstance(kernel.op, tvm.te.ComputeOp)
+        and "filter_pack" in kernel.op.tag
+    ):
+        BB = s.cache_read(kernel_pack, get_texture_storage(kernel_pack.shape), [OL])
+        bind_data_copy(s[BB])
+
+    by = s[bgemm].fuse(b1, b2, y)
+
+    # tile and bind spatial axes
+    bgemm_scope, by = s[bgemm].split(by, nparts=1)
+    by, vy, ty = cfg["tile_y"].apply(s, bgemm, by)
+    bx, vx, tx = cfg["tile_x"].apply(s, bgemm, x)
+    s[bgemm].bind(by, te.thread_axis("blockIdx.y"))
+    s[bgemm].bind(bx, te.thread_axis("blockIdx.x"))
+    s[bgemm].bind(vy, te.thread_axis("vthread"))
+    s[bgemm].bind(vx, te.thread_axis("vthread"))
+    s[bgemm].bind(ty, te.thread_axis("threadIdx.y"))
+    s[bgemm].bind(tx, te.thread_axis("threadIdx.x"))
+    s[bgemm].reorder(bgemm_scope, by, bx, vy, vx, ty, tx, cb)
+    s[bgemm].vectorize(cb)
+    s[bgemm].set_scope(get_texture_storage(bgemm.shape))
+
+    # tile reduction axes
+    s[OL].compute_at(s[bgemm], tx)
+    b1, b2, y, x, cb = s[OL].op.axis
+    (rcc, rcb) = s[OL].op.reduce_axis
+    b = s[OL].fuse(b1, b2)
+    s[OL].reorder(b, y, x, rcc, rcb, cb)
+    # s[OL].unroll(rcb)
+    s[OL].pragma(rcb, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+    s[OL].pragma(rcb, "unroll_explicit", True)
+    s[OL].vectorize(cb)
+
+    # schedule inverse, output and fusion
+    if output.op in s.outputs:
+        OL = None
+    else:
+        OL = output
+        s[OL].set_scope("local")
+        output = s.outputs[0]
+
+    m = alpha - 3 + 1
+    if len(s[output].op.axis) == 4:
+        n, co, h, w = s[output].op.axis
+    else:
+        n, co, h, w, _ = s[output].op.axis
+    ho, wo, hi, wi = s[output].tile(h, w, m, m)
+    inverse_scope, n = s[output].split(n, nparts=1)
+
+    fused = s[output].fuse(n, co, ho, wo)
+    bb, tt = s[output].split(fused, 128)
+
+    s[output].bind(bb, te.thread_axis("blockIdx.x"))
+    s[output].bind(tt, te.thread_axis("threadIdx.x"))
+
+    if OL is not None:
+        s[OL].compute_at(s[output], tt)
+
+    co, p, vh, vw, cb = s[inverse].op.axis
+    r_a, r_b = s[inverse].op.reduce_axis
+    for axis in [vh, vw, r_a, r_b]:
+        s[inverse].unroll(axis)
+    s[inverse].vectorize(cb)
+    s[inverse].compute_at(s[output], tt)
+
+    return s
diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py
index 727741c11fd3f..78a992e56a0f9 100644
--- a/python/tvm/topi/adreno/utils.py
+++ b/python/tvm/topi/adreno/utils.py
@@ -547,3 +547,31 @@ def get_texture_storage(shape):
         return "global.texture-nhwc"
     else:
         return "global.texture-weight"
+
+
+def infer_tile_size(data, layout):
+    """Compute the tile size for Winograd algorithm
+
+    Parameters
+    ----------
+    data: tvm.te.Tensor
+        Data tensor
+
+    layout: string
+        Layout of data tebsir
+        NCHW, NCHW4c, NHWC or NHWC4c are acceptable
+
+    Returns
+    -------
+    tile_size : int
+        Calculated tile size
+    """
+    assert layout in ("NCHW", "NCHW4c", "NHWC", "NHWC4c"), "Incompatible layout"
+    if layout in ("NCHW", "NCHW4c"):
+        H = get_const_tuple(data.shape)[2]
+    else:
+        H = get_const_tuple(data.shape)[1]
+
+    if H % 8 == 0:
+        return 4
+    return 2
diff --git a/src/runtime/opencl/texture_pool.cc b/src/runtime/opencl/texture_pool.cc
index e7f6655c41142..0b9477f2d4ea3 100644
--- a/src/runtime/opencl/texture_pool.cc
+++ b/src/runtime/opencl/texture_pool.cc
@@ -29,113 +29,112 @@
 namespace tvm {
 namespace runtime {
 
-class TexturePool::Pool {
- public:
-  Pool() = default;
-  void* Alloc(Device dev, DeviceAPI* device, size_t width, size_t height, DLDataType type_hint) {
-    Entry e;
-    e.data = nullptr;
-    if (free_list_.size() != 0) {
-      Entry new_mem;
-      int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
-      int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
-      int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
-      int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
-      std::vector<Entry>::iterator best_mem;
-      for (auto it = free_list_.begin(); it != free_list_.end(); ++it) {
-        if (it->type.code != type_hint.code) {
-          continue;
-        }
-        new_mem.x = std::max(it->x, width);
-        new_mem.y = std::max(it->y, height);
-        int64_t added_size_x = new_mem.x - it->x;
-        int64_t added_size_y = new_mem.y - it->y;
-        int64_t wasted_size_x = new_mem.x - width;
-        int64_t wasted_size_y = new_mem.y - height;
-        // Minimize added size first and wasted size thereafter
-        if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
-            (min_added_size_y > 0 && added_size_y < min_added_size_y) ||
-            (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
-            (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
-          min_added_size_x = added_size_x;
-          min_added_size_y = added_size_y;
-          min_wasted_size_x = wasted_size_x;
-          min_wasted_size_y = wasted_size_y;
-          best_mem = it;
-        }
+void* Pool2D::Alloc(Device dev, DeviceAPI* device, size_t width, size_t height,
+                    DLDataType type_hint) {
+  Entry e;
+  Entry new_mem;
+  // Processed several experiments and found that when we are trying to fit
+  // small texture to too big texture then it may lead to the performance
+  // degradation.
+  // Coefficient at 5 looks like robust variant for reusing textures.
+  const int64_t max_ratio = 5;
+  e.data = nullptr;
+  std::vector<Entry>::iterator best_mem;
+  if (free_list_.size() != 0) {
+    int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
+    int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
+    int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
+    int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
+    for (auto it = free_list_.begin(); it != free_list_.end(); ++it) {
+      if (it->type.code != type_hint.code) {
+        continue;
       }
-
-      if (min_added_size_x == 0 && min_added_size_y == 0) {
-        // use existing block
-        e = *best_mem;
-        free_list_.erase(best_mem);
-      } else if (static_cast<size_t>(min_added_size_x) <= width ||
-                 static_cast<size_t>(min_added_size_y) <= height) {
-        // if added size is less or equal to
-        // what is needed by alloc, then grow entry
-        device->FreeDataSpace(dev, best_mem->data);
-        free_list_.erase(best_mem);
-        new_mem.type = type_hint;
-        std::vector<int64_t> shape{int64_t(new_mem.y), int64_t(new_mem.x), 4};
-        new_mem.data = device->AllocDataSpace(dev, shape.size(), shape.data(), new_mem.type,
-                                              Optional<String>("global.texture"));
-        e = new_mem;
+      // avoid reusing too small and too big textures
+      if (width / it->x > max_ratio || it->x / width > max_ratio || height / it->y > max_ratio ||
+          it->y / height > max_ratio) {
+        continue;
+      }
+      int64_t new_width = std::max(it->x, width);
+      int64_t new_height = std::max(it->y, height);
+      int64_t added_size_x = new_width - it->x;
+      int64_t added_size_y = new_height - it->y;
+      int64_t wasted_size_x = new_width - width;
+      int64_t wasted_size_y = new_height - height;
+      // Minimize added size first and wasted size thereafter
+      if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
+          (min_added_size_y > 0 && added_size_y < min_added_size_y) ||
+          (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
+          (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
+        min_added_size_x = added_size_x;
+        min_added_size_y = added_size_y;
+        min_wasted_size_x = wasted_size_x;
+        min_wasted_size_y = wasted_size_y;
+        best_mem = it;
+        new_mem.x = new_width;
+        new_mem.y = new_height;
       }
     }
 
-    if (e.data == nullptr) {
-      // create new block
-      std::vector<int64_t> shape{int64_t(height), int64_t(width), 4};
-      e.data = device->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
-                                      Optional<String>("global.texture"));
-      e.x = width;
-      e.y = height;
-      e.type = type_hint;
+    if (min_added_size_x == 0 && min_added_size_y == 0) {
+      // use existing block
+      e = *best_mem;
+      free_list_.erase(best_mem);
+    } else if (static_cast<size_t>(min_added_size_x) <= width ||
+               static_cast<size_t>(min_added_size_y) <= height) {
+      // if added size is less or equal to
+      // what is needed by alloc, then grow entry
+      device->FreeDataSpace(dev, best_mem->data);
+      free_list_.erase(best_mem);
+      new_mem.type = type_hint;
+      std::vector<int64_t> shape{int64_t(new_mem.y), int64_t(new_mem.x), 4};
+      new_mem.data = device->AllocDataSpace(dev, shape.size(), shape.data(), new_mem.type,
+                                            Optional<String>("global.texture"));
+      e = new_mem;
     }
-
-    allocated_.push_back(e);
-    return e.data;
   }
 
-  void Free(void* data) {
-    Entry e;
-    if (allocated_.back().data == data) {
-      // quick path, last allocated.
-      e = allocated_.back();
-      allocated_.pop_back();
-    } else {
-      int index = static_cast<int>(allocated_.size()) - 2;
-      for (; index >= 0 && allocated_[index].data != data; --index) {
-      }
-      ICHECK_GE(index, 0) << "Attempt to free texture that has not been allocated";
-      e = allocated_[index];
-      allocated_.erase(allocated_.begin() + index);
-    }
-    free_list_.push_back(e);
+  if (e.data == nullptr) {
+    // create new block
+    std::vector<int64_t> shape{int64_t(height), int64_t(width), 4};
+    e.data = device->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
+                                    Optional<String>("global.texture"));
+    e.x = width;
+    e.y = height;
+    e.type = type_hint;
   }
 
-  // Release all resources immediately
-  void Release(Device dev, DeviceAPI* device) {
-    for (auto& e : allocated_) {
-      device->FreeDataSpace(dev, e.data);
-    }
-    for (auto& e : free_list_) {
-      device->FreeDataSpace(dev, e.data);
+  allocated_.push_back(e);
+  return e.data;
+}
+
+void Pool2D::Free(void* data) {
+  Entry e;
+  if (allocated_.back().data == data) {
+    // quick path, last allocated.
+    e = allocated_.back();
+    allocated_.pop_back();
+  } else {
+    int index = static_cast<int>(allocated_.size()) - 2;
+    for (; index >= 0 && allocated_[index].data != data; --index) {
     }
-    allocated_.clear();
-    free_list_.clear();
+    ICHECK_GE(index, 0) << "Attempt to free texture that has not been allocated";
+    e = allocated_[index];
+    allocated_.erase(allocated_.begin() + index);
   }
+  free_list_.push_back(e);
+}
 
- private:
-  struct Entry {
-    void* data;
-    size_t x;
-    size_t y;
-    DLDataType type;
-  };
-  std::vector<Entry> free_list_;
-  std::vector<Entry> allocated_;
-};
+// Release all resources immediately
+void Pool2D::Release(Device dev, DeviceAPI* device) {
+  for (auto& e : allocated_) {
+    device->FreeDataSpace(dev, e.data);
+  }
+  for (auto& e : free_list_) {
+    device->FreeDataSpace(dev, e.data);
+  }
+  allocated_.clear();
+  free_list_.clear();
+}
 
 TexturePool::TexturePool(DLDeviceType device_type, DeviceAPI* device)
     : device_type_(device_type), device_(device) {}
@@ -157,7 +156,7 @@ void* TexturePool::AllocTexture(Device dev, size_t width, size_t height, DLDataT
     array_.resize(dev.device_id + 1, nullptr);
   }
   if (array_[dev.device_id] == nullptr) {
-    array_[dev.device_id] = new Pool();
+    array_[dev.device_id] = new Pool2D();
   }
   return array_[dev.device_id]->Alloc(dev, device_, width, height, type_hint);
 }
diff --git a/src/runtime/texture.h b/src/runtime/texture.h
index 5f43c8cee8f3f..dc38101f0cd4f 100644
--- a/src/runtime/texture.h
+++ b/src/runtime/texture.h
@@ -94,6 +94,25 @@ inline bool IsTextureStorage(std::string scope) {
   return scope.find("texture") != std::string::npos;
 }
 
+class TVM_DLL Pool2D {
+ public:
+  Pool2D() = default;
+  void* Alloc(Device dev, DeviceAPI* device, size_t width, size_t height, DLDataType type_hint);
+  void Free(void* data);
+  // Release all resources immediately
+  void Release(Device dev, DeviceAPI* device);
+
+ protected:
+  struct Entry {
+    void* data;
+    size_t x;
+    size_t y;
+    DLDataType type;
+  };
+  std::vector<Entry> free_list_;
+  std::vector<Entry> allocated_;
+};
+
 /*!
  * \brief A two dimensional storage pool that recycles temporal workspace
  * allocations for dynamically allocated texture. See AllocTexture docstring
@@ -136,9 +155,8 @@ class TVM_DLL TexturePool {
   void FreeTexture(Device dev, void* ptr);
 
  private:
-  class Pool;
   /*! \brief pool of device local array */
-  std::vector<Pool*> array_;
+  std::vector<Pool2D*> array_;
   /*! \brief device type this pool support */
   DLDeviceType device_type_;
   /*! \brief The device API */
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index e6f322885e3a2..4a969dcee8bb9 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -43,6 +43,10 @@
 #define TVM_INFO_USE_OPENCL "NOT-FOUND"
 #endif
 
+#ifndef TVM_INFO_USE_OPENCL_GTEST
+#define TVM_INFO_USE_OPENCL_GTEST "NOT-FOUND"
+#endif
+
 #ifndef TVM_INFO_USE_VULKAN
 #define TVM_INFO_USE_VULKAN "NOT-FOUND"
 #endif
@@ -286,6 +290,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_MSVC_MT", TVM_INFO_USE_MSVC_MT},
       {"USE_NNPACK", TVM_INFO_USE_NNPACK},
       {"USE_OPENCL", TVM_INFO_USE_OPENCL},
+      {"USE_OPENCL_GTEST", TVM_INFO_USE_OPENCL_GTEST},
       {"USE_OPENMP", TVM_INFO_USE_OPENMP},
       {"USE_PAPI", TVM_INFO_USE_PAPI},
       {"USE_PROFILER", TVM_INFO_USE_PROFILER},
diff --git a/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc b/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc
new file mode 100644
index 0000000000000..2d3f43ddce6de
--- /dev/null
+++ b/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/container/optional.h>
+
+#include "../src/runtime/opencl/opencl_common.h"
+#include "../src/runtime/texture.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::cl;
+
+// PoolWrapper is necessary because in class Pool2D we don't have an access to
+// its protected members. In this class we add new methods which allow us to
+// get and check internal state of class Pool
+class PoolWrapper : public Pool2D {
+ public:
+  inline size_t FreeListSize() const { return free_list_.size(); }
+  inline size_t AllocatedListSize() const { return allocated_.size(); }
+  inline std::pair<size_t, size_t> FreeListItemSize(size_t idx) const {
+    return std::make_pair(free_list_[idx].x, free_list_[idx].y);
+  }
+  inline std::pair<size_t, size_t> AllocatedListItemSize(size_t idx) const {
+    return std::make_pair(allocated_[idx].x, allocated_[idx].y);
+  }
+};
+
+TEST(OpenCLTexturePool, textures_reallocation_optimal_size) {
+  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
+  OpenCLThreadEntry* t = workspace->GetThreadEntry();
+  PoolWrapper pool;
+  EXPECT_EQ(pool.AllocatedListSize(), 0);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+
+  DLDataType type{kDLFloat, 16, 1};
+  void* data1 = pool.Alloc(t->device, workspace, 1024, 768, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+  auto item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 768);
+
+  pool.Alloc(t->device, workspace, 64, 12455, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 2);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+  item = pool.AllocatedListItemSize(1);
+  EXPECT_EQ(item.first, 64);
+  EXPECT_EQ(item.second, 12455);
+
+  pool.Free(data1);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 1);
+  item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 64);
+  EXPECT_EQ(item.second, 12455);
+  item = pool.FreeListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 768);
+
+  pool.Alloc(t->device, workspace, 768, 1024, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 2);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+  item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 64);
+  EXPECT_EQ(item.second, 12455);
+  item = pool.AllocatedListItemSize(1);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 1024);
+}
+
+TEST(OpenCLTexturePool, avoid_reusing_too_big_textures) {
+  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
+  OpenCLThreadEntry* t = workspace->GetThreadEntry();
+  PoolWrapper pool;
+  EXPECT_EQ(pool.AllocatedListSize(), 0);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+
+  DLDataType type{kDLFloat, 16, 1};
+  void* data1 = pool.Alloc(t->device, workspace, 12455, 64, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+  auto item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 12455);
+  EXPECT_EQ(item.second, 64);
+
+  pool.Free(data1);
+  EXPECT_EQ(pool.AllocatedListSize(), 0);
+  EXPECT_EQ(pool.FreeListSize(), 1);
+  item = pool.FreeListItemSize(0);
+  EXPECT_EQ(item.first, 12455);
+  EXPECT_EQ(item.second, 64);
+
+  pool.Alloc(t->device, workspace, 1024, 768, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 1);
+  item = pool.FreeListItemSize(0);
+  EXPECT_EQ(item.first, 12455);
+  EXPECT_EQ(item.second, 64);
+  item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 768);
+}
+
+TEST(OpenCLTexturePool, avoid_reusing_too_small_textures) {
+  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
+  OpenCLThreadEntry* t = workspace->GetThreadEntry();
+  PoolWrapper pool;
+  EXPECT_EQ(pool.AllocatedListSize(), 0);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+
+  DLDataType type{kDLFloat, 16, 1};
+  void* data1 = pool.Alloc(t->device, workspace, 1024, 64, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+  auto item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 64);
+
+  pool.Free(data1);
+  EXPECT_EQ(pool.AllocatedListSize(), 0);
+  EXPECT_EQ(pool.FreeListSize(), 1);
+  item = pool.FreeListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 64);
+
+  pool.Alloc(t->device, workspace, 12544, 64, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 1);
+  item = pool.FreeListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 64);
+  item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 12544);
+  EXPECT_EQ(item.second, 64);
+}
diff --git a/tests/cpp-runtime/opencl/run_gtests.cc b/tests/cpp-runtime/opencl/run_gtests.cc
new file mode 100644
index 0000000000000..b16ae3efc74d9
--- /dev/null
+++ b/tests/cpp-runtime/opencl/run_gtests.cc
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <string>
+#include <vector>
+
+#include "../src/support/utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace cl {
+
+TVM_REGISTER_GLOBAL("opencl.run_gtests").set_body([](TVMArgs args, TVMRetValue* rv) {
+  // gtest args are passed into this packed func as a singular string
+  // split gtest args using <space> delimiter and build argument vector
+  std::vector<std::string> parsed_args = tvm::support::Split(args[0], ' ');
+  std::vector<char*> argv;
+
+  // add executable name
+  argv.push_back(const_cast<char*>("opencl_run_gtests"));
+
+  // add parsed arguments
+  for (int i = 0; i < parsed_args.size(); ++i) {
+    argv.push_back(const_cast<char*>(parsed_args[i].data()));
+  }
+
+  // end of parsed arguments
+  argv.push_back(nullptr);
+
+  // set argument count
+  int argc = argv.size() - 1;
+
+  // initialize gtest with arguments and run
+  ::testing::InitGoogleTest(&argc, argv.data());
+  *rv = RUN_ALL_TESTS();
+});
+
+}  // namespace cl
+}  // namespace runtime
+}  // namespace tvm
diff --git a/tests/python/contrib/test_opencl/conftest.py b/tests/python/contrib/test_opencl/conftest.py
new file mode 100644
index 0000000000000..0a8b9e1c631f0
--- /dev/null
+++ b/tests/python/contrib/test_opencl/conftest.py
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" OpenCL testing fixtures used to deduce testing argument
+    values from testing parameters """
+
+
+import pytest
+
+import tvm
+import tvm.testing
+
+pytest_plugins = [
+    "tvm.contrib.hexagon.pytest_plugin",
+]
diff --git a/tests/python/contrib/test_opencl/test_run_gtests.py b/tests/python/contrib/test_opencl/test_run_gtests.py
new file mode 100644
index 0000000000000..4afcf7ee8d660
--- /dev/null
+++ b/tests/python/contrib/test_opencl/test_run_gtests.py
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import pytest
+import numpy as np
+
+import tvm
+from tvm import rpc
+
+
+# use pytest -sv to observe gtest output
+# use --gtest_args to pass arguments to gtest
+# for example to run all "foo" tests twice and observe gtest output run
+# pytest -sv <this file> --gtests_args="--gtest_filter=*foo* --gtest_repeat=2"
+@tvm.testing.requires_opencl
+def test_run_gtests(gtest_args):
+    if (
+        "TVM_TRACKER_HOST" in os.environ
+        and "TVM_TRACKER_PORT" in os.environ
+        and "TVM_TRACKER_KEY" in os.environ
+    ):
+        rpc_tracker_host = os.environ["TVM_TRACKER_HOST"]
+        rpc_tracker_port = os.environ["TVM_TRACKER_PORT"]
+        rpc_tracker_port = int(rpc_tracker_port)
+        rpc_key = os.environ["TVM_TRACKER_KEY"]
+        tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port)
+        rpc_connection = tracker.request(rpc_key, priority=0, session_timeout=600)
+    else:
+        rpc_connection = rpc.LocalSession()
+
+    try:
+        func = rpc_connection.get_function("opencl.run_gtests")
+    except:
+        print(
+            "This test requires TVM Runtime to be built with a OpenCL gtest version using OpenCL API cmake flag -DUSE_OPENCL_GTEST=/path/to/opencl/googletest/gtest"
+        )
+        raise
+
+    gtest_error_code = func(gtest_args)
+    np.testing.assert_equal(gtest_error_code, 0)
diff --git a/tests/python/relay/test_conv2d_nchw_texture.py b/tests/python/relay/test_conv2d_nchw_texture.py
index d36da51c8f713..89f68dacbd3ff 100644
--- a/tests/python/relay/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/test_conv2d_nchw_texture.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import re
 import tvm
 import numpy as np
 from tvm import relay
@@ -392,3 +393,45 @@ def test_conv2d_yolov3_v2_nchw_3c():
     }
 
     build_run_compare(mod, params, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_vgg16_winograd_4d():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 512, 28, 28)
+    filter_shape = (512, 512, 3, 3)
+    bias_shape = (1, 512, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[1, 1, 1, 1],
+        channels=512,
+        kernel_size=[3, 3],
+        out_dtype=dtype,
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    matches = re.findall("winograd", graph)
+    assert len(matches) > 0
diff --git a/tests/python/relay/test_conv2d_nhwc_texture.py b/tests/python/relay/test_conv2d_nhwc_texture.py
index a02b7cabbef62..96227ca551cf9 100644
--- a/tests/python/relay/test_conv2d_nhwc_texture.py
+++ b/tests/python/relay/test_conv2d_nhwc_texture.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import os
+import re
 import tvm
 import numpy as np
 from tvm import relay
@@ -554,3 +555,45 @@ def test_conv2d_yolov3_v2_nhwc_3c():
     }
 
     build_run_compare(mod, params, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_vgg16_winograd_4d():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 28, 28, 512)
+    filter_shape = (3, 3, 512, 512)
+    bias_shape = (1, 1, 1, 512)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[1, 1, 1, 1],
+        channels=512,
+        kernel_size=[3, 3],
+        out_dtype=dtype,
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    matches = re.findall("winograd", graph)
+    assert len(matches) > 0
diff --git a/tests/python/relay/utils/adreno_utils.py b/tests/python/relay/utils/adreno_utils.py
index 11abce3bfaa0a..3bb4a6ada4ecc 100644
--- a/tests/python/relay/utils/adreno_utils.py
+++ b/tests/python/relay/utils/adreno_utils.py
@@ -105,6 +105,7 @@ def build_run_compare(
         #         print(index, output[index], x)
 
         np.testing.assert_allclose(output, ref_output, rtol=1e-1, atol=1e-1)
+    return graph
 
 
 def gpu_preprocess(tvm_mod):

From 236eea0f49b4ca9a30e99d54f2ceb7ee3ef836f7 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Thu, 9 Jun 2022 10:19:31 +0100
Subject: [PATCH 076/181] [CMSIS-NN] Removed redudant arguments to CMSIS-NN
 wrapper function (#11431)

Removed input_scale and filter_scale from CMSIS-NN
wrapper function. These are not needed by CMSIS-NN
API which gets called from the generated C wrapper
function for Conv2D.
---
 .../backend/contrib/cmsisnn/relay_to_tir.cc   | 29 +++++-
 .../contrib/test_cmsisnn/test_conv2d.py       | 96 ++++++++++++++++++-
 2 files changed, 121 insertions(+), 4 deletions(-)

diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index dc5537ee905d8..524735caa9d6a 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -141,18 +141,24 @@ class RelayToTIRVisitor : public MixedModeMutator {
     // %3 = qnn.requantize(%2, %input_scale_const_4, %cmsisnn_shift_const_5,
     //                     %output_scale_scalar, %output_zero_point_scalar)
     // clip(%3, a_min=%min_scalar, a_max=%max_scalar)
+    // Position of scales in the global function for Conv2D
+    const int filter_scale_pos = 3;
+    const int input_scale_pos = bias_add_call ? 5 : 4;
     BufferCreator buffer_creator;
     tir::Var input = buffer_creator.CreateBufferVar("input", DataType::Handle(8));
     tir::Var filter = buffer_creator.CreateBufferVar("filter", DataType::Handle(8));
     tir::Var multiplier = buffer_creator.CreateBufferVar("multiplier", DataType::Handle(32));
-    tir::Var filter_scale = buffer_creator.CreateBufferVar("filter_scale", DataType::Handle(32));
     if (bias_add_call) {
       buffer_creator.CreateBufferVar("bias", DataType::Handle(32));
     }
-    tir::Var input_scale = buffer_creator.CreateBufferVar("input_scale", DataType::Handle(32));
     tir::Var shift = buffer_creator.CreateBufferVar("shift", DataType::Handle(32));
     tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
 
+    // Relay function contains input_scale and filter_scale as function parameters at the following
+    // locations in the global partitioned function for Conv2D
+    skip_call_args_.insert(filter_scale_pos);
+    skip_call_args_.insert(input_scale_pos);
+
     // Individual arguments to the structs arguments of the CMSIS-NN API are filled into call_extern
     // https://github.com/ARM-software/CMSIS_5/blob/def6f800f95661eb3451d317f7d0dde504f6020d/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L50
 
@@ -742,11 +748,25 @@ class RelayToTIRVisitor : public MixedModeMutator {
                                                GetRef<Function>(func));
         }
 
+        // Drop out the redundant arguments, and the arg_types from the global function call
         Array<Expr> args;
+        Array<Type> arg_types;
+        auto* func_type = new_global_var->checked_type_.as<FuncTypeNode>();
+        int arg_id = -1;
         for (const auto& arg : call->args) {
+          ++arg_id;
+          if (std::find(skip_call_args_.begin(), skip_call_args_.end(), arg_id) !=
+              skip_call_args_.end()) {
+            continue;
+          }
           args.push_back(VisitExpr(arg));
+          arg_types.push_back(func_type->arg_types[arg_id]);
         }
-
+        if (arg_types.size() != func_type->arg_types.size()) {
+          new_global_var->checked_type_ =
+              FuncType(arg_types, func_type->ret_type, {}, func_type->type_constraints);
+        }
+        skip_call_args_.clear();
         return Call(new_global_var, args, call->attrs, call->type_args, call->span);
       }
     }
@@ -757,7 +777,10 @@ class RelayToTIRVisitor : public MixedModeMutator {
   static constexpr int32_t kScaledDiffIntegerBits = 5;
   static constexpr int32_t kInputBits = 5;
   static constexpr double kBeta = 1.0;
+  /*! \brief Unique id for context buffer needed by CMSIS-NN layers. */
   int32_t context_buffer_id_;
+  /*! \brief Skip arguments in the call to global partitioned function. */
+  std::unordered_set<int32_t> skip_call_args_;
   IRModule ir_module_;
   Target target_;
 };
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 439a3ec39c9a7..90261e540a7d6 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -23,7 +23,7 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
-from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
+from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_models, compile_and_run
 
 from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 from utils import (
@@ -119,6 +119,100 @@ def make_model(
     return last_op, params
 
 
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("padding", ["SAME", "VALID"])
+@pytest.mark.parametrize("enable_bias", [True, False])
+@pytest.mark.parametrize(
+    "input_zero_point, input_scale, kernel_scale, out_channels",
+    [(10, 0.0128, [0.11, 0.22], 2)],
+)
+def test_conv2d_number_primfunc_args(
+    padding,
+    enable_bias,
+    input_zero_point,
+    input_scale,
+    kernel_scale,
+    out_channels,
+):
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    ifm_shape = (1, 64, 100, 4)
+    kernel_size = (3, 3)
+    strides = (1, 1)
+    dilation = (1, 1)
+    dtype = "int8"
+    groups = 1
+    weight_format = "HWIO"
+    kernel_h = kernel_size[0]
+    kernel_w = kernel_size[1]
+    kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
+    kernel_zero_point = 0
+    in_min, in_max = get_range_for_dtype_str(dtype)
+    relu_type = "RELU"
+
+    output_scale, output_zero_point = get_conv2d_qnn_params(
+        kernel_shape,
+        input_scale,
+        input_zero_point,
+        kernel_scale,
+        kernel_zero_point,
+        dtype,
+        dtype,
+        dtype,
+    )
+
+    model, params = make_model(
+        ifm_shape,
+        kernel_shape,
+        input_zero_point,
+        input_scale,
+        kernel_zero_point,
+        kernel_scale,
+        output_zero_point,
+        output_scale,
+        padding,
+        strides,
+        dilation,
+        groups,
+        dtype,
+        dtype,
+        out_channels,
+        weight_format,
+        enable_bias,
+        relu_type,
+    )
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
+
+    # validate pattern matching
+    assert_partitioned_function(orig_mod, cmsisnn_mod)
+
+    # compile the model
+    rng = np.random.default_rng(12345)
+    inputs = {"input": rng.integers(in_min, high=in_max, size=ifm_shape, dtype=dtype)}
+    output_list = generate_ref_data(orig_mod["main"], inputs, params)
+
+    compiled_models = compile_models(
+        AOTTestModel(module=cmsisnn_mod, inputs=inputs, outputs=output_list, params=params),
+        interface_api,
+        use_unpacked_api,
+    )
+
+    # validate number of TIR primfunc args
+    expected_num_params = 6 if enable_bias else 5
+    cmsisnn_tir_mod = None
+    for target, mod in compiled_models[0].executor_factory.lowered_ir_mods.items():
+        if "cmsis-nn" == target.kind.name:
+            cmsisnn_tir_mod = mod
+
+    cmsisnn_func = cmsisnn_tir_mod["tvmgen_default_cmsis_nn_main_0"]
+    assert (
+        len(cmsisnn_func.params) == expected_num_params
+    ), "Generated unexpected number of function arguments"
+
+
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
 @pytest.mark.parametrize("relu_type", ["RELU"])

From d8678a6a9aa7962b658efb603e27d83ea7737a02 Mon Sep 17 00:00:00 2001
From: FranckQC <89943638+FranckQC@users.noreply.github.com>
Date: Thu, 9 Jun 2022 11:32:15 -0500
Subject: [PATCH 077/181] [TIR] CSE pass : Restrict the equivalence to be
 decided by a normal form - avoids comparison of terms (#11574)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CSE pass had been designed for potentially allowing comparisons (and commonings) of equivalent terms (like (x+y)+z and x+(y+z)), where **the notion of being equivalent was customizable, and no assumption was made about it**. That means that the implementation of the equivalence test function `EquivalentTerms()` - which was at the moment just calling the syntactical equality test `EqualTerms()` - could be replaced later by a cleverer equality test.

However, having such a generic way of comparing elements meant that in the function `SyntacticToSemanticComputations()`, where we were going from a hashtable of syntactical entities to what I called a vector of "semantical entites" (which are just canonical forms/representants of classes of equivalence of terms), **the only way was to compare each pair**.
That resulted in a quadratic behavior of this function, but there was no way around it as in order to merge equivalent entities into their class of equivalence, we had to compare them.

**This PR essentially does the following:**

- When computing the classes of equivalences of terms (therefore transforming a ComputationTable (i.e. a hashtable) into a vector of classes of equivalence) : **instead of comparing each pair of terms, relies on a normalization procedure to obtain a normal form for each of them**.
That transforms a small part of the algorithm that was quadratic to n.logn. However, it's difficult to see improvements in practice, in particular for average sized programs, as that part was a "small" quadratic to a "big" n.logn (finding things in a hash-table, copying it to a vector, etc).
It was probably going from a complexity of ~O(((n²-n)/2) + n.logn) to a complexity of ~O(3n + n.logn), so potential gains would only be expected for very large programs.

- Completely gives the user the possibility to turn ON/OFF the semantical comparisons of terms. It is turned OFF by default (as it's quite longer to compile with it ON, unsurprisingly), which means that by default, the equivalence coincides with the (syntactical) equality of terms.
    As the pass was written with the possibility to do these additional commonings (like (x+y)+z and x+(y+z)), it was a good time to fully plug that completely, up to the Python user who can now turn that ON if he wants to. But again, it is OFF by default, so no real change on that.

To run it ON, simply do:
`with tvm.transform.PassContext(config={'tir.enable_equiv_terms_in_cse_tir':True}):`
before calling `build()`

- When this boolean is set to ON, it uses a simple implementation of the normalization function with equivalences that uses `arith::Analyzer::Simplify` as noted by in https://github.com/apache/tvm/pull/10544 . Note that this is not a real normalization procedure as it is incomplete (i.e., it is not guarantee to converge to the normal form), but it is correct, and it works well with most properties : associativity of +, distributivity of * on +, etc.

- Clarifies and enhance the test base for the pass. In particular, it adds the tests that were written in https://github.com/apache/tvm/pull/10544 but which did not make it through.

- Also add the test ( https://github.com/AndrewZhaoLuo/TVM-Sandbox/blob/19284ddbd6bb28af61c0c2aa8bb334c5c53731a7/tir/test_inconsistent_tir_lowering.py#L1 ) demonstrating the (older) non-deterministic lowering and put it into a proper test, as I found it useful for making sure that this does not happen again. It has been copied from https://github.com/apache/tvm/pull/10663 and only slightly adapted (in particular for doing the comparison of hashes automatically instead of printing them and relying on a human to compare them).
---
 include/tvm/tir/transform.h                   |   3 +-
 python/tvm/tir/transform/transform.py         |   4 +-
 src/driver/driver_api.cc                      |   6 +-
 src/tir/transforms/common_subexpr_elim.cc     |  96 +++++--
 src/tir/transforms/common_subexpr_elim.h      |   8 +-
 .../transforms/common_subexpr_elim_tools.cc   | 145 +++++++---
 .../transforms/common_subexpr_elim_tools.h    |  10 +-
 .../test_tir_transform_common_subexpr_elim.py | 260 ++++++++++++++----
 8 files changed, 409 insertions(+), 123 deletions(-)

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 24c3cfa78f721..4612d5ad3feac 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -470,9 +470,10 @@ TVM_DLL Pass LowerVtcmAlloc();
  * \brief Implements a Common Subexpression Elimination (CSE) for TIR
  *        which introduces let-in bindings for duplicated sub-expressions.
  * \param enable_cse_tir Whether common subexpression elimination is enabled.
+ * \param identify_equiv_terms Whether equivalent terms should be identified.
  * \return The pass.
  */
-TVM_DLL Pass CommonSubexprElimTIR(bool enable_cse_tir = true);
+TVM_DLL Pass CommonSubexprElimTIR(bool enable_cse_tir = true, bool identify_equiv_terms = false);
 
 /*!
  * \brief Unify all the thread bindings for "blockIdx.x/y/z", "threadIdx.x/y/z", and
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 802fdc576c41f..1bed29c560fc9 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -324,7 +324,7 @@ def BF16TypeLowering():
     return _ffi_api.BF16TypeLowering()  # type: ignore
 
 
-def CommonSubexprElimTIR(enable_cse_tir: bool = True):
+def CommonSubexprElimTIR(enable_cse_tir: bool = True, identify_equiv_terms: bool = False):
     """Replace redundant computations by new variables.
 
     Returns
@@ -332,7 +332,7 @@ def CommonSubexprElimTIR(enable_cse_tir: bool = True):
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.CommonSubexprElimTIR(enable_cse_tir)  # type: ignore
+    return _ffi_api.CommonSubexprElimTIR(enable_cse_tir, identify_equiv_terms)  # type: ignore
 
 
 def RewriteUnsafeSelect():
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 7df1a844acc2b..7706f229c9ed3 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -45,6 +45,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_bound_checkers", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_assert", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_vectorize", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_cse_tir", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.enable_equiv_terms_in_cse_tir", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_storage_rewrite", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
@@ -198,6 +199,8 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   bool instrument_bound_checkers =
       pass_ctx->GetConfig<Bool>("tir.instrument_bound_checkers", Bool(false)).value();
   bool disable_cse_tir = pass_ctx->GetConfig<Bool>("tir.disable_cse_tir", Bool(false)).value();
+  bool enable_equiv_terms_in_cse_tir =
+      pass_ctx->GetConfig<Bool>("tir.enable_equiv_terms_in_cse_tir", Bool(false)).value();
 
   // Get any user-added passes
   Array<Array<ObjectRef>> add_lower_pass =
@@ -289,7 +292,8 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
     pass_list.push_back(tir::transform::InstrumentBoundCheckers());
   }
 
-  pass_list.push_back(tir::transform::CommonSubexprElimTIR(!disable_cse_tir));
+  pass_list.push_back(
+      tir::transform::CommonSubexprElimTIR(!disable_cse_tir, enable_equiv_terms_in_cse_tir));
 
   return pass_list;
 }
diff --git a/src/tir/transforms/common_subexpr_elim.cc b/src/tir/transforms/common_subexpr_elim.cc
index d43b30d17be00..290f920e3fc07 100644
--- a/src/tir/transforms/common_subexpr_elim.cc
+++ b/src/tir/transforms/common_subexpr_elim.cc
@@ -60,7 +60,7 @@ namespace tir {
           to collect them for the CSE pass, but we also won't even want to collect computations
           that contain them.
           The reason is that reusing such computations would change the semantics of the program,
-          and therefore before doing any introduction of variable or any reuse of already introduced
+          and therefore before doing any introduction of var or any reuse of already introduced
           variables, we will make sure that the computation being considered is not forbidden, and
           that it does not even contain a forbidden computation.
  * \param expr The expression to check
@@ -120,6 +120,42 @@ bool CommonSubexpressionEliminator::CanContainEligibleComputations(const PrimExp
   return true;
 }
 
+/*!
+ * \brief Implements an order on pairs (expression,frequency). First attempts to compare them
+          using the size of the expression. If it is the same, decides something else still
+          deterministic.
+ * \param a The first pair
+ * \param b The second pair
+ * \return A boolean telling if the first pair `a` comes before the second pair `b`
+ * \note We need this order to be deterministic in order to have a fully deterministic pass,
+ *       as we will deal with elements that are coming from a hashtable, but the order in which
+ *       they appeared in the hashtable was based on some runtime addresses, so it can potentially
+ *       change with every execution.
+ */
+bool CommonSubexpressionEliminator::OrderOnExprAndFrequency(std::pair<PrimExpr, size_t> a,
+                                                            std::pair<PrimExpr, size_t> b) {
+  size_t a_size = CalculateExprComplexity(a.first);
+  size_t b_size = CalculateExprComplexity(b.first);
+
+  // Criteria 1 - Size of the expression comes first
+  // `a` comes before `b` if the size of `a` is bigger
+  if (a_size > b_size) {
+    return true;
+  }
+  // `a` does NOT come before `b` if the size of `b` is bigger
+  if (b_size > a_size) {
+    return false;
+  }
+
+  // Criteria 2 - If they had the same size, use the lexicographic order as a last resort
+  // as we need a deterministic order
+  std::stringstream a_stream;
+  std::stringstream b_stream;
+  a_stream << a.first;
+  b_stream << b.first;
+  return (a_stream.str().compare(b_stream.str()) < 0);
+}
+
 /*!
  * \brief Generates a new fresh variable, whose name will be cse_var_i.
  * \param type_annotation The type of the new variable to generate
@@ -166,10 +202,12 @@ int CommonSubexpressionEliminator::GetNbVarGenerated() { return nb_var_; }
                           of the function being analyzed
  * \return A new statement where CSE has been performed
  */
-Stmt CommonSubexpressionEliminator::PerformCSE(const Stmt& stmt, const Context& context_init) {
+Stmt CommonSubexpressionEliminator::PerformCSE(const Stmt& stmt, const Context& context_init,
+                                               bool identify_equiv_terms) {
   // As this function is being called for each PrimFunc definition, we create a new instance
   // for the one we are having now.
-  CommonSubexpressionEliminator common_subexpression_eliminator(stmt, context_init);
+  CommonSubexpressionEliminator common_subexpression_eliminator(stmt, context_init,
+                                                                identify_equiv_terms);
   return common_subexpression_eliminator.VisitStmt(stmt);
 }
 
@@ -179,8 +217,9 @@ Stmt CommonSubexpressionEliminator::PerformCSE(const Stmt& stmt, const Context&
                         formal parameters of the function that will be analyzed
  */
 CommonSubexpressionEliminator::CommonSubexpressionEliminator(const Stmt& stmt,
-                                                             const Context& context_init)
-    : initial_body_(stmt), context_(context_init) {}
+                                                             const Context& context_init,
+                                                             bool identify_equiv_terms)
+    : initial_body_(stmt), context_(context_init), identify_equiv_terms_(identify_equiv_terms) {}
 
 /*!
  * \brief The method which overrides the generic dispatcher of StmtExprMutator.
@@ -200,28 +239,28 @@ PrimExpr CommonSubexpressionEliminator::VisitExpr(const PrimExpr& expr) {
   // Transform the hashtable of *syntactic* eligible computations into a vector of pairs
   // containing *semantic* entities, i.e. where equivalent computations are merged.
   std::vector<std::pair<PrimExpr, size_t>> semantic_comp_done_by_expr =
-      SyntacticToSemanticComputations(table_syntactic_comp_done_by_expr);
+      SyntacticToSemanticComputations(table_syntactic_comp_done_by_expr, identify_equiv_terms_);
 
   // Sort the vector of semantic entities by decreasing size
   std::sort(semantic_comp_done_by_expr.begin(), semantic_comp_done_by_expr.end(),
-            [](std::pair<PrimExpr, size_t> a, std::pair<PrimExpr, size_t> b) {
-              return (CalculateExprComplexity(a.first) > CalculateExprComplexity(b.first));
-            });
+            OrderOnExprAndFrequency);
 
   // For each computation done (considering them from biggest to smallest)
   for (size_t i = 0; i < semantic_comp_done_by_expr.size(); i++) {
     std::pair<PrimExpr, size_t>& computation_and_nb = semantic_comp_done_by_expr[i];
 
+    bool ident_equiv_terms = identify_equiv_terms_;  // To avoid the capture of "this"
+
     // The predicate later used (when doing replacements) to select expressions that are
     // equivalent to the current computation (`computation_and_nb.first`)
     std::function<bool(const PrimExpr&)> predicate_selector =
-        [computation_and_nb](const PrimExpr& current_expr) {
+        [computation_and_nb, ident_equiv_terms](const PrimExpr& current_expr) {
           // `current_expr` should be equivalent to `computation_and_nb.first`, but we also check
           // that `current_expr` is an eligible computation even if we know that
           // `computation_and_nb.first` is eligible by construction, in case that one day the
           // equivalence relation would not preserve the eligibility any more (even though that
           // would probably be a very weird equivalence).
-          return (EquivalentTerms(current_expr, computation_and_nb.first) &&
+          return (EquivalentTerms(current_expr, computation_and_nb.first, ident_equiv_terms) &&
                   IsEligibleComputation(current_expr));
         };
 
@@ -229,10 +268,11 @@ PrimExpr CommonSubexpressionEliminator::VisitExpr(const PrimExpr& expr) {
     // equivalent to `computation_and_nb.first`
     auto it_on_var = std::find_if(
         context_.begin(), context_.end(),
-        [computation_and_nb](const std::pair<Var, MaybeValue>& var_and_value) {
+        [computation_and_nb, ident_equiv_terms](const std::pair<Var, MaybeValue>& var_and_value) {
           // Note : safe to call value() as we check has_value() just before
           return (var_and_value.second.has_value() &&
-                  EquivalentTerms(var_and_value.second.value(), computation_and_nb.first));
+                  EquivalentTerms(var_and_value.second.value(), computation_and_nb.first,
+                                  ident_equiv_terms));
         });
 
     // Case where we have a perfectly equivalent computation already available in a variable
@@ -298,7 +338,8 @@ PrimExpr CommonSubexpressionEliminator::VisitExpr(const PrimExpr& expr) {
         // The following insertion will maintain `semantic_comp_done_by_expr` sorted (by
         // decreasing size/complexity), and it will only insert at locations > i as the
         // direct subexprs are necessarily smaller than the current computation.
-        InsertVectorToSortedSemanticComputations(&semantic_comp_done_by_expr, direct_subexprs);
+        InsertVectorToSortedSemanticComputations(&semantic_comp_done_by_expr, direct_subexprs,
+                                                 identify_equiv_terms_);
       }
     }
     // Note : we do not remove the current element, as we never look back in the local vector
@@ -378,28 +419,28 @@ Stmt CommonSubexpressionEliminator::VisitStmt(const Stmt& stmt) {
   // Transform the hashtable of *syntactic* eligible computations into a vector of pairs
   // containing *semantic* entities, i.e. where equivalent computations are merged.
   std::vector<std::pair<PrimExpr, size_t>> semantic_comp_done_by_stmt =
-      SyntacticToSemanticComputations(table_syntactic_comp_done_by_stmt);
+      SyntacticToSemanticComputations(table_syntactic_comp_done_by_stmt, identify_equiv_terms_);
 
   // Sort the vector of semantic entities by decreasing size
   std::sort(semantic_comp_done_by_stmt.begin(), semantic_comp_done_by_stmt.end(),
-            [](std::pair<PrimExpr, size_t> a, std::pair<PrimExpr, size_t> b) {
-              return (CalculateExprComplexity(a.first) > CalculateExprComplexity(b.first));
-            });
+            OrderOnExprAndFrequency);
 
   // For each computation done (considering them from biggest to smallest)
   for (size_t i = 0; i < semantic_comp_done_by_stmt.size(); i++) {
     std::pair<PrimExpr, size_t>& computation_and_nb = semantic_comp_done_by_stmt[i];
 
+    bool ident_equiv_terms = identify_equiv_terms_;  // To avoid the capture of "this"
+
     // The predicate later used (when doing replacements) to select expressions that are
     // equivalent to the current computation (`computation_and_nb.first`)
     std::function<bool(const PrimExpr&)> predicate_selector =
-        [computation_and_nb](const PrimExpr& current_expr) {
+        [computation_and_nb, ident_equiv_terms](const PrimExpr& current_expr) {
           // `current_expr` should be equivalent to `computation_and_nb.first`, but we also check
           // that `current_expr` is an eligible computation even if we know that
           // `computation_and_nb.first` is eligible by construction, in case that one day the
           // equivalence relation would not preserve the eligibility any more (even though that
           // would probably be a very weird equivalence).
-          return (EquivalentTerms(current_expr, computation_and_nb.first) &&
+          return (EquivalentTerms(current_expr, computation_and_nb.first, ident_equiv_terms) &&
                   IsEligibleComputation(current_expr));
         };
 
@@ -407,10 +448,11 @@ Stmt CommonSubexpressionEliminator::VisitStmt(const Stmt& stmt) {
     // equivalent to `computation_and_nb.first`
     auto it_on_var = std::find_if(
         context_.begin(), context_.end(),
-        [computation_and_nb](const std::pair<Var, MaybeValue>& var_and_value) {
+        [computation_and_nb, ident_equiv_terms](const std::pair<Var, MaybeValue>& var_and_value) {
           // Note : safe to call value() as we check has_value() just before
           return (var_and_value.second.has_value() &&
-                  EquivalentTerms(var_and_value.second.value(), computation_and_nb.first));
+                  EquivalentTerms(var_and_value.second.value(), computation_and_nb.first,
+                                  ident_equiv_terms));
         });
 
     // Case where we have a perfectly equivalent computation already available in a variable
@@ -477,7 +519,8 @@ Stmt CommonSubexpressionEliminator::VisitStmt(const Stmt& stmt) {
         // The following insertion will maintain `semantic_comp_done_by_stmt` sorted (by
         // decreasing size/complexity), and it will only insert at locations > i as the
         // direct subexprs are necessarily smaller than the current computation.
-        InsertVectorToSortedSemanticComputations(&semantic_comp_done_by_stmt, direct_subexprs);
+        InsertVectorToSortedSemanticComputations(&semantic_comp_done_by_stmt, direct_subexprs,
+                                                 identify_equiv_terms_);
       }
     }
     // Note : we do not remove the current element, as we never look back in the local vector
@@ -587,8 +630,8 @@ namespace transform {
  * \brief The function which returns the pass for the Common Subexpression Elimination.
  * \return The pass for performing CSE.
  */
-Pass CommonSubexprElimTIR(bool enable_cse_tir) {
-  auto pass_func = [enable_cse_tir](PrimFunc f, IRModule m, PassContext ctx) {
+Pass CommonSubexprElimTIR(bool enable_cse_tir, bool identify_equiv_terms) {
+  auto pass_func = [enable_cse_tir, identify_equiv_terms](PrimFunc f, IRModule m, PassContext ctx) {
     if (enable_cse_tir) {
       auto* n = f.CopyOnWrite();
       Context context_init;
@@ -603,7 +646,8 @@ Pass CommonSubexprElimTIR(bool enable_cse_tir) {
 
       // Do the Common Subexpression Elimination on the body of the function, with the initial
       // context that we have prepared
-      n->body = CommonSubexpressionEliminator::PerformCSE(std::move(f->body), context_init);
+      n->body = CommonSubexpressionEliminator::PerformCSE(std::move(f->body), context_init,
+                                                          identify_equiv_terms);
     }
 
     return f;
diff --git a/src/tir/transforms/common_subexpr_elim.h b/src/tir/transforms/common_subexpr_elim.h
index 484d93c769822..5c14caf1a6e36 100644
--- a/src/tir/transforms/common_subexpr_elim.h
+++ b/src/tir/transforms/common_subexpr_elim.h
@@ -55,7 +55,7 @@ using Context = std::vector<std::pair<Var, MaybeValue>>;
 class CommonSubexpressionEliminator : public StmtExprMutator {
  public:
   // Toplevel (static) function
-  static Stmt PerformCSE(const Stmt& stmt, const Context& context_init);
+  static Stmt PerformCSE(const Stmt& stmt, const Context& context_init, bool identify_equiv_terms);
 
   PrimExpr VisitExpr(const PrimExpr& expr) override;
   Stmt VisitStmt(const Stmt& stmt) override;
@@ -64,7 +64,8 @@ class CommonSubexpressionEliminator : public StmtExprMutator {
 
  protected:
   // Constructor
-  CommonSubexpressionEliminator(const Stmt& stmt, const Context& context_init);
+  CommonSubexpressionEliminator(const Stmt& stmt, const Context& context_init,
+                                bool identify_equiv_terms);
 
   PrimExpr VisitExpr_(const LetNode* op) override;
 
@@ -77,9 +78,12 @@ class CommonSubexpressionEliminator : public StmtExprMutator {
   int num_last_try_ = 0;  // Number of the last variable tried
   int nb_var_ = 0;        // Number of variables introduced by the CSE pass
 
+  bool identify_equiv_terms_ = false;
+
   static bool ForbiddenComputation(const PrimExpr& expr);
   static bool IsEligibleComputation(const PrimExpr& expr);
   static bool CanContainEligibleComputations(const PrimExpr& expr);
+  static bool OrderOnExprAndFrequency(std::pair<PrimExpr, size_t> a, std::pair<PrimExpr, size_t> b);
   Var GenerateNewVar(DataType type_annotation);
 };
 
diff --git a/src/tir/transforms/common_subexpr_elim_tools.cc b/src/tir/transforms/common_subexpr_elim_tools.cc
index d39d211ba1824..b5b1bfccdf4ac 100644
--- a/src/tir/transforms/common_subexpr_elim_tools.cc
+++ b/src/tir/transforms/common_subexpr_elim_tools.cc
@@ -25,7 +25,8 @@
 
 #include "common_subexpr_elim_tools.h"
 
-#include <tvm/ir/transform.h>  // For the class Pass and the class PassContext
+#include <tvm/arith/analyzer.h>  // For the arith::Analyzer::Simplify() method simplifying terms
+#include <tvm/ir/transform.h>    // For the class Pass and the class PassContext
 #include <tvm/runtime/container/string.h>
 #include <tvm/tir/analysis.h>  // For the ExprDeepEqual analysis
 #include <tvm/tir/expr.h>
@@ -720,14 +721,42 @@ bool EqualTerms(const PrimExpr& a, const PrimExpr& b) {
   return deep_equal_(a, b);
 }
 
+/*!
+ * \brief Normalization function of a term, use to decide the equivalence relation of interest
+ * \param expr The expression to normalize
+ * \param do_normalization Whether we want the function to actually do normalization
+ * \note This function can be customized
+ */
+PrimExpr NormalizeTerm(const PrimExpr& expr, bool do_normalization) {
+  if (do_normalization) {
+    // Customize here!
+    // We could decide to normalize terms in a way that identifies them modulo commutativity
+    // (like x+y and y+x), or modulo associativity (like (x+y)+z and x+(y+z)), etc.
+    // For that, a normalization procedure (or an incomplete "pseudo-normalization" like
+    // arith::Analyzer::Simplify) will be used.
+
+    // One possible customization:
+    // Here is just an attempt to do more commonings by using the pseudo-normalization function
+    // offered by arith::Analyzer::Simplify(). "pseudo" because while it is correct (i.e.
+    // the simplification is indeed equivalent to the original term), it is incomplete (i.e.
+    // the returned term is not guaranteed to be a normal form).
+    arith::Analyzer analyzer;
+    return analyzer.Simplify(expr);
+  } else {
+    // If `do_normalization` is false, the equivalence relation just checks the syntactic equality,
+    // so the normalization is just the identity function.
+    return expr;
+  }
+}
+
 /*!
  * \brief Decides if two terms are equivalent semantically
  */
-bool EquivalentTerms(const PrimExpr& a, const PrimExpr& b) {
-  // For now, we just check the syntactic equality, but that could later become a semantic test,
-  // for instance identifying computations modulo commutativity (like x+y and y+x), or modulo
-  // associativity (like (x+y)+z and x+(y+z)), etc.
-  return EqualTerms(a, b);
+bool EquivalentTerms(const PrimExpr& a, const PrimExpr& b, bool identify_equiv_terms) {
+  // We restrict the equivalence to be decidable by a normalization procedure that is used to
+  // normalize both sides, and to then compare the normal forms with the strict syntactical
+  // equality
+  return EqualTerms(NormalizeTerm(a, identify_equiv_terms), NormalizeTerm(b, identify_equiv_terms));
 }
 
 /*!
@@ -739,21 +768,52 @@ bool EquivalentTerms(const PrimExpr& a, const PrimExpr& b) {
    \note This function is needed because the advantage of the hashtable was the constant lookup.
           But in order to have this constant lookup, we could not collapse semantically equivalent
           computations.
+          Attention, the pairs returned are deterministic and will always be the same (as the same
+          canonical representant will always be chosen for a given class of equivalence), but the
+          order in which these pairs appear in the result is not deterministic, as it is based on
+          the order in which we found items in the "normalized hashtable" `norm_table`). The caller
+          is expected to sort the result anyway.
  */
 std::vector<std::pair<PrimExpr, size_t>> SyntacticToSemanticComputations(
-    const ComputationTable& table) {
+    const ComputationTable& table, bool identify_equiv_terms) {
   std::vector<std::pair<PrimExpr, size_t>> result;
 
-  // table.size() is an upper-bound of the number of elements in the resulting vector,
-  // as we might merge semantically equivalent computations.
-  // We do this reservation even if it might reserve slightly more space than is needed in the end
-  result.reserve(table.size());
+  // If we do NOT identify equivalent terms, then we simply need to transform the input hashtable
+  // into a vector, without doing anything else.
+  if (!identify_equiv_terms) {
+    // The result will contain exactly as many elements as the input `table` has
+    result.reserve(table.size());
+    for (const auto& elem : table) {
+      result.push_back(elem);
+    }
 
-  // Traverse through map in a sorted order on keys to maintain deterministic behavior
-  // We do this by comparing the string repr of each PrimExpr to get a determinstic ordering
-  std::vector<std::pair<PrimExpr, size_t>> sorted_map_items(table.begin(), table.end());
+    return result;
+  }
 
-  sort(sorted_map_items.begin(), sorted_map_items.end(),
+  // Otherwise, in order to identify equivalent terms, we will go through a table `norm_table`
+  // where normal forms are the keys., and use it to efficiently merge equivalent terms.
+
+  // In order to produce the result (a vector of semantical entities), the input table will be
+  // normalized. This normalized table will keep the count for each set of equivalent terms
+  // (i.e. each equivalence class), together with a term that did appear in this equivalence class
+  // (in practice, the first term of the equivalence class that was encoutered).
+  std::unordered_map<PrimExpr, std::pair<PrimExpr, size_t>, StructuralHash, ExprDeepEqual>
+      norm_table;
+
+  // In order to avoid frequent rehashing if the norm_table becomes big, we immediately ask for
+  // enough space to store the amount of elements that the input table has, as it's clearly an
+  // upper bound (in the worst case, each element is its own representant, and there is as many
+  // equivalence classes as there are elements)
+  norm_table.reserve(table.size());
+
+  // Transform the input hashtable to a vector and sort it according to some order, as we will be
+  // iterating through its items soon, and the order of appearance will be used to determine the
+  // individual representant for each class of equivalence, which we want to be deterministic
+  // (otherwise {x+y, y+x} could be both replaced by x+y, and on another run by y+x).
+  std::vector<std::pair<PrimExpr, size_t>> sorted_items_of_table(table.begin(), table.end());
+
+  // We do the ordering by comparing the string repr of each expr to get a determinstic ordering
+  sort(sorted_items_of_table.begin(), sorted_items_of_table.end(),
        [](std::pair<PrimExpr, size_t> a, std::pair<PrimExpr, size_t> b) {
          std::stringstream a_stream;
          std::stringstream b_stream;
@@ -762,21 +822,40 @@ std::vector<std::pair<PrimExpr, size_t>> SyntacticToSemanticComputations(
          return a_stream.str().compare(b_stream.str()) < 0;
        });
 
-  // For each element in the hashtable
-  for (auto elem : sorted_map_items) {
-    // We try to see if a semantically equivalent term is already in the resulting vector
-    auto it_found = std::find_if(result.begin(), result.end(),
-                                 [elem](std::pair<PrimExpr, size_t> already_seen) {
-                                   return EquivalentTerms(already_seen.first, elem.first);
-                                 });
-    // And if so, we increase (by `elem.second`) its count
-    if (it_found != result.end()) {
-      it_found->second += elem.second;
+  for (const auto& elem : sorted_items_of_table) {
+    PrimExpr norm_elem = NormalizeTerm(elem.first, identify_equiv_terms);
+    // If the normalized term is not already a key in the normalized table
+    auto it_found = norm_table.find(norm_elem);
+    if (it_found == norm_table.end()) {
+      // Then we add the mapping `norm_elem` -> (`elem`.first, `elem`.second) to the norm table
+      // (i.e. `norm_elem` has been seen `elem`.second many times so far, and the chosen element
+      // to represent the equivalence class will be `elem`.first as it's the first element of the
+      // class that we see)
+      norm_table[norm_elem] = elem;
     } else {
-      // If we could not find a semantically equivalent term in the resulting vector, we add it
-      result.push_back(elem);
+      // Otherwise, it's not the first time we see a term in this equivalence class, so we just
+      // increase the count of this equivalence class as we now have `elem`.second additional items
+      // coming to the equivalence class.
+      it_found->second.second += elem.second;
     }
   }
+
+  // norm_table.size() is the number of equivalence class that we have built, so it's exactly the
+  // number of items that we will return in the vector of semantical entities
+  result.reserve(norm_table.size());
+
+  // Transform the intermediate hashtable `norm_table` into a vector, forgetting the keys,
+  // (which are the normal forms), as they won't be used as the canonical representants (which are
+  // instead the first element of each class that is effectively seen)
+  // Careful : the pairs will never change (the canonical represantants chosen will always be the
+  // same), but the order in which the pairs are produced can vary as we are iterating through the
+  // hashtable `norm_table`. It is not an issue as the called will be sorting the result anyway.
+  std::unordered_map<PrimExpr, std::pair<PrimExpr, size_t>, StructuralHash,
+                     ExprDeepEqual>::const_iterator it_norm_table;
+  for (it_norm_table = norm_table.begin(); it_norm_table != norm_table.end(); ++it_norm_table) {
+    result.push_back(it_norm_table->second);
+  }
+
   return result;
 }
 
@@ -822,17 +901,19 @@ void InsertElemToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size
           decreasing size of the expression) and maintain the vector sorted while doing so.
  */
 void InsertVectorToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size_t>>* sorted_vec,
-                                              const std::vector<PrimExpr>& vec_to_add) {
+                                              const std::vector<PrimExpr>& vec_to_add,
+                                              bool identify_equiv_terms) {
   if (sorted_vec == nullptr) {
     return;
   }
   for (auto elem_to_add : vec_to_add) {
     // See if the current element to add (or an equivalent one) is already present
     // in the sorted vector
-    auto it_found = std::find_if(sorted_vec->begin(), sorted_vec->end(),
-                                 [elem_to_add](std::pair<PrimExpr, size_t> elem) {
-                                   return EquivalentTerms(elem.first, elem_to_add);
-                                 });
+    auto it_found =
+        std::find_if(sorted_vec->begin(), sorted_vec->end(),
+                     [elem_to_add, identify_equiv_terms](std::pair<PrimExpr, size_t> elem) {
+                       return EquivalentTerms(elem.first, elem_to_add, identify_equiv_terms);
+                     });
 
     // If we found `elem_to_add` (or an equivalent expression) already in sorted_vec
     if (it_found != sorted_vec->end()) {
diff --git a/src/tir/transforms/common_subexpr_elim_tools.h b/src/tir/transforms/common_subexpr_elim_tools.h
index a590cde69fafc..fcd29fddc0a17 100644
--- a/src/tir/transforms/common_subexpr_elim_tools.h
+++ b/src/tir/transforms/common_subexpr_elim_tools.h
@@ -180,9 +180,12 @@ void PrintComputationTable(const ComputationTable& table);
 using MaybeValue = dmlc::optional<PrimExpr>;
 
 bool EqualTerms(const PrimExpr& a, const PrimExpr& b);
-bool EquivalentTerms(const PrimExpr& a, const PrimExpr& b);
+// Used for deciding the (decidable) equivalence relation
+PrimExpr NormalizeTerm(const PrimExpr& expr, bool do_normalization);
+// The equivalence relation, which is the syntactical equality when `identify_equiv_terms` is false
+bool EquivalentTerms(const PrimExpr& a, const PrimExpr& b, bool identify_equiv_terms);
 std::vector<std::pair<PrimExpr, size_t>> SyntacticToSemanticComputations(
-    const ComputationTable& table);
+    const ComputationTable& table, bool identify_equiv_terms);
 bool PredicateIntroVarForComputation(const PrimExpr& computation, size_t nb_times_seen);
 
 // Polymorphic (functional) map on a vector, which builds a news vector with the same number of
@@ -209,7 +212,8 @@ template std::vector<Var> VectorMap(const std::vector<std::pair<Var, MaybeValue>
 void InsertElemToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size_t>>* sorted_vec,
                                             const std::pair<PrimExpr, size_t>& pair);
 void InsertVectorToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size_t>>* sorted_vec,
-                                              const std::vector<PrimExpr>& vec_to_add);
+                                              const std::vector<PrimExpr>& vec_to_add,
+                                              bool identify_equiv_terms);
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
index c12e27a46e3f2..a546c16a648ec 100644
--- a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
+++ b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
@@ -17,12 +17,16 @@
 import hashlib
 
 import tvm
-from tvm import te
+from tvm import auto_scheduler, te, topi
 from tvm.ir.base import save_json
 from tvm.ir.module import IRModule
+from tvm.script import tir as T
 
-
-# A test program which gives the opportunity for the CSE pass to introduce two new variables, at two different levels
+# -----------------------------------------------------
+# Basic test for the expected Behavior of the CSE pass
+# -----------------------------------------------------
+# A test program which gives the opportunity for the CSE pass to introduce two new variables,
+# at two different levels
 def test_cse():
     z1 = te.var("z1")
     z2 = te.var("z2")
@@ -70,9 +74,9 @@ def test_cse():
             ),
         ),
     )
-    # This test program gives the opportunity to introduce two new variables, at two different levels
-    # and to perform replacements in the value of "a" and "b", using these new variables
-    # We will check all of that underneath and more, making also sure that nothing else has been changed
+    # This test program gives the opportunity to introduce two new variables, at two different
+    # levels and to perform replacements in the value of "a" and "b", using these new variables.
+    # We will check all of that underneath and more, making also sure that nothing else has changed
 
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([i1, i2, z3], body))
     body = tvm.tir.transform.CommonSubexprElimTIR()(mod)
@@ -138,52 +142,14 @@ def test_cse():
     assert isinstance(body.body, tvm.tir.BufferStore)
 
 
-def test_deterministic_cse():
-    import random
-
-    """Test deterministic allocation of CSE vars
-
-    We expect something like
-
-        result = (x + 1) + (x + 2) + (x + 3) + (x + 1) + (x + 2) + (x + 3)
-            -->
-        cse_var_3 = (x + 1)
-        cse_var_2 = (x + 2)
-        cse_var_1 = (x + 3)
-        result = cse_var_3 + cse_var_2 + cse_var_1 + cse_var_3 + cse_var_2 + cse_var_1
-    """
-    NUM_TERMS = 10
-    REPEATS = 10
-
-    x = te.var("x")
-    result = te.var("result")
-
-    offsets = sorted([i + 1 for i in range(NUM_TERMS)])
-    inc1 = [(x + offsets[i]) for i in range(NUM_TERMS)]
-    inc2 = [(x + offsets[i]) for i in range(NUM_TERMS)]
-
-    expression = x
-    for add in inc1 + inc2:
-        expression = expression + add
-    let_stmt = tvm.tir.LetStmt(result, expression, tvm.tir.Evaluate(result))
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([x], let_stmt))
-
-    initial_hash = None
-    for _ in range(REPEATS):
-        body = tvm.tir.transform.CommonSubexprElimTIR()(mod)["main"]
-
-        # Hash and ensure serialize json is the same every time
-        json_val = save_json(body)
-        json_hash = hashlib.sha256(json_val.encode()).hexdigest()
-
-        if initial_hash is None:
-            initial_hash = json_hash
-        assert json_hash == initial_hash
-
-
-# First specific test for if nodes : Some duplicated computations appear only in one branch (here the Then branch), not in both branches.
-# In this case, the CSE pass should introduce the redundant computation at the top if the Then branch, not before the whole If
-# (otherwise that would lead to some computations being computed for nothing when it is the Else branch that is executed).
+# -----------------------------------------------------
+# Tests related to If nodes
+# -----------------------------------------------------
+# First specific test for if nodes : Some duplicated computations appear only in one branch (here
+# the Then branch), not in both branches.
+# In this case, the CSE pass should introduce the redundant computation at the top of the Then
+# branch, not before the whole If (otherwise that would lead to some computations being computed
+# for nothing when it is the Else branch that is executed).
 def test_cse_ifNode_1():
     b = te.var("b")
     i1 = te.var("i1")
@@ -237,9 +203,9 @@ def test_cse_ifNode_1():
     assert tvm.ir.structural_equal(body.value, y + z)
 
 
-# Second test for if nodes : Some duplicated computations appear in both the Then and the Else branch.
-# In this case, the CSE pass should introduce the redundant computation before the whole If node, because
-# regardless of the execution path, it is going to be computed.
+# Second test for if nodes : Some duplicated computations appear in both the Then and Else branch.
+# In this case, the CSE pass should introduce the redundant computation before the whole If node,
+# because regardless of the execution path, it is going to be computed.
 def test_cse_ifNode_2():
     b = te.var("b")
     i1 = te.var("i1")
@@ -265,7 +231,7 @@ def test_cse_ifNode_2():
             b,
             tvm.tir.SeqStmt(
                 [
-                    tvm.tir.BufferStore(buffer, y + z, [i1]),  # (y+z) is present in the Then branch
+                    tvm.tir.BufferStore(buffer, y + z, [i1]),  # (y+z) is present in Then branch
                     tvm.tir.BufferStore(buffer, y, [i2]),
                 ]
             ),
@@ -288,9 +254,11 @@ def test_cse_ifNode_2():
     assert tvm.ir.structural_equal(body.value, y + z)
 
 
+# -------------------------------------------------------------------------------------------------
 # Test commoning in cascade : after having introduced a big exp ((x+y)+z) into a new variable,
 # it will become possible to do another commoning for (x+y) which appears both in the new variable
 # and in the rest of the program.
+# -------------------------------------------------------------------------------------------------
 def test_cse_cascade():
     i1 = te.var("i1")
     i2 = te.var("i2")
@@ -353,8 +321,188 @@ def test_cse_cascade():
     assert tvm.ir.structural_equal(store3.value, cse_var_2)
 
 
+# -----------------------------------------------------------------------------------------
+# A test which ensures that we don't perform normalizations outside of introduced variables
+# -----------------------------------------------------------------------------------------
+def test_no_normalization_without_commoning():
+    x = te.var("x")
+    y = te.var("y")
+    z = te.var("z")
+    a = te.var("a")
+    # Test prog :
+    # let a = x + (y + z) in a
+    body = tvm.tir.LetStmt(a, x + (y + z), tvm.tir.Evaluate(a))
+
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([x, y, z], body))
+    body = tvm.tir.transform.CommonSubexprElimTIR(identify_equiv_terms=True)(mod)
+
+    tvm.transform.PrintIR()(body)
+
+    body = body["main"].body  # Gets the body of the main, i.e. the full statement
+
+    assert body.var.name == "a"
+    assert tvm.ir.structural_equal(body.value, x + (y + z))
+
+
+# -------------------------------------------------
+# Part for testing the commoning with equivalences
+# -------------------------------------------------
+@T.prim_func
+def func_distributivity(i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32) -> None:
+    B = T.buffer_decl((50,), "int32")
+    B[i1] = x * (y + z)
+    B[i2] = x * y + x * z
+
+
+@T.prim_func
+def func_distributivity_expected(
+    i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32
+) -> None:
+    B = T.buffer_decl((50,), "int32")
+    cse_var_1 = T.var("int32")
+    with T.let(cse_var_1, x * y + x * z):
+        B[i1] = cse_var_1
+        B[i2] = cse_var_1
+
+
+@T.prim_func
+def func_associativity(i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32) -> None:
+    B = T.buffer_decl((50,), "int32")
+    B[i1] = (x + y) + z
+    B[i2] = x + (y + z)
+
+
+@T.prim_func
+def func_associativity_expected(
+    i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32
+) -> None:
+    B = T.buffer_decl((50,), "int32")
+    cse_var_1 = T.var("int32")
+    with T.let(cse_var_1, (x + y) + z):
+        B[i1] = cse_var_1
+        B[i2] = cse_var_1
+
+
+def _check(original, transformed):
+    func = original
+    mod = tvm.IRModule.from_expr(func)
+    body = tvm.tir.transform.CommonSubexprElimTIR(identify_equiv_terms=True)(mod)
+    tvm.transform.PrintIR()(body)
+    tvm.ir.assert_structural_equal(body["main"], transformed)
+
+
+def test_semantic_equiv_distributivity():
+    _check(func_distributivity, func_distributivity_expected)
+
+
+def test_semantic_equiv_associativity():
+    _check(func_associativity, func_associativity_expected)
+
+
+# -----------------------------------------------------
+# Tests that verify the determinism of the pass
+# -----------------------------------------------------
+def test_deterministic_cse():
+    import random
+
+    """Test deterministic allocation of CSE vars
+
+    We expect something like
+
+        result = (x + 1) + (x + 2) + (x + 3) + (x + 1) + (x + 2) + (x + 3)
+            -->
+        cse_var_3 = (x + 1)
+        cse_var_2 = (x + 2)
+        cse_var_1 = (x + 3)
+        result = cse_var_3 + cse_var_2 + cse_var_1 + cse_var_3 + cse_var_2 + cse_var_1
+    """
+    NUM_TERMS = 10
+    REPEATS = 10
+
+    x = te.var("x")
+    result = te.var("result")
+
+    offsets = sorted([i + 1 for i in range(NUM_TERMS)])
+    inc1 = [(x + offsets[i]) for i in range(NUM_TERMS)]
+    inc2 = [(x + offsets[i]) for i in range(NUM_TERMS)]
+
+    expression = x
+    for add in inc1 + inc2:
+        expression = expression + add
+    let_stmt = tvm.tir.LetStmt(result, expression, tvm.tir.Evaluate(result))
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([x], let_stmt))
+
+    initial_hash = None
+    for _ in range(REPEATS):
+        body = tvm.tir.transform.CommonSubexprElimTIR()(mod)
+
+        body = body["main"]
+
+        # Hash and ensure serialize json is the same every time
+        json_val = save_json(body)
+        json_hash = hashlib.sha256(json_val.encode()).hexdigest()
+
+        if initial_hash is None:
+            initial_hash = json_hash
+        assert json_hash == initial_hash
+
+
+# Needed for the second test on determinism
+LOG_LINE = '{"i": [["[\\"conv2d_layer\\", 1, 7, 7, 512, 512, 3, 3, [1, 1], [1, 1]]", \
+            "llvm -keys=cpu -link-params=0 -mcpu=broadwell -num-cores=2", \
+            [8, 64, 64, 0, 0, 0, 0, 0], "", 1, []], [[], [["CI", 5], \
+            ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 512, [1, 32, 16], 1], \
+            ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 7, [1, 1, 1], 1], \
+            ["SP", 3, 16, 512, [1], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 3, [3], 1], \
+            ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, \
+            11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], \
+            ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], \
+            ["CA", 3, 6, 7], ["CA", 1, 6, 5], ["FU", 6, [0, 1, 2, 3, 4, 5]], ["AN", 6, 0, 3], \
+            ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], \
+            ["AN", 6, 6, 2]]]], "r": [[0.0331129], 0, 0.900362, 1647464342], "v": "v0.6"}\n'
+
+# The workload associated with the log
+@auto_scheduler.register_workload
+def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
+    data = te.placeholder((N, CI, H, W), name="data")
+    kernel = te.placeholder((CO, CI, KH, KW), name="kernel")
+    bias = te.placeholder((1, CO, 1, 1), name="bias")
+    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32")
+    out = topi.nn.relu(conv + bias)
+    return [data, kernel, bias, out]
+
+
+def test_deterministic_cse_2():
+    inp, inr = auto_scheduler.measure_record.load_record_from_string(LOG_LINE)
+    inp = auto_scheduler.measure.recover_measure_input(inp, rebuild_state=True)
+
+    initial_hash = None
+
+    for _ in range(10):
+        sch, args = inp.task.compute_dag.apply_steps_from_state(inp.state)
+        ir_module = tvm.lower(sch, args)
+        primfunc = ir_module["main"]
+        json_str = save_json(primfunc)
+        new_hash = hashlib.sha256(json_str.encode("utf-8")).hexdigest()
+        # Make sure that all the hashes are going to be the same
+        if initial_hash is None:
+            initial_hash = new_hash
+        assert new_hash == initial_hash
+
+
 if __name__ == "__main__":
+    # Basic test:
     test_cse()
+    # Tests related to If nodes:
     test_cse_ifNode_1()
     test_cse_ifNode_2()
+    # Test performing a commoning on a commoning:
     test_cse_cascade()
+    # Test that verifies that the input program itself is not being normalized by the pass:
+    test_no_normalization_without_commoning()
+    # Tests that turn on the equivalence of terms and verify the commoning with equivalences:
+    test_semantic_equiv_distributivity()
+    test_semantic_equiv_associativity()
+    # Tests that verify the determinism of the pass:
+    test_deterministic_cse()
+    test_deterministic_cse_2()

From ebc9b6d41cbb6720654dd1fd54488a88b4a8898d Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 9 Jun 2022 09:41:02 -0700
Subject: [PATCH 078/181] [ci] Add guards to pytest_wrapper (#11553)

This should fix #11544 and adds some more logging in case the issue persists. Unfortunately it is difficult to test for real since the case data in that PR is thrown away after Jenkins is done (Jenkins does store test data but it marshals JUnits into its own format)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                     | 338 +++++++++++++++++++++++++++++++-
 jenkins/macros.j2               |  12 ++
 tests/scripts/git_utils.py      |   5 +-
 tests/scripts/pytest_wrapper.py |   9 +-
 4 files changed, 358 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 334448a7ae24b..0205a1e7364fe 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-01T16:34:53.941462
+// Generated at 2022-06-02T14:03:43.284817
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -847,6 +847,14 @@ def shard_run_unittest_GPU_1_of_3() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -899,6 +907,14 @@ def shard_run_unittest_GPU_2_of_3() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -947,6 +963,14 @@ def shard_run_unittest_GPU_3_of_3() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -994,6 +1018,14 @@ def shard_run_integration_CPU_1_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1040,6 +1072,14 @@ def shard_run_integration_CPU_2_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1086,6 +1126,14 @@ def shard_run_integration_CPU_3_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1132,6 +1180,14 @@ def shard_run_integration_CPU_4_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1178,6 +1234,14 @@ def shard_run_integration_CPU_5_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1224,6 +1288,14 @@ def shard_run_integration_CPU_6_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1271,6 +1343,14 @@ def shard_run_python_i386_1_of_5() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1317,6 +1397,14 @@ def shard_run_python_i386_2_of_5() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1362,6 +1450,14 @@ def shard_run_python_i386_3_of_5() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1407,6 +1503,14 @@ def shard_run_python_i386_4_of_5() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1452,6 +1556,14 @@ def shard_run_python_i386_5_of_5() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1498,6 +1610,14 @@ def shard_run_test_Hexagon_1_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1542,6 +1662,14 @@ def shard_run_test_Hexagon_2_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1586,6 +1714,14 @@ def shard_run_test_Hexagon_3_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1630,6 +1766,14 @@ def shard_run_test_Hexagon_4_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1674,6 +1818,14 @@ def shard_run_test_Hexagon_5_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1718,6 +1870,14 @@ def shard_run_test_Hexagon_6_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1762,6 +1922,14 @@ def shard_run_test_Hexagon_7_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1808,6 +1976,14 @@ def shard_run_integration_aarch64_1_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1853,6 +2029,14 @@ def shard_run_integration_aarch64_2_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1898,6 +2082,14 @@ def shard_run_integration_aarch64_3_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1943,6 +2135,14 @@ def shard_run_integration_aarch64_4_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1988,6 +2188,14 @@ def shard_run_topi_GPU_1_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2032,6 +2240,14 @@ def shard_run_topi_GPU_2_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2076,6 +2292,14 @@ def shard_run_topi_GPU_3_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2120,6 +2344,14 @@ def shard_run_topi_GPU_4_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2165,6 +2397,14 @@ def shard_run_frontend_GPU_1_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2209,6 +2449,14 @@ def shard_run_frontend_GPU_2_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2253,6 +2501,14 @@ def shard_run_frontend_GPU_3_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2297,6 +2553,14 @@ def shard_run_frontend_GPU_4_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2341,6 +2605,14 @@ def shard_run_frontend_GPU_5_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2385,6 +2657,14 @@ def shard_run_frontend_GPU_6_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2435,6 +2715,14 @@ def shard_run_topi_aarch64_1_of_2() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2483,6 +2771,14 @@ def shard_run_topi_aarch64_2_of_2() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2528,6 +2824,14 @@ def shard_run_frontend_aarch64_1_of_2() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2572,6 +2876,14 @@ def shard_run_frontend_aarch64_2_of_2() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2742,6 +3054,14 @@ stage('Test') {
                 )
               })
             } finally {
+              sh(
+                script: """
+                  set -eux
+                  aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+                """,
+                label: 'Upload JUnits to S3',
+              )
+
               junit 'build/pytest-results/*.xml'
             }
           }
@@ -2787,6 +3107,14 @@ stage('Test') {
                 )
               })
             } finally {
+              sh(
+                script: """
+                  set -eux
+                  aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+                """,
+                label: 'Upload JUnits to S3',
+              )
+
               junit 'build/pytest-results/*.xml'
             }
           }
@@ -2827,6 +3155,14 @@ stage('Test') {
                 )
               })
             } finally {
+              sh(
+                script: """
+                  set -eux
+                  aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+                """,
+                label: 'Upload JUnits to S3',
+              )
+
               junit 'build/pytest-results/*.xml'
             }
           }
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 5a641b73fea84..5d996ce19a559 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -19,6 +19,16 @@
   "workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
 {%- endmacro -%}
 
+{% macro junit_to_s3() %}
+sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+{% endmacro %}
+
 {% macro sharded_test_step(name, num_shards, node, ws, docker_image, platform, test_method_names) %}
 
 {% for shard_index in range(1, num_shards + 1) %}
@@ -39,6 +49,7 @@ def {{ method_name }}() {
             })
           }
         } finally {
+          {{ junit_to_s3() }}
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -86,6 +97,7 @@ def {{ method_name }}() {
                 {{ caller() | indent(width=12) | trim }}
               })
             } finally {
+              {{ junit_to_s3() | indent(width=4) }}
               junit 'build/pytest-results/*.xml'
             }
           }
diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index 0e2e85e552431..267756d859050 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -36,7 +36,7 @@ def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] =
     req = request.Request(url, headers=headers, method="POST")
     if auth is not None:
         auth_str = base64.b64encode(f"{auth[0]}:{auth[1]}".encode())
-        req.add_header("Authorization", f"Basic {auth_str}")
+        req.add_header("Authorization", f"Basic {auth_str.decode()}")
 
     if body is None:
         body = ""
@@ -47,8 +47,7 @@ def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] =
     req.add_header("Content-Length", len(data))
 
     with request.urlopen(req, data) as response:
-        response = json.loads(response.read())
-    return response
+        return response.read()
 
 
 class GitHubRepo:
diff --git a/tests/scripts/pytest_wrapper.py b/tests/scripts/pytest_wrapper.py
index a7b6f0dfa766d..4c4410bedc9c6 100755
--- a/tests/scripts/pytest_wrapper.py
+++ b/tests/scripts/pytest_wrapper.py
@@ -18,6 +18,7 @@
 import argparse
 import textwrap
 import junitparser
+import traceback
 from pathlib import Path
 from typing import List, Optional
 import os
@@ -51,6 +52,10 @@ def failed_test_ids() -> List[str]:
         for suite in xml:
             # handle suites
             for case in suite:
+                if case.result is None:
+                    logging.warn(f"Incorrectly formatted JUnit found, result was None on {case}")
+                    continue
+
                 if len(case.result) > 0 and isinstance(case.result[0], FAILURE_TYPES):
                     node_id = classname_to_file(case.classname) + "::" + case.name
                     failed_node_ids.append(node_id)
@@ -112,7 +117,7 @@ def show_failure_help(failed_suites: List[str]) -> None:
         "If there is no test listed below, the failure likely came from a segmentation "
         "fault which you can find in the logs above.\n"
     )
-    if len(failed_suites) > 0:
+    if failed_suites is not None and len(failed_suites) > 0:
         print("\n".join([f"    - {suite}" for suite in failed_suites]))
         print("")
 
@@ -131,4 +136,4 @@ def show_failure_help(failed_suites: List[str]) -> None:
     except Exception as e:
         # This script shouldn't ever introduce failures since it's just there to
         # add extra information, so ignore any errors
-        logging.error(str(e))
+        logging.exception(e)

From 87502ddd9002cdfe1035a2bc1c7063e33098ced1 Mon Sep 17 00:00:00 2001
From: Sunghyun Park <49998730+sunggg@users.noreply.github.com>
Date: Thu, 9 Jun 2022 10:14:46 -0700
Subject: [PATCH 079/181] [PASS] Refactor a couple of TIR passes - BindTarget,
 AnnotateEntryFunc, Filter, LowerInitBlock (#11628)

This PR fixes a few inconsistent pass registration and add testcases for them.
- `LowerInitBlock` had mismatch between its pass name and ffi key.
- `BindTarget`, `AnnotateEntryFunc`, `Filter` were not following the name convention of tir passes and they were not registered in FFI registry.
---
 include/tvm/tir/transform.h                   |  19 +++
 python/tvm/tir/transform/transform.py         |  61 ++++++---
 src/driver/driver_api.cc                      |  45 ++-----
 src/tir/transforms/lower_init_block.cc        |   2 +-
 src/tir/transforms/primfunc_utils.cc          |  63 +++++++++
 .../convert_pool_allocations_to_offsets.cc    |   2 +-
 .../unittest/test_tir_transform_helpers.py    | 123 ++++++++++++++++++
 7 files changed, 258 insertions(+), 57 deletions(-)
 create mode 100644 src/tir/transforms/primfunc_utils.cc
 create mode 100644 tests/python/unittest/test_tir_transform_helpers.py

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 4612d5ad3feac..6393eeb9430b9 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -25,6 +25,7 @@
 #define TVM_TIR_TRANSFORM_H_
 
 #include <tvm/ir/transform.h>
+#include <tvm/target/target.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/function.h>
 
@@ -625,6 +626,24 @@ TVM_DLL Pass ExtractPrimFuncConstants();
  */
 TVM_DLL Pass RenormalizeSplitPattern();
 
+/*!
+ * \brief Annotate a PrimFunc with a given target.
+ * \return The pass.
+ */
+TVM_DLL Pass BindTarget(Target target);
+
+/*!
+ * \brief Set a PrimFunc as the entry point if it is only function in IRModule.
+ * \return The pass.
+ */
+TVM_DLL Pass AnnotateEntryFunc();
+
+/*!
+ * \brief Filter PrimFuncs with a given condition.
+ * \return The pass.
+ */
+TVM_DLL Pass Filter(runtime::TypedPackedFunc<bool(PrimFunc)> fcond);
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 1bed29c560fc9..e0a7501ef92af 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -16,7 +16,8 @@
 # under the License.
 """Wrapping existing transformations."""
 # pylint: disable=invalid-name
-from typing import Optional
+from typing import Optional, Callable
+
 from . import _ffi_api
 from . import function_pass as _fpass
 
@@ -43,26 +44,6 @@ def _transform(func, mod, ctx):
     return _fpass.prim_func_pass(_transform, opt_level=0, name="Apply")  # type: ignore
 
 
-def Filter(fcond):
-    """Filter functions by the calling convention attribute.
-
-    Parameters
-    ----------
-    fcond : tvm.tir.PrimFunc -> bool
-        The condition of the filtering.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The result pass
-    """
-    # pylint: disable=unused-argument
-    def _transform(func, mod, ctx):
-        return func if fcond(func) else None
-
-    return _fpass.prim_func_pass(_transform, opt_level=0, name="Filter")  # type: ignore
-
-
 def InjectPrefetch():
     """Inject prefetch instructions into stmt.
 
@@ -806,3 +787,41 @@ def RenormalizeSplitPattern():
         The result pass
     """
     return _ffi_api.RenormalizeSplitPattern()  # type: ignore
+
+
+def BindTarget(target):
+    """Annotate a PrimFunc with a given target.
+    Parameters
+    -------
+    target : tvm.target.Target
+        target
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.BindTarget(target)  # type: ignore
+
+
+def AnnotateEntryFunc():
+    """Set a PrimFunc as the entry point if it is only function in IRModule.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.AnnotateEntryFunc()  # type: ignore
+
+
+def Filter(fcond: Callable):
+    """Filter out PrimFuncs that does not satisfy the given condition.
+    `fcond` should be a function that takes a primfunc and returns boolean.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.Filter(fcond)  # type: ignore
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 7706f229c9ed3..ace31800de27f 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -164,32 +164,6 @@ TVM_REGISTER_GLOBAL("driver.get_binds")
       return out_arr;
     });
 
-transform::Pass BindTarget(Target target) {
-  auto fpass = [target](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
-    return WithAttr(std::move(f), tvm::attr::kTarget, target);
-  };
-  return tir::transform::CreatePrimFuncPass(fpass, 0, "BindTarget", {});
-}
-
-static transform::Pass AnnotateEntryFunc(bool b) {
-  auto fpass = [](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
-    return WithAttr(std::move(f), tir::attr::kIsEntryFunc, Bool(true));
-  };
-  return tir::transform::CreatePrimFuncPass(fpass, 0, "AnnotateEntryFunc", {});
-}
-
-template <typename FCond>
-transform::Pass Filter(FCond fcond) {
-  auto fpass = [fcond](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
-    if (fcond(f)) {
-      return f;
-    } else {
-      return tir::PrimFunc(nullptr);
-    }
-  };
-  return tir::transform::CreatePrimFuncPass(fpass, 0, "Filter", {});
-}
-
 Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   transform::PassContext pass_ctx = transform::PassContext::Current();
 
@@ -564,12 +538,12 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
 
   Array<Pass> mixed_pass_list;
 
-  mixed_pass_list.push_back(BindTarget(target));
+  mixed_pass_list.push_back(tir::transform::BindTarget(target));
 
   mixed_pass_list.push_back(tir::transform::VerifyMemory());
 
   if (ShouldAnnotateEntryFunc(mixed_mod)) {
-    mixed_pass_list.push_back(AnnotateEntryFunc(true));
+    mixed_pass_list.push_back(tir::transform::AnnotateEntryFunc());
   }
 
   bool detect_global_barrier =
@@ -606,14 +580,16 @@ TVM_REGISTER_GLOBAL("driver.mixed_mod_passes")
 
 transform::Sequential HostModulePassManager(IRModule mixed_mod, Target target_host) {
   Array<tvm::transform::Pass> host_pass_list;
-  host_pass_list.push_back(Filter([](const tir::PrimFunc& f) {
+
+  runtime::TypedPackedFunc<bool(tir::PrimFunc)> fcond = [](const tir::PrimFunc& f) {
     return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) !=
            CallingConv::kDeviceKernelLaunch;
-  }));
+  };
+  host_pass_list.push_back(tir::transform::Filter(fcond));
 
   ICHECK(mixed_mod.defined()) << "This module must be defined";
 
-  host_pass_list.push_back(BindTarget(target_host));
+  host_pass_list.push_back(tir::transform::BindTarget(target_host));
 
   host_pass_list.push_back(tir::transform::LowerTVMBuiltin());
   host_pass_list.push_back(tir::transform::LowerCustomDatatypes());
@@ -631,12 +607,13 @@ TVM_REGISTER_GLOBAL("driver.host_mod_passes")
 
 transform::Sequential DeviceModulePassManager(IRModule mixed_mod, Target target) {
   Array<Pass> device_pass_list;
-  device_pass_list.push_back(Filter([](const tir::PrimFunc& f) {
+  runtime::TypedPackedFunc<bool(tir::PrimFunc)> fcond = [](const tir::PrimFunc& f) {
     return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
            CallingConv::kDeviceKernelLaunch;
-  }));
+  };
+  device_pass_list.push_back(tir::transform::Filter(fcond));
 
-  device_pass_list.push_back(BindTarget(target));
+  device_pass_list.push_back(tir::transform::BindTarget(target));
 
   device_pass_list.push_back(tir::transform::LowerWarpMemory());
   device_pass_list.push_back(tir::transform::Simplify());
diff --git a/src/tir/transforms/lower_init_block.cc b/src/tir/transforms/lower_init_block.cc
index d8621ac3b3e6d..17b4e3fb22e62 100644
--- a/src/tir/transforms/lower_init_block.cc
+++ b/src/tir/transforms/lower_init_block.cc
@@ -81,7 +81,7 @@ Pass LowerInitBlock() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     return LowerInitBlock(std::move(f));
   };
-  return CreatePrimFuncPass(pass_func, 0, "tir.LowerReduction", {});
+  return CreatePrimFuncPass(pass_func, 0, "tir.LowerInitBlock", {});
 }
 
 TVM_REGISTER_GLOBAL("tir.transform.LowerInitBlock").set_body_typed(LowerInitBlock);
diff --git a/src/tir/transforms/primfunc_utils.cc b/src/tir/transforms/primfunc_utils.cc
new file mode 100644
index 0000000000000..d2bb259f9921f
--- /dev/null
+++ b/src/tir/transforms/primfunc_utils.cc
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file primfunc_utils.cc
+ * \brief Passes that serve as helper functions.
+ */
+
+#include <tvm/driver/driver_api.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tir {
+namespace transform {
+transform::Pass BindTarget(Target target) {
+  auto fpass = [target](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
+    return WithAttr(std::move(f), tvm::attr::kTarget, target);
+  };
+  return tir::transform::CreatePrimFuncPass(fpass, 0, "tir.BindTarget", {});
+}
+
+transform::Pass AnnotateEntryFunc() {
+  auto fpass = [](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
+    ICHECK(m->functions.size() == 1);
+    return WithAttr(std::move(f), tir::attr::kIsEntryFunc, Bool(true));
+  };
+  return tir::transform::CreatePrimFuncPass(fpass, 0, "tir.AnnotateEntryFunc", {});
+}
+
+transform::Pass Filter(runtime::TypedPackedFunc<bool(PrimFunc)> fcond) {
+  auto fpass = [fcond](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
+    if (fcond(f)) {
+      return f;
+    } else {
+      return tir::PrimFunc(nullptr);
+    }
+  };
+  return tir::transform::CreatePrimFuncPass(fpass, 0, "tir.Filter", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.BindTarget").set_body_typed(BindTarget);
+TVM_REGISTER_GLOBAL("tir.transform.AnnotateEntryFunc").set_body_typed(AnnotateEntryFunc);
+TVM_REGISTER_GLOBAL("tir.transform.Filter").set_body_typed(Filter);
+
+}  // namespace transform
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
index dc71e3d60891c..1161962f12872 100644
--- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
+++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
@@ -60,7 +60,7 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator {
       PoolInfo pool_info = pool_allocation->pool_info;
       int byte_pool_offset = pool_allocation->byte_offset->value;
       int required_pool_size_for_allocation =
-          byte_pool_offset + CalculateExtentsSize(allocate_node.operator->());
+          byte_pool_offset + static_cast<int>(CalculateExtentsSize(allocate_node.operator->()));
       if (all_pools_sizes_.find(pool_info) == all_pools_sizes_.end()) {
         all_pools_sizes_[pool_info] = required_pool_size_for_allocation;
       } else {
diff --git a/tests/python/unittest/test_tir_transform_helpers.py b/tests/python/unittest/test_tir_transform_helpers.py
new file mode 100644
index 0000000000000..01496e0e0fc13
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_helpers.py
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+import tvm
+from tvm.script import tir as T
+import tvm.testing
+
+
+def test_annotate_entry_func_single_primfunc():
+    @tvm.script.ir_module
+    class MockModule:
+        @T.prim_func
+        def func1(A: T.Buffer[(16,), "float32"]):
+            for i in T.serial(16):
+                if i == 5:
+                    if i == 5:
+                        A[i] = 0.0
+
+    mod = MockModule
+    assert mod
+    assert mod["func1"].attrs is None
+    after = tvm.tir.transform.AnnotateEntryFunc()(mod)
+    assert (
+        after["func1"].attrs
+        and "tir.is_entry_func" in after["func1"].attrs
+        and after["func1"].attrs["tir.is_entry_func"]
+    )
+
+
+# Test module
+@tvm.script.ir_module
+class MockModule:
+    @T.prim_func
+    def func1(A: T.Buffer[(16,), "float32"]):
+        for i in T.serial(16):
+            if i == 5:
+                if i == 5:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def func2(A: T.Buffer[(32,), "float32"]):
+        for i in T.serial(32):
+            if i == 15:
+                if i == 15:
+                    A[i] = 0.0
+
+
+@pytest.mark.xfail
+def test_annotate_entry_func_multiple_primfunc():
+    mod = MockModule
+    assert mod
+    assert mod["func1"].attrs is None
+    assert mod["func2"].attrs is None
+    # This should fail
+    after = tvm.tir.transform.AnnotateEntryFunc()(mod)
+
+
+def test_bind_target():
+    mod = MockModule
+    assert mod
+
+    target = tvm.target.Target("cuda")
+    assert mod["func1"].attrs is None
+    assert mod["func2"].attrs is None
+    after = tvm.tir.transform.BindTarget(target)(mod)
+
+    assert after["func1"].attrs and "target" in after["func1"].attrs
+    assert after["func1"].attrs["target"] == target
+    assert after["func2"].attrs and "target" in after["func2"].attrs
+    assert after["func2"].attrs["target"] == target
+
+
+def test_filter_primfunc():
+    mod = MockModule
+    assert mod
+    # Annotate each function for testing
+    mod["func1"] = mod["func1"].with_attr("temp", "test1")
+    mod["func2"] = mod["func2"].with_attr("temp", "test2")
+
+    # Test condition that does not filter out anything
+    def checker_filter_out_none(func: tvm.tir.PrimFunc):
+        return (func.attrs is not None) and ("temp" in func.attrs)
+
+    after = tvm.tir.transform.Filter(checker_filter_out_none)(mod)
+    assert len(after.functions) == 2
+    # Filtered functions should satisfy the given condition.
+    assert checker_filter_out_none(after["func1"])
+    assert checker_filter_out_none(after["func2"])
+
+    # Test condition that selectively filters out primfuncs
+    def checker_filter_out_one(func: tvm.tir.PrimFunc):
+        return (func.attrs is not None) and ("temp" in func.attrs) and func.attrs["temp"] == "test1"
+
+    after = tvm.tir.transform.Filter(checker_filter_out_one)(mod)
+    assert len(after.functions) == 1
+    # Filtered functions should satisfy the given condition.
+    assert checker_filter_out_one(after["func1"])
+
+    # Test condition that filters out everything
+    def checker_filter_out_both(func: tvm.tir.PrimFunc):
+        return (func.attrs is not None) and ("invalid_attr" in func.attrs)
+
+    after = tvm.tir.transform.Filter(checker_filter_out_both)(mod)
+    assert len(after.functions) == 0
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 7f1b819cdbc70fabaabe9374932e98a3c4bc4660 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Thu, 9 Jun 2022 11:17:09 -0600
Subject: [PATCH 080/181] [microTVM] Remove microTVM RVM version suffix
 (#11629)

---
 apps/microtvm/reference-vm/base-box-tool.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index a4777c3ff86f4..db89f323328e1 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -479,7 +479,7 @@ def release_command(args):
     if args.release_full_name:
         vm_name = args.release_full_name
     else:
-        vm_name = f"tlcpack/microtvm-{args.platform}-{args.platform_version}"
+        vm_name = f"tlcpack/microtvm-{args.platform}"
 
     if not args.skip_creating_release_version:
         subprocess.check_call(
@@ -604,14 +604,6 @@ def parse_args():
         action="store_true",
         help="Skip creating the version and just upload for this provider.",
     )
-    parser_release.add_argument(
-        "--platform-version",
-        required=False,
-        help=(
-            "For Zephyr, the platform version to release, in the form 'x.y'. "
-            "For Arduino, the version of arduino-cli that's being used, in the form 'x.y.z'."
-        ),
-    )
     parser_release.add_argument(
         "--release-full-name",
         required=False,
@@ -619,15 +611,11 @@ def parse_args():
         default=None,
         help=(
             "If set, it will use this as the full release name and version for the box. "
-            "If this set, it will ignore `--platform-version` and `--release-version`."
+            "If this set, it will ignore `--release-version`."
         ),
     )
 
     args = parser.parse_args()
-
-    if args.action == "release" and not args.release_full_name:
-        parser.error("--platform-version is requireed.")
-
     return args
 
 

From f528a9a1cd5a0145e07b0bebcc43ab9020767cc9 Mon Sep 17 00:00:00 2001
From: czh978 <41666381+czh978@users.noreply.github.com>
Date: Fri, 10 Jun 2022 01:33:44 +0800
Subject: [PATCH 081/181] [Frontend][TFLite] Improve support for
 half_pixel_centers in resize (#11521)

* add resize_nearest_neighbor op test

* Improve support for half_pixel_centers in resize
---
 python/tvm/relay/frontend/tflite.py          |  7 ++++++-
 tests/python/frontend/tflite/test_forward.py | 14 ++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 342c4e2ae553a..981074b6adb24 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -695,10 +695,15 @@ def _convert_resize(self, method, op):
         coord_trans = "align_corners" if align_corners else "asymmetric"
         coord_trans = "half_pixel" if half_pixel_centers else coord_trans
 
+        rounding_method = ""
+        if method == "nearest_neighbor":
+            if not align_corners and half_pixel_centers:
+                rounding_method = "round_prefer_ceil"
+
         if bilinear_method and input_tensor.qnn_params:
             in_expr = self.dequantize(in_expr, input_tensor)
         out = _op.image.resize2d(
-            in_expr, target_size, None, "NHWC", method, coordinate_transformation_mode=coord_trans
+            in_expr, target_size, None, "NHWC", method, coord_trans, rounding_method
         )
         if bilinear_method and output_tensor.qnn_params:
             out = self.quantize(out, output_tensor)
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 8b0244d75eda8..76b0766dae284 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -1693,6 +1693,20 @@ def test_all_resize():
             align_corners=False,
             half_pixel_centers=False,
         )
+        _test_resize(
+            tf.image.resize_nearest_neighbor,
+            images_data_float32,
+            size_data,
+            align_corners=True,
+            half_pixel_centers=False,
+        )
+        _test_resize(
+            tf.image.resize_nearest_neighbor,
+            images_data_float32,
+            size_data,
+            align_corners=False,
+            half_pixel_centers=True,
+        )
 
 
 #######################################################################

From 81b42e67460f11955794f7fc48465b15f16ae57b Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Thu, 9 Jun 2022 19:02:31 +0100
Subject: [PATCH 082/181] Making CMSIS-NN tests pylint compliant (#11625)

---
 tests/lint/pylint.sh                          |   2 +
 tests/python/contrib/test_cmsisnn/__init__.py |  17 ++
 .../contrib/test_cmsisnn/test_binary_ops.py   |  22 +-
 .../contrib/test_cmsisnn/test_conv2d.py       |  25 +-
 .../test_cmsisnn/test_extract_constants.py    | 217 ++++++++++--------
 .../test_cmsisnn/test_fully_connected.py      |  28 ++-
 .../test_cmsisnn/test_generate_constants.py   |  19 +-
 .../test_cmsisnn/test_invalid_graphs.py       |  14 +-
 .../contrib/test_cmsisnn/test_networks.py     |  22 +-
 .../contrib/test_cmsisnn/test_pooling.py      |  17 +-
 .../test_scalar_to_tensor_constant.py         | 201 ++++++++--------
 .../contrib/test_cmsisnn/test_softmax.py      |  11 +-
 tests/python/contrib/test_cmsisnn/utils.py    |   7 +-
 13 files changed, 326 insertions(+), 276 deletions(-)
 create mode 100644 tests/python/contrib/test_cmsisnn/__init__.py

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 6c958a9231395..b442c33c0ff67 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -20,3 +20,5 @@ set -euxo pipefail
 python3 -m pylint python/tvm --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint vta/python/vta --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/unittest/test_tvmscript_type.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc
+
diff --git a/tests/python/contrib/test_cmsisnn/__init__.py b/tests/python/contrib/test_cmsisnn/__init__.py
new file mode 100644
index 0000000000000..f9a622464a479
--- /dev/null
+++ b/tests/python/contrib/test_cmsisnn/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Infrastructure and tests for CMSIS-NN"""
diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
index 49c76870157ea..fec18c197e045 100644
--- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py
+++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
@@ -18,17 +18,19 @@
 """CMSIS-NN integration tests: binary ops"""
 
 import itertools
-import sys
 
 import numpy as np
-from enum import Enum
 import pytest
 
 import tvm
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
+from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
+from tvm.micro.testing.aot_test_utils import (
+    AOT_USMP_CORSTONE300_RUNNER,
+)
 
-from utils import (
+from .utils import (
     skip_if_no_reference_system,
     make_module,
     make_qnn_relu,
@@ -36,11 +38,6 @@
     assert_partitioned_function,
     assert_no_external_function,
 )
-from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
-from tvm.micro.testing.aot_test_utils import (
-    AOT_CORSTONE300_RUNNER,
-    AOT_USMP_CORSTONE300_RUNNER,
-)
 
 
 def generate_tensor_constant():
@@ -104,6 +101,7 @@ def make_model(
 def test_op_int8(
     op, relu_type, input_0_scale, input_0_zero_point, input_1_scale, input_1_zero_point
 ):
+    """Tests QNN Conv2D operator for CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -147,8 +145,10 @@ def test_op_int8(
     )
 
 
-# At least one of the inputs is a constant, both can't be variables, both can't be scalars
 def parameterize_for_constant_inputs(test):
+    """Generates parameters in such a way so that at least one of the inputs is a constant,
+    both can't be variables, both can't be scalars.
+    """
     op = [relay.qnn.op.mul, relay.qnn.op.add]
     input_0 = [generate_variable("input_0"), generate_tensor_constant(), generate_scalar_constant()]
     input_1 = [generate_variable("input_1"), generate_tensor_constant(), generate_scalar_constant()]
@@ -178,6 +178,7 @@ def parameterize_for_constant_inputs(test):
 @tvm.testing.requires_cmsisnn
 @parameterize_for_constant_inputs
 def test_constant_input_int8(op, input_0, input_1):
+    """Tests binary ops where one of the operands is a constant"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -231,9 +232,9 @@ def test_constant_input_int8(op, input_0, input_1):
 def test_both_scalar_inputs_int8(
     op,
 ):
+    """Tests binary ops where both operands are scalars"""
     input_scale = 0.256
     input_zero_point = 33
-    dtype = "int8"
     model = make_model(
         op,
         generate_scalar_constant(),
@@ -257,6 +258,7 @@ def test_invalid_parameters(
     op,
     input_dtype,
 ):
+    """Tests binary ops for non int8 dtypes"""
     input_scale = 0.256
     input_zero_point = 33
     model = make_model(
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 90261e540a7d6..462eb88347194 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -26,8 +26,7 @@
 from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_models, compile_and_run
 
 from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
-from utils import (
-    skip_if_no_reference_system,
+from .utils import (
     make_module,
     get_range_for_dtype_str,
     get_same_padding,
@@ -76,7 +75,7 @@ def make_model(
         shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
 
     rng = np.random.default_rng(12321)
-    w = tvm.nd.array(
+    weight = tvm.nd.array(
         rng.integers(
             np.iinfo(kernel_dtype).min,
             high=np.iinfo(kernel_dtype).max,
@@ -84,7 +83,7 @@ def make_model(
             dtype=kernel_dtype,
         )
     )
-    weight_const = relay.const(w, kernel_dtype)
+    weight_const = relay.const(weight, kernel_dtype)
     conv = relay.qnn.op.conv2d(
         invar,
         weight_const,
@@ -102,8 +101,8 @@ def make_model(
         padding=p,
         out_dtype="int32",
     )
-    b = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
-    bias_const = relay.const(b, "int32")
+    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
+    bias_const = relay.const(bias, "int32")
     last_op = relay.nn.bias_add(conv, bias_const, axis=3) if enable_bias else conv
     requant_input_sc = [sc * input_scale for sc in kernel_scale]
     last_op = relay.qnn.op.requantize(
@@ -115,7 +114,7 @@ def make_model(
         out_dtype=dtype,
     )
     last_op = make_qnn_relu(last_op, relu_type, output_scale, output_zero_point, dtype)
-    params = {"w": w, "b": b}
+    params = {"w": weight, "b": bias}
     return last_op, params
 
 
@@ -134,9 +133,9 @@ def test_conv2d_number_primfunc_args(
     kernel_scale,
     out_channels,
 ):
+    """Tests number of arguments in Conv2D primfunc"""
     interface_api = "c"
     use_unpacked_api = True
-    test_runner = AOT_USMP_CORSTONE300_RUNNER
 
     ifm_shape = (1, 64, 100, 4)
     kernel_size = (3, 3)
@@ -204,7 +203,7 @@ def test_conv2d_number_primfunc_args(
     expected_num_params = 6 if enable_bias else 5
     cmsisnn_tir_mod = None
     for target, mod in compiled_models[0].executor_factory.lowered_ir_mods.items():
-        if "cmsis-nn" == target.kind.name:
+        if target.kind.name == "cmsis-nn":
             cmsisnn_tir_mod = mod
 
     cmsisnn_func = cmsisnn_tir_mod["tvmgen_default_cmsis_nn_main_0"]
@@ -230,6 +229,7 @@ def test_conv2d_symmetric_padding_int8(
     kernel_scale,
     out_channels,
 ):
+    """Tests QNN Conv2D where the padding is symmetric on both sides of input"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -319,6 +319,7 @@ def test_conv2d_asymmetric_padding_int8(
     kernel_scale,
     out_channels,
 ):
+    """Tests QNN Conv2D where the padding is asymmetric on different sides of input"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -390,6 +391,7 @@ def test_conv2d_asymmetric_padding_int8(
     )
 
 
+# pylint: disable=import-outside-toplevel
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("ifm_shape", [(1, 55, 55, 3)])
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
@@ -397,6 +399,7 @@ def test_conv2d_asymmetric_padding_int8(
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
 @pytest.mark.parametrize("activation", ["NONE", "RELU"])
 def test_conv2d_int8_tflite(ifm_shape, kernel_shape, strides, dilation, padding, activation):
+    """Compares TVM output against TFLite output"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -460,6 +463,7 @@ def test_depthwise_int8(
     out_channels,
     depth_multiplier,
 ):
+    """Tests QNN Depthwise int8 op via CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -537,6 +541,7 @@ def test_depthwise_int8(
 
 
 def parameterize_for_invalid_model(test):
+    """Generates non int8 inputs"""
     in_dtype = ["uint8", "int8"]
     kernel_dtype = ["uint8", "int8"]
     kernel_zero_point = [-33, 10, 0]
@@ -560,12 +565,12 @@ def test_invalid_parameters(
     kernel_dtype,
     kernel_zero_point,
 ):
+    """Tests Depthwise op for non int8 inputs"""
     ifm_shape = (1, 28, 28, 12)
     out_channels = 2
     input_scale = 1
     input_zero_point = 24
     kernel_scale = [0.11, 0.0237]
-    in_min, in_max = get_range_for_dtype_str(in_dtype)
 
     kernel_layout = "HWIO"
     kernel_shape = [3, 3, ifm_shape[3], out_channels]
diff --git a/tests/python/contrib/test_cmsisnn/test_extract_constants.py b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
index 789d400faf978..8831596d40e63 100644
--- a/tests/python/contrib/test_cmsisnn/test_extract_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
@@ -16,8 +16,6 @@
 # under the License.
 
 """CMSIS-NN integration tests: extract_constants pass"""
-import itertools
-import math
 import numpy as np
 import pytest
 import tvm
@@ -28,6 +26,8 @@
 
 
 class CheckFunctionsForConstants(tvm.relay.ExprVisitor):
+    """Provides methods to test number of constants present in a function"""
+
     def __init__(self):
         super().__init__()
         self.num_constants_ = 0
@@ -38,7 +38,7 @@ def visit_call(self, call):
             if isinstance(arg, relay.Constant) and arg.data.numpy().ndim > 0:
                 self.num_constants_ += 1
 
-    def check_num_constants(self, func):
+    def check_num_constants(self):
         assert self.num_constants_ == 0, "Functions should not have constant arguments in Calls"
 
 
@@ -56,118 +56,132 @@ def set_composite_func_attr(func, name):
 
 @tvm.testing.requires_cmsisnn
 def test_external_function():
-    y0_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
-    x0 = relay.var("x0", shape=(8, 8))
-    y0_const = relay.const(y0_data, "float32")
-    z0 = x0 + y0_const
-    ef = relay.Function([x0], z0, relay.TensorType((8, 8), "float32"))
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-
-    x = relay.var("x", shape=(8, 8))
-    c = relay.Call(ev, [x])
-    mf = relay.Function([x], c, relay.TensorType((8, 8), "float32"))
-    mv = relay.GlobalVar("main")
+    """Tests the pass ExternConstants when the function is a global function"""
+    input1_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+    input0 = relay.var("input0", shape=(8, 8))
+    input1_const = relay.const(input1_data, "float32")
+    binary_op = input0 + input1_const
+    extern_func = relay.Function([input0], binary_op, relay.TensorType((8, 8), "float32"))
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+
+    arg = relay.var("arg", shape=(8, 8))
+    call_extern_func = relay.Call(global_var, [arg])
+    main_func = relay.Function([arg], call_extern_func, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = ExtractConstantsFromPartitionedFunction()(mod)
-    CheckFunctionsForConstants().check_num_constants(mod[ev])
+    constant_verifier = CheckFunctionsForConstants()
+    constant_verifier.visit_function(mod[global_var])
+    constant_verifier.check_num_constants()
     relay.transform.InferType()(mod)
 
 
 @tvm.testing.requires_cmsisnn
 def test_nested_function():
-    y1_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
-    x1 = relay.var("x1", shape=(8, 8))
-    y1_const = relay.const(y1_data, "float32")
-    z1 = x1 + y1_const
-    w1 = z1 * relay.const(5.0, "float32")
-    lf = relay.Function([x1], w1, relay.TensorType((8, 8), "float32"))
-    lf = set_composite_func_attr(lf, "cmsis-nn")
-
-    x0 = relay.var("x0", shape=(8, 8))
-    c0 = relay.Call(lf, [x0])
-    ef = relay.Function([x0], c0, relay.TensorType((8, 8), "float32"))
-
-    x = relay.var("x", shape=(8, 8))
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [x])
-    mf = relay.Function([x], c, relay.TensorType((8, 8), "float32"))
-    mv = relay.GlobalVar("main")
+    """Tests the pass ExternConstants when a composite function
+    is present within global function
+    """
+    input1_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+    input0 = relay.var("input0", shape=(8, 8))
+    input1_const = relay.const(input1_data, "float32")
+    binary_op0 = input0 + input1_const
+    binary_op1 = binary_op0 * relay.const(5.0, "float32")
+    local_func = relay.Function([input0], binary_op1, relay.TensorType((8, 8), "float32"))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn")
+
+    arg = relay.var("arg", shape=(8, 8))
+    call_local_func = relay.Call(local_func, [arg])
+    extern_func = relay.Function([arg], call_local_func, relay.TensorType((8, 8), "float32"))
+
+    global_arg = relay.var("garg", shape=(8, 8))
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [global_arg])
+    main_func = relay.Function([global_arg], call_extern_func, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = ExtractConstantsFromPartitionedFunction()(mod)
-    CheckFunctionsForConstants().check_num_constants(mod[ev])
+    constant_verifier = CheckFunctionsForConstants()
+    constant_verifier.visit_function(mod[global_var])
+    constant_verifier.check_num_constants()
     relay.transform.InferType()(mod)
 
 
 @tvm.testing.requires_cmsisnn
 def test_multiple_functions():
-    y20_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
-    x20 = relay.var("x20", shape=(8, 8))
-    y20_const = relay.const(y20_data, "float32")
-    z20 = x20 + y20_const
-    f20 = relay.Function([x20], z20, relay.TensorType((8, 8), "float32"))
-    f20 = set_composite_func_attr(f20, "cmsis-nn")
-
-    y21_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
-    x21 = relay.var("x21", shape=(8, 8))
-    y21_const = relay.const(y21_data, "float32")
-    z21 = x21 + y21_const
-    f21 = relay.Function([x21], z21, relay.TensorType((8, 8), "float32"))
-    f21 = set_composite_func_attr(f21, "cmsis-nn")
-
-    x10 = relay.var("x10", shape=(8, 8))
-    c10 = relay.Call(f20, [x10])
-    c11 = relay.Call(f21, [c10])
-    ef = relay.Function([x10], c11, relay.TensorType((8, 8), "float32"))
-    x0 = relay.var("x0", shape=(8, 8))
-    ev = relay.GlobalVar("cmsis-nn")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [x0])
-    mf = relay.Function([x0], c, relay.TensorType((8, 8), "float32"))
-    mv = relay.GlobalVar("main")
+    """Tests the pass ExternConstants when global function
+    contains multiple composite functions inside it
+    """
+    f0_input1_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+    f0_input0 = relay.var("f0_in0", shape=(8, 8))
+    f0_input1_const = relay.const(f0_input1_data, "float32")
+    f0_binary_op = f0_input0 + f0_input1_const
+    f0_func = relay.Function([f0_input0], f0_binary_op, relay.TensorType((8, 8), "float32"))
+    f0_func = set_composite_func_attr(f0_func, "cmsis-nn")
+
+    f1_input1_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+    f1_input0 = relay.var("f1_in0", shape=(8, 8))
+    f1_input1_const = relay.const(f1_input1_data, "float32")
+    f1_binary_op = f1_input0 + f1_input1_const
+    f1_func = relay.Function([f1_input0], f1_binary_op, relay.TensorType((8, 8), "float32"))
+    f1_func = set_composite_func_attr(f1_func, "cmsis-nn")
+
+    arg0 = relay.var("arg0", shape=(8, 8))
+    call_local_func0 = relay.Call(f0_func, [arg0])
+    call_local_func1 = relay.Call(f1_func, [call_local_func0])
+    extern_func = relay.Function([arg0], call_local_func1, relay.TensorType((8, 8), "float32"))
+    input0 = relay.var("input0", shape=(8, 8))
+    global_var = relay.GlobalVar("cmsis-nn")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [input0])
+    main_func = relay.Function([input0], call_extern_func, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = ExtractConstantsFromPartitionedFunction()(mod)
-    CheckFunctionsForConstants().check_num_constants(mod[ev])
+    constant_verifier = CheckFunctionsForConstants()
+    constant_verifier.visit_function(mod[global_var])
+    constant_verifier.check_num_constants()
     relay.transform.InferType()(mod)
 
 
 @tvm.testing.requires_cmsisnn
 def test_main_function():
-    x0 = relay.var("x0", shape=(8, 8))
-    y0 = relay.var("y0", shape=(8, 8))
-    z0 = x0 + y0
-    ef = relay.Function([x0, y0], z0, relay.TensorType((8, 8), "float32"))
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-
-    x = relay.var("x", shape=(8, 8))
-    y_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
-    y_const = relay.const(y_data, "float32")
-    z = x + y_const
-    c = relay.Call(ev, [x, z])
-    mf = relay.Function([x], c, relay.TensorType((8, 8), "float32"))
-    mv = relay.GlobalVar("main")
+    """Tests the pass ExternConstants on main function"""
+    input0 = relay.var("input0", shape=(8, 8))
+    input1 = relay.var("input1", shape=(8, 8))
+    binary_op = input0 + input1
+    extern_func = relay.Function([input0, input1], binary_op, relay.TensorType((8, 8), "float32"))
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+
+    arg = relay.var("arg", shape=(8, 8))
+    input_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+    input_const = relay.const(input_data, "float32")
+    binary_op = arg + input_const
+    call_extern_func = relay.Call(global_var, [arg, binary_op])
+    main_func = relay.Function([arg], call_extern_func, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = ExtractConstantsFromPartitionedFunction()(mod)
     check_for_constants = CheckFunctionsForConstants()
-    check_for_constants.visit_call(mod[mv].body)
+    check_for_constants.visit_call(mod[main_var].body)
     assert (
         check_for_constants.num_constants_ == 1
     ), "main() should have same number of arguments as before"
@@ -176,6 +190,7 @@ def test_main_function():
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("external_compiler", ["cmsis-nn", "other_compiler"])
 def test_multiple_functions_non_cmsisnn_compiler(external_compiler):
+    """Tests the pass ExternConstants on non CMSIS-NN targets"""
     y20_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
     x20 = relay.var("x20", shape=(8, 8))
     y20_const = relay.const(y20_data, "float32")
@@ -183,8 +198,8 @@ def test_multiple_functions_non_cmsisnn_compiler(external_compiler):
     f20 = relay.Function([x20], z20, relay.TensorType((8, 8), "float32"))
     f20 = set_composite_func_attr(f20, "cmsis-nn.qnn_op_1")
     x10 = relay.var("x10", shape=(8, 8))
-    c10 = relay.Call(f20, [x10])
-    ef0 = relay.Function([x10], c10, relay.TensorType((8, 8), "float32"))
+    call_local_func0 = relay.Call(f20, [x10])
+    extern_func0 = relay.Function([x10], call_local_func0, relay.TensorType((8, 8), "float32"))
 
     y21_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
     x21 = relay.var("x21", shape=(8, 8))
@@ -193,27 +208,27 @@ def test_multiple_functions_non_cmsisnn_compiler(external_compiler):
     f21 = relay.Function([x21], z21, relay.TensorType((8, 8), "float32"))
     f21 = set_composite_func_attr(f21, "cmsis-nn.qnn_op_2")
     x11 = relay.var("x11", shape=(8, 8))
-    c11 = relay.Call(f21, [x11])
-    ef1 = relay.Function([x11], c11, relay.TensorType((8, 8), "float32"))
-
-    x0 = relay.var("x0", shape=(8, 8))
-    ev0 = relay.GlobalVar("external_function_0")
-    ef0 = set_external_func_attr(ef0, external_compiler, ev0.name_hint)
-    c0 = relay.Call(ev0, [x0])
-    ev1 = relay.GlobalVar("external_function_1")
-    ef1 = set_external_func_attr(ef1, external_compiler, ev1.name_hint)
-    c1 = relay.Call(ev1, [c0])
-    mf = relay.Function([x0], c1, relay.TensorType((8, 8), "float32"))
-    mv = relay.GlobalVar("main")
+    call_local_func1 = relay.Call(f21, [x11])
+    extern_func1 = relay.Function([x11], call_local_func1, relay.TensorType((8, 8), "float32"))
+
+    input0 = relay.var("input0", shape=(8, 8))
+    global_var0 = relay.GlobalVar("external_function_0")
+    extern_func0 = set_external_func_attr(extern_func0, external_compiler, global_var0.name_hint)
+    call_extern_func0 = relay.Call(global_var0, [input0])
+    global_var1 = relay.GlobalVar("external_function_1")
+    extern_func1 = set_external_func_attr(extern_func1, external_compiler, global_var1.name_hint)
+    call_extern_func1 = relay.Call(global_var1, [call_extern_func0])
+    main_func = relay.Function([input0], call_extern_func1, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev0] = ef0
-    mod[ev1] = ef1
-    mod[mv] = mf
+    mod[global_var0] = extern_func0
+    mod[global_var1] = extern_func1
+    mod[main_var] = main_func
 
     mod = ExtractConstantsFromPartitionedFunction()(mod)
     check_for_constants = CheckFunctionsForConstants()
-    check_for_constants.visit_call(mod[mv].body)
+    check_for_constants.visit_call(mod[main_var].body)
 
     num_extracted_constants = 0
     if external_compiler == "cmsis-nn":
diff --git a/tests/python/contrib/test_cmsisnn/test_fully_connected.py b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
index c5d97f807b046..3a2061096dc12 100644
--- a/tests/python/contrib/test_cmsisnn/test_fully_connected.py
+++ b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
@@ -27,11 +27,9 @@
 from tvm.micro.testing.aot_test_utils import (
     AOT_USMP_CORSTONE300_RUNNER,
 )
-from utils import (
-    skip_if_no_reference_system,
+from .utils import (
     make_module,
     get_range_for_dtype_str,
-    get_same_padding,
     get_conv2d_qnn_params,
     make_qnn_relu,
     assert_partitioned_function,
@@ -55,9 +53,9 @@ def make_model(
     relu_type="NONE",
 ):
     """Return a model and any parameters it may have"""
-    a = relay.var("input", shape=in_shape, dtype=dtype)
+    input_ = relay.var("input", shape=in_shape, dtype=dtype)
     rng = np.random.default_rng(12321)
-    w = tvm.nd.array(
+    weight = tvm.nd.array(
         rng.integers(
             np.iinfo(kernel_dtype).min,
             high=np.iinfo(kernel_dtype).max,
@@ -65,9 +63,9 @@ def make_model(
             dtype=kernel_dtype,
         )
     )
-    weight_const = relay.const(w, kernel_dtype)
-    fc = relay.qnn.op.dense(
-        a,
+    weight_const = relay.const(weight, kernel_dtype)
+    dense = relay.qnn.op.dense(
+        input_,
         weight_const,
         input_zero_point=relay.const(input_zero_point, "int32"),
         kernel_zero_point=relay.const(kernel_zero_point, "int32"),
@@ -77,9 +75,9 @@ def make_model(
         out_dtype="int32",
     )
 
-    b = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
-    bias_const = relay.const(b, "int32")
-    last_op = relay.nn.bias_add(fc, bias_const) if enable_bias else fc
+    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
+    bias_const = relay.const(bias, "int32")
+    last_op = relay.nn.bias_add(dense, bias_const) if enable_bias else dense
     requant_input_sc = input_scale * kernel_scale
     last_op = relay.qnn.op.requantize(
         last_op,
@@ -90,7 +88,7 @@ def make_model(
         out_dtype=dtype,
     )
     last_op = make_qnn_relu(last_op, relu_type, output_scale, output_zero_point, dtype)
-    params = {"w": w, "b": b}
+    params = {"w": weight, "b": bias}
     return last_op, params
 
 
@@ -98,7 +96,6 @@ def make_model(
 @pytest.mark.parametrize("in_shape", [(2, 28), (1, 64)])
 @pytest.mark.parametrize("out_channels", [12, 128])
 @pytest.mark.parametrize("enable_bias", [False, True])
-@pytest.mark.parametrize("relu_type", ["RELU"])
 @pytest.mark.parametrize(
     "input_zero_point, input_scale, kernel_scale",
     [(10, 0.0128, 0.11), (-64, 0.0256, 1.37)],
@@ -110,8 +107,8 @@ def test_op_int8(
     input_scale,
     kernel_scale,
     out_channels,
-    relu_type,
 ):
+    """Test QNN fully connected layer"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -170,6 +167,7 @@ def test_op_int8(
 
 
 def parameterize_for_invalid_model(test):
+    """Generates parameters for non int8 inputs to fully connected layer"""
     in_dtype = ["uint8", "int8"]
     kernel_dtype = ["uint8", "int8"]
     kernel_zero_point = [-33, 10, 0]
@@ -193,12 +191,12 @@ def test_invalid_parameters(
     kernel_dtype,
     kernel_zero_point,
 ):
+    """Tests fully connected layer with non int8 inputs"""
     in_shape = (2, 28)
     out_channels = 2
     input_scale = 1
     input_zero_point = 24
     kernel_scale = [0.11, 0.0237]
-    in_min, in_max = get_range_for_dtype_str(in_dtype)
 
     kernel_shape = [out_channels, in_shape[1]]
     conv2d_kernel_shape = [1, 1, kernel_shape[0], kernel_shape[1]]
diff --git a/tests/python/contrib/test_cmsisnn/test_generate_constants.py b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
index cded0f03566d4..e6faa1a243f5c 100644
--- a/tests/python/contrib/test_cmsisnn/test_generate_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
@@ -16,7 +16,6 @@
 # under the License.
 
 """CMSIS-NN integration tests: generate_constants pass"""
-import itertools
 import math
 import numpy as np
 import pytest
@@ -25,9 +24,8 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
-from utils import (
+from .utils import (
     make_module,
-    get_range_for_dtype_str,
     get_same_padding,
     get_conv2d_qnn_params,
     make_qnn_relu,
@@ -43,6 +41,8 @@ def quantize_scale(scale):
 
 
 class CheckGeneratedConstants(tvm.relay.ExprVisitor):
+    """Provides methods to compare against expected quantization parameters"""
+
     def __init__(self, enable_bias, multiplier, shift):
         super().__init__()
         self.num_constant_args_ = 0
@@ -53,7 +53,6 @@ def __init__(self, enable_bias, multiplier, shift):
     def visit_call(self, call):
         super().visit_call(call)
         if isinstance(call.op, tvm.ir.expr.GlobalVar):
-            # extern_fn_call(input, weight, multiplier, weight_scale, bias_optional, input_scale, shift)
             multiplier = call.args[2]
             shift = call.args[6] if self.enable_bias_ else call.args[5]
             assert isinstance(
@@ -107,7 +106,7 @@ def make_model(
 
     weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
     rng = np.random.default_rng(12321)
-    w = tvm.nd.array(
+    weight = tvm.nd.array(
         rng.integers(
             np.iinfo(kernel_dtype).min,
             high=np.iinfo(kernel_dtype).max,
@@ -115,7 +114,7 @@ def make_model(
             dtype=kernel_dtype,
         )
     )
-    weight_const = relay.const(w, kernel_dtype)
+    weight_const = relay.const(weight, kernel_dtype)
     conv = relay.qnn.op.conv2d(
         a,
         weight_const,
@@ -133,8 +132,8 @@ def make_model(
         padding=p,
         out_dtype="int32",
     )
-    b = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
-    bias_const = relay.const(b, "int32")
+    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
+    bias_const = relay.const(bias, "int32")
     last_op = relay.nn.bias_add(conv, bias_const, axis=3) if enable_bias else conv
     requant_input_sc = [sc * input_scale for sc in kernel_scale]
     last_op = relay.qnn.op.requantize(
@@ -146,7 +145,7 @@ def make_model(
         out_dtype=dtype,
     )
     last_op = make_qnn_relu(last_op, relu_type, output_scale, output_zero_point, dtype)
-    params = {"w": w, "b": b}
+    params = {"w": weight, "b": bias}
     return last_op, params
 
 
@@ -163,6 +162,7 @@ def test_op_int8(
     kernel_scale,
     out_channels,
 ):
+    """Tests for CMSIS-NN constants when the dtype is int8"""
     ifm_shape = (1, 28, 28, 3)
     padding = "VALID"
     strides = (1, 1)
@@ -175,7 +175,6 @@ def test_op_int8(
     kernel_w = kernel_size[1]
     dtype = "int8"
     relu_type = "RELU"
-    in_min, in_max = get_range_for_dtype_str(dtype)
 
     weight_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
 
diff --git a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
index d0a8547d32acd..c66f9d0e07260 100644
--- a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
+++ b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
@@ -16,17 +16,14 @@
 # under the License.
 
 """CMSIS-NN integration tests: Tests invalid graphs"""
-import itertools
 import numpy as np
-import pytest
 import tvm
-from tvm import relay
 
 from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
 from tvm.micro.testing.aot_test_utils import (
     AOT_USMP_CORSTONE300_RUNNER,
 )
-from utils import (
+from .utils import (
     skip_if_no_reference_system,
     get_range_for_dtype_str,
 )
@@ -35,13 +32,14 @@
 @skip_if_no_reference_system
 @tvm.testing.requires_cmsisnn
 def test_empty_function():
-    ORIGINAL_MODEL = """
+    """Test partitioned function without composite function"""
+    original_model = """
 #[version = "0.0.5"]
 def @main(%data : Tensor[(16, 29), int8]) -> Tensor[(16, 29), int8] {
     add(%data, %data)
 }
 """
-    CMSISNN_MODEL = """
+    cmsisnn_model = """
 #[version = "0.0.5"]
 def @tvmgen_default_cmsis_nn_main_1(%i1: Tensor[(16, 29), int8], Inline=1, Compiler="cmsis-nn", global_symbol="tvmgen_default_cmsis_nn_main_1", Primitive=1) -> Tensor[(16, 29), int8] {
   add(%i1, %i1)
@@ -51,8 +49,8 @@ def @main(%data : Tensor[(16, 29), int8]) -> Tensor[(16, 29), int8] {
   %1
 }
 """
-    orig_mod = tvm.parser.fromtext(ORIGINAL_MODEL)
-    cmsisnn_mod = tvm.parser.fromtext(CMSISNN_MODEL)
+    orig_mod = tvm.parser.fromtext(original_model)
+    cmsisnn_mod = tvm.parser.fromtext(cmsisnn_model)
     params = {}
 
     # validate the output
diff --git a/tests/python/contrib/test_cmsisnn/test_networks.py b/tests/python/contrib/test_cmsisnn/test_networks.py
index 3b1e2331f2ff5..6f9f3743a6226 100644
--- a/tests/python/contrib/test_cmsisnn/test_networks.py
+++ b/tests/python/contrib/test_cmsisnn/test_networks.py
@@ -17,8 +17,6 @@
 
 """CMSIS-NN: testing with networks"""
 
-import sys
-
 import pytest
 import numpy as np
 
@@ -26,20 +24,21 @@
 from tvm import relay
 from tvm.contrib.download import download_testdata
 from tvm.relay.op.contrib import cmsisnn
-
-from utils import skip_if_no_reference_system, get_range_for_dtype_str
 from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
 from tvm.micro.testing.aot_test_utils import (
     AOT_CORSTONE300_RUNNER,
     AOT_USMP_CORSTONE300_RUNNER,
 )
+from .utils import skip_if_no_reference_system, get_range_for_dtype_str
 
-
+# pylint: disable=import-outside-toplevel
 def _convert_to_relay(
     tflite_model_buf,
     input_data,
     input_node,
 ):
+    """Converts TFLite model to Relay module and params"""
+
     def convert_to_list(x):
         if not isinstance(x, list):
             x = [x]
@@ -62,9 +61,9 @@ def convert_to_list(x):
 
     shape_dict = {}
     dtype_dict = {}
-    for i, e in enumerate(input_node):
-        shape_dict[e] = input_data[i].shape
-        dtype_dict[e] = input_data[i].dtype.name
+    for i, name in enumerate(input_node):
+        shape_dict[name] = input_data[i].shape
+        dtype_dict[name] = input_data[i].dtype.name
 
     mod, params = relay.frontend.from_tflite(
         tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
@@ -78,8 +77,13 @@ def convert_to_list(x):
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("test_runner", [AOT_CORSTONE300_RUNNER, AOT_USMP_CORSTONE300_RUNNER])
 def test_cnn_small(test_runner):
+    """Download a small network and tests TVM via CMSIS-NN output against TFLite output"""
     # download the model
-    base_url = "https://github.com/ARM-software/ML-zoo/raw/48a22ee22325d15d2371a6df24eb7d67e21dcc97/models/keyword_spotting/cnn_small/tflite_int8"
+    base_url = (
+        "https://github.com/ARM-software/ML-zoo/raw/"
+        "48a22ee22325d15d2371a6df24eb7d67e21dcc97"
+        "/models/keyword_spotting/cnn_small/tflite_int8"
+    )
     file_to_download = "cnn_s_quantized.tflite"
     file_saved = "cnn_s_quantized_15Dec2021.tflite"
     model_file = download_testdata("{}/{}".format(base_url, file_to_download), file_saved)
diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py
index 1fd280b7d81a1..6b719cdc9938e 100644
--- a/tests/python/contrib/test_cmsisnn/test_pooling.py
+++ b/tests/python/contrib/test_cmsisnn/test_pooling.py
@@ -16,7 +16,6 @@
 # under the License.
 
 """CMSIS-NN integration tests: Conv2D"""
-import itertools
 import numpy as np
 import pytest
 import tvm
@@ -25,12 +24,10 @@
 
 from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
 from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
-from utils import (
-    skip_if_no_reference_system,
+from .utils import (
     make_module,
     get_range_for_dtype_str,
     get_same_padding,
-    get_conv2d_qnn_params,
     make_qnn_relu,
     assert_partitioned_function,
     assert_no_external_function,
@@ -49,7 +46,9 @@ def make_model(
     relu_type="RELU",
     layout="NHWC",
 ):
-    """Return a model and any parameters it may have, all parameters are defaulted to known good values"""
+    """Return a model and any parameters it may have,
+    all parameters are defaulted to known good values
+    """
     op = relay.var("input", shape=shape, dtype=dtype)
     pad_ = (0, 0, 0, 0)
     if padding == "SAME":
@@ -61,12 +60,12 @@ def make_model(
             pad_value=zero_point,
             pad_mode="constant",
         )
-    if pool_op == relay.nn.avg_pool2d:
+    if pool_op.__name__ == relay.nn.avg_pool2d.__name__:
         op = relay.cast(op, "int32")
     op = pool_op(
         op, pool_size=pool_size, strides=strides, padding=pad_, ceil_mode=True, layout=layout
     )
-    if pool_op == relay.nn.avg_pool2d:
+    if pool_op.__name__ == relay.nn.avg_pool2d.__name__:
         op = relay.cast(op, dtype)
     op = make_qnn_relu(op, relu_type, scale, zero_point, dtype)
     return op
@@ -91,6 +90,7 @@ def test_op_int8(
     zero_point,
     scale,
 ):
+    """Tests QNN pooling op for int8 inputs"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -138,6 +138,7 @@ def test_op_int8(
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("op", [relay.nn.avg_pool2d, relay.nn.max_pool2d])
 def test_invalid_datatype(op):
+    """Checks CMSIS-NN partitioning for non int8 dtype"""
     model = make_model(pool_op=op, dtype="int64")
 
     orig_mod = make_module(model)
@@ -148,6 +149,7 @@ def test_invalid_datatype(op):
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("op", [relay.nn.avg_pool2d, relay.nn.max_pool2d])
 def test_invalid_batch_size(op):
+    """Checks CMSIS-NN partitioning when batch size is not 1"""
     model = make_model(
         pool_op=op,
         shape=(2, 28, 28, 12),
@@ -161,6 +163,7 @@ def test_invalid_batch_size(op):
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("op", [relay.nn.avg_pool2d, relay.nn.max_pool2d])
 def test_invalid_layout(op):
+    """Checks CMSIS-NN partitioning when layout is not NHWC"""
     model = make_model(pool_op=op, layout="NCHW")
 
     orig_mod = make_module(model)
diff --git a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
index 35bdabf3171c4..557a65aeffcaf 100644
--- a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
+++ b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
@@ -16,10 +16,7 @@
 # under the License.
 
 """CMSIS-NN integration tests: scalar_to_tensor_constant pass"""
-import sys
-
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -56,6 +53,8 @@ def make_binary_op(
 
 
 class CheckFunctionsForConstants(tvm.relay.ExprVisitor):
+    """Provides method to test number of scalar constants present in a function"""
+
     def __init__(self):
         super().__init__()
         self.num_constants_ = 0
@@ -66,7 +65,7 @@ def visit_call(self, call):
             if isinstance(arg, relay.Constant) and arg.data.numpy().ndim > 0:
                 self.num_constants_ += 1
 
-    def check_num_constants(self, func):
+    def check_num_constants(self):
         assert self.num_constants_ == 0, "Functions should not have constant arguments in Calls"
 
 
@@ -84,44 +83,45 @@ def set_composite_func_attr(func, name):
 
 @tvm.testing.requires_cmsisnn
 def test_single_scalar_position_0():
+    """Tests conversion to tensor constant when first operand is a scalar"""
     dtype = "int8"
     shape = (8, 8)
-    x0 = generate_variable("x0", None, dtype)
-    x1 = generate_variable("x1", shape, dtype)
-    z1 = make_binary_op(
+    operand0 = generate_variable("operand0", None, dtype)
+    operand1 = generate_variable("operand1", shape, dtype)
+    binary_op = make_binary_op(
         relay.qnn.op.add,
-        x0,
-        x1,
+        operand0,
+        operand1,
         input_0_scale=0.0128,
         input_0_zero_point=32,
         input_1_scale=0.256,
         input_1_zero_point=-64,
     )
 
-    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
-    lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
+    local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
 
-    y0 = relay.expr.const(3, dtype)
-    y1 = relay.var("y1", shape=shape, dtype=dtype)
-    c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([y1], c0, relay.TensorType(shape, dtype))
+    arg0 = relay.expr.const(3, dtype)
+    arg1 = relay.var("arg1", shape=shape, dtype=dtype)
+    call_local_func = relay.Call(local_func, [arg0, arg1])
+    extern_func = relay.Function([arg1], call_local_func, relay.TensorType(shape, dtype))
 
     x = relay.var("x", shape=shape, dtype=dtype)
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [x])
-    mf = relay.Function([x], c, relay.TensorType(shape, dtype))
-    mv = relay.GlobalVar("main")
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [x])
+    main_func = relay.Function([x], call_extern_func, relay.TensorType(shape, dtype))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
     mod = relay.transform.InferType()(mod)
     check_for_constants = CheckFunctionsForConstants()
-    check_for_constants.visit_call(mod[ev].body)
+    check_for_constants.visit_call(mod[global_var].body)
     assert (
         check_for_constants.num_constants_ == 1
     ), "Scalar constant wasn't converted into tensor constant"
@@ -129,44 +129,45 @@ def test_single_scalar_position_0():
 
 @tvm.testing.requires_cmsisnn
 def test_single_scalar_position_1():
+    """Tests conversion to tensor constant when second operand is a scalar"""
     dtype = "int8"
     shape = (8, 8)
-    x0 = generate_variable("x0", shape, dtype)
-    x1 = generate_variable("x1", None, dtype)
-    z1 = make_binary_op(
+    operand0 = generate_variable("operand0", shape, dtype)
+    operand1 = generate_variable("operand1", None, dtype)
+    binary_op = make_binary_op(
         relay.qnn.op.add,
-        x0,
-        x1,
+        operand0,
+        operand1,
         input_0_scale=0.0128,
         input_0_zero_point=32,
         input_1_scale=0.256,
         input_1_zero_point=-64,
     )
 
-    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
-    lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
+    local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
 
-    y0 = relay.var("y0", shape=shape, dtype=dtype)
-    y1 = relay.expr.const(3, dtype)
-    c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([y0], c0, relay.TensorType(shape, dtype))
+    arg0 = relay.var("arg0", shape=shape, dtype=dtype)
+    arg1 = relay.expr.const(3, dtype)
+    call_local_func = relay.Call(local_func, [arg0, arg1])
+    extern_func = relay.Function([arg0], call_local_func, relay.TensorType(shape, dtype))
 
     x = relay.var("x", shape=shape, dtype=dtype)
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [x])
-    mf = relay.Function([x], c, relay.TensorType(shape, dtype))
-    mv = relay.GlobalVar("main")
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [x])
+    main_func = relay.Function([x], call_extern_func, relay.TensorType(shape, dtype))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
     mod = relay.transform.InferType()(mod)
     check_for_constants = CheckFunctionsForConstants()
-    check_for_constants.visit_call(mod[ev].body)
+    check_for_constants.visit_call(mod[global_var].body)
     assert (
         check_for_constants.num_constants_ == 1
     ), "Scalar constant wasn't converted into tensor constant"
@@ -174,83 +175,85 @@ def test_single_scalar_position_1():
 
 @tvm.testing.requires_cmsisnn
 def test_primary_operands_all_scalars():
+    """Tests conversion to tensor constants all operands are scalars"""
     dtype = "int8"
     shape = None
-    x0 = generate_variable("x0", None, dtype)
-    x1 = generate_variable("x1", None, dtype)
-    z1 = make_binary_op(
+    operand0 = generate_variable("operand0", None, dtype)
+    operand1 = generate_variable("operand1", None, dtype)
+    binary_op = make_binary_op(
         relay.qnn.op.add,
-        x0,
-        x1,
+        operand0,
+        operand1,
         input_0_scale=0.0128,
         input_0_zero_point=32,
         input_1_scale=0.256,
         input_1_zero_point=-64,
     )
 
-    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
-    lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
+    local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
 
-    y0 = relay.expr.const(7, dtype)
-    y1 = relay.expr.const(3, dtype)
-    c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([], c0, relay.TensorType(shape, dtype))
+    arg0 = relay.expr.const(7, dtype)
+    arg1 = relay.expr.const(3, dtype)
+    call_local_func = relay.Call(local_func, [arg0, arg1])
+    extern_func = relay.Function([], call_local_func, relay.TensorType(shape, dtype))
 
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [])
-    mf = relay.Function([], c, relay.TensorType(shape, dtype))
-    mv = relay.GlobalVar("main")
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [])
+    main_func = relay.Function([], call_extern_func, relay.TensorType(shape, dtype))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
     new_mod = relay.transform.InferType()(mod)
-    assert tvm.ir.structural_equal(mod[ev].body, new_mod[ev].body)
+    assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body)
 
 
 @tvm.testing.requires_cmsisnn
 def test_all_primary_operands_tensor_constants():
+    """Tests conversion to tensor constants all operands are tensors"""
     dtype = "int8"
     shape = (1, 3, 3, 32)
-    x0 = generate_variable("x0", shape, dtype)
-    x1 = generate_variable("x1", shape, dtype)
-    z1 = make_binary_op(
+    operand0 = generate_variable("operand0", shape, dtype)
+    operand1 = generate_variable("operand1", shape, dtype)
+    binary_op = make_binary_op(
         relay.qnn.op.add,
-        x0,
-        x1,
+        operand0,
+        operand1,
         input_0_scale=0.0128,
         input_0_zero_point=32,
         input_1_scale=0.256,
         input_1_zero_point=-64,
     )
 
-    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
-    lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
+    local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
 
     rng = np.random.default_rng(12345)
-    y0 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
-    y1 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
-    c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([], c0, relay.TensorType(shape, dtype))
+    arg0 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
+    arg1 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
+    call_local_func = relay.Call(local_func, [arg0, arg1])
+    extern_func = relay.Function([], call_local_func, relay.TensorType(shape, dtype))
 
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [])
-    mf = relay.Function([], c, relay.TensorType(shape, dtype))
-    mv = relay.GlobalVar("main")
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [])
+    main_func = relay.Function([], call_extern_func, relay.TensorType(shape, dtype))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
     new_mod = relay.transform.InferType()(mod)
-    assert tvm.ir.structural_equal(mod[ev].body, new_mod[ev].body)
+    assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body)
 
 
 @tvm.testing.requires_cmsisnn
@@ -258,26 +261,28 @@ def test_non_cmsisnn_ext_func():
     """Non CMSISNN functions should not be altered."""
 
     def get_mod():
-        x1 = relay.var("x1", shape=None)
-        x2 = relay.var("x2", shape=None)
-        z1 = x1 + x2
-        lf = relay.Function([x1, x2], z1, relay.TensorType((), "float32"))
-        lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
-
-        y0 = relay.expr.const(5, "float32")
-        y1 = relay.expr.const(3, "float32")
-        c0 = relay.Call(lf, [y0, y1])
-        ef = relay.Function([], c0, relay.TensorType((), "float32"))
-
-        ev = relay.GlobalVar("external_function")
-        ef = set_external_func_attr(ef, "foo", ev.name_hint)
-        c = relay.Call(ev, [])
-        mf = relay.Function([], c, relay.TensorType((), "float32"))
-        mv = relay.GlobalVar("main")
+        operand1 = relay.var("operand1", shape=None)
+        operand2 = relay.var("operand2", shape=None)
+        binary_op = operand1 + operand2
+        local_func = relay.Function(
+            [operand1, operand2], binary_op, relay.TensorType((), "float32")
+        )
+        local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
+
+        arg0 = relay.expr.const(5, "float32")
+        arg1 = relay.expr.const(3, "float32")
+        call_local_func = relay.Call(local_func, [arg0, arg1])
+        extern_func = relay.Function([], call_local_func, relay.TensorType((), "float32"))
+
+        global_var = relay.GlobalVar("external_function")
+        extern_func = set_external_func_attr(extern_func, "foo", global_var.name_hint)
+        call_extern_func = relay.Call(global_var, [])
+        main_func = relay.Function([], call_extern_func, relay.TensorType((), "float32"))
+        main_var = relay.GlobalVar("main")
 
         mod = tvm.IRModule()
-        mod[ev] = ef
-        mod[mv] = mf
+        mod[global_var] = extern_func
+        mod[main_var] = main_func
         mod = relay.transform.InferType()(mod)
         return mod
 
diff --git a/tests/python/contrib/test_cmsisnn/test_softmax.py b/tests/python/contrib/test_cmsisnn/test_softmax.py
index 840d0e6f4436d..c6d2e4ec45371 100644
--- a/tests/python/contrib/test_cmsisnn/test_softmax.py
+++ b/tests/python/contrib/test_cmsisnn/test_softmax.py
@@ -16,8 +16,6 @@
 # under the License.
 
 """CMSIS-NN integration tests: Softmax"""
-
-import sys
 import itertools
 
 import numpy as np
@@ -26,16 +24,16 @@
 import tvm.testing
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 
-from utils import (
+from .utils import (
     skip_if_no_reference_system,
     make_module,
     get_range_for_dtype_str,
     assert_partitioned_function,
     assert_no_external_function,
 )
-from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
-from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 
 
 def make_model(
@@ -62,6 +60,7 @@ def make_model(
 @pytest.mark.parametrize(["zero_point", "scale"], [[33, 0.256], [-64, 0.0128]])
 @tvm.testing.requires_cmsisnn
 def test_op_int8(zero_point, scale):
+    """Tests int8 QNN Softmax for CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -92,6 +91,7 @@ def test_op_int8(zero_point, scale):
 
 
 def parameterize_for_invalid_model(test):
+    """Generates parameters for non int8 input and output of Softmax"""
     in_dtype = ["uint8", "int8"]
     out_dtype = ["uint8", "int8"]
     zero_point = [-128, 64]
@@ -119,6 +119,7 @@ def parameterize_for_invalid_model(test):
 @parameterize_for_invalid_model
 @tvm.testing.requires_cmsisnn
 def test_invalid_parameters(in_dtype, out_dtype, zero_point, scale, out_zero_point, out_scale):
+    """Tests for non int8 input and output of Softmax"""
     model = make_model(
         [1, 16, 16, 3], in_dtype, out_dtype, zero_point, scale, out_zero_point, out_scale
     )
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index 83c67cd95b1c7..e69329ebc5a42 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -17,11 +17,9 @@
 
 """CMSIS-NN functions for testing networks"""
 
-import platform
 import math
+from typing import List, Union, Tuple
 import numpy as np
-import pytest
-from typing import List, Dict, Optional, Any, Union, Tuple
 
 import tvm
 from tvm import relay
@@ -52,6 +50,7 @@ def visit_call(self, call):
 
 
 def assert_partitioned_function(orig_mod, cmsisnn_mod):
+    """If kCompiler attribute is missing, this function raises assertion"""
     attrs = [
         cmsisnn_mod[var.name_hint].attrs
         for var in cmsisnn_mod.get_global_vars()
@@ -225,3 +224,5 @@ def make_qnn_relu(expr, fused_activation_fn, scale, zero_point, dtype):
         )
     if fused_activation_fn == "RELU":
         return tvm.relay.op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
+
+    raise ValueError("Invalid argument provided with fused_activation_fn")

From af0128158c45683d03d3cd0a8aea5afd620794c7 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 9 Jun 2022 15:34:32 -0500
Subject: [PATCH 083/181] [TIR][Schedule] Allow named block and buffer
 arguments in Schedule (#11624)

* [Schedule] Allowed string argument as block arg

This has previously been implemented for `Schedule.transform_layout`
in https://github.com/apache/tvm/pull/11296, extending to allow for
block arguments in all `Schedule` methods.

This change was only made for arguments that must be a `BlockRV`.  For
arguments that may be either a `BlockRV` or another
type (e.g. `Schedule.get_child_blocks` accepts either `BlockRV` or
`LoopRV`), this sugar is not implemented, to avoid ambiguity.

* [Schedule] Allowed string argument to Schedule.reindex

Similar to https://github.com/apache/tvm/pull/11269, which added this
functionality to `Schedule.transform_layout`.

* CI test update
---
 python/tvm/tir/schedule/schedule.py           | 112 ++++++++++++------
 .../schedule/primitive/cache_read_write.cc    |   9 +-
 .../test_tir_schedule_cache_read_write.py     |  94 ++++++++-------
 .../unittest/test_tir_schedule_compute_at.py  |  78 ++++++------
 .../test_tir_schedule_compute_inline.py       | 106 +++++++++--------
 .../unittest/test_tir_schedule_reduction.py   |  10 +-
 .../unittest/test_tir_schedule_reindex.py     |  32 +++--
 .../unittest/test_tir_schedule_sampling.py    |  10 +-
 .../unittest/test_tir_schedule_set_scope.py   |   9 +-
 .../test_tir_schedule_storage_align.py        |   6 +-
 .../test_tir_schedule_transform_layout.py     |  32 +++--
 .../unittest/test_tir_schedule_utilities.py   |  20 ++--
 12 files changed, 291 insertions(+), 227 deletions(-)

diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index d225280b655f7..d29495c430076 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -373,14 +373,14 @@ def sample_perfect_tile(
     @type_checked
     def sample_compute_location(
         self,
-        block: BlockRV,
+        block: Union[BlockRV, str],
         decision: Optional[int] = None,
     ) -> LoopRV:
         """Sample a compute-at location of the given block
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block whose compute-at location is to be sampled
         decision : Optional[int]
             The sampling decision
@@ -390,6 +390,8 @@ def sample_compute_location(
         result : LoopRV
             The sampled loop where the input block is to be computed at
         """
+        block = self._normalize_block_arg(block)
+
         return _ffi_api.ScheduleSampleComputeLocation(  # type: ignore  # pylint: disable=no-member
             self,
             block,
@@ -425,12 +427,12 @@ def get_block(
         )
 
     @type_checked
-    def get_loops(self, block: BlockRV) -> List[LoopRV]:
+    def get_loops(self, block: Union[BlockRV, str]) -> List[LoopRV]:
         """Get the parent loops of the block in its scope, from outer to inner
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The query block
 
         Returns
@@ -438,6 +440,7 @@ def get_loops(self, block: BlockRV) -> List[LoopRV]:
         loops : List[LoopRV]
             A list of loops above the given block in its scope, from outer to inner
         """
+        block = self._normalize_block_arg(block)
         return list(_ffi_api.ScheduleGetLoops(self, block))  # type: ignore # pylint: disable=no-member
 
     @type_checked
@@ -457,12 +460,12 @@ def get_child_blocks(self, block_or_loop: Union[BlockRV, LoopRV]) -> List[BlockR
         return list(_ffi_api.ScheduleGetChildBlocks(self, block_or_loop))  # type: ignore # pylint: disable=no-member
 
     @type_checked
-    def get_producers(self, block: BlockRV) -> List[BlockRV]:
+    def get_producers(self, block: Union[BlockRV, str]) -> List[BlockRV]:
         """Get the producers of a specific block
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block in the query
 
         Returns
@@ -470,15 +473,16 @@ def get_producers(self, block: BlockRV) -> List[BlockRV]:
         producers : List[BlockRV]
             A list of producers of the given block
         """
+        block = self._normalize_block_arg(block)
         return list(_ffi_api.ScheduleGetProducers(self, block))  # type: ignore # pylint: disable=no-member
 
     @type_checked
-    def get_consumers(self, block: BlockRV) -> List[BlockRV]:
+    def get_consumers(self, block: Union[BlockRV, str]) -> List[BlockRV]:
         """Get the consumers of a specific block
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block in the query
 
         Returns
@@ -486,6 +490,7 @@ def get_consumers(self, block: BlockRV) -> List[BlockRV]:
         consumers : List[BlockRV]
             A list of consumers of the given block
         """
+        block = self._normalize_block_arg(block)
         return list(_ffi_api.ScheduleGetConsumers(self, block))  # type: ignore # pylint: disable=no-member
 
     ########## Schedule: Transform loops ##########
@@ -970,7 +975,9 @@ def after_unroll(a: T.handle, b: T.handle) -> None:
     ########## Schedule: Insert cache stages ##########
 
     @type_checked
-    def cache_read(self, block: BlockRV, read_buffer_index: int, storage_scope: str) -> BlockRV:
+    def cache_read(
+        self, block: Union[BlockRV, str], read_buffer_index: int, storage_scope: str
+    ) -> BlockRV:
         """Create a block that reads a buffer region into a read cache. It requires:
 
         1) There is at most one block who write the buffer in the scope.
@@ -979,7 +986,7 @@ def cache_read(self, block: BlockRV, read_buffer_index: int, storage_scope: str)
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The consumer block of the target buffer.
 
         read_buffer_index: int
@@ -1036,12 +1043,15 @@ def after_cache_read(a: T.handle, b: T.handle) -> None:
                         B[vi, vj] = A_local[vi, vj] * 2.0
 
         """
+        block = self._normalize_block_arg(block)
         return _ffi_api.ScheduleCacheRead(  # type: ignore # pylint: disable=no-member
             self, block, read_buffer_index, storage_scope
         )
 
     @type_checked
-    def cache_write(self, block: BlockRV, write_buffer_index: int, storage_scope: str) -> BlockRV:
+    def cache_write(
+        self, block: Union[BlockRV, str], write_buffer_index: int, storage_scope: str
+    ) -> BlockRV:
         """Create a block that reads a buffer region into a write cache. It requires:
 
         1) There is only one block who write the buffer in the scope.
@@ -1050,7 +1060,7 @@ def cache_write(self, block: BlockRV, write_buffer_index: int, storage_scope: st
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The producer block of the target buffer.
 
         write_buffer_index: int
@@ -1108,12 +1118,17 @@ def after_cache_write(a: T.handle, b: T.handle) -> None:
                         B[vi, vj] = B_local[vi, vj]
 
         """
+        block = self._normalize_block_arg(block)
         return _ffi_api.ScheduleCacheWrite(  # type: ignore # pylint: disable=no-member
             self, block, write_buffer_index, storage_scope
         )
 
     @type_checked
-    def reindex(self, block: BlockRV, buffer_index: int, buffer_index_type: str) -> BlockRV:
+    def reindex(
+        self,
+        block: Union[BlockRV, str],
+        buffer: Union[Tuple[str, int], str, Buffer],
+    ) -> BlockRV:
         """Create a block that read/write a buffer region into a read/write cache with reindexing.
         The layout of the cache will be the same as by the iterators of the block that reads/writes
         the buffer. It requires:
@@ -1122,12 +1137,27 @@ def reindex(self, block: BlockRV, buffer_index: int, buffer_index_type: str) ->
 
         Parameters
         ----------
-        block: BlockRV
-            The block that accesses the target buffer
-        buffer_index: int
-            The index of the buffer in block's read or write region
-        buffer_index_type : str
-            Type of the buffer index, "read" or "write"
+        block : Union[BlockRV, str]
+
+            The block that accesses the target buffer.  If a string,
+            this must uniquely identify a block.
+
+        buffer: Union[Tuple[str,int], Buffer, str]
+
+            The buffer to be transformed, or a specification of how to
+            identify the buffer to be transformed.
+
+            If `buffer` if a tuple of ``(str,int)``, the first item
+            should be either "read" or "write", and the second item is
+            an index into the block's read or write regions.
+
+            If `buffer` is a string, it is the name of the buffer,
+            which must exist within the reads/writes of the block.  In
+            addition, the reads/writes of the block may not contain
+            more than one buffer with this name.
+
+            If `buffer` is a Buffer object, it must exist within the
+            reads/writes of the block.
 
         Returns
         -------
@@ -1157,7 +1187,7 @@ def before_reindex(
 
             sch = tir.Schedule(before_reindex)
             block = sch.get_block("B")
-            sch.reindex(block, 0, "read)
+            sch.reindex(block, ("read", 0))
 
         After applying reindex, the IR becomes:
 
@@ -1179,6 +1209,8 @@ def after_reindex(
                         B[vi, vj] = A_reindex[vi, vj] * 2.0
 
         """
+        block = self._normalize_block_arg(block)
+        buffer_index_type, buffer_index, _ = self._normalize_buffer_arg(block, buffer)
         assert buffer_index_type in ["read", "write"], "Invalid buffer_index_type"
         buffer_index_type_enum = 0 if buffer_index_type == "read" else 1
         return _ffi_api.ScheduleReIndex(  # type: ignore # pylint: disable=no-member
@@ -1190,7 +1222,7 @@ def after_reindex(
     @type_checked
     def compute_at(
         self,
-        block: BlockRV,
+        block: Union[BlockRV, str],
         loop: LoopRV,
         preserve_unit_loops: bool = False,
     ) -> None:
@@ -1213,7 +1245,7 @@ def compute_at(
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block to be moved
 
         loop: LoopRV
@@ -1273,6 +1305,7 @@ def after_compute_at(a: T.handle, c: T.handle) -> None:
                             C[vi, vj] = B[vi, vj] + 1.0
 
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleComputeAt(  # type: ignore # pylint: disable=no-member
             self,
             block,
@@ -1283,7 +1316,7 @@ def after_compute_at(a: T.handle, c: T.handle) -> None:
     @type_checked
     def reverse_compute_at(
         self,
-        block: BlockRV,
+        block: Union[BlockRV, str],
         loop: LoopRV,
         preserve_unit_loops: bool = False,
     ) -> None:
@@ -1303,7 +1336,7 @@ def reverse_compute_at(
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block to be moved
 
         loop: LoopRV
@@ -1363,6 +1396,7 @@ def after_reverse_compute_at(a: T.handle, c: T.handle) -> None:
                             C[vi, vj] = B[vi, vj] + 1.0
 
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleReverseComputeAt(  # type: ignore # pylint: disable=no-member
             self,
             block,
@@ -1371,7 +1405,7 @@ def after_reverse_compute_at(a: T.handle, c: T.handle) -> None:
         )
 
     @type_checked
-    def compute_inline(self, block: BlockRV) -> None:
+    def compute_inline(self, block: Union[BlockRV, str]) -> None:
         """Inline a block into its consumer(s). It requires:
 
         1) The block is a complete non-root block, which only produces one buffer
@@ -1386,7 +1420,7 @@ def compute_inline(self, block: BlockRV) -> None:
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block to be inlined to its consumer(s)
 
         Examples
@@ -1432,10 +1466,11 @@ def after_inline(a: T.handle, c: T.handle) -> None:
                         C[vi, vj] = A[vi, vj] * 2.0 + 1.0
 
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleComputeInline(self, block)  # type: ignore # pylint: disable=no-member
 
     @type_checked
-    def reverse_compute_inline(self, block: BlockRV) -> None:
+    def reverse_compute_inline(self, block: Union[BlockRV, str]) -> None:
         """Inline a block into its only producer. It requires:
 
         1) The block is a complete non-root block, which only produces and consumes one buffer
@@ -1453,7 +1488,7 @@ def reverse_compute_inline(self, block: BlockRV) -> None:
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block to be inlined to its producer
 
         Examples
@@ -1499,12 +1534,13 @@ def after_inline(a: T.handle, c: T.handle) -> None:
                         C[vi, vj] = A[vi, vj] * 2.0 + 1.0
 
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleReverseComputeInline(self, block)  # type: ignore # pylint: disable=no-member
 
     ########## Schedule: Reduction ##########
 
     @type_checked
-    def decompose_reduction(self, block: BlockRV, loop: LoopRV) -> BlockRV:
+    def decompose_reduction(self, block: Union[BlockRV, str], loop: LoopRV) -> BlockRV:
         """Decompose a reduction block into two separate blocks.
 
         a) The init block, which is translated from the init statement of the reduction block;
@@ -1523,7 +1559,7 @@ def decompose_reduction(self, block: BlockRV, loop: LoopRV) -> BlockRV:
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The reduction block to be decomposed
         loop : LoopRV
             The loop above which the init block is inserted before.
@@ -1578,6 +1614,7 @@ def after_decompose(a: ty.handle, c: ty.handle) -> None:
                         C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
         """
+        block = self._normalize_block_arg(block)
         return _ffi_api.ScheduleDecomposeReduction(self, block, loop)  # type: ignore # pylint: disable=no-member
 
     @type_checked
@@ -1734,7 +1771,7 @@ def after_rfactor(a: T.handle, b: T.handle) -> None:
     @type_checked
     def storage_align(  # pylint: disable=too-many-arguments
         self,
-        block: BlockRV,
+        block: Union[BlockRV, str],
         buffer_index: int,
         axis: int,
         factor: int,
@@ -1747,7 +1784,7 @@ def storage_align(  # pylint: disable=too-many-arguments
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The producer block of the buffer.
         buffer_index : int
             The index of the buffer in block's write region.
@@ -1812,18 +1849,19 @@ def after_storage_align(a: T.handle, c: T.handle) -> None:
         ----
         Storage_align requires the buffer to be an intermediate buffer defined via `alloc_buffer`.
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleStorageAlign(  # type: ignore # pylint: disable=no-member
             self, block, buffer_index, axis, factor, offset
         )
 
     @type_checked
-    def set_scope(self, block: BlockRV, buffer_index: int, storage_scope: str) -> None:
+    def set_scope(self, block: Union[BlockRV, str], buffer_index: int, storage_scope: str) -> None:
         """Set the storage scope of a buffer, where the buffer is
         specified by the a block and a write-index
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The producer block of the buffer
         buffer_index : int
             The index of the buffer in block's write region
@@ -1883,6 +1921,7 @@ def after_set_scope(
         ----
         Set_scope requires the buffer to be an intermediate buffer defined via `alloc_buffer`.
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleSetScope(  # type: ignore # pylint: disable=no-member
             self, block, buffer_index, storage_scope
         )
@@ -2418,14 +2457,14 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) ->
     @type_checked
     def transform_block_layout(
         self,
-        block: BlockRV,
+        block: Union[BlockRV, str],
         index_map: Union[IndexMap, Callable],
     ) -> None:
         """Apply a transformation represented by IndexMap to block
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block to be transformed
 
         index_map : Union[IndexMap, Callable]
@@ -2470,6 +2509,7 @@ def after_transform_block_layout(
                         vi, = T.axis.remap("S", [i])
                         B[vi // 16, vi % 16] = A[vi // 16, vi % 16] * 2.0
         """
+        block = self._normalize_block_arg(block)
         if callable(index_map):
             index_map = IndexMap.from_func(index_map)
         _ffi_api.ScheduleTransformBlockLayout(  # type: ignore # pylint: disable=no-member
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index c96f88e1f6333..5a8d452f14b85 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -1241,11 +1241,10 @@ struct ReIndexTraits : public UnpackedInstTraits<ReIndexTraits> {
                                  Integer buffer_index_type) {
     PythonAPICall py("reindex");
     py.Input("block", block);
-    py.Input("buffer_index", buffer_index);
-    py.Input("buffer_index_type", '"' +
-                                      std::string(BufferIndexType2Str(
-                                          static_cast<BufferIndexType>(buffer_index_type->value))) +
-                                      '"');
+    std::ostringstream os;
+    os << "(\"" << BufferIndexType2Str(static_cast<BufferIndexType>(buffer_index_type->value))
+       << "\", " << buffer_index << ")";
+    py.Input("buffer", os.str());
     py.SingleOutput(outputs);
     return py.Str();
   }
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index ef306b2c49290..5cd39c7ddaeb6 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -741,13 +741,15 @@ def block_predicate_cache_write_output_buf() -> None:
 
 ########## Testcases for cache_read ##########
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
-def test_cache_read_elementwise():
+
+def test_cache_read_elementwise(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
     block_b = sch.get_block("B")
     block_c = sch.get_block("C")
-    cached_a = sch.cache_read(block_b, 0, "global")
-    cached_b = sch.cache_read(block_c, 0, "local")
+    cached_a = sch.cache_read("B" if use_block_name else block_b, 0, "global")
+    cached_b = sch.cache_read("C" if use_block_name else block_c, 0, "local")
     assert sch.get(cached_a) == sch.get(sch.get_block("A_global"))
     assert sch.get(cached_b) == sch.get(sch.get_block("B_local"))
     assert sch.get(block_b) == sch.get(sch.get_block("B"))
@@ -756,74 +758,74 @@ def test_cache_read_elementwise():
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_cache_read_under_scope():
+def test_cache_read_under_scope(use_block_name):
     sch = tir.Schedule(access_under_scope, debug_mask="all")
-    block_b = sch.get_block("B")
-    block_c = sch.get_block("C")
+    block_b = "B" if use_block_name else sch.get_block("B")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.cache_read(block_b, 0, "local")
     sch.cache_read(block_c, 0, "global")
     tvm.ir.assert_structural_equal(cache_read_under_scope, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=access_under_scope)
 
 
-def test_cache_read_opaque_access():
+def test_cache_read_opaque_access(use_block_name):
     sch = tir.Schedule(opaque_access, debug_mask="all")
-    block = sch.get_block("load_store")
+    block = "load_store" if use_block_name else sch.get_block("load_store")
     sch.cache_read(block, 0, "global")
     tvm.ir.assert_structural_equal(cache_read_opaque_access, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=opaque_access)
 
 
-def test_cache_read_location():
+def test_cache_read_location(use_block_name):
     sch = tir.Schedule(func_multi_consumer, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     sch.cache_read(block_b, 0, "global")
     tvm.ir.assert_structural_equal(cache_read_multi_consumer, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=func_multi_consumer)
 
 
-def test_continuous_cache_read():
+def test_continuous_cache_read(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.cache_read(block_c, 0, "shared")
     sch.cache_read(block_c, 0, "local")
     tvm.ir.assert_structural_equal(continuous_cache_read, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_cache_read_with_block_predicate():
+def test_cache_read_with_block_predicate(use_block_name):
     sch = tir.Schedule(func_with_block_predicate, debug_mask="all")
-    block = sch.get_block("consumer")
+    block = "consumer" if use_block_name else sch.get_block("consumer")
     sch.cache_read(block, 0, "shared")
     tvm.ir.assert_structural_equal(block_predicate_cache_read, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=func_with_block_predicate)
 
 
-def test_cache_read_non_int32_shape():
+def test_cache_read_non_int32_shape(use_block_name):
     sch = tir.Schedule(elementwise_shape_int64, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     sch.cache_read(block_b, 0, "global")
     tvm.ir.assert_structural_equal(cache_read_shape_int64, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_shape_int64)
 
 
-def test_cache_read_fail_multi_producer():
+def test_cache_read_fail_multi_producer(use_block_name):
     sch = tir.Schedule(func_multi_producer, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_read(block_b, 0, "global")
 
 
-def test_cache_read_fail_index_out_of_bound():
+def test_cache_read_fail_index_out_of_bound(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_read(block_b, 1, "global")
 
 
-def test_cache_read_fail_invalid_storage_scope():
+def test_cache_read_fail_invalid_storage_scope(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_read(block_b, 0, "test_scope")
 
@@ -831,12 +833,12 @@ def test_cache_read_fail_invalid_storage_scope():
 ########## Testcases for cache_write ##########
 
 
-def test_cache_write_elementwise():
+def test_cache_write_elementwise(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
     block_b = sch.get_block("B")
     block_c = sch.get_block("C")
-    cached_b = sch.cache_write(block_b, 0, "local")
-    cached_c = sch.cache_write(block_c, 0, "global")
+    cached_b = sch.cache_write("B" if use_block_name else block_b, 0, "local")
+    cached_c = sch.cache_write("C" if use_block_name else block_c, 0, "global")
     assert sch.get(cached_b) == sch.get(sch.get_block("B_local"))
     assert sch.get(cached_c) == sch.get(sch.get_block("C_global"))
     assert sch.get(block_b) == sch.get(sch.get_block("B"))
@@ -845,10 +847,10 @@ def test_cache_write_elementwise():
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_cache_write_under_scope():
+def test_cache_write_under_scope(use_block_name):
     sch = tir.Schedule(access_under_scope, debug_mask="all")
-    block_a = sch.get_block("A")
-    block_b = sch.get_block("B")
+    block_a = "A" if use_block_name else sch.get_block("A")
+    block_b = "B" if use_block_name else sch.get_block("B")
     block_scope = sch.get_block("scope")
     sch.cache_write(block_a, 0, "local")
     sch.cache_write(block_b, 0, "global")
@@ -857,11 +859,11 @@ def test_cache_write_under_scope():
     verify_trace_roundtrip(sch=sch, mod=access_under_scope)
 
 
-def test_cache_write_opaque_access():
+def test_cache_write_opaque_access(use_block_name):
     sch = tir.Schedule(opaque_access, debug_mask="all")
-    block_store = sch.get_block("load_store")
-    block_opaque = sch.get_block("opaque")
-    block_match_buffer = sch.get_block("match_buffer")
+    block_store = "load_store" if use_block_name else sch.get_block("load_store")
+    block_opaque = "opaque" if use_block_name else sch.get_block("opaque")
+    block_match_buffer = "match_buffer" if use_block_name else sch.get_block("match_buffer")
     sch.cache_write(block_store, 0, "global")
     sch.cache_write(block_opaque, 0, "global")
     sch.cache_write(block_match_buffer, 0, "global")
@@ -869,58 +871,58 @@ def test_cache_write_opaque_access():
     verify_trace_roundtrip(sch=sch, mod=opaque_access)
 
 
-def test_cache_write_location():
+def test_cache_write_location(use_block_name):
     sch = tir.Schedule(func_multi_consumer, debug_mask="all")
-    block_a = sch.get_block("A")
+    block_a = "A" if use_block_name else sch.get_block("A")
     sch.cache_write(block_a, 0, "global")
     tvm.ir.assert_structural_equal(cache_write_multi_consumer, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=func_multi_consumer)
 
 
-def test_continuous_cache_write():
+def test_continuous_cache_write(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     sch.cache_write(block_b, 0, "shared")
     sch.cache_write(block_b, 0, "local")
     tvm.ir.assert_structural_equal(continuous_cache_write, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_cache_write_with_block_predicate():
+def test_cache_write_with_block_predicate(use_block_name):
     # cache write for intermediate buffer
     sch = tir.Schedule(func_with_block_predicate, debug_mask="all")
-    block = sch.get_block("producer")
+    block = "producer" if use_block_name else sch.get_block("producer")
     sch.cache_write(block, 0, "shared")
     tvm.ir.assert_structural_equal(block_predicate_cache_write_intermediate_buf, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=func_with_block_predicate)
     # cache write for external buffer
     sch = tir.Schedule(func_with_block_predicate, debug_mask="all")
-    block = sch.get_block("consumer")
+    block = "consumer" if use_block_name else sch.get_block("consumer")
     sch.cache_write(block, 0, "shared")
     tvm.ir.assert_structural_equal(block_predicate_cache_write_output_buf, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=func_with_block_predicate)
 
 
-def test_cache_write_fail_multi_producer():
+def test_cache_write_fail_multi_producer(use_block_name):
     sch = tir.Schedule(func_multi_producer, debug_mask="all")
-    block_a0 = sch.get_block("A0")
-    block_a1 = sch.get_block("A1")
+    block_a0 = "A0" if use_block_name else sch.get_block("A0")
+    block_a1 = "A1" if use_block_name else sch.get_block("A1")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_write(block_a0, 0, "global")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_write(block_a1, 0, "global")
 
 
-def test_cache_write_fail_index_out_of_bound():
+def test_cache_write_fail_index_out_of_bound(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_write(block_b, 1, "global")
 
 
-def test_cache_write_fail_invalid_storage_scope():
+def test_cache_write_fail_invalid_storage_scope(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_write(block_b, 0, "test_scope")
 
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
index 3772d9a4e0fec..0c20a4783ca02 100644
--- a/tests/python/unittest/test_tir_schedule_compute_at.py
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -1052,17 +1052,19 @@ def static_bound_after_compute_at(A: T.Buffer[(32, 1), "float32"], C: T.Buffer[(
 # pylint: enable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
 # fmt: on
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
-def test_compute_at_two_elementwise():
+
+def test_compute_at_two_elementwise(use_block_name):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
-    block = sch.get_block("B")
-    loop, _ = sch.get_loops(sch.get_block("C"))
+    block = "B" if use_block_name else sch.get_block("B")
+    loop, _ = sch.get_loops("C" if use_block_name else sch.get_block("C"))
     sch.compute_at(block, loop, preserve_unit_loops=True)
     tvm.ir.assert_structural_equal(two_elementwise_after_compute_at, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
 
-def test_compute_at_blockized_1():
+def test_compute_at_blockized_1(use_block_name):
     sch = tir.Schedule(blockized_1, debug_mask="all")
     block = sch.get_block("B")
     _, loop = sch.get_loops(sch.get_block("C_outer"))
@@ -1071,7 +1073,7 @@ def test_compute_at_blockized_1():
     verify_trace_roundtrip(sch=sch, mod=blockized_1)
 
 
-def test_compute_at_blockized_2():
+def test_compute_at_blockized_2(use_block_name):
     sch = tir.Schedule(blockized_2, debug_mask="all")
     block = sch.get_block("B_outer")
     _, loop, _, _ = sch.get_loops(sch.get_block("C"))
@@ -1080,7 +1082,7 @@ def test_compute_at_blockized_2():
     verify_trace_roundtrip(sch=sch, mod=blockized_2)
 
 
-def test_compute_at_cuda_matmul_0():
+def test_compute_at_cuda_matmul_0(use_block_name):
     sch = tir.Schedule(cuda_matmul_0, debug_mask="all")
     block = sch.get_block("C")
     _, _, _, _, _, loop, _, _ = sch.get_loops(sch.get_block("C_local"))
@@ -1089,7 +1091,7 @@ def test_compute_at_cuda_matmul_0():
     verify_trace_roundtrip(sch=sch, mod=cuda_matmul_0)
 
 
-def test_compute_at_cuda_matmul_1():
+def test_compute_at_cuda_matmul_1(use_block_name):
     sch = tir.Schedule(cuda_matmul_1, debug_mask="all")
     block = sch.get_block("A_shared_local")
     _, _, _, _, _, _, _, loop, _, _, _ = sch.get_loops(sch.get_block("C"))
@@ -1098,7 +1100,7 @@ def test_compute_at_cuda_matmul_1():
     verify_trace_roundtrip(sch=sch, mod=cuda_matmul_1)
 
 
-def test_compute_at_cuda_matmul_2():
+def test_compute_at_cuda_matmul_2(use_block_name):
     sch = tir.Schedule(cuda_matmul_2, debug_mask="all")
     block = sch.get_block("B_shared_local")
     _, _, _, _, _, _, _, loop, _, _, _ = sch.get_loops(sch.get_block("C"))
@@ -1107,7 +1109,7 @@ def test_compute_at_cuda_matmul_2():
     verify_trace_roundtrip(sch=sch, mod=cuda_matmul_2)
 
 
-def test_compute_at_cuda_matmul_3():
+def test_compute_at_cuda_matmul_3(use_block_name):
     sch = tir.Schedule(cuda_matmul_3, debug_mask="all")
     block = sch.get_block("A_shared")
     _, _, _, _, _, _, loop, _, _, _, _ = sch.get_loops(sch.get_block("C"))
@@ -1116,7 +1118,7 @@ def test_compute_at_cuda_matmul_3():
     verify_trace_roundtrip(sch=sch, mod=cuda_matmul_3)
 
 
-def test_compute_at_cuda_matmul_4():
+def test_compute_at_cuda_matmul_4(use_block_name):
     sch = tir.Schedule(cuda_matmul_4, debug_mask="all")
     block = sch.get_block("B_shared")
     _, _, _, _, _, _, loop, _, _, _, _ = sch.get_loops(sch.get_block("C"))
@@ -1125,7 +1127,7 @@ def test_compute_at_cuda_matmul_4():
     verify_trace_roundtrip(sch=sch, mod=cuda_matmul_4)
 
 
-def test_compute_at_reduction_block():
+def test_compute_at_reduction_block(use_block_name):
     sch = tir.Schedule(multi_reduction, debug_mask="all")
     block = sch.get_block("B")
     (loop,) = sch.get_loops(sch.get_block("C"))
@@ -1134,7 +1136,7 @@ def test_compute_at_reduction_block():
     verify_trace_roundtrip(sch=sch, mod=multi_reduction)
 
 
-def test_compute_at_tiled_pooling_read_cache():
+def test_compute_at_tiled_pooling_read_cache(use_block_name):
     sch = tir.Schedule(tiled_pooling_read_cache, debug_mask="all")
     compute = sch.get_block("compute")
     _, w_o, _, _, _, _ = sch.get_loops(compute)
@@ -1144,7 +1146,7 @@ def test_compute_at_tiled_pooling_read_cache():
     verify_trace_roundtrip(sch=sch, mod=tiled_pooling_read_cache)
 
 
-def test_compute_at_non_uniform_tiled_conv():
+def test_compute_at_non_uniform_tiled_conv(use_block_name):
     sch = tir.Schedule(non_uniform_tiled_conv, debug_mask="all")
     compute = sch.get_block("compute")
     sch.compute_at(sch.get_block("cache"), sch.get_loops(compute)[1])
@@ -1152,7 +1154,7 @@ def test_compute_at_non_uniform_tiled_conv():
     verify_trace_roundtrip(sch=sch, mod=non_uniform_tiled_conv)
 
 
-def test_compute_at_concat():
+def test_compute_at_concat(use_block_name):
     sch = tir.Schedule(concat_two_elemwise, debug_mask="all")
     concat = sch.get_block("T_concat")
     add1 = sch.get_block("T_add_1")
@@ -1164,7 +1166,7 @@ def test_compute_at_concat():
     verify_trace_roundtrip(sch=sch, mod=concat_two_elemwise)
 
 
-def test_compute_at_tiled_repeat_op():
+def test_compute_at_tiled_repeat_op(use_block_name):
     sch = tir.Schedule(tiled_repeat_op, debug_mask="all")
     outer_ax, _ = sch.get_loops(sch.get_block("T_repeat"))
     sch.compute_at(sch.get_block("T_add"), outer_ax)
@@ -1172,7 +1174,7 @@ def test_compute_at_tiled_repeat_op():
     verify_trace_roundtrip(sch=sch, mod=tiled_repeat_op)
 
 
-def test_reverse_compute_at_tiled():
+def test_reverse_compute_at_tiled(use_block_name):
     sch = tir.Schedule(tiled, debug_mask="all")
     block = sch.get_block("C")
     _, _, loop, _ = sch.get_loops(sch.get_block("B"))
@@ -1181,7 +1183,7 @@ def test_reverse_compute_at_tiled():
     verify_trace_roundtrip(sch=sch, mod=tiled)
 
 
-def test_reverse_compute_at_tiled_trivial_binding():
+def test_reverse_compute_at_tiled_trivial_binding(use_block_name):
     sch = tir.Schedule(tiled_trivial_binding, debug_mask="all")
     block = sch.get_block("C")
     _, _, loop, _ = sch.get_loops(sch.get_block("B"))
@@ -1190,7 +1192,7 @@ def test_reverse_compute_at_tiled_trivial_binding():
     verify_trace_roundtrip(sch=sch, mod=tiled_trivial_binding)
 
 
-def test_reverse_compute_at_blockized_2():
+def test_reverse_compute_at_blockized_2(use_block_name):
     sch = tir.Schedule(blockized_2, debug_mask="all")
     block = sch.get_block("C")
     _, loop = sch.get_loops(sch.get_block("B_outer"))
@@ -1199,7 +1201,7 @@ def test_reverse_compute_at_blockized_2():
     verify_trace_roundtrip(sch=sch, mod=blockized_2)
 
 
-def test_reverse_compute_at_factorized():
+def test_reverse_compute_at_factorized(use_block_name):
     sch = tir.Schedule(factorized, debug_mask="all")
     block = sch.get_block("B")
     _, loop, _, _ = sch.get_loops(sch.get_block("B_rf"))
@@ -1208,7 +1210,7 @@ def test_reverse_compute_at_factorized():
     verify_trace_roundtrip(sch=sch, mod=factorized)
 
 
-def test_reverse_compute_at_floordiv_and_floormod_indices():
+def test_reverse_compute_at_floordiv_and_floormod_indices(use_block_name):
     sch = tir.Schedule(floordiv_and_floormod_indices, debug_mask="all")
     A = sch.get_block("A")
     B = sch.get_block("B")
@@ -1219,7 +1221,7 @@ def test_reverse_compute_at_floordiv_and_floormod_indices():
     verify_trace_roundtrip(sch=sch, mod=floordiv_and_floormod_indices)
 
 
-def test_read_out_of_bound():
+def test_read_out_of_bound(use_block_name):
     sch = tir.Schedule(read_out_of_bound, debug_mask="all")
     block = sch.get_block("B")
     (loop,) = sch.get_loops(sch.get_block("C"))
@@ -1228,7 +1230,7 @@ def test_read_out_of_bound():
     verify_trace_roundtrip(sch=sch, mod=read_out_of_bound)
 
 
-def test_compact_dataflow():
+def test_compact_dataflow(use_block_name):
     sch = tir.Schedule(not_all_compact_data_flow, debug_mask="all")
     block = sch.get_block("B")
     _, loop = sch.get_loops(sch.get_block("C_1"))
@@ -1237,7 +1239,7 @@ def test_compact_dataflow():
     verify_trace_roundtrip(sch=sch, mod=not_all_compact_data_flow)
 
 
-def test_compute_at_simplify_static_bound():
+def test_compute_at_simplify_static_bound(use_block_name):
     sch = tir.Schedule(static_bound, debug_mask="all")
     block = sch.get_block("B")
     loop, _ = sch.get_loops(sch.get_block("C"))
@@ -1246,7 +1248,7 @@ def test_compute_at_simplify_static_bound():
     verify_trace_roundtrip(sch=sch, mod=static_bound)
 
 
-def test_compute_at_non_perfect_channel_group():
+def test_compute_at_non_perfect_channel_group(use_block_name):
     @T.prim_func
     def grouped_channel_bias(
         X: T.Buffer[(720, 8, 8), "float32"], Y: T.Buffer[(720, 8, 8), "float32"]
@@ -1284,7 +1286,7 @@ def grouped_channel_bias_non_perfect_tiled(
     tvm.ir.assert_structural_equal(sch.mod["main"], grouped_channel_bias_non_perfect_tiled)
 
 
-def test_fail_subtree_complete_block():
+def test_fail_subtree_complete_block(use_block_name):
     sch = tir.Schedule(fail_subtree_compact_dataflow, debug_mask="all")
     block = sch.get_block("B_0")
     loop, _ = sch.get_loops(sch.get_block("C"))
@@ -1292,47 +1294,47 @@ def test_fail_subtree_complete_block():
         sch.compute_at(block, loop)
 
 
-def test_fail_not_in_same_scope():
+def test_fail_not_in_same_scope(use_block_name):
     sch = tir.Schedule(blockized_1, debug_mask="all")
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
     loop, _ = sch.get_loops(sch.get_block("C_inner"))
     with pytest.raises(tvm.tir.ScheduleError, match="same block scope"):
         sch.compute_at(block, loop)
 
 
-def test_fail_loop_is_ancestor_of_block():
+def test_fail_loop_is_ancestor_of_block(use_block_name):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
     loop, _ = sch.get_loops(sch.get_block("B"))
     with pytest.raises(tvm.tir.ScheduleError, match="ancestor of block"):
         sch.compute_at(block, loop)
 
 
-def test_fail_output_block():
+def test_fail_output_block(use_block_name):
     sch = tir.Schedule(tiled, debug_mask="all")
-    block = sch.get_block("C")
+    block = "C" if use_block_name else sch.get_block("C")
     loop, _, _, _ = sch.get_loops(sch.get_block("B"))
     with pytest.raises(tvm.tir.ScheduleError, match="output block"):
         sch.compute_at(block, loop)
 
 
-def test_fail_all_consumers_under_loop():
+def test_fail_all_consumers_under_loop(use_block_name):
     sch = tir.Schedule(fail_all_consumers_under_loop, debug_mask="all")
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
     loop, _ = sch.get_loops(sch.get_block("C"))
     with pytest.raises(tvm.tir.ScheduleError, match="requires all the consumer"):
         sch.compute_at(block, loop)
 
 
-def test_fail_all_producers_under_loop():
+def test_fail_all_producers_under_loop(use_block_name):
     sch = tir.Schedule(fail_all_producers_under_loop, debug_mask="all")
-    block = sch.get_block("D")
+    block = "D" if use_block_name else sch.get_block("D")
     loop, _ = sch.get_loops(sch.get_block("C"))
     with pytest.raises(tvm.tir.ScheduleError, match="requires all the producer"):
         sch.reverse_compute_at(block, loop)
 
 
-def test_compute_at_int64_loop():
+def test_compute_at_int64_loop(use_block_name):
     def _create_prim_func():
         n = te.var("n", dtype="int64")
         m = te.var("m", dtype="int64")
@@ -1344,8 +1346,8 @@ def _create_prim_func():
 
     mod = _create_prim_func()
     sch = tir.Schedule(mod, debug_mask="all")
-    block_c = sch.get_block("C")
-    block_d = sch.get_block("D")
+    block_c = "C" if use_block_name else sch.get_block("C")
+    block_d = "D" if use_block_name else sch.get_block("D")
     i, _ = sch.get_loops(block_d)
     sch.compute_at(block_c, i)
     verify_trace_roundtrip(sch=sch, mod=mod)
diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py
index 84fb88218997f..617e13db27f60 100644
--- a/tests/python/unittest/test_tir_schedule_compute_inline.py
+++ b/tests/python/unittest/test_tir_schedule_compute_inline.py
@@ -587,10 +587,12 @@ def exp_exp_opaque_access_with_tvm_access_ptr_inlined(
 
 # pylint: enable=no-member,invalid-name,unused-variable
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
-def test_compute_inline_elementwise():
+
+def test_compute_inline_elementwise(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     block_c = sch.get_block("C")
     sch.compute_inline(block_b)
     tvm.ir.assert_structural_equal(elementwise_inlined, sch.mod["main"])
@@ -598,9 +600,9 @@ def test_compute_inline_elementwise():
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_compute_inline_under_loop():
+def test_compute_inline_under_loop(use_block_name):
     sch = tir.Schedule(elementwise_under_loop, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     block_c = sch.get_block("C")
     sch.compute_inline(block_b)
     tvm.ir.assert_structural_equal(elementwise_inlined, sch.mod["main"])
@@ -608,9 +610,9 @@ def test_compute_inline_under_loop():
     verify_trace_roundtrip(sch=sch, mod=elementwise_under_loop)
 
 
-def test_compute_inline_as_dce():
+def test_compute_inline_as_dce(use_block_name):
     sch = tir.Schedule(elementwise_standalone, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     block_c = sch.get_block("C")
     sch.compute_inline(block_b)
     tvm.ir.assert_structural_equal(elementwise_standalone_dce, sch.mod["main"])
@@ -618,9 +620,9 @@ def test_compute_inline_as_dce():
     verify_trace_roundtrip(sch=sch, mod=elementwise_standalone)
 
 
-def test_compute_inline_multi_consumer():
+def test_compute_inline_multi_consumer(use_block_name):
     sch = tir.Schedule(elementwise_multi_producer_consumer, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     block_c = sch.get_block("C")
     block_d = sch.get_block("D")
     sch.compute_inline(block_b)
@@ -630,81 +632,81 @@ def test_compute_inline_multi_consumer():
     verify_trace_roundtrip(sch=sch, mod=elementwise_multi_producer_consumer)
 
 
-def test_compute_inline_fail_multi_writer():
+def test_compute_inline_fail_multi_writer(use_block_name):
     sch = tir.Schedule(fail_multi_reader_writer, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.compute_inline(block_b)
 
 
-def test_reverse_compute_inline_elementwise():
+def test_reverse_compute_inline_elementwise(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
     block_b = sch.get_block("B")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(elementwise_inlined, sch.mod["main"])
     assert sch.get(block_b).name_hint == "B"
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_reverse_compute_inline_under_loop():
+def test_reverse_compute_inline_under_loop(use_block_name):
     sch = tir.Schedule(elementwise_under_loop, debug_mask="all")
     block_b = sch.get_block("B")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(elementwise_inlined, sch.mod["main"])
     assert sch.get(block_b).name_hint == "B"
     verify_trace_roundtrip(sch=sch, mod=elementwise_under_loop)
 
 
-def test_reverse_compute_inline_fail_as_dce():
+def test_reverse_compute_inline_fail_as_dce(use_block_name):
     sch = tir.Schedule(elementwise_standalone, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.reverse_compute_inline(block_b)
 
 
-def test_reverse_compute_inline_fail_multi_producer():
+def test_reverse_compute_inline_fail_multi_producer(use_block_name):
     sch = tir.Schedule(elementwise_multi_producer_consumer, debug_mask="all")
-    block_d = sch.get_block("D")
+    block_d = "D" if use_block_name else sch.get_block("D")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.reverse_compute_inline(block_d)
 
 
-def test_reverse_compute_inline_fail_multi_reader():
+def test_reverse_compute_inline_fail_multi_reader(use_block_name):
     sch = tir.Schedule(fail_multi_reader_writer, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.reverse_compute_inline(block_c)
 
 
-def test_reverse_compute_multi_reverse_loads():
+def test_reverse_compute_multi_reverse_loads(use_block_name):
     sch = tir.Schedule(elementwise_multi_reverse_loads, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(elementwise_multi_reverse_loads_inlined, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_multi_reverse_loads)
 
 
-def test_reverse_compute_inline_affine_load():
+def test_reverse_compute_inline_affine_load(use_block_name):
     sch = tir.Schedule(elementwise_reverse_affine_load, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(elementwise_reverse_affine_load_inlined, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_reverse_affine_load)
 
 
-def test_reverse_compute_inline_multi_affine_load():
+def test_reverse_compute_inline_multi_affine_load(use_block_name):
     sch = tir.Schedule(elementwise_multi_reverse_affine_load, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(elementwise_multi_reverse_affine_load_inlined, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_multi_reverse_affine_load)
 
 
-def test_reverse_compute_inline_affine_load_unit_iter():
+def test_reverse_compute_inline_affine_load_unit_iter(use_block_name):
     sch = tir.Schedule(elementwise_reverse_affine_load_unit_iter, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(
         elementwise_reverse_affine_load_unit_iter_inlined, sch.mod["main"]
@@ -712,9 +714,9 @@ def test_reverse_compute_inline_affine_load_unit_iter():
     verify_trace_roundtrip(sch=sch, mod=elementwise_reverse_affine_load_unit_iter)
 
 
-def test_reverse_compute_inline_affine_load_unit_iter_simplified():
+def test_reverse_compute_inline_affine_load_unit_iter_simplified(use_block_name):
     sch = tir.Schedule(elementwise_reverse_affine_load_unit_iter_simplified, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(
         elementwise_reverse_affine_load_unit_iter_simplified_inlined, sch.mod["main"]
@@ -723,10 +725,10 @@ def test_reverse_compute_inline_affine_load_unit_iter_simplified():
 
 
 @pytest.mark.parametrize("reverse_order", [True, False])
-def test_reverse_compute_inline_affine_chain(reverse_order):
+def test_reverse_compute_inline_affine_chain(use_block_name, reverse_order):
     sch = tir.Schedule(elementwise_reverse_affine_chain, debug_mask="all")
-    block_c = sch.get_block("C")
-    block_d = sch.get_block("D")
+    block_c = "C" if use_block_name else sch.get_block("C")
+    block_d = "D" if use_block_name else sch.get_block("D")
     if reverse_order:
         sch.reverse_compute_inline(block_d)
         sch.reverse_compute_inline(block_c)
@@ -737,68 +739,68 @@ def test_reverse_compute_inline_affine_chain(reverse_order):
     verify_trace_roundtrip(sch=sch, mod=elementwise_reverse_affine_chain)
 
 
-def test_reverse_compute_fail_non_affine_load():
+def test_reverse_compute_fail_non_affine_load(use_block_name):
     sch = tir.Schedule(elementwise_reverse_non_affine_load, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.reverse_compute_inline(block_c)
 
 
-def test_reverse_compute_fail_multi_reverse_loads():
+def test_reverse_compute_fail_multi_reverse_loads(use_block_name):
     sch = tir.Schedule(elementwise_multi_loads, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.reverse_compute_inline(block_c)
 
 
-def test_opaque_access_load():
+def test_opaque_access_load(use_block_name):
     sch = tir.Schedule(opaque_access_load, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.compute_inline(block_b)
 
 
-def test_opaque_access_store():
+def test_opaque_access_store(use_block_name):
     sch = tir.Schedule(opaque_access_store, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.compute_inline(block_b)
 
 
-def test_buffer_matched():
+def test_buffer_matched(use_block_name):
     sch = tir.Schedule(buffer_matched, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.compute_inline(block_b)
 
 
-def test_output_block():
+def test_output_block(use_block_name):
     sch = tir.Schedule(matmul_relu, debug_mask="all")
     block = sch.get_block("compute")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.compute_inline(block)
 
 
-def test_compute_inline_predicate():
+def test_compute_inline_predicate(use_block_name):
     sch = tir.Schedule(elementwise_predicate, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     sch.compute_inline(block_b)
     tvm.ir.assert_structural_equal(elementwise_predicate_inlined, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_predicate)
 
 
-def test_compute_inline_multi_loads():
+def test_compute_inline_multi_loads(use_block_name):
     sch = tir.Schedule(elementwise_multi_loads, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     sch.compute_inline(block_b)
     tvm.ir.assert_structural_equal(elementwise_multi_loads_inlined, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_multi_loads)
 
 
-def test_compute_inline_with_opaque_access():
+def test_compute_inline_with_opaque_access(use_block_name):
     """Test not rewrite opaque reads/writes after irrelavant compute inline"""
     sch = tir.Schedule(access_opaque_ptr_then_elemwise, debug_mask="all")
-    BB = sch.get_block("BB")
+    BB = "BB" if use_block_name else sch.get_block("BB")
     sch.compute_inline(BB)
     tvm.ir.assert_structural_equal(access_opaque_ptr_then_elemwise_inline, sch.mod["main"])
 
@@ -810,10 +812,10 @@ def test_inline_block_with_init():
         sch.compute_inline(block=block)
 
 
-def test_compute_inline_opaque_access_with_tvm_access_ptr():
+def test_compute_inline_opaque_access_with_tvm_access_ptr(use_block_name):
     """Test opaque access with tvm_access_ptr after compute inline"""
     sch = tir.Schedule(exp_exp_opaque_access_with_tvm_access_ptr, debug_mask="all")
-    compute = sch.get_block("compute")
+    compute = "compute" if use_block_name else sch.get_block("compute")
     sch.compute_inline(compute)
     tvm.ir.assert_structural_equal(
         exp_exp_opaque_access_with_tvm_access_ptr_inlined, sch.mod["main"]
diff --git a/tests/python/unittest/test_tir_schedule_reduction.py b/tests/python/unittest/test_tir_schedule_reduction.py
index a8348afb457d5..f3503460e50ac 100644
--- a/tests/python/unittest/test_tir_schedule_reduction.py
+++ b/tests/python/unittest/test_tir_schedule_reduction.py
@@ -215,19 +215,21 @@ def colsum_decompose_with_vectorization(a: T.handle, b: T.handle) -> None:
 
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
-def test_reduction_decompose0():
+
+def test_reduction_decompose0(use_block_name):
     s = tir.Schedule(matmul, debug_mask="all")
-    C = s.get_block("update")
+    C = "update" if use_block_name else s.get_block("update")
     i, j, k = s.get_loops(C)
     s.decompose_reduction(C, i)
     tvm.ir.assert_structural_equal(matmul_decompose0, s.mod["main"])
     verify_trace_roundtrip(s, mod=matmul)
 
 
-def test_reduction_decompose1():
+def test_reduction_decompose1(use_block_name):
     s = tir.Schedule(rowsum_blockized, debug_mask="all")
-    blockized_B = s.get_block("blockized_B")
+    blockized_B = "blockized_B" if use_block_name else s.get_block("blockized_B")
     io, ko = s.get_loops(blockized_B)
     s.decompose_reduction(blockized_B, io)
     tvm.ir.assert_structural_equal(matmul_decompose1, s.mod["main"])
diff --git a/tests/python/unittest/test_tir_schedule_reindex.py b/tests/python/unittest/test_tir_schedule_reindex.py
index 9b2e37a19813a..c6776b0c8a3e2 100644
--- a/tests/python/unittest/test_tir_schedule_reindex.py
+++ b/tests/python/unittest/test_tir_schedule_reindex.py
@@ -168,35 +168,43 @@ def multiple_read(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "f
             B[vi, vj] = A[vj, vi] + A[vi, vj]
 
 
-def test_reindex_read_basic():
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
+use_buffer_name = tvm.testing.parameter(by_dict={"buffer_index": False, "buffer_name": True})
+
+
+def test_reindex_read_basic(use_block_name, use_buffer_name):
     sch = tir.Schedule(transpose_elementwise)
-    block = sch.get_block("B")
-    sch.reindex(block, 0, "read")
+    block = "B" if use_block_name else sch.get_block("B")
+    buf = "A" if use_buffer_name else ("read", 0)
+    sch.reindex(block, buf)
     tvm.ir.assert_structural_equal(transpose_elementwise_reindex_read, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=transpose_elementwise)
 
 
-def test_conv2d_reindex_read():
+def test_conv2d_reindex_read(use_block_name, use_buffer_name):
     sch = tir.Schedule(conv2d_nhwc)
-    block = sch.get_block("conv2d_nhwc")
-    sch.reindex(block, 1, "read")
+    block = "conv2d_nhwc" if use_block_name else sch.get_block("conv2d_nhwc")
+    buf = "Weight" if use_buffer_name else ("read", 1)
+    sch.reindex(block, buf)
     tvm.ir.assert_structural_equal(conv2d_nhwc_reindex_weight, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=conv2d_nhwc)
 
 
-def test_matmul_reindex_write():
+def test_matmul_reindex_write(use_block_name, use_buffer_name):
     sch = tir.Schedule(matmul)
-    block = sch.get_block("matmul")
-    sch.reindex(block, 0, "write")
+    block = "matmul" if use_block_name else sch.get_block("matmul")
+    buf = "C" if use_buffer_name else ("write", 0)
+    sch.reindex(block, buf)
     tvm.ir.assert_structural_equal(matmul_reindex_write, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=matmul)
 
 
-def test_reindex_fail_multiple_read():
+def test_reindex_fail_multiple_read(use_block_name, use_buffer_name):
     sch = tir.Schedule(multiple_read)
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
+    buf = "A" if use_buffer_name else ("read", 0)
     with pytest.raises(ScheduleError):
-        sch.reindex(block, 0, "read")
+        sch.reindex(block, buf)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_tir_schedule_sampling.py b/tests/python/unittest/test_tir_schedule_sampling.py
index 17f35ea8f72fe..0c2a3d27ffdb2 100644
--- a/tests/python/unittest/test_tir_schedule_sampling.py
+++ b/tests/python/unittest/test_tir_schedule_sampling.py
@@ -179,10 +179,16 @@ def test_sample_perfect_tile_composite():
     verify_trace_roundtrip(sch, mod=elementwise)
 
 
-def test_sample_compute_location():
+use_sugared_block = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
+
+
+def test_sample_compute_location(use_sugared_block):
     n = 100
     sch = tir.Schedule(tiled_conv2d_with_padding, seed=42, debug_mask="all")
-    pad_input = sch.get_block("PadInput")
+    if use_sugared_block:
+        pad_input = "PadInput"
+    else:
+        pad_input = sch.get_block("PadInput")
     decision_dict = dict()
     for _ in range(n):
         _ = sch.sample_compute_location(pad_input)  # pylint: disable=invalid-name
diff --git a/tests/python/unittest/test_tir_schedule_set_scope.py b/tests/python/unittest/test_tir_schedule_set_scope.py
index 29c4880f77622..b2e8479462ebe 100644
--- a/tests/python/unittest/test_tir_schedule_set_scope.py
+++ b/tests/python/unittest/test_tir_schedule_set_scope.py
@@ -86,20 +86,21 @@ def element_wise_subregion_match_set_scope(A: T.Buffer[(128, 128), "float32"], C
 
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
-def test_set_scope():
+def test_set_scope(use_block_name):
     func = element_wise
     s = tir.Schedule(func, debug_mask='all')
-    s.set_scope(s.get_block("B"), 0, "shared")
+    s.set_scope('B' if use_block_name else s.get_block("B"), 0, "shared")
     tvm.ir.assert_structural_equal(element_wise_set_scope, s.mod["main"])
     verify_trace_roundtrip(sch=s, mod=func)
 
 
-def test_set_scope_fail_on_output_buffer():
+def test_set_scope_fail_on_output_buffer(use_block_name):
     func = element_wise
     s = tir.Schedule(func, debug_mask='all')
     with pytest.raises(tvm.tir.ScheduleError):
-        s.set_scope(s.get_block("C"), 0, "shared")
+        s.set_scope('C' if use_block_name else s.get_block("C"), 0, "shared")
 
 
 def test_set_scope_fail_on_index_out_of_bound():
diff --git a/tests/python/unittest/test_tir_schedule_storage_align.py b/tests/python/unittest/test_tir_schedule_storage_align.py
index 3b699fd8f1b2d..072640c8f3af5 100644
--- a/tests/python/unittest/test_tir_schedule_storage_align.py
+++ b/tests/python/unittest/test_tir_schedule_storage_align.py
@@ -98,10 +98,12 @@ def element_wise_invalid_annotation(a: T.handle, c: T.handle) -> None:
                     C[vi_1, vj_1] = (B[vi_1, vj_1] + T.float32(1))
 
 
-def test_storage_align():
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
+
+def test_storage_align(use_block_name):
     func = element_wise
     s = tir.Schedule(func, debug_mask='all')
-    B = s.get_block("B")
+    B = 'B' if use_block_name else s.get_block("B")
     s.storage_align(B, 0, axis=0, factor=128, offset=127)
     tvm.ir.assert_structural_equal(element_wise_storage_align, s.mod["main"])
     verify_trace_roundtrip(sch=s, mod=func)
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index e184bc3f627c3..205bd5091268b 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -171,15 +171,13 @@ def conv2d_nhwc_transformed(
 # pylint: enable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
 # fmt: on
 
-use_sugared_transform = tvm.testing.parameter(
-    by_dict={"transform_layout": False, "transform_layout_sugared": True}
-)
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
 
-def test_two_elementwise_transform_intermediate_buffer(use_sugared_transform):
+def test_two_elementwise_transform_intermediate_buffer(use_block_name):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
 
-    if use_sugared_transform:
+    if use_block_name:
         sch.transform_layout(
             block="B",
             buffer="B",
@@ -193,10 +191,10 @@ def test_two_elementwise_transform_intermediate_buffer(use_sugared_transform):
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
 
-def test_two_elementwise_transform_input_buffer(use_sugared_transform):
+def test_two_elementwise_transform_input_buffer(use_block_name):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
 
-    if use_sugared_transform:
+    if use_block_name:
         sch.transform_layout(
             index_map=packed_index_map_func,
             block="B",
@@ -210,10 +208,10 @@ def test_two_elementwise_transform_input_buffer(use_sugared_transform):
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
 
-def test_two_elementwise_transform_output_buffer(use_sugared_transform):
+def test_two_elementwise_transform_output_buffer(use_block_name):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
 
-    if use_sugared_transform:
+    if use_block_name:
         sch.transform_layout(
             index_map=packed_index_map_func,
             block="C",
@@ -295,17 +293,17 @@ def summation_3d_split(
     tvm.ir.assert_structural_equal(summation_3d_split, sch.mod["main"])
 
 
-def test_transform_block_layout_basic():
+def test_transform_block_layout_basic(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
     sch.transform_block_layout(block, lambda i, j: (i * 128 + j,))
     tvm.ir.assert_structural_equal(elementwise_transformed, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_transform_block_layout_conv2d_nhwc():
+def test_transform_block_layout_conv2d_nhwc(use_block_name):
     sch = tir.Schedule(conv2d_nhwc, debug_mask="all")
-    block = sch.get_block("conv2d_nhwc")
+    block = "conv2d_nhwc" if use_block_name else sch.get_block("conv2d_nhwc")
     sch.transform_block_layout(
         block,
         lambda n, h, w, co, rh, rw, rc: (n * 112 * 112 + h * 112 + w, co, rh * 7 * 3 + rw * 3 + rc),
@@ -314,16 +312,16 @@ def test_transform_block_layout_conv2d_nhwc():
     verify_trace_roundtrip(sch=sch, mod=conv2d_nhwc)
 
 
-def test_transform_block_layout_fail_non_affine():
+def test_transform_block_layout_fail_non_affine(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tir.ScheduleError):
         sch.transform_block_layout(block, lambda i, j: (i + j,))
 
 
-def test_transform_block_layout_fail_mixed_iter_type():
+def test_transform_block_layout_fail_mixed_iter_type(use_block_name):
     sch = tir.Schedule(conv2d_nhwc, debug_mask="all")
-    block = sch.get_block("conv2d_nhwc")
+    block = "conv2d_nhwc" if use_block_name else sch.get_block("conv2d_nhwc")
     with pytest.raises(tir.ScheduleError):
         sch.transform_block_layout(
             block,
diff --git a/tests/python/unittest/test_tir_schedule_utilities.py b/tests/python/unittest/test_tir_schedule_utilities.py
index 0d23d3f95211d..b7517aab7cd37 100644
--- a/tests/python/unittest/test_tir_schedule_utilities.py
+++ b/tests/python/unittest/test_tir_schedule_utilities.py
@@ -104,6 +104,8 @@ def matmul_relu_ann2(a: T.handle, b: T.handle, d: T.handle) -> None:
 
 # pylint: enable=no-member,invalid-name,unused-variable
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
+
 
 def test_tir_schedule_creation():
     # Tests:
@@ -131,24 +133,24 @@ def test_tir_schedule_get_block():
     assert block.same_as(matmul.body.block.body.body.body[1].body.block)
 
 
-def test_tir_schedule_get_loops():
+def test_tir_schedule_get_loops(use_block_name):
     # Tests:
     # - Schedule.get_loops
     # - Schedule.get
     sch = tir.Schedule(matmul, debug_mask="all")
-    block_rv = sch.get_block(name="update")
-    i, j, k = sch.get_loops(block_rv)
+    block = "update" if use_block_name else sch.get_block(name="update")
+    i, j, k = sch.get_loops(block)
     assert sch.get(i).loop_var.name == "i"
     assert sch.get(j).loop_var.name == "j"
     assert sch.get(k).loop_var.name == "k"
 
 
-def test_tir_schedule_copy_1():
+def test_tir_schedule_copy_1(use_block_name):
     # Tests:
     # - Schedule.copy
     sch_1 = tir.Schedule(matmul, debug_mask="all")
     block_rv = sch_1.get_block(name="update")
-    i, j, k = sch_1.get_loops(block_rv)
+    i, j, k = sch_1.get_loops(block="update" if use_block_name else block_rv)
     assert sch_1.get(i).loop_var.name == "i"
     assert sch_1.get(j).loop_var.name == "j"
     assert sch_1.get(k).loop_var.name == "k"
@@ -218,9 +220,9 @@ def test_get_child_blocks():
     assert s.get(update) == s.get(blocks[1])
 
 
-def test_get_producers():
+def test_get_producers(use_block_name):
     sch = tir.Schedule(mod=matmul_relu, debug_mask="all")
-    block = sch.get_block("relu")
+    block = "relu" if use_block_name else sch.get_block("relu")
     (producer,) = sch.get_producers(block)
     assert tvm.ir.structural_equal(
         sch.get_sref(producer).stmt,
@@ -229,9 +231,9 @@ def test_get_producers():
     verify_trace_roundtrip(sch, mod=matmul_relu)
 
 
-def test_get_consumers():
+def test_get_consumers(use_block_name):
     sch = tir.Schedule(mod=matmul_relu, debug_mask="all")
-    block = sch.get_block("matmul")
+    block = "matmul" if use_block_name else sch.get_block("matmul")
     (consumer,) = sch.get_consumers(block)
     assert tvm.ir.structural_equal(
         sch.get_sref(consumer).stmt,

From 6d557ffae2db64fcea127b5e34089d9bc8e74fb0 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 9 Jun 2022 15:01:48 -0700
Subject: [PATCH 084/181] [ci] Rebuild Docker images if necessary (#11329)

This rebuilds Docker images and uses them in later stages in the same build. If the build is running on `main`, then the images are uploaded to Docker Hub automatically once the run is complete. Images are always rebuilt, but Docker Hub functions as a cache. If there have been no changes to `docker/` since the last available hash on Docker Hub, then the build will just use the images from Hub.
---
 Jenkinsfile                            | 393 ++++++++++++++++---------
 jenkins/Build.groovy.j2                |  23 ++
 jenkins/Deploy.groovy.j2               |  50 ++++
 jenkins/DockerBuild.groovy.j2          | 240 ++++++---------
 jenkins/Jenkinsfile.j2                 |   3 +
 jenkins/Lint.groovy.j2                 |  10 +-
 jenkins/Prepare.groovy.j2              |  11 +
 tests/python/ci/test_ci.py             |  97 +++++-
 tests/scripts/cmd_utils.py             |  21 +-
 tests/scripts/git_utils.py             |   1 +
 tests/scripts/http_utils.py            |  34 +++
 tests/scripts/should_rebuild_docker.py | 154 ++++++++++
 12 files changed, 737 insertions(+), 300 deletions(-)
 create mode 100644 tests/scripts/http_utils.py
 create mode 100755 tests/scripts/should_rebuild_docker.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 0205a1e7364fe..ec4cea52d67b3 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-02T14:03:43.284817
+// Generated at 2022-06-09T09:42:12.430625
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -97,6 +97,7 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
 // Filenames for stashing between build and test steps
 s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
+
 // General note: Jenkins has limits on the size of a method (or top level code)
 // that are pretty strict, so most usage of groovy methods in these templates
 // are purely to satisfy the JVM
@@ -171,6 +172,17 @@ def docker_init(image) {
     """,
     label: 'Clean old Docker images',
   )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    ecr_pull(image)
+  } else {
+    sh(
+      script: "docker pull ${image}",
+      label: 'Pull docker image',
+    )
+  }
 }
 
 def should_skip_slow_tests(pr_number) {
@@ -273,16 +285,50 @@ def prepare() {
     }
   }
 }
-def build_image(image_name) {
-  hash = sh(
+def ecr_push(full_name) {
+  aws_account_id = sh(
     returnStdout: true,
-    script: 'git log -1 --format=\'%h\''
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
   ).trim()
-  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
-  sh(
-    script: "${docker_build} ${image_name} --spec ${full_name}",
-    label: 'Build docker image'
-  )
+
+  def ecr_name = "${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com/${full_name}"
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -x
+          docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
+          docker push \$AWS_ECR_REPO/${full_name}
+        """,
+        label: 'Upload image to ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+  return ecr_name
+}
+
+def ecr_pull(full_name) {
   aws_account_id = sh(
     returnStdout: true,
     script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
@@ -290,153 +336,144 @@ def build_image(image_name) {
   ).trim()
 
   try {
-    // Use a credential so Jenkins knows to scrub the AWS account ID which is nice
-    // (but so we don't have to rely it being hardcoded in Jenkins)
-    withCredentials([string(
-      credentialsId: 'aws-account-id',
-      variable: '_ACCOUNT_ID_DO_NOT_USE',
-      )]) {
-      withEnv([
-        "AWS_ACCOUNT_ID=${aws_account_id}",
-        'AWS_DEFAULT_REGION=us-west-2']) {
-        sh(
-          script: '''
-            set -x
-            aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
-          ''',
-          label: 'Log in to ECR'
-        )
-        sh(
-          script: """
-            set -x
-            docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-            docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-          """,
-          label: 'Upload image to ECR'
-        )
-      }
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
     }
   } finally {
-    sh(
-      script: 'rm -f ~/.docker/config.json',
-      label: 'Clean up login credentials'
-    )
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
   }
+}
+
+def build_image(image_name) {
+  hash = sh(
+    returnStdout: true,
+    script: 'git log -1 --format=\'%h\''
+  ).trim()
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
   sh(
-    script: "docker rmi ${full_name}",
-    label: 'Remove docker image'
+    script: "${docker_build} ${image_name} --spec ${full_name}",
+    label: 'Build docker image'
   )
+  return ecr_push(full_name)
 }
 
+
 def build_docker_images() {
   stage('Docker Image Build') {
-    // TODO in a follow up PR: Find ecr tag and use in subsequent builds
-    parallel 'ci-lint': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_lint')
+    parallel(
+      'ci_arm': {
+        node('ARM') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_arm = build_image('ci_arm')
+            build_image('ci_arm')
+          }
         }
-      }
-    }, 'ci-cpu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_cpu')
+      },
+      'ci_cpu': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_cpu = build_image('ci_cpu')
+            build_image('ci_cpu')
+          }
         }
-      }
-    }, 'ci-gpu': {
-      node('GPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_gpu')
+      },
+      'ci_gpu': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_gpu = build_image('ci_gpu')
+            build_image('ci_gpu')
+          }
         }
-      }
-    }, 'ci-qemu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_qemu')
+      },
+      'ci_hexagon': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_hexagon = build_image('ci_hexagon')
+            build_image('ci_hexagon')
+          }
         }
-      }
-    }, 'ci-i386': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_i386')
+      },
+      'ci_i386': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_i386 = build_image('ci_i386')
+            build_image('ci_i386')
+          }
         }
-      }
-    }, 'ci-arm': {
-      node('ARM') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_arm')
+      },
+      'ci_lint': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_lint = build_image('ci_lint')
+            build_image('ci_lint')
+          }
         }
-      }
-    }, 'ci-wasm': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_wasm')
+      },
+      'ci_qemu': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_qemu = build_image('ci_qemu')
+            build_image('ci_qemu')
+          }
         }
-      }
-    }, 'ci-hexagon': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_hexagon')
+      },
+      'ci_wasm': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_wasm = build_image('ci_wasm')
+            build_image('ci_wasm')
+          }
         }
-      }
-    }
-  }
-  // // TODO: Once we are able to use the built images, enable this step
-  // // If the docker images changed, we need to run the image build before the lint
-  // // can run since it requires a base docker image. Most of the time the images
-  // // aren't build though so it's faster to use the same node that checks for
-  // // docker changes to run the lint in the usual case.
-  // stage('Sanity Check (re-run)') {
-  //   timeout(time: max_time, unit: 'MINUTES') {
-  //     node('CPU') {
-  //       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/sanity") {
-  //         init_git()
-  //         sh (
-  //           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
-  //           label: 'Run lint',
-  //         )
-  //       }
-  //     }
-  //   }
-  // }
-}
-
-// Run make. First try to do an incremental make from a previous workspace in hope to
-// accelerate the compilation. If something is wrong, clean the workspace and then
-// build from scratch.
-def make(docker_type, path, make_flag) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    try {
-      cmake_build(docker_type, path, make_flag)
-      // always run cpp test when build
-    } catch (hudson.AbortException ae) {
-      // script exited due to user abort, directly throw instead of retry
-      if (ae.getMessage().contains('script returned exit code 143')) {
-        throw ae
-      }
-      echo 'Incremental compilation failed. Fall back to build from scratch'
-      sh (
-        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
-        label: 'Clear old cmake workspace',
-      )
-      cmake_build(docker_type, path, make_flag)
-    }
+      },
+    )
   }
 }
 def lint() {
@@ -531,6 +568,29 @@ def add_hexagon_permissions() {
   )
 }
 
+// Run make. First try to do an incremental make from a previous workspace in hope to
+// accelerate the compilation. If something is wrong, clean the workspace and then
+// build from scratch.
+def make(docker_type, path, make_flag) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    try {
+      cmake_build(docker_type, path, make_flag)
+    } catch (hudson.AbortException ae) {
+      // script exited due to user abort, directly throw instead of retry
+      if (ae.getMessage().contains('script returned exit code 143')) {
+        throw ae
+      }
+      echo 'Incremental compilation failed. Fall back to build from scratch'
+      sh (
+        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
+        label: 'Clear old cmake workspace',
+      )
+      cmake_build(docker_type, path, make_flag)
+    }
+  }
+}
+
+
 def build() {
 stage('Build') {
   environment {
@@ -3239,6 +3299,25 @@ stage('Build packages') {
 }
 */
 
+
+def update_docker(ecr_image, hub_image) {
+  if (!ecr_image.contains("amazonaws.com")) {
+    sh("echo Skipping '${ecr_image}' since it doesn't look like an ECR image")
+    return
+  }
+  docker_init(ecr_image)
+  sh(
+    script: """
+    set -eux
+    docker tag \
+      ${ecr_image} \
+      ${hub_image}
+    docker push ${hub_image}
+    """,
+    label: "Update ${hub_image} on Docker Hub",
+  )
+}
+
 def deploy_docs() {
   // Note: This code must stay in the Jenkinsfile to ensure that it runs
   // from a trusted context only
@@ -3298,6 +3377,42 @@ def deploy() {
         }
       }
     }
+    if (env.BRANCH_NAME == 'main' && env.DEPLOY_DOCKER_IMAGES == 'yes' && rebuild_docker_images && upstream_revision != null) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docker") {
+          try {
+            withCredentials([string(
+              credentialsId: 'dockerhub-tlcpackstaging-key',
+              variable: 'DOCKERHUB_KEY',
+            )]) {
+              sh(
+                script: 'docker login -u tlcpackstaging -p ${DOCKERHUB_KEY}',
+                label: 'Log in to Docker Hub',
+              )
+            }
+            def date_Ymd_HMS = sh(
+              script: 'python3 -c \'import datetime; print(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))\'',
+              label: 'Determine date',
+              returnStdout: true,
+            ).trim()
+            def tag = "${date_Ymd_HMS}-${upstream_revision.substring(0, 8)}"
+            update_docker(ci_arm, "tlcpackstaging/test_ci_arm:${tag}")
+            update_docker(ci_cpu, "tlcpackstaging/test_ci_cpu:${tag}")
+            update_docker(ci_gpu, "tlcpackstaging/test_ci_gpu:${tag}")
+            update_docker(ci_hexagon, "tlcpackstaging/test_ci_hexagon:${tag}")
+            update_docker(ci_i386, "tlcpackstaging/test_ci_i386:${tag}")
+            update_docker(ci_lint, "tlcpackstaging/test_ci_lint:${tag}")
+            update_docker(ci_qemu, "tlcpackstaging/test_ci_qemu:${tag}")
+            update_docker(ci_wasm, "tlcpackstaging/test_ci_wasm:${tag}")
+          } finally {
+            sh(
+              script: 'docker logout',
+              label: 'Clean up login credentials'
+            )
+          }
+        }
+      }
+    }
   }
 }
 
diff --git a/jenkins/Build.groovy.j2 b/jenkins/Build.groovy.j2
index 62ccc94916048..fcde53f559395 100644
--- a/jenkins/Build.groovy.j2
+++ b/jenkins/Build.groovy.j2
@@ -52,6 +52,29 @@ def add_hexagon_permissions() {
   {% endfor %}
 }
 
+// Run make. First try to do an incremental make from a previous workspace in hope to
+// accelerate the compilation. If something is wrong, clean the workspace and then
+// build from scratch.
+def make(docker_type, path, make_flag) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    try {
+      cmake_build(docker_type, path, make_flag)
+    } catch (hudson.AbortException ae) {
+      // script exited due to user abort, directly throw instead of retry
+      if (ae.getMessage().contains('script returned exit code 143')) {
+        throw ae
+      }
+      echo 'Incremental compilation failed. Fall back to build from scratch'
+      sh (
+        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
+        label: 'Clear old cmake workspace',
+      )
+      cmake_build(docker_type, path, make_flag)
+    }
+  }
+}
+
+
 def build() {
 stage('Build') {
   environment {
diff --git a/jenkins/Deploy.groovy.j2 b/jenkins/Deploy.groovy.j2
index 917f71ded1ff3..3a049c5141dd9 100644
--- a/jenkins/Deploy.groovy.j2
+++ b/jenkins/Deploy.groovy.j2
@@ -16,6 +16,25 @@ stage('Build packages') {
 }
 */
 
+
+def update_docker(ecr_image, hub_image) {
+  if (!ecr_image.contains("amazonaws.com")) {
+    sh("echo Skipping '${ecr_image}' since it doesn't look like an ECR image")
+    return
+  }
+  docker_init(ecr_image)
+  sh(
+    script: """
+    set -eux
+    docker tag \
+      ${ecr_image} \
+      ${hub_image}
+    docker push ${hub_image}
+    """,
+    label: "Update ${hub_image} on Docker Hub",
+  )
+}
+
 def deploy_docs() {
   // Note: This code must stay in the Jenkinsfile to ensure that it runs
   // from a trusted context only
@@ -67,5 +86,36 @@ def deploy() {
         }
       }
     }
+    if (env.BRANCH_NAME == 'main' && env.DEPLOY_DOCKER_IMAGES == 'yes' && rebuild_docker_images && upstream_revision != null) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/deploy-docker') }}) {
+          try {
+            withCredentials([string(
+              credentialsId: 'dockerhub-tlcpackstaging-key',
+              variable: 'DOCKERHUB_KEY',
+            )]) {
+              sh(
+                script: 'docker login -u tlcpackstaging -p ${DOCKERHUB_KEY}',
+                label: 'Log in to Docker Hub',
+              )
+            }
+            def date_Ymd_HMS = sh(
+              script: 'python3 -c \'import datetime; print(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))\'',
+              label: 'Determine date',
+              returnStdout: true,
+            ).trim()
+            def tag = "${date_Ymd_HMS}-${upstream_revision.substring(0, 8)}"
+            {% for image in images %}
+            update_docker({{ image.name }}, "tlcpackstaging/test_{{ image.name }}:${tag}")
+            {% endfor %}
+          } finally {
+            sh(
+              script: 'docker logout',
+              label: 'Clean up login credentials'
+            )
+          }
+        }
+      }
+    }
   }
 }
diff --git a/jenkins/DockerBuild.groovy.j2 b/jenkins/DockerBuild.groovy.j2
index e9d80801a9d9c..a0ff666773f75 100644
--- a/jenkins/DockerBuild.groovy.j2
+++ b/jenkins/DockerBuild.groovy.j2
@@ -1,13 +1,47 @@
-def build_image(image_name) {
-  hash = sh(
+def ecr_push(full_name) {
+  aws_account_id = sh(
     returnStdout: true,
-    script: 'git log -1 --format=\'%h\''
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
   ).trim()
-  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
-  sh(
-    script: "${docker_build} ${image_name} --spec ${full_name}",
-    label: 'Build docker image'
-  )
+
+  def ecr_name = "${aws_account_id}.{{ aws_ecr_url }}/${full_name}"
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -x
+          docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
+          docker push \$AWS_ECR_REPO/${full_name}
+        """,
+        label: 'Upload image to ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+  return ecr_name
+}
+
+def ecr_pull(full_name) {
   aws_account_id = sh(
     returnStdout: true,
     script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
@@ -15,152 +49,68 @@ def build_image(image_name) {
   ).trim()
 
   try {
-    // Use a credential so Jenkins knows to scrub the AWS account ID which is nice
-    // (but so we don't have to rely it being hardcoded in Jenkins)
-    withCredentials([string(
-      credentialsId: 'aws-account-id',
-      variable: '_ACCOUNT_ID_DO_NOT_USE',
-      )]) {
-      withEnv([
-        "AWS_ACCOUNT_ID=${aws_account_id}",
-        'AWS_DEFAULT_REGION=us-west-2']) {
-        sh(
-          script: '''
-            set -x
-            aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
-          ''',
-          label: 'Log in to ECR'
-        )
-        sh(
-          script: """
-            set -x
-            docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-            docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-          """,
-          label: 'Upload image to ECR'
-        )
-      }
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
     }
   } finally {
-    sh(
-      script: 'rm -f ~/.docker/config.json',
-      label: 'Clean up login credentials'
-    )
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
   }
+}
+
+def build_image(image_name) {
+  hash = sh(
+    returnStdout: true,
+    script: 'git log -1 --format=\'%h\''
+  ).trim()
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
   sh(
-    script: "docker rmi ${full_name}",
-    label: 'Remove docker image'
+    script: "${docker_build} ${image_name} --spec ${full_name}",
+    label: 'Build docker image'
   )
+  return ecr_push(full_name)
 }
 
+
 def build_docker_images() {
   stage('Docker Image Build') {
-    // TODO in a follow up PR: Find ecr tag and use in subsequent builds
-    parallel 'ci-lint': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_lint')
-        }
-      }
-    }, 'ci-cpu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_cpu')
+    parallel(
+    {% for image in images %}
+      '{{ image.name }}': {
+        node('{{ image.platform }}') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // {{ image.name }} = build_image('{{ image.name }}')
+            build_image('{{ image.name }}')
+          }
         }
-      }
-    }, 'ci-gpu': {
-      node('GPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_gpu')
-        }
-      }
-    }, 'ci-qemu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_qemu')
-        }
-      }
-    }, 'ci-i386': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_i386')
-        }
-      }
-    }, 'ci-arm': {
-      node('ARM') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_arm')
-        }
-      }
-    }, 'ci-wasm': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_wasm')
-        }
-      }
-    }, 'ci-hexagon': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_hexagon')
-        }
-      }
-    }
-  }
-  // // TODO: Once we are able to use the built images, enable this step
-  // // If the docker images changed, we need to run the image build before the lint
-  // // can run since it requires a base docker image. Most of the time the images
-  // // aren't build though so it's faster to use the same node that checks for
-  // // docker changes to run the lint in the usual case.
-  // stage('Sanity Check (re-run)') {
-  //   timeout(time: max_time, unit: 'MINUTES') {
-  //     node('CPU') {
-  //       ws({{ m.per_exec_ws('tvm/sanity') }}) {
-  //         init_git()
-  //         sh (
-  //           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
-  //           label: 'Run lint',
-  //         )
-  //       }
-  //     }
-  //   }
-  // }
-}
-
-// Run make. First try to do an incremental make from a previous workspace in hope to
-// accelerate the compilation. If something is wrong, clean the workspace and then
-// build from scratch.
-def make(docker_type, path, make_flag) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    try {
-      cmake_build(docker_type, path, make_flag)
-      // always run cpp test when build
-    } catch (hudson.AbortException ae) {
-      // script exited due to user abort, directly throw instead of retry
-      if (ae.getMessage().contains('script returned exit code 143')) {
-        throw ae
-      }
-      echo 'Incremental compilation failed. Fall back to build from scratch'
-      sh (
-        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
-        label: 'Clear old cmake workspace',
-      )
-      cmake_build(docker_type, path, make_flag)
-    }
+      },
+    {% endfor %}
+    )
   }
 }
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index c165de964feb4..4e344c56d7f72 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -100,6 +100,9 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
 {% set hexagon_api = ['build/hexagon_api_output',] %}
 s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
+{% set aws_default_region = "us-west-2" %}
+{% set aws_ecr_url = "dkr.ecr." + aws_default_region + ".amazonaws.com" %}
+
 // General note: Jenkins has limits on the size of a method (or top level code)
 // that are pretty strict, so most usage of groovy methods in these templates
 // are purely to satisfy the JVM
diff --git a/jenkins/Lint.groovy.j2 b/jenkins/Lint.groovy.j2
index 40dad3aef7be3..3ede64301c935 100644
--- a/jenkins/Lint.groovy.j2
+++ b/jenkins/Lint.groovy.j2
@@ -2,11 +2,11 @@ def lint() {
   stage('Lint') {
     parallel(
       {% call m.sharded_lint_step(
-        name='Lint',
-        num_shards=2,
-        node='CPU-SMALL',
-        ws='tvm/lint',
-        docker_image='ci_lint',
+          name='Lint',
+          num_shards=2,
+          node='CPU-SMALL',
+          ws='tvm/lint',
+          docker_image='ci_lint',
         )
       %}
         sh (
diff --git a/jenkins/Prepare.groovy.j2 b/jenkins/Prepare.groovy.j2
index 2900775f49452..894ddc72eeb7b 100644
--- a/jenkins/Prepare.groovy.j2
+++ b/jenkins/Prepare.groovy.j2
@@ -69,6 +69,17 @@ def docker_init(image) {
     """,
     label: 'Clean old Docker images',
   )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    ecr_pull(image)
+  } else {
+    sh(
+      script: "docker pull ${image}",
+      label: 'Pull docker image',
+    )
+  }
 }
 
 def should_skip_slow_tests(pr_number) {
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 042c109dd9d49..7ef2f0cd58452 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -18,9 +18,11 @@
 import subprocess
 import sys
 import json
+from tempfile import tempdir
 import textwrap
 import pytest
 import tvm.testing
+from pathlib import Path
 
 from test_utils import REPO_ROOT
 
@@ -29,11 +31,13 @@ class TempGit:
     def __init__(self, cwd):
         self.cwd = cwd
 
-    def run(self, *args):
-        proc = subprocess.run(["git"] + list(args), cwd=self.cwd)
+    def run(self, *args, **kwargs):
+        proc = subprocess.run(["git"] + list(args), encoding="utf-8", cwd=self.cwd, **kwargs)
         if proc.returncode != 0:
             raise RuntimeError(f"git command failed: '{args}'")
 
+        return proc
+
 
 def test_cc_reviewers(tmpdir_factory):
     reviewers_script = REPO_ROOT / "tests" / "scripts" / "github_cc_reviewers.py"
@@ -747,5 +751,94 @@ def run(type, data, check):
     )
 
 
+@pytest.mark.parametrize(
+    "changed_files,name,check,expected_code",
+    [
+        d.values()
+        for d in [
+            dict(
+                changed_files=[],
+                name="abc",
+                check="Image abc is not using new naming scheme",
+                expected_code=1,
+            ),
+            dict(
+                changed_files=[], name="123-123-abc", check="No extant hash found", expected_code=1
+            ),
+            dict(
+                changed_files=[["test.txt"]],
+                name=None,
+                check="Did not find changes, no rebuild necessary",
+                expected_code=0,
+            ),
+            dict(
+                changed_files=[["test.txt"], ["docker/test.txt"]],
+                name=None,
+                check="Found docker changes",
+                expected_code=2,
+            ),
+        ]
+    ],
+)
+def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expected_code):
+    tag_script = REPO_ROOT / "tests" / "scripts" / "should_rebuild_docker.py"
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+    git.run("init")
+    git.run("config", "user.name", "ci")
+    git.run("config", "user.email", "email@example.com")
+    git.run("checkout", "-b", "main")
+    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
+
+    git_path = Path(git.cwd)
+    for i, commits in enumerate(changed_files):
+        for filename in commits:
+            path = git_path / filename
+            path.parent.mkdir(exist_ok=True, parents=True)
+            path.touch()
+            git.run("add", filename)
+
+        git.run("commit", "-m", f"message {i}")
+
+    if name is None:
+        ref = "HEAD"
+        if len(changed_files) > 1:
+            ref = f"HEAD~{len(changed_files) - 1}"
+        proc = git.run("rev-parse", ref, stdout=subprocess.PIPE)
+        last_hash = proc.stdout.strip()
+        name = f"123-123-{last_hash}"
+
+    docker_data = {
+        "repositories/tlcpack": {
+            "results": [
+                {
+                    "name": "ci-something",
+                },
+                {
+                    "name": "something-else",
+                },
+            ],
+        },
+        "repositories/tlcpack/ci-something/tags": {
+            "results": [{"name": name}, {"name": name + "old"}],
+        },
+    }
+
+    proc = subprocess.run(
+        [
+            str(tag_script),
+            "--testing-docker-data",
+            json.dumps(docker_data),
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        encoding="utf-8",
+        cwd=git.cwd,
+    )
+
+    assert_in(check, proc.stdout)
+    assert proc.returncode == expected_code
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/scripts/cmd_utils.py b/tests/scripts/cmd_utils.py
index 272086796e8df..771c3ee52dbd2 100644
--- a/tests/scripts/cmd_utils.py
+++ b/tests/scripts/cmd_utils.py
@@ -44,18 +44,21 @@ def init_log():
 
 
 class Sh:
-    def __init__(self, env=None):
+    def __init__(self, env=None, cwd=None):
         self.env = os.environ.copy()
         if env is not None:
             self.env.update(env)
+        self.cwd = cwd
 
     def run(self, cmd: str, **kwargs):
         logging.info(f"+ {cmd}")
-        if "check" not in kwargs:
-            kwargs["check"] = True
-        if "shell" not in kwargs:
-            kwargs["shell"] = True
-        if "env" not in kwargs:
-            kwargs["env"] = self.env
-
-        subprocess.run(cmd, **kwargs)
+        defaults = {
+            "check": True,
+            "shell": True,
+            "env": self.env,
+            "encoding": "utf-8",
+            "cwd": self.cwd,
+        }
+        defaults.update(kwargs)
+
+        return subprocess.run(cmd, **defaults)
diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index 267756d859050..c5ea8d85e0718 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -20,6 +20,7 @@
 import subprocess
 import re
 import base64
+import logging
 from urllib import request
 from typing import Dict, Tuple, Any, Optional, List
 
diff --git a/tests/scripts/http_utils.py b/tests/scripts/http_utils.py
new file mode 100644
index 0000000000000..c14259479d3be
--- /dev/null
+++ b/tests/scripts/http_utils.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+import logging
+from urllib import request
+from typing import Dict, Any, Optional
+
+
+def get(url: str, headers: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+    logging.info(f"Requesting GET to {url}")
+    if headers is None:
+        headers = {}
+    req = request.Request(url, headers=headers)
+    with request.urlopen(req) as response:
+        response_headers = {k: v for k, v in response.getheaders()}
+        response = json.loads(response.read())
+
+    return response, response_headers
diff --git a/tests/scripts/should_rebuild_docker.py b/tests/scripts/should_rebuild_docker.py
new file mode 100755
index 0000000000000..dc12c38de8303
--- /dev/null
+++ b/tests/scripts/should_rebuild_docker.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import argparse
+import datetime
+import json
+import logging
+import subprocess
+
+from typing import Dict, Any, List
+
+
+from http_utils import get
+from cmd_utils import Sh, init_log
+
+
+DOCKER_API_BASE = "https://hub.docker.com/v2/"
+PAGE_SIZE = 25
+TEST_DATA = None
+
+
+def docker_api(url: str) -> Dict[str, Any]:
+    """
+    Run a paginated fetch from the public Docker Hub API
+    """
+    if TEST_DATA is not None:
+        return TEST_DATA[url]
+    pagination = f"?page_size={PAGE_SIZE}&page=1"
+    url = DOCKER_API_BASE + url + pagination
+    r, headers = get(url)
+    reset = headers.get("x-ratelimit-reset")
+    if reset is not None:
+        reset = datetime.datetime.fromtimestamp(int(reset))
+        reset = reset.isoformat()
+    logging.info(
+        f"Docker API Rate Limit: {headers.get('x-ratelimit-remaining')} / {headers.get('x-ratelimit-limit')} (reset at {reset})"
+    )
+    if "results" not in r:
+        raise RuntimeError(f"Error fetching data, no results found in: {r}")
+    return r
+
+
+def any_docker_changes_since(hash: str) -> bool:
+    """
+    Check the docker/ directory, return True if there have been any code changes
+    since the specified hash
+    """
+    sh = Sh()
+    cmd = f"git diff {hash} -- docker/"
+    proc = sh.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    stdout = proc.stdout.strip()
+    return stdout != "", stdout
+
+
+def does_commit_exist(hash: str) -> bool:
+    """
+    Returns True if the hash exists in the repo
+    """
+    sh = Sh()
+    cmd = f"git rev-parse -q {hash}"
+    proc = sh.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False)
+    print(proc.stdout)
+    if proc.returncode == 0:
+        return True
+
+    if "unknown revision or path not in the working tree" in proc.stdout:
+        return False
+
+    raise RuntimeError(f"Unexpected failure when running: {cmd}")
+
+
+def find_hash_for_tag(tag: Dict[str, Any]) -> str:
+    """
+    Split the hash off of a name like <date>-<time>-<hash>
+    """
+    name = tag["name"]
+    name_parts = name.split("-")
+    if len(name_parts) != 3:
+        raise RuntimeError(f"Image {name} is not using new naming scheme")
+    shorthash = name_parts[2]
+    return shorthash
+
+
+def find_commit_in_repo(tags: List[Dict[str, Any]]):
+    """
+    Look through all the docker tags, find the most recent one which references
+    a commit that is present in the repo
+    """
+    for tag in tags["results"]:
+        shorthash = find_hash_for_tag(tag)
+        logging.info(f"Hash '{shorthash}' does not exist in repo")
+        if does_commit_exist(shorthash):
+            return shorthash, tag
+
+    raise RuntimeError(f"No extant hash found in tags:\n{tags}")
+
+
+def main():
+    # Fetch all tlcpack images
+    images = docker_api("repositories/tlcpack")
+
+    # Ignore all non-ci images
+    relevant_images = [image for image in images["results"] if image["name"].startswith("ci-")]
+    image_names = [image["name"] for image in relevant_images]
+    logging.info(f"Found {len(relevant_images)} images to check: {', '.join(image_names)}")
+
+    for image in relevant_images:
+        # Check the tags for the image
+        tags = docker_api(f"repositories/tlcpack/{image['name']}/tags")
+
+        # Find the hash of the most recent tag
+        shorthash, tag = find_commit_in_repo(tags)
+        name = tag["name"]
+        logging.info(f"Looking for docker/ changes since {shorthash}")
+
+        any_docker_changes, diff = any_docker_changes_since(shorthash)
+        if any_docker_changes:
+            logging.info(f"Found docker changes from {shorthash} when checking {name}")
+            logging.info(diff)
+            exit(2)
+
+    logging.info("Did not find changes, no rebuild necessary")
+    exit(0)
+
+
+if __name__ == "__main__":
+    init_log()
+    parser = argparse.ArgumentParser(
+        description="Exits 0 if Docker images don't need to be rebuilt, 1 otherwise"
+    )
+    parser.add_argument(
+        "--testing-docker-data",
+        help="(testing only) JSON data to mock response from Docker Hub API",
+    )
+    args = parser.parse_args()
+
+    if args.testing_docker_data is not None:
+        TEST_DATA = json.loads(args.testing_docker_data)
+
+    main()

From 3a0b7c26b13cbcd105ed1ed4b447d8c91bcea799 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 9 Jun 2022 17:35:06 -0500
Subject: [PATCH 085/181] [Hexagon] Fix gtest flag in
 apps/hexagon_api/CMakeLists.txt (#11652)

---
 apps/hexagon_api/CMakeLists.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index feafff3f98da4..82c4b5b66d4c2 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -109,6 +109,9 @@ ExternalProject_Add_Step(android_tvm_runtime_rpc copy_rpc_server
 # Build Hexagon binaries:
 # - libhexagon_rpc_skel.so
 # - libtvm_runtime.a
+if(DEFINED USE_HEXAGON_GTEST)
+  set(GTEST_FLAG "-DUSE_HEXAGON_GTEST=${USE_HEXAGON_GTEST}")
+endif()
 
 ExternalProject_Add(hexagon_tvm_runtime_rpc
   SOURCE_DIR "${TVM_SOURCE_DIR}"
@@ -128,9 +131,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
     "-DUSE_ALTERNATIVE_LINKER=OFF"
     "-DUSE_CUSTOM_LOGGING=ON"
-    if(DEFINED USE_HEXAGON_GTEST)
-      "-DUSE_HEXAGON_GTEST=${USE_HEXAGON_GTEST}"
-    endif()
+    "${GTEST_FLAG}"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )

From d03633f36639179318613eda30df3bd3b2bade57 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 9 Jun 2022 16:00:31 -0700
Subject: [PATCH 086/181] [microTVM] Update pyproject to python3.7 (#11634)

* Update to python3.7 and add poetry.lock file
---
 apps/microtvm/poetry.lock                     | 2844 +++++++++++++++++
 apps/microtvm/pyproject.toml                  |   21 +-
 .../arduino/base-box/base_box_setup.sh        |    3 +
 .../reference-vm/arduino/provision_setup.sh   |   10 +-
 .../zephyr/base-box/base_box_setup.sh         |    3 +
 .../reference-vm/zephyr/provision_setup.sh    |   10 +-
 tests/lint/check_file_type.py                 |    1 +
 7 files changed, 2863 insertions(+), 29 deletions(-)
 create mode 100644 apps/microtvm/poetry.lock

diff --git a/apps/microtvm/poetry.lock b/apps/microtvm/poetry.lock
new file mode 100644
index 0000000000000..a6a9cd5a124cf
--- /dev/null
+++ b/apps/microtvm/poetry.lock
@@ -0,0 +1,2844 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This `poetry.lock` file is generated from `poetry install`.
+
+[[package]]
+name = "absl-py"
+version = "1.1.0"
+description = "Abseil Python Common Libraries, see https://github.com/abseil/abseil-py."
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[[package]]
+name = "alabaster"
+version = "0.7.12"
+description = "A configurable sidebar-enabled Sphinx theme"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "appdirs"
+version = "1.4.4"
+description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "asgiref"
+version = "3.5.2"
+description = "ASGI specs, helper code, and adapters"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+typing-extensions = {version = "*", markers = "python_version < \"3.8\""}
+
+[package.extras]
+tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
+
+[[package]]
+name = "astroid"
+version = "2.11.5"
+description = "An abstract syntax tree for Python with inference support."
+category = "dev"
+optional = false
+python-versions = ">=3.6.2"
+
+[package.dependencies]
+lazy-object-proxy = ">=1.4.0"
+typed-ast = {version = ">=1.4.0,<2.0", markers = "implementation_name == \"cpython\" and python_version < \"3.8\""}
+typing-extensions = {version = ">=3.10", markers = "python_version < \"3.10\""}
+wrapt = ">=1.11,<2"
+
+[[package]]
+name = "astunparse"
+version = "1.6.3"
+description = "An AST unparser for Python"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+six = ">=1.6.1,<2.0"
+
+[[package]]
+name = "atomicwrites"
+version = "1.4.0"
+description = "Atomic file writes."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "attrs"
+version = "19.3.0"
+description = "Classes Without Boilerplate"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.extras]
+azure-pipelines = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "pytest-azurepipelines"]
+dev = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "sphinx", "pre-commit"]
+docs = ["sphinx", "zope.interface"]
+tests = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"]
+
+[[package]]
+name = "autodocsumm"
+version = "0.1.13"
+description = "Extended sphinx autodoc including automatic autosummaries"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+sphinx = "*"
+
+[[package]]
+name = "autoflake"
+version = "1.4"
+description = "Removes unused imports and unused variables"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+pyflakes = ">=1.1.0"
+
+[[package]]
+name = "autopep8"
+version = "1.6.0"
+description = "A tool that automatically formats Python code to conform to the PEP 8 style guide"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+pycodestyle = ">=2.8.0"
+toml = "*"
+
+[[package]]
+name = "babel"
+version = "2.10.1"
+description = "Internationalization utilities"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+pytz = ">=2015.7"
+
+[[package]]
+name = "black"
+version = "19.10b0"
+description = "The uncompromising code formatter."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+appdirs = "*"
+attrs = ">=18.1.0"
+click = ">=6.5"
+pathspec = ">=0.6,<1"
+regex = "*"
+toml = ">=0.9.4"
+typed-ast = ">=1.4.0"
+
+[package.extras]
+d = ["aiohttp (>=3.3.2)", "aiohttp-cors"]
+
+[[package]]
+name = "cachetools"
+version = "5.2.0"
+description = "Extensible memoizing collections and decorators"
+category = "main"
+optional = true
+python-versions = "~=3.7"
+
+[[package]]
+name = "certifi"
+version = "2022.5.18.1"
+description = "Python package for providing Mozilla's CA Bundle."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "cffi"
+version = "1.15.0"
+description = "Foreign Function Interface for Python calling C code."
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+pycparser = "*"
+
+[[package]]
+name = "charset-normalizer"
+version = "2.0.12"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+category = "main"
+optional = false
+python-versions = ">=3.5.0"
+
+[package.extras]
+unicode_backport = ["unicodedata2"]
+
+[[package]]
+name = "click"
+version = "8.1.3"
+description = "Composable command line interface toolkit"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
+
+[[package]]
+name = "cloudpickle"
+version = "1.6.0"
+description = "Extended pickling support for Python objects"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "colorama"
+version = "0.4.4"
+description = "Cross-platform colored terminal text."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "commonmark"
+version = "0.9.1"
+description = "Python parser for the CommonMark Markdown spec"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.extras]
+test = ["flake8 (==3.7.8)", "hypothesis (==3.55.3)"]
+
+[[package]]
+name = "coremltools"
+version = "3.4"
+description = "Community Tools for Core ML"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+numpy = ">=1.14.5"
+protobuf = ">=3.1.0"
+six = ">=1.10.0"
+
+[[package]]
+name = "cycler"
+version = "0.11.0"
+description = "Composable style cycles"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "decorator"
+version = "4.4.2"
+description = "Decorators for Humans"
+category = "main"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*"
+
+[[package]]
+name = "dill"
+version = "0.3.5.1"
+description = "serialize all of python"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
+
+[package.extras]
+graph = ["objgraph (>=1.7.2)"]
+
+[[package]]
+name = "django"
+version = "3.2.13"
+description = "A high-level Python Web framework that encourages rapid development and clean, pragmatic design."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+asgiref = ">=3.3.2,<4"
+pytz = "*"
+sqlparse = ">=0.2.2"
+
+[package.extras]
+argon2 = ["argon2-cffi (>=19.1.0)"]
+bcrypt = ["bcrypt"]
+
+[[package]]
+name = "docformatter"
+version = "1.4"
+description = "Formats docstrings to follow PEP 257."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+untokenize = "*"
+
+[[package]]
+name = "docutils"
+version = "0.18.1"
+description = "Docutils -- Python Documentation Utilities"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "flatbuffers"
+version = "2.0"
+description = "The FlatBuffers serialization format for Python"
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "fonttools"
+version = "4.33.3"
+description = "Tools to manipulate font files"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "skia-pathops (>=0.5.0)", "uharfbuzz (>=0.23.0)", "brotlicffi (>=0.8.0)", "scipy", "brotli (>=1.0.1)", "munkres", "unicodedata2 (>=14.0.0)", "xattr"]
+graphite = ["lz4 (>=1.7.4.2)"]
+interpolatable = ["scipy", "munkres"]
+lxml = ["lxml (>=4.0,<5)"]
+pathops = ["skia-pathops (>=0.5.0)"]
+plot = ["matplotlib"]
+repacker = ["uharfbuzz (>=0.23.0)"]
+symfont = ["sympy"]
+type1 = ["xattr"]
+ufo = ["fs (>=2.2.0,<3)"]
+unicode = ["unicodedata2 (>=14.0.0)"]
+woff = ["zopfli (>=0.1.4)", "brotlicffi (>=0.8.0)", "brotli (>=1.0.1)"]
+
+[[package]]
+name = "future"
+version = "0.18.2"
+description = "Clean single-source support for Python 3 and 2"
+category = "main"
+optional = true
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "gast"
+version = "0.4.0"
+description = "Python AST that abstracts the underlying Python version"
+category = "main"
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "google-auth"
+version = "2.7.0"
+description = "Google Authentication Library"
+category = "main"
+optional = true
+python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*"
+
+[package.dependencies]
+cachetools = ">=2.0.0,<6.0"
+pyasn1-modules = ">=0.2.1"
+rsa = {version = ">=3.1.4,<5", markers = "python_version >= \"3.6\""}
+six = ">=1.9.0"
+
+[package.extras]
+aiohttp = ["requests (>=2.20.0,<3.0.0dev)", "aiohttp (>=3.6.2,<4.0.0dev)"]
+enterprise_cert = ["cryptography (==36.0.2)", "pyopenssl (==22.0.0)"]
+pyopenssl = ["pyopenssl (>=20.0.0)"]
+reauth = ["pyu2f (>=0.1.5)"]
+
+[[package]]
+name = "google-auth-oauthlib"
+version = "0.4.6"
+description = "Google Authentication Library"
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+google-auth = ">=1.0.0"
+requests-oauthlib = ">=0.7.0"
+
+[package.extras]
+tool = ["click (>=6.0.0)"]
+
+[[package]]
+name = "google-pasta"
+version = "0.2.0"
+description = "pasta is an AST-based Python refactoring library"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+six = "*"
+
+[[package]]
+name = "graphviz"
+version = "0.8.4"
+description = "Simple Python interface for Graphviz"
+category = "main"
+optional = true
+python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
+
+[package.extras]
+dev = ["tox (>=3.0)", "flake8", "pep8-naming", "wheel", "twine"]
+docs = ["sphinx (>=1.3)", "sphinx-rtd-theme"]
+test = ["mock (>=2)", "pytest (>=3.4)", "pytest-mock (>=1.8)", "pytest-cov"]
+
+[[package]]
+name = "grpcio"
+version = "1.46.3"
+description = "HTTP/2-based RPC framework"
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+six = ">=1.5.2"
+
+[package.extras]
+protobuf = ["grpcio-tools (>=1.46.3)"]
+
+[[package]]
+name = "h5py"
+version = "3.7.0"
+description = "Read and write HDF5 files from Python"
+category = "main"
+optional = true
+python-versions = ">=3.7"
+
+[package.dependencies]
+numpy = ">=1.14.5"
+
+[[package]]
+name = "idna"
+version = "3.3"
+description = "Internationalized Domain Names in Applications (IDNA)"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "image"
+version = "1.5.33"
+description = "Django application that provides cropping, resizing, thumbnailing, overlays and masking for images and videos with the ability to set the center of attention,"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+django = "*"
+pillow = "*"
+six = "*"
+
+[[package]]
+name = "imagesize"
+version = "1.3.0"
+description = "Getting image size from png/jpeg/jpeg2000/gif file"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "importlib-metadata"
+version = "4.11.4"
+description = "Read metadata from Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
+zipp = ">=0.5"
+
+[package.extras]
+docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"]
+perf = ["ipython"]
+testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"]
+
+[[package]]
+name = "isort"
+version = "5.10.1"
+description = "A Python utility / library to sort Python imports."
+category = "dev"
+optional = false
+python-versions = ">=3.6.1,<4.0"
+
+[package.extras]
+pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
+requirements_deprecated_finder = ["pipreqs", "pip-api"]
+colors = ["colorama (>=0.4.3,<0.5.0)"]
+plugins = ["setuptools"]
+
+[[package]]
+name = "jinja2"
+version = "3.1.2"
+description = "A very fast and expressive template engine."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+MarkupSafe = ">=2.0"
+
+[package.extras]
+i18n = ["Babel (>=2.7)"]
+
+[[package]]
+name = "keras"
+version = "2.7.0"
+description = "Deep learning for humans."
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "keras-preprocessing"
+version = "1.1.2"
+description = "Easy data preprocessing and data augmentation for deep learning models"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+numpy = ">=1.9.1"
+six = ">=1.9.0"
+
+[package.extras]
+image = ["scipy (>=0.14)", "Pillow (>=5.2.0)"]
+pep8 = ["flake8"]
+tests = ["pandas", "pillow", "tensorflow", "keras", "pytest", "pytest-xdist", "pytest-cov"]
+
+[[package]]
+name = "kiwisolver"
+version = "1.4.2"
+description = "A fast implementation of the Cassowary constraint solver"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+typing-extensions = {version = "*", markers = "python_version < \"3.8\""}
+
+[[package]]
+name = "lazy-object-proxy"
+version = "1.7.1"
+description = "A fast and thorough lazy object proxy."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "libclang"
+version = "14.0.1"
+description = "Clang Python Bindings, mirrored from the official LLVM repo: https://github.com/llvm/llvm-project/tree/main/clang/bindings/python, to make the installation process easier."
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "markdown"
+version = "3.3.7"
+description = "Python implementation of Markdown."
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""}
+
+[package.extras]
+testing = ["coverage", "pyyaml"]
+
+[[package]]
+name = "markupsafe"
+version = "2.1.1"
+description = "Safely add untrusted strings to HTML/XML markup."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "matplotlib"
+version = "3.5.2"
+description = "Python plotting package"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+cycler = ">=0.10"
+fonttools = ">=4.22.0"
+kiwisolver = ">=1.0.1"
+numpy = ">=1.17"
+packaging = ">=20.0"
+pillow = ">=6.2.0"
+pyparsing = ">=2.2.1"
+python-dateutil = ">=2.7"
+setuptools_scm = ">=4"
+
+[[package]]
+name = "mccabe"
+version = "0.7.0"
+description = "McCabe checker, plugin for flake8"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "more-itertools"
+version = "8.13.0"
+description = "More routines for operating on iterables, beyond itertools"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "mxnet"
+version = "1.9.1"
+description = "Apache MXNet is an ultra-scalable deep learning framework. This version uses openblas and MKLDNN."
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+graphviz = ">=0.8.1,<0.9.0"
+numpy = ">1.16.0,<2.0.0"
+requests = ">=2.20.0,<3"
+
+[[package]]
+name = "numpy"
+version = "1.19.5"
+description = "NumPy is the fundamental package for array computing with Python."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "oauthlib"
+version = "3.2.0"
+description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic"
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.extras]
+rsa = ["cryptography (>=3.0.0)"]
+signals = ["blinker (>=1.4.0)"]
+signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
+
+[[package]]
+name = "onnx"
+version = "1.10.2"
+description = "Open Neural Network Exchange"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+numpy = ">=1.16.6"
+protobuf = "*"
+six = "*"
+typing-extensions = ">=3.6.2.1"
+
+[package.extras]
+mypy = ["mypy (==0.600)"]
+
+[[package]]
+name = "onnxoptimizer"
+version = "0.2.6"
+description = "Open Neural Network Exchange"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+onnx = "*"
+
+[package.extras]
+mypy = ["mypy (==0.600)"]
+
+[[package]]
+name = "onnxruntime"
+version = "1.9.0"
+description = "ONNX Runtime is a runtime accelerator for Machine Learning models"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+flatbuffers = "*"
+numpy = ">=1.16.6"
+protobuf = "*"
+
+[[package]]
+name = "opencv-python"
+version = "4.5.2.54"
+description = "Wrapper package for OpenCV python bindings."
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+numpy = ">=1.13.3"
+
+[[package]]
+name = "opencv-python"
+version = "4.5.5.64"
+description = "Wrapper package for OpenCV python bindings."
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.19.3", markers = "python_version >= \"3.6\" and platform_system == \"Linux\" and platform_machine == \"aarch64\""},
+    {version = ">=1.14.5", markers = "python_version >= \"3.7\""},
+    {version = ">=1.17.3", markers = "python_version >= \"3.8\""},
+]
+
+[[package]]
+name = "opencv-python"
+version = "4.6.0.66"
+description = "Wrapper package for OpenCV python bindings."
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.19.3", markers = "python_version >= \"3.6\" and platform_system == \"Linux\" and platform_machine == \"aarch64\""},
+    {version = ">=1.14.5", markers = "python_version >= \"3.7\""},
+    {version = ">=1.17.3", markers = "python_version >= \"3.8\""},
+]
+
+[[package]]
+name = "opt-einsum"
+version = "3.3.0"
+description = "Optimizing numpys einsum function"
+category = "main"
+optional = true
+python-versions = ">=3.5"
+
+[package.dependencies]
+numpy = ">=1.7"
+
+[package.extras]
+docs = ["sphinx (==1.2.3)", "sphinxcontrib-napoleon", "sphinx-rtd-theme", "numpydoc"]
+tests = ["pytest", "pytest-cov", "pytest-pep8"]
+
+[[package]]
+name = "packaging"
+version = "21.3"
+description = "Core utilities for Python packages"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
+
+[[package]]
+name = "pathspec"
+version = "0.9.0"
+description = "Utility library for gitignore style pattern matching of file paths."
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+
+[[package]]
+name = "pillow"
+version = "6.2.2"
+description = "Python Imaging Library (Fork)"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "platformdirs"
+version = "2.5.2"
+description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx-autodoc-typehints (>=1.12)", "sphinx (>=4)"]
+test = ["appdirs (==1.4.4)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)", "pytest (>=6)"]
+
+[[package]]
+name = "pluggy"
+version = "0.13.1"
+description = "plugin and hook calling mechanisms for python"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.dependencies]
+importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+
+[[package]]
+name = "protobuf"
+version = "3.19.4"
+description = "Protocol Buffers"
+category = "main"
+optional = true
+python-versions = ">=3.5"
+
+[[package]]
+name = "psutil"
+version = "5.9.1"
+description = "Cross-platform lib for process and system monitoring in Python."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.extras]
+test = ["ipaddress", "mock", "enum34", "pywin32", "wmi"]
+
+[[package]]
+name = "py"
+version = "1.11.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "pyasn1"
+version = "0.4.8"
+description = "ASN.1 types and codecs"
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "pyasn1-modules"
+version = "0.2.8"
+description = "A collection of ASN.1-based protocols modules."
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+pyasn1 = ">=0.4.6,<0.5.0"
+
+[[package]]
+name = "pycodestyle"
+version = "2.8.0"
+description = "Python style guide checker"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "pycparser"
+version = "2.21"
+description = "C parser in Python"
+category = "main"
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "pyflakes"
+version = "2.4.0"
+description = "passive checker of Python programs"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "pyformat"
+version = "0.7"
+description = "Formats Python code to follow a consistent style."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+autoflake = ">=0.6.6"
+autopep8 = ">=1.2.2"
+docformatter = ">=0.7"
+unify = ">=0.2"
+
+[[package]]
+name = "pygments"
+version = "2.12.0"
+description = "Pygments is a syntax highlighting package written in Python."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "pylint"
+version = "2.13.9"
+description = "python code static checker"
+category = "dev"
+optional = false
+python-versions = ">=3.6.2"
+
+[package.dependencies]
+astroid = ">=2.11.5,<=2.12.0-dev0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+dill = ">=0.2"
+isort = ">=4.2.5,<6"
+mccabe = ">=0.6,<0.8"
+platformdirs = ">=2.2.0"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""}
+
+[package.extras]
+testutil = ["gitpython (>3)"]
+
+[[package]]
+name = "pyparsing"
+version = "3.0.9"
+description = "pyparsing module - Classes and methods to define and execute parsing grammars"
+category = "dev"
+optional = false
+python-versions = ">=3.6.8"
+
+[package.extras]
+diagrams = ["railroad-diagrams", "jinja2"]
+
+[[package]]
+name = "pyserial"
+version = "3.5"
+description = "Python Serial Port Extension"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.extras]
+cp2110 = ["hidapi"]
+
+[[package]]
+name = "pytest"
+version = "5.4.3"
+description = "pytest: simple powerful testing with Python"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.dependencies]
+atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
+attrs = ">=17.4.0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
+more-itertools = ">=4.0.0"
+packaging = "*"
+pluggy = ">=0.12,<1.0"
+py = ">=1.5.0"
+wcwidth = "*"
+
+[package.extras]
+checkqa-mypy = ["mypy (==v0.761)"]
+testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
+
+[[package]]
+name = "python-dateutil"
+version = "2.8.2"
+description = "Extensions to the standard Python datetime module"
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+
+[package.dependencies]
+six = ">=1.5"
+
+[[package]]
+name = "pytz"
+version = "2022.1"
+description = "World timezone definitions, modern and historical"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "pyyaml"
+version = "5.4.1"
+description = "YAML parser and emitter for Python"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
+
+[[package]]
+name = "recommonmark"
+version = "0.6.0"
+description = "A docutils-compatibility bridge to CommonMark, enabling you to write CommonMark inside of Docutils & Sphinx projects."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+commonmark = ">=0.8.1"
+docutils = ">=0.11"
+sphinx = ">=1.3.1"
+
+[[package]]
+name = "regex"
+version = "2022.6.2"
+description = "Alternative regular expression module, to replace re."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "requests"
+version = "2.28.0"
+description = "Python HTTP for Humans."
+category = "main"
+optional = false
+python-versions = ">=3.7, <4"
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2.0.0,<2.1.0"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<1.27"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
+
+[[package]]
+name = "requests-oauthlib"
+version = "1.3.1"
+description = "OAuthlib authentication support for Requests."
+category = "main"
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.dependencies]
+oauthlib = ">=3.0.0"
+requests = ">=2.0.0"
+
+[package.extras]
+rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
+
+[[package]]
+name = "rsa"
+version = "4.8"
+description = "Pure-Python RSA implementation"
+category = "main"
+optional = true
+python-versions = ">=3.6,<4"
+
+[package.dependencies]
+pyasn1 = ">=0.1.3"
+
+[[package]]
+name = "scipy"
+version = "1.7.3"
+description = "SciPy: Scientific Library for Python"
+category = "main"
+optional = false
+python-versions = ">=3.7,<3.11"
+
+[package.dependencies]
+numpy = ">=1.16.5,<1.23.0"
+
+[[package]]
+name = "setuptools-scm"
+version = "6.4.2"
+description = "the blessed package to manage your versions by scm tags"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+packaging = ">=20.0"
+tomli = ">=1.0.0"
+
+[package.extras]
+test = ["pytest (>=6.2)", "virtualenv (>20)"]
+toml = ["setuptools (>=42)"]
+
+[[package]]
+name = "six"
+version = "1.16.0"
+description = "Python 2 and 3 compatibility utilities"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "snowballstemmer"
+version = "2.2.0"
+description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "sphinx"
+version = "3.5.3"
+description = "Python documentation generator"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.dependencies]
+alabaster = ">=0.7,<0.8"
+babel = ">=1.3"
+colorama = {version = ">=0.3.5", markers = "sys_platform == \"win32\""}
+docutils = ">=0.12"
+imagesize = "*"
+Jinja2 = ">=2.3"
+packaging = "*"
+Pygments = ">=2.0"
+requests = ">=2.5.0"
+snowballstemmer = ">=1.1"
+sphinxcontrib-applehelp = "*"
+sphinxcontrib-devhelp = "*"
+sphinxcontrib-htmlhelp = "*"
+sphinxcontrib-jsmath = "*"
+sphinxcontrib-qthelp = "*"
+sphinxcontrib-serializinghtml = "*"
+
+[package.extras]
+docs = ["sphinxcontrib-websupport"]
+lint = ["flake8 (>=3.5.0)", "isort", "mypy (>=0.800)", "docutils-stubs"]
+test = ["pytest", "pytest-cov", "html5lib", "cython", "typed-ast"]
+
+[[package]]
+name = "sphinx-gallery"
+version = "0.11.0.dev0"
+description = "A Sphinx extension that builds an HTML version of any Python script and puts it into an examples gallery."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+develop = false
+
+[package.dependencies]
+sphinx = ">=1.8.3"
+
+[package.source]
+type = "git"
+url = "https://github.com/sphinx-gallery/sphinx-gallery.git"
+reference = "6142f179"
+resolved_reference = "6142f1791151849b5bec4bf3959f75697ba226cd"
+
+[[package]]
+name = "sphinx-rtd-theme"
+version = "0.4.3"
+description = "Read the Docs theme for Sphinx"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+sphinx = "*"
+
+[[package]]
+name = "sphinxcontrib-applehelp"
+version = "1.0.2"
+description = "sphinxcontrib-applehelp is a sphinx extension which outputs Apple help books"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.extras]
+lint = ["flake8", "mypy", "docutils-stubs"]
+test = ["pytest"]
+
+[[package]]
+name = "sphinxcontrib-devhelp"
+version = "1.0.2"
+description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp document."
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.extras]
+lint = ["flake8", "mypy", "docutils-stubs"]
+test = ["pytest"]
+
+[[package]]
+name = "sphinxcontrib-htmlhelp"
+version = "2.0.0"
+description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.extras]
+lint = ["flake8", "mypy", "docutils-stubs"]
+test = ["pytest", "html5lib"]
+
+[[package]]
+name = "sphinxcontrib-jsmath"
+version = "1.0.1"
+description = "A sphinx extension which renders display math in HTML via JavaScript"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.extras]
+test = ["pytest", "flake8", "mypy"]
+
+[[package]]
+name = "sphinxcontrib-qthelp"
+version = "1.0.3"
+description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp document."
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.extras]
+lint = ["flake8", "mypy", "docutils-stubs"]
+test = ["pytest"]
+
+[[package]]
+name = "sphinxcontrib-serializinghtml"
+version = "1.1.5"
+description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)."
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.extras]
+lint = ["flake8", "mypy", "docutils-stubs"]
+test = ["pytest"]
+
+[[package]]
+name = "sqlparse"
+version = "0.4.2"
+description = "A non-validating SQL parser."
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "tensorboard"
+version = "2.9.1"
+description = "TensorBoard lets you watch Tensors Flow"
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+absl-py = ">=0.4"
+google-auth = ">=1.6.3,<3"
+google-auth-oauthlib = ">=0.4.1,<0.5"
+grpcio = ">=1.24.3"
+markdown = ">=2.6.8"
+numpy = ">=1.12.0"
+protobuf = ">=3.9.2,<3.20"
+requests = ">=2.21.0,<3"
+tensorboard-data-server = ">=0.6.0,<0.7.0"
+tensorboard-plugin-wit = ">=1.6.0"
+werkzeug = ">=1.0.1"
+
+[[package]]
+name = "tensorboard-data-server"
+version = "0.6.1"
+description = "Fast data loading for TensorBoard"
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[[package]]
+name = "tensorboard-plugin-wit"
+version = "1.8.1"
+description = "What-If Tool TensorBoard plugin."
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "tensorflow"
+version = "2.7.3"
+description = "TensorFlow is an open source machine learning framework for everyone."
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+absl-py = ">=0.4.0"
+astunparse = ">=1.6.0"
+flatbuffers = ">=1.12,<3.0"
+gast = ">=0.2.1,<0.5.0"
+google-pasta = ">=0.1.1"
+grpcio = ">=1.24.3,<2.0"
+h5py = ">=2.9.0"
+keras = ">=2.7.0rc0,<2.8"
+keras-preprocessing = ">=1.1.1"
+libclang = ">=9.0.1"
+numpy = ">=1.14.5"
+opt-einsum = ">=2.3.2"
+protobuf = ">=3.9.2,<3.20"
+six = ">=1.12.0"
+tensorboard = ">=2.6,<3.0"
+tensorflow-estimator = ">=2.7.0rc0,<2.8"
+tensorflow-io-gcs-filesystem = ">=0.21.0"
+termcolor = ">=1.1.0"
+typing-extensions = ">=3.6.6"
+wrapt = ">=1.11.0"
+
+[[package]]
+name = "tensorflow-estimator"
+version = "2.7.0"
+description = "TensorFlow Estimator."
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "tensorflow-io-gcs-filesystem"
+version = "0.26.0"
+description = "TensorFlow IO"
+category = "main"
+optional = true
+python-versions = ">=3.7, <3.11"
+
+[package.extras]
+tensorflow = ["tensorflow (>=2.9.0,<2.10.0)"]
+tensorflow-aarch64 = ["tensorflow-aarch64 (>=2.9.0,<2.10.0)"]
+tensorflow-cpu = ["tensorflow-cpu (>=2.9.0,<2.10.0)"]
+tensorflow-gpu = ["tensorflow-gpu (>=2.9.0,<2.10.0)"]
+tensorflow-rocm = ["tensorflow-rocm (>=2.9.0,<2.10.0)"]
+
+[[package]]
+name = "termcolor"
+version = "1.1.0"
+description = "ANSII Color formatting for output in terminal."
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "tflite"
+version = "2.1.0"
+description = "A package to parse TFLite models (*.tflite)"
+category = "main"
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
+
+[package.dependencies]
+flatbuffers = "*"
+
+[[package]]
+name = "toml"
+version = "0.10.2"
+description = "Python Library for Tom's Obvious, Minimal Language"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "torch"
+version = "1.11.0"
+description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
+category = "main"
+optional = true
+python-versions = ">=3.7.0"
+
+[package.dependencies]
+typing-extensions = "*"
+
+[[package]]
+name = "torchvision"
+version = "0.12.0"
+description = "image and video datasets and models for torch deep learning"
+category = "main"
+optional = true
+python-versions = ">=3.7"
+
+[package.dependencies]
+numpy = "*"
+pillow = ">=5.3.0,<8.3.0 || >=8.4.0"
+requests = "*"
+torch = "*"
+typing-extensions = "*"
+
+[package.extras]
+scipy = ["scipy"]
+
+[[package]]
+name = "tornado"
+version = "6.1"
+description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed."
+category = "main"
+optional = false
+python-versions = ">= 3.5"
+
+[[package]]
+name = "typed-ast"
+version = "1.5.4"
+description = "a fork of Python 2 and 3 ast modules with type comment support"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "typing-extensions"
+version = "4.2.0"
+description = "Backported and Experimental Type Hints for Python 3.7+"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "unify"
+version = "0.5"
+description = "Modifies strings to all use the same (single/double) quote where possible."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+untokenize = "*"
+
+[[package]]
+name = "untokenize"
+version = "0.1.1"
+description = "Transforms tokens into original source code (while preserving whitespace)."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "urllib3"
+version = "1.26.9"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
+
+[package.extras]
+brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"]
+secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
+socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
+
+[[package]]
+name = "wcwidth"
+version = "0.2.5"
+description = "Measures the displayed width of unicode strings in a terminal"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "werkzeug"
+version = "2.1.2"
+description = "The comprehensive WSGI web application library."
+category = "main"
+optional = true
+python-versions = ">=3.7"
+
+[package.extras]
+watchdog = ["watchdog"]
+
+[[package]]
+name = "wrapt"
+version = "1.14.1"
+description = "Module for decorators, wrappers and monkey patching."
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+
+[[package]]
+name = "xgboost"
+version = "1.6.1"
+description = "XGBoost Python Package"
+category = "main"
+optional = true
+python-versions = ">=3.7"
+
+[package.dependencies]
+numpy = "*"
+scipy = "*"
+
+[package.extras]
+dask = ["dask", "pandas", "distributed"]
+datatable = ["datatable"]
+pandas = ["pandas"]
+plotting = ["graphviz", "matplotlib"]
+scikit-learn = ["scikit-learn"]
+
+[[package]]
+name = "zipp"
+version = "3.8.0"
+description = "Backport of pathlib-compatible object wrapper for zip files"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"]
+testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"]
+
+[extras]
+importer-caffe2 = ["torch"]
+importer-coreml = ["coremltools"]
+importer-darknet = ["opencv-python"]
+importer-keras = ["tensorflow", "tensorflow-estimator"]
+importer-onnx = ["future", "onnx", "onnxoptimizer", "onnxruntime", "torch", "torchvision"]
+importer-pytorch = ["torch", "torchvision", "future"]
+importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
+importer-tflite = ["tflite", "tensorflow", "tensorflow-estimator"]
+xgboost = ["xgboost"]
+
+[metadata]
+lock-version = "1.1"
+python-versions = ">=3.7, <3.9"
+content-hash = "f5a314157836c088e542703c94163559d1445f6e47cd24ee73a28e32ea192b67"
+
+[metadata.files]
+absl-py = [
+    {file = "absl-py-1.1.0.tar.gz", hash = "sha256:3aa39f898329c2156ff525dfa69ce709e42d77aab18bf4917719d6f260aa6a08"},
+    {file = "absl_py-1.1.0-py3-none-any.whl", hash = "sha256:db97287655e30336938f8058d2c81ed2be6af1d9b6ebbcd8df1080a6c7fcd24e"},
+]
+alabaster = [
+    {file = "alabaster-0.7.12-py2.py3-none-any.whl", hash = "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359"},
+    {file = "alabaster-0.7.12.tar.gz", hash = "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"},
+]
+appdirs = [
+    {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"},
+    {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
+]
+asgiref = [
+    {file = "asgiref-3.5.2-py3-none-any.whl", hash = "sha256:1d2880b792ae8757289136f1db2b7b99100ce959b2aa57fd69dab783d05afac4"},
+    {file = "asgiref-3.5.2.tar.gz", hash = "sha256:4a29362a6acebe09bf1d6640db38c1dc3d9217c68e6f9f6204d72667fc19a424"},
+]
+astroid = [
+    {file = "astroid-2.11.5-py3-none-any.whl", hash = "sha256:14ffbb4f6aa2cf474a0834014005487f7ecd8924996083ab411e7fa0b508ce0b"},
+    {file = "astroid-2.11.5.tar.gz", hash = "sha256:f4e4ec5294c4b07ac38bab9ca5ddd3914d4bf46f9006eb5c0ae755755061044e"},
+]
+astunparse = [
+    {file = "astunparse-1.6.3-py2.py3-none-any.whl", hash = "sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8"},
+    {file = "astunparse-1.6.3.tar.gz", hash = "sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872"},
+]
+atomicwrites = [
+    {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
+    {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
+]
+attrs = [
+    {file = "attrs-19.3.0-py2.py3-none-any.whl", hash = "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c"},
+    {file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"},
+]
+autodocsumm = [
+    {file = "autodocsumm-0.1.13.tar.gz", hash = "sha256:02cabadf090ed0e6de166709ef18c796536b3ed40607ff96c776884fe6aa1f75"},
+]
+autoflake = [
+    {file = "autoflake-1.4.tar.gz", hash = "sha256:61a353012cff6ab94ca062823d1fb2f692c4acda51c76ff83a8d77915fba51ea"},
+]
+autopep8 = [
+    {file = "autopep8-1.6.0-py2.py3-none-any.whl", hash = "sha256:ed77137193bbac52d029a52c59bec1b0629b5a186c495f1eb21b126ac466083f"},
+    {file = "autopep8-1.6.0.tar.gz", hash = "sha256:44f0932855039d2c15c4510d6df665e4730f2b8582704fa48f9c55bd3e17d979"},
+]
+babel = [
+    {file = "Babel-2.10.1-py3-none-any.whl", hash = "sha256:3f349e85ad3154559ac4930c3918247d319f21910d5ce4b25d439ed8693b98d2"},
+    {file = "Babel-2.10.1.tar.gz", hash = "sha256:98aeaca086133efb3e1e2aad0396987490c8425929ddbcfe0550184fdc54cd13"},
+]
+black = [
+    {file = "black-19.10b0-py36-none-any.whl", hash = "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b"},
+    {file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"},
+]
+cachetools = [
+    {file = "cachetools-5.2.0-py3-none-any.whl", hash = "sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db"},
+    {file = "cachetools-5.2.0.tar.gz", hash = "sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757"},
+]
+certifi = [
+    {file = "certifi-2022.5.18.1-py3-none-any.whl", hash = "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"},
+    {file = "certifi-2022.5.18.1.tar.gz", hash = "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7"},
+]
+cffi = [
+    {file = "cffi-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962"},
+    {file = "cffi-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0"},
+    {file = "cffi-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14"},
+    {file = "cffi-1.15.0-cp27-cp27m-win32.whl", hash = "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474"},
+    {file = "cffi-1.15.0-cp27-cp27m-win_amd64.whl", hash = "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6"},
+    {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27"},
+    {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023"},
+    {file = "cffi-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2"},
+    {file = "cffi-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382"},
+    {file = "cffi-1.15.0-cp310-cp310-win32.whl", hash = "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55"},
+    {file = "cffi-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0"},
+    {file = "cffi-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605"},
+    {file = "cffi-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e"},
+    {file = "cffi-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc"},
+    {file = "cffi-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7"},
+    {file = "cffi-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66"},
+    {file = "cffi-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029"},
+    {file = "cffi-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6"},
+    {file = "cffi-1.15.0-cp38-cp38-win32.whl", hash = "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c"},
+    {file = "cffi-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443"},
+    {file = "cffi-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a"},
+    {file = "cffi-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8"},
+    {file = "cffi-1.15.0-cp39-cp39-win32.whl", hash = "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a"},
+    {file = "cffi-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139"},
+    {file = "cffi-1.15.0.tar.gz", hash = "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954"},
+]
+charset-normalizer = [
+    {file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"},
+    {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"},
+]
+click = [
+    {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
+    {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
+]
+cloudpickle = [
+    {file = "cloudpickle-1.6.0-py3-none-any.whl", hash = "sha256:3a32d0eb0bc6f4d0c57fbc4f3e3780f7a81e6fee0fa935072884d58ae8e1cc7c"},
+    {file = "cloudpickle-1.6.0.tar.gz", hash = "sha256:9bc994f9e9447593bd0a45371f0e7ac7333710fcf64a4eb9834bf149f4ef2f32"},
+]
+colorama = [
+    {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
+    {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
+]
+commonmark = [
+    {file = "commonmark-0.9.1-py2.py3-none-any.whl", hash = "sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"},
+    {file = "commonmark-0.9.1.tar.gz", hash = "sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60"},
+]
+coremltools = [
+    {file = "coremltools-3.4-cp27-none-macosx_10_12_intel.whl", hash = "sha256:f9acf53823f503fba468eedf7a1e67788bbfa8b77316ddce0f6f8196cc411056"},
+    {file = "coremltools-3.4-cp27-none-macosx_10_13_intel.whl", hash = "sha256:961cabd211350125dec02d5deb26f322468bd887d280514df3f8c40ab92aa47a"},
+    {file = "coremltools-3.4-cp27-none-macosx_10_14_intel.whl", hash = "sha256:5dd4211a55e5ed86bf595d9ff1bd69cc2cf72b09947e7d68ca5aac28416caa08"},
+    {file = "coremltools-3.4-cp27-none-macosx_10_15_intel.whl", hash = "sha256:dfae3ad3542d40dbd9ff566f64186d6f8031bbd08b42333044928436e8b526c1"},
+    {file = "coremltools-3.4-cp27-none-manylinux1_x86_64.whl", hash = "sha256:13d60a56eeae28661061c0f439677346ae02593c2946c69d7b703e5b26695729"},
+    {file = "coremltools-3.4-cp35-none-macosx_10_12_intel.whl", hash = "sha256:1f64825bfe4bc13add097a24ac52f0822721ffc781e18062017ff415a043250f"},
+    {file = "coremltools-3.4-cp35-none-macosx_10_13_intel.whl", hash = "sha256:38e047109518efc4469cf9e3fed2b3ff213672d5591772b061186362ba0c3853"},
+    {file = "coremltools-3.4-cp35-none-macosx_10_14_intel.whl", hash = "sha256:71f520c8b9310f3a1ee8b2b676dcc2c26b445cdfb4835a3c31e51eb7c1b92bcf"},
+    {file = "coremltools-3.4-cp35-none-macosx_10_15_intel.whl", hash = "sha256:fa5b95a6514fa8dfc2dfaa9e02165db22cc5fd0746fceccf9432e85e21a26cc6"},
+    {file = "coremltools-3.4-cp35-none-manylinux1_x86_64.whl", hash = "sha256:9c9795187fbfe39d188efa3b5cc3d83d3c8d190ea490b00a0dad7fd81f8d00ed"},
+    {file = "coremltools-3.4-cp36-none-macosx_10_12_intel.whl", hash = "sha256:edd619372e83240dac810aeda2dbdf7c0177fd8c4617ecbbb6abdc286aa3e0af"},
+    {file = "coremltools-3.4-cp36-none-macosx_10_13_intel.whl", hash = "sha256:6b09d631d9e0963a76245c9b086bf328bc0f56ad477c7bd43fe92271f28af8a2"},
+    {file = "coremltools-3.4-cp36-none-macosx_10_14_intel.whl", hash = "sha256:f72b8d963890d728aefc85286f3a0d59f62a7464cdee8fd8f4d9a6a31c328ba9"},
+    {file = "coremltools-3.4-cp36-none-macosx_10_15_intel.whl", hash = "sha256:a6dfc9dbc1921219b231f98d4e03f3e2ec1e5be100ba0379d3dfd46606903cbb"},
+    {file = "coremltools-3.4-cp36-none-manylinux1_x86_64.whl", hash = "sha256:eabc5b20e1ab9e6f16ed6a55a5b0a9df154e46e84a06219625c12a9ff9d4bb86"},
+    {file = "coremltools-3.4-cp37-none-macosx_10_12_intel.whl", hash = "sha256:ba5ceb45dac4136b2969fa9af1fa992c6f54e535cfd479ad3153861b470662b6"},
+    {file = "coremltools-3.4-cp37-none-macosx_10_13_intel.whl", hash = "sha256:5c7056ffff1076fd2a627b0bfb6931a7302f80e3432a383dbdb1021af9af9533"},
+    {file = "coremltools-3.4-cp37-none-macosx_10_14_intel.whl", hash = "sha256:b6eee32f3bb3739861702ac487083a9598fb111de337def2abf7c2c00fc101d0"},
+    {file = "coremltools-3.4-cp37-none-macosx_10_15_intel.whl", hash = "sha256:4bba322462dd389f743ac6dc59a5ae8d3d564ff93863ee0873dcf86676b477a2"},
+    {file = "coremltools-3.4-cp37-none-manylinux1_x86_64.whl", hash = "sha256:3276fe8064048caa719061735bf1dfc1e5a793ec13ff2252e3f1065fa07d4918"},
+]
+cycler = [
+    {file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"},
+    {file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"},
+]
+decorator = [
+    {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"},
+    {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"},
+]
+dill = [
+    {file = "dill-0.3.5.1-py2.py3-none-any.whl", hash = "sha256:33501d03270bbe410c72639b350e941882a8b0fd55357580fbc873fba0c59302"},
+    {file = "dill-0.3.5.1.tar.gz", hash = "sha256:d75e41f3eff1eee599d738e76ba8f4ad98ea229db8b085318aa2b3333a208c86"},
+]
+django = [
+    {file = "Django-3.2.13-py3-none-any.whl", hash = "sha256:b896ca61edc079eb6bbaa15cf6071eb69d6aac08cce5211583cfb41515644fdf"},
+    {file = "Django-3.2.13.tar.gz", hash = "sha256:6d93497a0a9bf6ba0e0b1a29cccdc40efbfc76297255b1309b3a884a688ec4b6"},
+]
+docformatter = [
+    {file = "docformatter-1.4.tar.gz", hash = "sha256:064e6d81f04ac96bc0d176cbaae953a0332482b22d3ad70d47c8a7f2732eef6f"},
+]
+docutils = [
+    {file = "docutils-0.18.1-py2.py3-none-any.whl", hash = "sha256:23010f129180089fbcd3bc08cfefccb3b890b0050e1ca00c867036e9d161b98c"},
+    {file = "docutils-0.18.1.tar.gz", hash = "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06"},
+]
+flatbuffers = [
+    {file = "flatbuffers-2.0-py2.py3-none-any.whl", hash = "sha256:3751954f0604580d3219ae49a85fafec9d85eec599c0b96226e1bc0b48e57474"},
+    {file = "flatbuffers-2.0.tar.gz", hash = "sha256:12158ab0272375eab8db2d663ae97370c33f152b27801fa6024e1d6105fd4dd2"},
+]
+fonttools = [
+    {file = "fonttools-4.33.3-py3-none-any.whl", hash = "sha256:f829c579a8678fa939a1d9e9894d01941db869de44390adb49ce67055a06cc2a"},
+    {file = "fonttools-4.33.3.zip", hash = "sha256:c0fdcfa8ceebd7c1b2021240bd46ef77aa8e7408cf10434be55df52384865f8e"},
+]
+future = [
+    {file = "future-0.18.2.tar.gz", hash = "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"},
+]
+gast = [
+    {file = "gast-0.4.0-py3-none-any.whl", hash = "sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4"},
+    {file = "gast-0.4.0.tar.gz", hash = "sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1"},
+]
+google-auth = [
+    {file = "google-auth-2.7.0.tar.gz", hash = "sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1"},
+    {file = "google_auth-2.7.0-py2.py3-none-any.whl", hash = "sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475"},
+]
+google-auth-oauthlib = [
+    {file = "google-auth-oauthlib-0.4.6.tar.gz", hash = "sha256:a90a072f6993f2c327067bf65270046384cda5a8ecb20b94ea9a687f1f233a7a"},
+    {file = "google_auth_oauthlib-0.4.6-py2.py3-none-any.whl", hash = "sha256:3f2a6e802eebbb6fb736a370fbf3b055edcb6b52878bf2f26330b5e041316c73"},
+]
+google-pasta = [
+    {file = "google-pasta-0.2.0.tar.gz", hash = "sha256:c9f2c8dfc8f96d0d5808299920721be30c9eec37f2389f28904f454565c8a16e"},
+    {file = "google_pasta-0.2.0-py2-none-any.whl", hash = "sha256:4612951da876b1a10fe3960d7226f0c7682cf901e16ac06e473b267a5afa8954"},
+    {file = "google_pasta-0.2.0-py3-none-any.whl", hash = "sha256:b32482794a366b5366a32c92a9a9201b107821889935a02b3e51f6b432ea84ed"},
+]
+graphviz = [
+    {file = "graphviz-0.8.4-py2.py3-none-any.whl", hash = "sha256:7caa53f0b0be42c5f2eaa3f3d71dcc863b15bacceb5d531c2ad7519e1980ff82"},
+    {file = "graphviz-0.8.4.zip", hash = "sha256:4958a19cbd8461757a08db308a4a15c3d586660417e1e364f0107d2fe481689f"},
+]
+grpcio = [
+    {file = "grpcio-1.46.3-cp310-cp310-linux_armv7l.whl", hash = "sha256:4c05dbc164c2d3015109292ffeed68292807a6cb1225f9a36699bf2166634908"},
+    {file = "grpcio-1.46.3-cp310-cp310-macosx_10_10_universal2.whl", hash = "sha256:c6a460b6aaf43428d48fececad864cc562458b944df80568e490d985d8576292"},
+    {file = "grpcio-1.46.3-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:707b85fa0cf606a9ab02246bd3142c76e154f1c30f00f7346b2afa3d0b315d5a"},
+    {file = "grpcio-1.46.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c63e7c61c0b06f838e8f45ffd3a7c68a520c4c026b2e0e8b1ad29c456d0f859"},
+    {file = "grpcio-1.46.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6fe85e5873d9784ab82cf261d9fc07ed67a4459ba69fbe1187ef8b8e3d9e30e"},
+    {file = "grpcio-1.46.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:df980c4901a92ca649e18036ff67c7c8cad239b2759c2472694f7ab0f0b4ffb9"},
+    {file = "grpcio-1.46.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7b59982e405159385d5796aa1e0817ec83affb3eb4c2a5b7ca39413d17d7e332"},
+    {file = "grpcio-1.46.3-cp310-cp310-win32.whl", hash = "sha256:6d51fa98bd40d4593f819a3fec8a078a192958d24f84c3daf15b5ad7705d4c48"},
+    {file = "grpcio-1.46.3-cp310-cp310-win_amd64.whl", hash = "sha256:e9bba429eb743471715e6dadf006a70a77cb6afb065aa4a6eaa9efd76b09e336"},
+    {file = "grpcio-1.46.3-cp36-cp36m-linux_armv7l.whl", hash = "sha256:a898b0f13bda2dfe786952cc1ea705762fa6c3ae799b4bb0525d7821605ae968"},
+    {file = "grpcio-1.46.3-cp36-cp36m-macosx_10_10_x86_64.whl", hash = "sha256:9014aee70e29911008d2f388011cabf2c7fe4fe29918ce5f71513a660494069a"},
+    {file = "grpcio-1.46.3-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9c97106134de70f8323b12738ac0adf0615688b69253002910d0c5d42d202a77"},
+    {file = "grpcio-1.46.3-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d41ea8efb87b1ae4e576b13d94f2b470297a1495ae6b2c9d1047952731bf168f"},
+    {file = "grpcio-1.46.3-cp36-cp36m-manylinux_2_17_aarch64.whl", hash = "sha256:ab18e85082003d7883a4d069065436e61cb27c2c2150e7965ce93658f17bc8da"},
+    {file = "grpcio-1.46.3-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:307ff1d6237d5c383196660a12db021c20280227f9f4423d88d6b2ab20c8b1d0"},
+    {file = "grpcio-1.46.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c9106ef35239767b3aa9dc1a79856ad499655f853fca9f92f9dd3182d646627"},
+    {file = "grpcio-1.46.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:e0ae8e8523308bf7ab0b7d6aa686011de59b19fb06abb253f302d0b5da2a5905"},
+    {file = "grpcio-1.46.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:4fd0aa30a938893060defd5f222604461db55f9a81a028b154479b91deac7074"},
+    {file = "grpcio-1.46.3-cp36-cp36m-win32.whl", hash = "sha256:f7637b55662e56a64c07846bc0d2da6232a6e893b22c39790f2e41d03ac1a826"},
+    {file = "grpcio-1.46.3-cp36-cp36m-win_amd64.whl", hash = "sha256:97801afa96a819f911d030b490dbea95b246de02433bac69c5acf150081686e4"},
+    {file = "grpcio-1.46.3-cp37-cp37m-linux_armv7l.whl", hash = "sha256:3585a6fa3d97fc8f030bbf0e88185b5eb345a340f6732e165d5c22df54de5bc6"},
+    {file = "grpcio-1.46.3-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:dc6d15cbcceaebaacf2994280ed1c01d42b5772059b30afd8a76152e9d23daa4"},
+    {file = "grpcio-1.46.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e0486485d59d5865149010966ef3df99c5df97ab8b01f10e26f8759d6e10fafc"},
+    {file = "grpcio-1.46.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5210ec7a1638daa61da16487fbfafb3dbb7b8cd44382d9262316bbb58a5b1cf7"},
+    {file = "grpcio-1.46.3-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:e278fa30d2b5652f7e43970c86ad34c639146443553678b746909aae204924dc"},
+    {file = "grpcio-1.46.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d4148f1f76516b01cccf2273b45bc706847f1560ccb55aa6e29df851e9ca8cc"},
+    {file = "grpcio-1.46.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01f3f7a6cdb111cf276ffff9c892fa32624e03999bac809d3f3d8321d98b6855"},
+    {file = "grpcio-1.46.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:91aaccbe1c035ad2bcd1b8a25cebd11839070eb70fb6573e9d0197ddbca5d96b"},
+    {file = "grpcio-1.46.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:26136c19d96e2138f04412101f3730d66f5f1515dc912ac0d415587c8208d826"},
+    {file = "grpcio-1.46.3-cp37-cp37m-win32.whl", hash = "sha256:a8f40dafcdc3e0e378387953528eaf4e35758161f3b10d96199f12b11afbe2c2"},
+    {file = "grpcio-1.46.3-cp37-cp37m-win_amd64.whl", hash = "sha256:a6bb52df85a4bd6d3bad16b4e7cc43efe95469b74a856c87a2c5bef496c9147f"},
+    {file = "grpcio-1.46.3-cp38-cp38-linux_armv7l.whl", hash = "sha256:2334ceeab4084e80433693451452cba26afc1607a7974133af3b3635fc8aa935"},
+    {file = "grpcio-1.46.3-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:2c96a6103caec84985bb2cffac2b261f8cac2641e7a70d4b43b7d08754a6cfe7"},
+    {file = "grpcio-1.46.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7a39d39da8855b03be2d7348387986bab6a322031fcc8b04fa5e72355e7b13a1"},
+    {file = "grpcio-1.46.3-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4caf87a06de88e3611a4610c57ef55b78801843d1f5a9e5fd6b75e887dad3340"},
+    {file = "grpcio-1.46.3-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:ffbbb228e6fc6f85b34aac428eb76b4fc6591d771e487ce46eb16b4b7e18b91d"},
+    {file = "grpcio-1.46.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c89ae010c57333dd3c692e0892199a59df1ddfd467cdfea31f98331d0e8cf87"},
+    {file = "grpcio-1.46.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34b206cdf78dd1c14d93e10e7308750c36b4e6754d579895cba74341875e2fb5"},
+    {file = "grpcio-1.46.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a19b3ecdb8ddf60e4b034def27636065e49ac1ee3c85854a16353cf52c2afd83"},
+    {file = "grpcio-1.46.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aac6e66acae82be5c99a0a40ab8f5733d7df76a04f242cf42ecc34cfb1e947bd"},
+    {file = "grpcio-1.46.3-cp38-cp38-win32.whl", hash = "sha256:aff6d961d6bc5e34e12e148383671f8da5d17e47ed606ec15f483def3053b206"},
+    {file = "grpcio-1.46.3-cp38-cp38-win_amd64.whl", hash = "sha256:71d46c2f3c0512bac3d658af3193e3d645c96123af56bd07a8416474c69df2cf"},
+    {file = "grpcio-1.46.3-cp39-cp39-linux_armv7l.whl", hash = "sha256:5969f63f3cf92538f83f26949d393d9fc59de670f47cf7c2a0e1e0d30b770294"},
+    {file = "grpcio-1.46.3-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:5f8134d4a7e76c8c6644bd3ce728b9894933575155d02c09922986d5d8d6e48c"},
+    {file = "grpcio-1.46.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:53fff69fd4d315adddda226e7b71804d1f12adf3a4162126dc520725624a483a"},
+    {file = "grpcio-1.46.3-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3af2cc4e41f87d3b57f624b1b14321c1d0f030b191da60f9eeeda5448d83240c"},
+    {file = "grpcio-1.46.3-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:5fb7779ae01c20c4fad5831e98003b3f036acfe6b77697d6a9baa0f9a7f14daf"},
+    {file = "grpcio-1.46.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:56636ebf8db63ba50d272dfd73c92538950525120311676246f8f6a81b0aa144"},
+    {file = "grpcio-1.46.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a5012ba00cf8b7ce9e6ac2312ace0b0e16fe9502c18340c8c3ecb734a759831"},
+    {file = "grpcio-1.46.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:be1679d814a292a701f45df324e25b060435dd13159e9b08a16e2a2396c4391c"},
+    {file = "grpcio-1.46.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4faaba7db078a0001a8c1a4370d56dc454c03b4613b6acec01f14b90c8dd03cf"},
+    {file = "grpcio-1.46.3-cp39-cp39-win32.whl", hash = "sha256:f5c6393fa645183ae858ebfbf72ab94e7ebafb5cd849dcf4ae8c53a83cce4e24"},
+    {file = "grpcio-1.46.3-cp39-cp39-win_amd64.whl", hash = "sha256:158b90d4f1354f40e435f4c866057acc29a4364b214c31049c8b8c903646fbab"},
+    {file = "grpcio-1.46.3.tar.gz", hash = "sha256:4b8fd8b1cd553635274b83cd984f0755e6779886eca53c1c71d48215962eb689"},
+]
+h5py = [
+    {file = "h5py-3.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d77af42cb751ad6cc44f11bae73075a07429a5cf2094dfde2b1e716e059b3911"},
+    {file = "h5py-3.7.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:63beb8b7b47d0896c50de6efb9a1eaa81dbe211f3767e7dd7db159cea51ba37a"},
+    {file = "h5py-3.7.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:04e2e1e2fc51b8873e972a08d2f89625ef999b1f2d276199011af57bb9fc7851"},
+    {file = "h5py-3.7.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f73307c876af49aa869ec5df1818e9bb0bdcfcf8a5ba773cc45a4fba5a286a5c"},
+    {file = "h5py-3.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:f514b24cacdd983e61f8d371edac8c1b780c279d0acb8485639e97339c866073"},
+    {file = "h5py-3.7.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:43fed4d13743cf02798a9a03a360a88e589d81285e72b83f47d37bb64ed44881"},
+    {file = "h5py-3.7.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c038399ce09a58ff8d89ec3e62f00aa7cb82d14f34e24735b920e2a811a3a426"},
+    {file = "h5py-3.7.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03d64fb86bb86b978928bad923b64419a23e836499ec6363e305ad28afd9d287"},
+    {file = "h5py-3.7.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e5b7820b75f9519499d76cc708e27242ccfdd9dfb511d6deb98701961d0445aa"},
+    {file = "h5py-3.7.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a9351d729ea754db36d175098361b920573fdad334125f86ac1dd3a083355e20"},
+    {file = "h5py-3.7.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6776d896fb90c5938de8acb925e057e2f9f28755f67ec3edcbc8344832616c38"},
+    {file = "h5py-3.7.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0a047fddbe6951bce40e9cde63373c838a978c5e05a011a682db9ba6334b8e85"},
+    {file = "h5py-3.7.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0798a9c0ff45f17d0192e4d7114d734cac9f8b2b2c76dd1d923c4d0923f27bb6"},
+    {file = "h5py-3.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:0d8de8cb619fc597da7cf8cdcbf3b7ff8c5f6db836568afc7dc16d21f59b2b49"},
+    {file = "h5py-3.7.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f084bbe816907dfe59006756f8f2d16d352faff2d107f4ffeb1d8de126fc5dc7"},
+    {file = "h5py-3.7.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1fcb11a2dc8eb7ddcae08afd8fae02ba10467753a857fa07a404d700a93f3d53"},
+    {file = "h5py-3.7.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ed43e2cc4f511756fd664fb45d6b66c3cbed4e3bd0f70e29c37809b2ae013c44"},
+    {file = "h5py-3.7.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e7535df5ee3dc3e5d1f408fdfc0b33b46bc9b34db82743c82cd674d8239b9ad"},
+    {file = "h5py-3.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:9e2ad2aa000f5b1e73b5dfe22f358ca46bf1a2b6ca394d9659874d7fc251731a"},
+    {file = "h5py-3.7.0.tar.gz", hash = "sha256:3fcf37884383c5da64846ab510190720027dca0768def34dd8dcb659dbe5cbf3"},
+]
+idna = [
+    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
+    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
+]
+image = [
+    {file = "image-1.5.33.tar.gz", hash = "sha256:baa2e09178277daa50f22fd6d1d51ec78f19c12688921cb9ab5808743f097126"},
+]
+imagesize = [
+    {file = "imagesize-1.3.0-py2.py3-none-any.whl", hash = "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c"},
+    {file = "imagesize-1.3.0.tar.gz", hash = "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"},
+]
+importlib-metadata = [
+    {file = "importlib_metadata-4.11.4-py3-none-any.whl", hash = "sha256:c58c8eb8a762858f49e18436ff552e83914778e50e9d2f1660535ffb364552ec"},
+    {file = "importlib_metadata-4.11.4.tar.gz", hash = "sha256:5d26852efe48c0a32b0509ffbc583fda1a2266545a78d104a6f4aff3db17d700"},
+]
+isort = [
+    {file = "isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"},
+    {file = "isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"},
+]
+jinja2 = [
+    {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"},
+    {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"},
+]
+keras = [
+    {file = "keras-2.7.0-py2.py3-none-any.whl", hash = "sha256:0c33ae1f728064ca0d35dfba999e9c316f03623bf5688c82fb83cc74a80ea248"},
+]
+keras-preprocessing = [
+    {file = "Keras_Preprocessing-1.1.2-py2.py3-none-any.whl", hash = "sha256:7b82029b130ff61cc99b55f3bd27427df4838576838c5b2f65940e4fcec99a7b"},
+    {file = "Keras_Preprocessing-1.1.2.tar.gz", hash = "sha256:add82567c50c8bc648c14195bf544a5ce7c1f76761536956c3d2978970179ef3"},
+]
+kiwisolver = [
+    {file = "kiwisolver-1.4.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6e395ece147f0692ca7cdb05a028d31b83b72c369f7b4a2c1798f4b96af1e3d8"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0b7f50a1a25361da3440f07c58cd1d79957c2244209e4f166990e770256b6b0b"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3c032c41ae4c3a321b43a3650e6ecc7406b99ff3e5279f24c9b310f41bc98479"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1dcade8f6fe12a2bb4efe2cbe22116556e3b6899728d3b2a0d3b367db323eacc"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0e45e780a74416ef2f173189ef4387e44b5494f45e290bcb1f03735faa6779bf"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d2bb56309fb75a811d81ed55fbe2208aa77a3a09ff5f546ca95e7bb5fac6eff"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69b2d6c12f2ad5f55104a36a356192cfb680c049fe5e7c1f6620fc37f119cdc2"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:262c248c60f22c2b547683ad521e8a3db5909c71f679b93876921549107a0c24"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-win32.whl", hash = "sha256:1008346a7741620ab9cc6c96e8ad9b46f7a74ce839dbb8805ddf6b119d5fc6c2"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:6ece2e12e4b57bc5646b354f436416cd2a6f090c1dadcd92b0ca4542190d7190"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b978afdb913ca953cf128d57181da2e8798e8b6153be866ae2a9c446c6162f40"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f88c4b8e449908eeddb3bbd4242bd4dc2c7a15a7aa44bb33df893203f02dc2d"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e348f1904a4fab4153407f7ccc27e43b2a139752e8acf12e6640ba683093dd96"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c839bf28e45d7ddad4ae8f986928dbf5a6d42ff79760d54ec8ada8fb263e097c"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8ae5a071185f1a93777c79a9a1e67ac46544d4607f18d07131eece08d415083a"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:c222f91a45da9e01a9bc4f760727ae49050f8e8345c4ff6525495f7a164c8973"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-win32.whl", hash = "sha256:a4e8f072db1d6fb7a7cc05a6dbef8442c93001f4bb604f1081d8c2db3ca97159"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-win_amd64.whl", hash = "sha256:be9a650890fb60393e60aacb65878c4a38bb334720aa5ecb1c13d0dac54dd73b"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:8ec2e55bf31b43aabe32089125dca3b46fdfe9f50afbf0756ae11e14c97b80ca"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d1078ba770d6165abed3d9a1be1f9e79b61515de1dd00d942fa53bba79f01ae"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cbb5eb4a2ea1ffec26268d49766cafa8f957fe5c1b41ad00733763fae77f9436"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e6cda72db409eefad6b021e8a4f964965a629f577812afc7860c69df7bdb84a"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b1605c7c38cc6a85212dfd6a641f3905a33412e49f7c003f35f9ac6d71f67720"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81237957b15469ea9151ec8ca08ce05656090ffabc476a752ef5ad7e2644c526"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:240009fdf4fa87844f805e23f48995537a8cb8f8c361e35fda6b5ac97fcb906f"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:240c2d51d098395c012ddbcb9bd7b3ba5de412a1d11840698859f51d0e643c4f"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-win32.whl", hash = "sha256:8b6086aa6936865962b2cee0e7aaecf01ab6778ce099288354a7229b4d9f1408"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:0d98dca86f77b851350c250f0149aa5852b36572514d20feeadd3c6b1efe38d0"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:91eb4916271655dfe3a952249cb37a5c00b6ba68b4417ee15af9ba549b5ba61d"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa4d97d7d2b2c082e67907c0b8d9f31b85aa5d3ba0d33096b7116f03f8061261"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:71469b5845b9876b8d3d252e201bef6f47bf7456804d2fbe9a1d6e19e78a1e65"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8ff3033e43e7ca1389ee59fb7ecb8303abb8713c008a1da49b00869e92e3dd7c"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:89b57c2984f4464840e4b768affeff6b6809c6150d1166938ade3e22fbe22db8"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffbdb9a96c536f0405895b5e21ee39ec579cb0ed97bdbd169ae2b55f41d73219"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a830a03970c462d1a2311c90e05679da56d3bd8e78a4ba9985cb78ef7836c9f"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f74f2a13af201559e3d32b9ddfc303c94ae63d63d7f4326d06ce6fe67e7a8255"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-win32.whl", hash = "sha256:e677cc3626287f343de751e11b1e8a5b915a6ac897e8aecdbc996cd34de753a0"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:b3e251e5c38ac623c5d786adb21477f018712f8c6fa54781bd38aa1c60b60fc2"},
+    {file = "kiwisolver-1.4.2-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0c380bb5ae20d829c1a5473cfcae64267b73aaa4060adc091f6df1743784aae0"},
+    {file = "kiwisolver-1.4.2-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:484f2a5f0307bc944bc79db235f41048bae4106ffa764168a068d88b644b305d"},
+    {file = "kiwisolver-1.4.2-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e8afdf533b613122e4bbaf3c1e42c2a5e9e2d1dd3a0a017749a7658757cb377"},
+    {file = "kiwisolver-1.4.2-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:42f6ef9b640deb6f7d438e0a371aedd8bef6ddfde30683491b2e6f568b4e884e"},
+    {file = "kiwisolver-1.4.2.tar.gz", hash = "sha256:7f606d91b8a8816be476513a77fd30abe66227039bd6f8b406c348cb0247dcc9"},
+]
+lazy-object-proxy = [
+    {file = "lazy-object-proxy-1.7.1.tar.gz", hash = "sha256:d609c75b986def706743cdebe5e47553f4a5a1da9c5ff66d76013ef396b5a8a4"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bb8c5fd1684d60a9902c60ebe276da1f2281a318ca16c1d0a96db28f62e9166b"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a57d51ed2997e97f3b8e3500c984db50a554bb5db56c50b5dab1b41339b37e36"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd45683c3caddf83abbb1249b653a266e7069a09f486daa8863fb0e7496a9fdb"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:8561da8b3dd22d696244d6d0d5330618c993a215070f473b699e00cf1f3f6443"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fccdf7c2c5821a8cbd0a9440a456f5050492f2270bd54e94360cac663398739b"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-win32.whl", hash = "sha256:898322f8d078f2654d275124a8dd19b079080ae977033b713f677afcfc88e2b9"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:85b232e791f2229a4f55840ed54706110c80c0a210d076eee093f2b2e33e1bfd"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:46ff647e76f106bb444b4533bb4153c7370cdf52efc62ccfc1a28bdb3cc95442"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:12f3bb77efe1367b2515f8cb4790a11cffae889148ad33adad07b9b55e0ab22c"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c19814163728941bb871240d45c4c30d33b8a2e85972c44d4e63dd7107faba44"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:e40f2013d96d30217a51eeb1db28c9ac41e9d0ee915ef9d00da639c5b63f01a1"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:2052837718516a94940867e16b1bb10edb069ab475c3ad84fd1e1a6dd2c0fcfc"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-win32.whl", hash = "sha256:6a24357267aa976abab660b1d47a34aaf07259a0c3859a34e536f1ee6e76b5bb"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-win_amd64.whl", hash = "sha256:6aff3fe5de0831867092e017cf67e2750c6a1c7d88d84d2481bd84a2e019ec35"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6a6e94c7b02641d1311228a102607ecd576f70734dc3d5e22610111aeacba8a0"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4ce15276a1a14549d7e81c243b887293904ad2d94ad767f42df91e75fd7b5b6"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e368b7f7eac182a59ff1f81d5f3802161932a41dc1b1cc45c1f757dc876b5d2c"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:6ecbb350991d6434e1388bee761ece3260e5228952b1f0c46ffc800eb313ff42"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:553b0f0d8dbf21890dd66edd771f9b1b5f51bd912fa5f26de4449bfc5af5e029"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-win32.whl", hash = "sha256:c7a683c37a8a24f6428c28c561c80d5f4fd316ddcf0c7cab999b15ab3f5c5c69"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-win_amd64.whl", hash = "sha256:df2631f9d67259dc9620d831384ed7732a198eb434eadf69aea95ad18c587a28"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:07fa44286cda977bd4803b656ffc1c9b7e3bc7dff7d34263446aec8f8c96f88a"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4dca6244e4121c74cc20542c2ca39e5c4a5027c81d112bfb893cf0790f96f57e"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91ba172fc5b03978764d1df5144b4ba4ab13290d7bab7a50f12d8117f8630c38"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:043651b6cb706eee4f91854da4a089816a6606c1428fd391573ef8cb642ae4f7"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b9e89b87c707dd769c4ea91f7a31538888aad05c116a59820f28d59b3ebfe25a"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-win32.whl", hash = "sha256:9d166602b525bf54ac994cf833c385bfcc341b364e3ee71e3bf5a1336e677b55"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:8f3953eb575b45480db6568306893f0bd9d8dfeeebd46812aa09ca9579595148"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:dd7ed7429dbb6c494aa9bc4e09d94b778a3579be699f9d67da7e6804c422d3de"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70ed0c2b380eb6248abdef3cd425fc52f0abd92d2b07ce26359fcbc399f636ad"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7096a5e0c1115ec82641afbdd70451a144558ea5cf564a896294e346eb611be1"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f769457a639403073968d118bc70110e7dce294688009f5c24ab78800ae56dc8"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:39b0e26725c5023757fc1ab2a89ef9d7ab23b84f9251e28f9cc114d5b59c1b09"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-win32.whl", hash = "sha256:2130db8ed69a48a3440103d4a520b89d8a9405f1b06e2cc81640509e8bf6548f"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:677ea950bef409b47e51e733283544ac3d660b709cfce7b187f5ace137960d61"},
+    {file = "lazy_object_proxy-1.7.1-pp37.pp38-none-any.whl", hash = "sha256:d66906d5785da8e0be7360912e99c9188b70f52c422f9fc18223347235691a84"},
+]
+libclang = [
+    {file = "libclang-14.0.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:a00c5f433af032979ac0cf03bcba59cf5247cb01fa04ef2380bf9668e84d50a9"},
+    {file = "libclang-14.0.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:52634f51635e8fc710febde1d7c59d3756b14531bd9ab60df54397ccc08cc4a8"},
+    {file = "libclang-14.0.1-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:02bacd219959601c627872f2c7c7090ce57cf6bd497618388e41813c7ee75a3a"},
+    {file = "libclang-14.0.1-py2.py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d44b8e4b063ea4c7e78c925f083c05ab14440d63ed1bad13d4ca62d2908d277"},
+    {file = "libclang-14.0.1-py2.py3-none-manylinux2014_armv7l.whl", hash = "sha256:7c7b8c7c82c0cdc088052c6b7b2be4a45b6b06f5f856e7e7058e598f05c09910"},
+    {file = "libclang-14.0.1-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:58b9679868b2d6b5172ded26026c2f71306c4cabd6d15b93b597446fd677eb98"},
+    {file = "libclang-14.0.1-py2.py3-none-win_amd64.whl", hash = "sha256:1a4f0d5959c801c975950926cffb9b45521c890d7c4b730d8a1f688d75b25de9"},
+]
+markdown = [
+    {file = "Markdown-3.3.7-py3-none-any.whl", hash = "sha256:f5da449a6e1c989a4cea2631aa8ee67caa5a2ef855d551c88f9e309f4634c621"},
+    {file = "Markdown-3.3.7.tar.gz", hash = "sha256:cbb516f16218e643d8e0a95b309f77eb118cb138d39a4f27851e6a63581db874"},
+]
+markupsafe = [
+    {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-win32.whl", hash = "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-win32.whl", hash = "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-win32.whl", hash = "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-win32.whl", hash = "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"},
+    {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"},
+]
+matplotlib = [
+    {file = "matplotlib-3.5.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:03bbb3f5f78836855e127b5dab228d99551ad0642918ccbf3067fcd52ac7ac5e"},
+    {file = "matplotlib-3.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:49a5938ed6ef9dda560f26ea930a2baae11ea99e1c2080c8714341ecfda72a89"},
+    {file = "matplotlib-3.5.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:77157be0fc4469cbfb901270c205e7d8adb3607af23cef8bd11419600647ceed"},
+    {file = "matplotlib-3.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5844cea45d804174bf0fac219b4ab50774e504bef477fc10f8f730ce2d623441"},
+    {file = "matplotlib-3.5.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c87973ddec10812bddc6c286b88fdd654a666080fbe846a1f7a3b4ba7b11ab78"},
+    {file = "matplotlib-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a05f2b37222319753a5d43c0a4fd97ed4ff15ab502113e3f2625c26728040cf"},
+    {file = "matplotlib-3.5.2-cp310-cp310-win32.whl", hash = "sha256:9776e1a10636ee5f06ca8efe0122c6de57ffe7e8c843e0fb6e001e9d9256ec95"},
+    {file = "matplotlib-3.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:b4fedaa5a9aa9ce14001541812849ed1713112651295fdddd640ea6620e6cf98"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ee175a571e692fc8ae8e41ac353c0e07259113f4cb063b0ec769eff9717e84bb"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e8bda1088b941ead50caabd682601bece983cadb2283cafff56e8fcddbf7d7f"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9480842d5aadb6e754f0b8f4ebeb73065ac8be1855baa93cd082e46e770591e9"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6c623b355d605a81c661546af7f24414165a8a2022cddbe7380a31a4170fa2e9"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-win32.whl", hash = "sha256:a91426ae910819383d337ba0dc7971c7cefdaa38599868476d94389a329e599b"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-win_amd64.whl", hash = "sha256:c4b82c2ae6d305fcbeb0eb9c93df2602ebd2f174f6e8c8a5d92f9445baa0c1d3"},
+    {file = "matplotlib-3.5.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ebc27ad11df3c1661f4677a7762e57a8a91dd41b466c3605e90717c9a5f90c82"},
+    {file = "matplotlib-3.5.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a32ea6e12e80dedaca2d4795d9ed40f97bfa56e6011e14f31502fdd528b9c89"},
+    {file = "matplotlib-3.5.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2a0967d4156adbd0d46db06bc1a877f0370bce28d10206a5071f9ecd6dc60b79"},
+    {file = "matplotlib-3.5.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2b696699386766ef171a259d72b203a3c75d99d03ec383b97fc2054f52e15cf"},
+    {file = "matplotlib-3.5.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7f409716119fa39b03da3d9602bd9b41142fab7a0568758cd136cd80b1bf36c8"},
+    {file = "matplotlib-3.5.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b8d3f4e71e26307e8c120b72c16671d70c5cd08ae412355c11254aa8254fb87f"},
+    {file = "matplotlib-3.5.2-cp38-cp38-win32.whl", hash = "sha256:b6c63cd01cad0ea8704f1fd586e9dc5777ccedcd42f63cbbaa3eae8dd41172a1"},
+    {file = "matplotlib-3.5.2-cp38-cp38-win_amd64.whl", hash = "sha256:75c406c527a3aa07638689586343f4b344fcc7ab1f79c396699eb550cd2b91f7"},
+    {file = "matplotlib-3.5.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4a44cdfdb9d1b2f18b1e7d315eb3843abb097869cd1ef89cfce6a488cd1b5182"},
+    {file = "matplotlib-3.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3d8e129af95b156b41cb3be0d9a7512cc6d73e2b2109f82108f566dbabdbf377"},
+    {file = "matplotlib-3.5.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:364e6bca34edc10a96aa3b1d7cd76eb2eea19a4097198c1b19e89bee47ed5781"},
+    {file = "matplotlib-3.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea75df8e567743207e2b479ba3d8843537be1c146d4b1e3e395319a4e1a77fe9"},
+    {file = "matplotlib-3.5.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:44c6436868186564450df8fd2fc20ed9daaef5caad699aa04069e87099f9b5a8"},
+    {file = "matplotlib-3.5.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7d7705022df2c42bb02937a2a824f4ec3cca915700dd80dc23916af47ff05f1a"},
+    {file = "matplotlib-3.5.2-cp39-cp39-win32.whl", hash = "sha256:ee0b8e586ac07f83bb2950717e66cb305e2859baf6f00a9c39cc576e0ce9629c"},
+    {file = "matplotlib-3.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:c772264631e5ae61f0bd41313bbe48e1b9bcc95b974033e1118c9caa1a84d5c6"},
+    {file = "matplotlib-3.5.2-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:751d3815b555dcd6187ad35b21736dc12ce6925fc3fa363bbc6dc0f86f16484f"},
+    {file = "matplotlib-3.5.2-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:31fbc2af27ebb820763f077ec7adc79b5a031c2f3f7af446bd7909674cd59460"},
+    {file = "matplotlib-3.5.2-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4fa28ca76ac5c2b2d54bc058b3dad8e22ee85d26d1ee1b116a6fd4d2277b6a04"},
+    {file = "matplotlib-3.5.2-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:24173c23d1bcbaed5bf47b8785d27933a1ac26a5d772200a0f3e0e38f471b001"},
+    {file = "matplotlib-3.5.2.tar.gz", hash = "sha256:48cf850ce14fa18067f2d9e0d646763681948487a8080ec0af2686468b4607a2"},
+]
+mccabe = [
+    {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
+    {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
+]
+more-itertools = [
+    {file = "more-itertools-8.13.0.tar.gz", hash = "sha256:a42901a0a5b169d925f6f217cd5a190e32ef54360905b9c39ee7db5313bfec0f"},
+    {file = "more_itertools-8.13.0-py3-none-any.whl", hash = "sha256:c5122bffc5f104d37c1626b8615b511f3427aa5389b94d61e5ef8236bfbc3ddb"},
+]
+mxnet = [
+    {file = "mxnet-1.9.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:73c045f65ad05fe9ca3c4202e10471703b57231f8ac8b05d973ec2ab362178fb"},
+    {file = "mxnet-1.9.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5e51a0c05d99f8f1b3b5e7c02170be57af2e6edb3ad9af2cb9551ace3e22942c"},
+    {file = "mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:65d5dac162c87a14d138d888b54494d515036d9047ae804ff51f770bd02197a6"},
+]
+numpy = [
+    {file = "numpy-1.19.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cc6bd4fd593cb261332568485e20a0712883cf631f6f5e8e86a52caa8b2b50ff"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:aeb9ed923be74e659984e321f609b9ba54a48354bfd168d21a2b072ed1e833ea"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:8b5e972b43c8fc27d56550b4120fe6257fdc15f9301914380b27f74856299fea"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:43d4c81d5ffdff6bae58d66a3cd7f54a7acd9a0e7b18d97abb255defc09e3140"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:a4646724fba402aa7504cd48b4b50e783296b5e10a524c7a6da62e4a8ac9698d"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:2e55195bc1c6b705bfd8ad6f288b38b11b1af32f3c8289d6c50d47f950c12e76"},
+    {file = "numpy-1.19.5-cp36-cp36m-win32.whl", hash = "sha256:39b70c19ec771805081578cc936bbe95336798b7edf4732ed102e7a43ec5c07a"},
+    {file = "numpy-1.19.5-cp36-cp36m-win_amd64.whl", hash = "sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827"},
+    {file = "numpy-1.19.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:603aa0706be710eea8884af807b1b3bc9fb2e49b9f4da439e76000f3b3c6ff0f"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:cae865b1cae1ec2663d8ea56ef6ff185bad091a5e33ebbadd98de2cfa3fa668f"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:36674959eed6957e61f11c912f71e78857a8d0604171dfd9ce9ad5cbf41c511c"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:06fab248a088e439402141ea04f0fffb203723148f6ee791e9c75b3e9e82f080"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:6149a185cece5ee78d1d196938b2a8f9d09f5a5ebfbba66969302a778d5ddd1d"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:50a4a0ad0111cc1b71fa32dedd05fa239f7fb5a43a40663269bb5dc7877cfd28"},
+    {file = "numpy-1.19.5-cp37-cp37m-win32.whl", hash = "sha256:d051ec1c64b85ecc69531e1137bb9751c6830772ee5c1c426dbcfe98ef5788d7"},
+    {file = "numpy-1.19.5-cp37-cp37m-win_amd64.whl", hash = "sha256:a12ff4c8ddfee61f90a1633a4c4afd3f7bcb32b11c52026c92a12e1325922d0d"},
+    {file = "numpy-1.19.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf2402002d3d9f91c8b01e66fbb436a4ed01c6498fffed0e4c7566da1d40ee1e"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1ded4fce9cfaaf24e7a0ab51b7a87be9038ea1ace7f34b841fe3b6894c721d1c"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:012426a41bc9ab63bb158635aecccc7610e3eff5d31d1eb43bc099debc979d94"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:759e4095edc3c1b3ac031f34d9459fa781777a93ccc633a472a5468587a190ff"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:a9d17f2be3b427fbb2bce61e596cf555d6f8a56c222bd2ca148baeeb5e5c783c"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:99abf4f353c3d1a0c7a5f27699482c987cf663b1eac20db59b8c7b061eabd7fc"},
+    {file = "numpy-1.19.5-cp38-cp38-win32.whl", hash = "sha256:384ec0463d1c2671170901994aeb6dce126de0a95ccc3976c43b0038a37329c2"},
+    {file = "numpy-1.19.5-cp38-cp38-win_amd64.whl", hash = "sha256:811daee36a58dc79cf3d8bdd4a490e4277d0e4b7d103a001a4e73ddb48e7e6aa"},
+    {file = "numpy-1.19.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c843b3f50d1ab7361ca4f0b3639bf691569493a56808a0b0c54a051d260b7dbd"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:d6631f2e867676b13026e2846180e2c13c1e11289d67da08d71cacb2cd93d4aa"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7fb43004bce0ca31d8f13a6eb5e943fa73371381e53f7074ed21a4cb786c32f8"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2ea52bd92ab9f768cc64a4c3ef8f4b2580a17af0a5436f6126b08efbd1838371"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:400580cbd3cff6ffa6293df2278c75aef2d58d8d93d3c5614cd67981dae68ceb"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60"},
+    {file = "numpy-1.19.5-cp39-cp39-win32.whl", hash = "sha256:ab83f24d5c52d60dbc8cd0528759532736b56db58adaa7b5f1f76ad551416a1e"},
+    {file = "numpy-1.19.5-cp39-cp39-win_amd64.whl", hash = "sha256:0eef32ca3132a48e43f6a0f5a82cb508f22ce5a3d6f67a8329c81c8e226d3f6e"},
+    {file = "numpy-1.19.5-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:a0d53e51a6cb6f0d9082decb7a4cb6dfb33055308c4c44f53103c073f649af73"},
+    {file = "numpy-1.19.5.zip", hash = "sha256:a76f502430dd98d7546e1ea2250a7360c065a5fdea52b2dffe8ae7180909b6f4"},
+]
+oauthlib = [
+    {file = "oauthlib-3.2.0-py3-none-any.whl", hash = "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"},
+    {file = "oauthlib-3.2.0.tar.gz", hash = "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2"},
+]
+onnx = [
+    {file = "onnx-1.10.2-cp36-cp36m-macosx_10_12_x86_64.whl", hash = "sha256:898915bcba9c1d54abef00f4ea7d60e59fdb2d21d49e7493acac40c121eca4df"},
+    {file = "onnx-1.10.2-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:86baab35fc1a317369f2a0cd3816c0eeb9036c29f9a27ed5e8f6935e67cbf0a8"},
+    {file = "onnx-1.10.2-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:186abf5e9189b4b011da290c6d83d5499adefac8f6a07f5d596a192b4c911098"},
+    {file = "onnx-1.10.2-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48a747b247bc626e049341b8e8c4aeac20aa2306d6b8dff9c9e53a6b14931f1e"},
+    {file = "onnx-1.10.2-cp36-cp36m-win32.whl", hash = "sha256:63aee84aed68c8e14583af48c79d99405844034043dee1efbd1937a78dfa7f6b"},
+    {file = "onnx-1.10.2-cp36-cp36m-win_amd64.whl", hash = "sha256:7e59a6da6e437488059080babc9d96cde7c929cc758ffe4b0171aceaea559ada"},
+    {file = "onnx-1.10.2-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:358fc6f71841e30ca793a0c1bcd3d0b9c62e436e215773e77a301acb6106cbda"},
+    {file = "onnx-1.10.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1e2f92a77d84ae84d25ac84ec84a77b53e427cc7b2eb72ed7d56f2204f885715"},
+    {file = "onnx-1.10.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6205849c935837a934a9ec1fd994f1e858ad7d253e02d0bacbe4add211e4255d"},
+    {file = "onnx-1.10.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc830b15fe11846911fdf068460fd5f20b0f711c8b4c575c68478a6bf2884304"},
+    {file = "onnx-1.10.2-cp37-cp37m-win32.whl", hash = "sha256:796fa0b80f108f2824cccf5c7298895a925aaea7831330a0bd720ceffc7be3c6"},
+    {file = "onnx-1.10.2-cp37-cp37m-win_amd64.whl", hash = "sha256:24e654cca4c7285ea339fae15998dd33a5b9e57831d8ecb0bdb1f439c61c5736"},
+    {file = "onnx-1.10.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:3b73128c269ef84694099dad2b06568f2672ce95761a51e0225401695dc2c136"},
+    {file = "onnx-1.10.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4a53055b8f13747b607dbf835914c2bd60fa7214ee719893b003ceb5fc903220"},
+    {file = "onnx-1.10.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a86e3f956e2a1d39772ae36d28c5b7f20fb6a883ae35971ada261b25548a8b32"},
+    {file = "onnx-1.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd31e61ba95c62548543d8de2007fcb18fd2f017a9a36f712bbc08ddad1f25f4"},
+    {file = "onnx-1.10.2-cp38-cp38-win32.whl", hash = "sha256:57f93db536766b1dcfeee583c02bd86c9f1c9a652253bd4f9bf189a39446de1c"},
+    {file = "onnx-1.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:d0a3951276ac83fde93632303ad0b3b69e10894b69b7fe5eab0361e4f4212627"},
+    {file = "onnx-1.10.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:4138093cbf11e4300b7a7679aedfe1972f81abeb284a731e90dffdf3ef6c5ca3"},
+    {file = "onnx-1.10.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:38e7d106fa98921faf909c2908bfd022eb2c594ecfbd275b60f80e0161cb8476"},
+    {file = "onnx-1.10.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:526de93b57dd65b136bec85d5b4c6fa4455d6d817bb319b54797d29111b9c407"},
+    {file = "onnx-1.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce14dbe32a250b7691751e809c232b9a206da138ac055e24b9e60a1500b4d5b8"},
+    {file = "onnx-1.10.2-cp39-cp39-win32.whl", hash = "sha256:253fd36cbcfcbbbe00e55dde7a09995b22fc2cc825f6de28e5ef9c47f581f264"},
+    {file = "onnx-1.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:0c176ef6e0c3b6bdfb69a43a66dcb8e6ba687437e302c79b4efb75027e1007dc"},
+    {file = "onnx-1.10.2.tar.gz", hash = "sha256:24d73ca7dfd7e6c7339944f89554b4010719899337924fca1447d8f1b5db50d6"},
+]
+onnxoptimizer = [
+    {file = "onnxoptimizer-0.2.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b12a06ce647d9827553bf07070327de236b1f8b547fe6896755ae775ddc11f94"},
+    {file = "onnxoptimizer-0.2.6-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:cb751d8b44cef3099d5c2ccfadeb772ab9c56d300fd9dfa1fdaa3cf71e279b77"},
+    {file = "onnxoptimizer-0.2.6-cp36-cp36m-win_amd64.whl", hash = "sha256:f2978ef9fac7fd99c01ecef8fb7981a695f91eb2251d73ac25eeba57672e41fe"},
+    {file = "onnxoptimizer-0.2.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:eebfb8a63eb0d8710ce116b72b78ec20b04b4997b673ec02dccee0e54fe4869b"},
+    {file = "onnxoptimizer-0.2.6-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:5c4338ffbcb29ee5e7bccec01fa60b72528a495d680f30203be0c06fbb34949c"},
+    {file = "onnxoptimizer-0.2.6-cp37-cp37m-win_amd64.whl", hash = "sha256:4ba0d23a9f580f3579079e226f1e75ff9e3d2d6011ca71b9f6e4cbfd6a2d2113"},
+    {file = "onnxoptimizer-0.2.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2e98f9f915929397eec5e98cf3ad217a2a56cf77d5b9f06b7878a2672bff6c20"},
+    {file = "onnxoptimizer-0.2.6-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:870bf2741716e2be4bd24a46de2fb27ffbe5ee215df3f891f531f747d19e398b"},
+    {file = "onnxoptimizer-0.2.6-cp38-cp38-win_amd64.whl", hash = "sha256:c4e6573a981949cc662e425c503e4d69440a02d5512a7693701ef1da1cbb0a33"},
+    {file = "onnxoptimizer-0.2.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0c05bbf023af64394394e3c98597b45785634cbd4ea5d80b2f15134889d6239c"},
+    {file = "onnxoptimizer-0.2.6-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:9a9bbbaf58c739d68ec88f50d6f667cb131ccbafa6b0f91d0aee5886b1ce8a03"},
+    {file = "onnxoptimizer-0.2.6-cp39-cp39-win_amd64.whl", hash = "sha256:7557b4a22b656c46956a21ac806ac18b5889a2b0447fbaf65e37881dac1ff97c"},
+]
+onnxruntime = [
+    {file = "onnxruntime-1.9.0-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:48f0fcf3c9aa6836584e64abe63fa7395c02066d3259bbdeb489b4d172e0127a"},
+    {file = "onnxruntime-1.9.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f9d772a6330cb85e7723f84e357320a1603e3824a92aab4ef36fc3a41e64f16"},
+    {file = "onnxruntime-1.9.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bdb861822a63404cca7b46dce86d48bbc21c906a4b4ed13969bc89763ac7f96"},
+    {file = "onnxruntime-1.9.0-cp36-cp36m-win32.whl", hash = "sha256:e3f8f7d5d4d66e3a4a2b731a000d3142a53a5403e8814e68bbd659514e815899"},
+    {file = "onnxruntime-1.9.0-cp36-cp36m-win_amd64.whl", hash = "sha256:cf3edbc54bfe99a119d73cd65398a2ec68ae3af2557ab7e645976314a8d11aa1"},
+    {file = "onnxruntime-1.9.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:c8ff9c914b2b1c3b022dedc199e3f971e340d8923a1ef42d66530508fa367bf6"},
+    {file = "onnxruntime-1.9.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d03ff4a2717c4149acc7c649fd66a67e81ec44c9e6e2a00df1d6e9ca843f1b7"},
+    {file = "onnxruntime-1.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8a2315e2244ac371742f6e30da5367c680c3e84c31e291a35f8ddfab09c3c82"},
+    {file = "onnxruntime-1.9.0-cp37-cp37m-win32.whl", hash = "sha256:fa927b1825f2851c0c8f3948515a56d76cb0686da9acd1d6f8fafe552c8d8fec"},
+    {file = "onnxruntime-1.9.0-cp37-cp37m-win_amd64.whl", hash = "sha256:9ccaf6a0365f2b86efe21681416b8cfe97f084a7d53bd1cf2bf889a0aef2b0d3"},
+    {file = "onnxruntime-1.9.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:d20ce3448babe89a77cc9d357730767deb3617e36439bddcd006f28abc72b416"},
+    {file = "onnxruntime-1.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3fd1d6647245aa38e1099cfd355d84e807de5350d5216e84ceefd91c64ce243d"},
+    {file = "onnxruntime-1.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f00620fc0f51bc4d90ae6d96ceb4b6538e3bd1e328178104118ac672f37c40d"},
+    {file = "onnxruntime-1.9.0-cp38-cp38-win32.whl", hash = "sha256:e1c1fe3f7d960eeffc02a5f196d85529254eefd59cbeecd8abee0a9467b5c2d8"},
+    {file = "onnxruntime-1.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:9002214af1b2317ab3a63a2f045f7d1363c207e661d475a877aa6499ca09d606"},
+    {file = "onnxruntime-1.9.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:7339cef9b918b88f1fec8109cfa0a8416f119c5968d00300a9186847d86e35de"},
+    {file = "onnxruntime-1.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdec8538eb59e63a376d0677f7ec043ceb597d52ee88f1f7e250928893a0de7f"},
+    {file = "onnxruntime-1.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab62b29429e0e62c11478b2a8a3af2646531fba7800736e8b201d8baa50b43a8"},
+    {file = "onnxruntime-1.9.0-cp39-cp39-win32.whl", hash = "sha256:4aee9a893f93637341fd0e6b56fa3ab1c430d718d08d79a358603297a1575ad9"},
+    {file = "onnxruntime-1.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:bdf1327932227383b04093a51266474b2703b3fcf9c0f6f11c652d9652b76a5c"},
+]
+opencv-python = [
+    {file = "opencv_python-4.5.2.54-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:4e6c2d8320168a4f76822fbb76df3b18688ac5e068d49ac38a4ce39af0f8e1a6"},
+    {file = "opencv_python-4.5.2.54-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:9680ab256ab31bdafd74f6cf55eb570e5629b5604d50fd69dd1bd2a8124f0611"},
+    {file = "opencv_python-4.5.2.54-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:ef3102b70aa59ab3fed69df30465c1b7587d681e963dfff5146de233c75df7ba"},
+    {file = "opencv_python-4.5.2.54-cp36-cp36m-win32.whl", hash = "sha256:89a2b45429bf945988a17b0404431d9d8fdc9e04fb2450b56fa01f6f9477101d"},
+    {file = "opencv_python-4.5.2.54-cp36-cp36m-win_amd64.whl", hash = "sha256:08327a38564786bf73e387736f080e8ad4c110b394ca4af2ecec8277b305bf44"},
+    {file = "opencv_python-4.5.2.54-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:6b2573c6367ec0052b37e375d18638a885dd7a10a5ef8dd726b391969c227f23"},
+    {file = "opencv_python-4.5.2.54-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b724a96eeb88842bd2371b1ffe2da73b6295063ba5c029aa34139d25b8315a3f"},
+    {file = "opencv_python-4.5.2.54-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:4b8814d3f0cf01e8b8624125f7dcfb095893abcc04083cb4968fa1629bc81161"},
+    {file = "opencv_python-4.5.2.54-cp37-cp37m-win32.whl", hash = "sha256:d9004e2cc90bb2862cdc1d062fac5163d3def55b200081d4520d3e90b4c7197b"},
+    {file = "opencv_python-4.5.2.54-cp37-cp37m-win_amd64.whl", hash = "sha256:2436b71346d1eed423577fac8cd3aa9c0832ea97452444dc7f856b2f09600dba"},
+    {file = "opencv_python-4.5.2.54-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:0118a086fad8d77acdf46ac68df49d4167fbb85420f8bcf2615d7b74fc03aae0"},
+    {file = "opencv_python-4.5.2.54-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:b3bef3f2a2ab3c201784d12ec6b5c9e61c920c15b6854d8d2f62fd019e3df846"},
+    {file = "opencv_python-4.5.2.54-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:6e2070e35f2aaca3d1259093c786d4e373004b36d89a94e81943247c6ed3d4e1"},
+    {file = "opencv_python-4.5.2.54-cp38-cp38-win32.whl", hash = "sha256:f12f39c1e5001e1c00df5873e3eee6f0232b7723a60b7ef438b1e23f1341df0e"},
+    {file = "opencv_python-4.5.2.54-cp38-cp38-win_amd64.whl", hash = "sha256:10325c3fd571e33a11eb5f0e5d265d73baef22dbb34c977f28df7e22de47b0bc"},
+    {file = "opencv_python-4.5.2.54-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:050227e5728ea8316ec114aca8f43d56253cbb1c50983e3b136a988254a83118"},
+    {file = "opencv_python-4.5.2.54-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c446555cbbc4f5e809f9c15ac1b6200024032d9859f5ac5a2ca7669d09e4c91c"},
+    {file = "opencv_python-4.5.2.54-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:8cf81f53ac5ad900ca443a8252c4e0bc1256f1c2cb2d8459df2ba1ac014dfa36"},
+    {file = "opencv_python-4.5.2.54-cp39-cp39-win32.whl", hash = "sha256:a8020cc6145c6934192189058743a55189750df6dff894396edb8b35a380cc48"},
+    {file = "opencv_python-4.5.2.54-cp39-cp39-win_amd64.whl", hash = "sha256:0a3aef70b7c53bbd22ade86a4318b8a2ad98d3c3ed3d0c315f18bf1a2d868709"},
+    {file = "opencv-python-4.5.5.64.tar.gz", hash = "sha256:f65de0446a330c3b773cd04ba10345d8ce1b15dcac3f49770204e37602d0b3f7"},
+    {file = "opencv_python-4.5.5.64-cp36-abi3-macosx_10_15_x86_64.whl", hash = "sha256:a512a0c59b6fec0fac3844b2f47d6ecb1a9d18d235e6c5491ce8dbbe0663eae8"},
+    {file = "opencv_python-4.5.5.64-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca6138b6903910e384067d001763d40f97656875487381aed32993b076f44375"},
+    {file = "opencv_python-4.5.5.64-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b293ced62f4360d9f11cf72ae7e9df95320ff7bf5b834d87546f844e838c0c35"},
+    {file = "opencv_python-4.5.5.64-cp36-abi3-win32.whl", hash = "sha256:6247e584813c00c3b9ed69a795da40d2c153dc923d0182e957e1c2f00a554ac2"},
+    {file = "opencv_python-4.5.5.64-cp36-abi3-win_amd64.whl", hash = "sha256:408d5332550287aa797fd06bef47b2dfed163c6787668cc82ef9123a9484b56a"},
+    {file = "opencv_python-4.5.5.64-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:7787bb017ae93d5f9bb1b817ac8e13e45dd193743cb648498fcab21d00cf20a3"},
+    {file = "opencv-python-4.6.0.66.tar.gz", hash = "sha256:c5bfae41ad4031e66bb10ec4a0a2ffd3e514d092652781e8b1ac98d1b59f1158"},
+    {file = "opencv_python-4.6.0.66-cp36-abi3-macosx_10_15_x86_64.whl", hash = "sha256:e6e448b62afc95c5b58f97e87ef84699e6607fe5c58730a03301c52496005cae"},
+    {file = "opencv_python-4.6.0.66-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5af8ba35a4fcb8913ffb86e92403e9a656a4bff4a645d196987468f0f8947875"},
+    {file = "opencv_python-4.6.0.66-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbdc84a9b4ea2cbae33861652d25093944b9959279200b7ae0badd32439f74de"},
+    {file = "opencv_python-4.6.0.66-cp36-abi3-win32.whl", hash = "sha256:f482e78de6e7b0b060ff994ffd859bddc3f7f382bb2019ef157b0ea8ca8712f5"},
+    {file = "opencv_python-4.6.0.66-cp36-abi3-win_amd64.whl", hash = "sha256:0dc82a3d8630c099d2f3ac1b1aabee164e8188db54a786abb7a4e27eba309440"},
+    {file = "opencv_python-4.6.0.66-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:6e32af22e3202748bd233ed8f538741876191863882eba44e332d1a34993165b"},
+]
+opt-einsum = [
+    {file = "opt_einsum-3.3.0-py3-none-any.whl", hash = "sha256:2455e59e3947d3c275477df7f5205b30635e266fe6dc300e3d9f9646bfcea147"},
+    {file = "opt_einsum-3.3.0.tar.gz", hash = "sha256:59f6475f77bbc37dcf7cd748519c0ec60722e91e63ca114e68821c0c54a46549"},
+]
+packaging = [
+    {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
+    {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
+]
+pathspec = [
+    {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"},
+    {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"},
+]
+pillow = [
+    {file = "Pillow-6.2.2-cp27-cp27m-macosx_10_6_intel.whl", hash = "sha256:834dd023b7f987d6b700ad93dc818098d7eb046bd445e9992b3093c6f9d7a95f"},
+    {file = "Pillow-6.2.2-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:d3a98444a00b4643b22b0685dbf9e0ddcaf4ebfd4ea23f84f228adf5a0765bb2"},
+    {file = "Pillow-6.2.2-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:2b4a94be53dff02af90760c10a2e3634c3c7703410f38c98154d5ce71fe63d20"},
+    {file = "Pillow-6.2.2-cp27-cp27m-win32.whl", hash = "sha256:87ef0eca169f7f0bc050b22f05c7e174a65c36d584428431e802c0165c5856ea"},
+    {file = "Pillow-6.2.2-cp27-cp27m-win_amd64.whl", hash = "sha256:cbd5647097dc55e501f459dbac7f1d0402225636deeb9e0a98a8d2df649fc19d"},
+    {file = "Pillow-6.2.2-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:4adc3302df4faf77c63ab3a83e1a3e34b94a6a992084f4aa1cb236d1deaf4b39"},
+    {file = "Pillow-6.2.2-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:e3a797a079ce289e59dbd7eac9ca3bf682d52687f718686857281475b7ca8e6a"},
+    {file = "Pillow-6.2.2-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:bb7861e4618a0c06c40a2e509c1bea207eea5fd4320d486e314e00745a402ca5"},
+    {file = "Pillow-6.2.2-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:535e8e0e02c9f1fc2e307256149d6ee8ad3aa9a6e24144b7b6e6fb6126cb0e99"},
+    {file = "Pillow-6.2.2-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:bc149dab804291a18e1186536519e5e122a2ac1316cb80f506e855a500b1cdd4"},
+    {file = "Pillow-6.2.2-cp35-cp35m-win32.whl", hash = "sha256:1a3bc8e1db5af40a81535a62a591fafdb30a8a1b319798ea8052aa65ef8f06d2"},
+    {file = "Pillow-6.2.2-cp35-cp35m-win_amd64.whl", hash = "sha256:d6b4dc325170bee04ca8292bbd556c6f5398d52c6149ca881e67daf62215426f"},
+    {file = "Pillow-6.2.2-cp36-cp36m-macosx_10_6_intel.whl", hash = "sha256:43ef1cff7ee57f9c8c8e6fa02a62eae9fa23a7e34418c7ce88c0e3fe09d1fb38"},
+    {file = "Pillow-6.2.2-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:900de1fdc93764be13f6b39dc0dd0207d9ff441d87ad7c6e97e49b81987dc0f3"},
+    {file = "Pillow-6.2.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:92b83b380f9181cacc994f4c983d95a9c8b00b50bf786c66d235716b526a3332"},
+    {file = "Pillow-6.2.2-cp36-cp36m-win32.whl", hash = "sha256:00e0bbe9923adc5cc38a8da7d87d4ce16cde53b8d3bba8886cb928e84522d963"},
+    {file = "Pillow-6.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:5ccfcb0a34ad9b77ad247c231edb781763198f405a5c8dc1b642449af821fb7f"},
+    {file = "Pillow-6.2.2-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:5dcbbaa3a24d091a64560d3c439a8962866a79a033d40eb1a75f1b3413bfc2bc"},
+    {file = "Pillow-6.2.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6e2a7e74d1a626b817ecb7a28c433b471a395c010b2a1f511f976e9ea4363e64"},
+    {file = "Pillow-6.2.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:c424d35a5259be559b64490d0fd9e03fba81f1ce8e5b66e0a59de97547351d80"},
+    {file = "Pillow-6.2.2-cp37-cp37m-win32.whl", hash = "sha256:aa4792ab056f51b49e7d59ce5733155e10a918baf8ce50f64405db23d5627fa2"},
+    {file = "Pillow-6.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:0d5c99f80068f13231ac206bd9b2e80ea357f5cf9ae0fa97fab21e32d5b61065"},
+    {file = "Pillow-6.2.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:03457e439d073770d88afdd90318382084732a5b98b0eb6f49454746dbaae701"},
+    {file = "Pillow-6.2.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:ccf16fe444cc43800eeacd4f4769971200982200a71b1368f49410d0eb769543"},
+    {file = "Pillow-6.2.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:b72c39585f1837d946bd1a829a4820ccf86e361f28cbf60f5d646f06318b61e2"},
+    {file = "Pillow-6.2.2-cp38-cp38-win32.whl", hash = "sha256:3ba7d8f1d962780f86aa747fef0baf3211b80cb13310fff0c375da879c0656d4"},
+    {file = "Pillow-6.2.2-cp38-cp38-win_amd64.whl", hash = "sha256:3e81485cec47c24f5fb27acb485a4fc97376b2b332ed633867dc68ac3077998c"},
+    {file = "Pillow-6.2.2-pp273-pypy_73-win32.whl", hash = "sha256:aa1b0297e352007ec781a33f026afbb062a9a9895bb103c8f49af434b1666880"},
+    {file = "Pillow-6.2.2-pp373-pypy36_pp73-win32.whl", hash = "sha256:82859575005408af81b3e9171ae326ff56a69af5439d3fc20e8cb76cd51c8246"},
+    {file = "Pillow-6.2.2.tar.gz", hash = "sha256:db9ff0c251ed066d367f53b64827cc9e18ccea001b986d08c265e53625dab950"},
+]
+platformdirs = [
+    {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"},
+    {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"},
+]
+pluggy = [
+    {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
+    {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
+]
+protobuf = [
+    {file = "protobuf-3.19.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f51d5a9f137f7a2cec2d326a74b6e3fc79d635d69ffe1b036d39fc7d75430d37"},
+    {file = "protobuf-3.19.4-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:09297b7972da685ce269ec52af761743714996b4381c085205914c41fcab59fb"},
+    {file = "protobuf-3.19.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:072fbc78d705d3edc7ccac58a62c4c8e0cec856987da7df8aca86e647be4e35c"},
+    {file = "protobuf-3.19.4-cp310-cp310-win32.whl", hash = "sha256:7bb03bc2873a2842e5ebb4801f5c7ff1bfbdf426f85d0172f7644fcda0671ae0"},
+    {file = "protobuf-3.19.4-cp310-cp310-win_amd64.whl", hash = "sha256:f358aa33e03b7a84e0d91270a4d4d8f5df6921abe99a377828839e8ed0c04e07"},
+    {file = "protobuf-3.19.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:1c91ef4110fdd2c590effb5dca8fdbdcb3bf563eece99287019c4204f53d81a4"},
+    {file = "protobuf-3.19.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c438268eebb8cf039552897d78f402d734a404f1360592fef55297285f7f953f"},
+    {file = "protobuf-3.19.4-cp36-cp36m-win32.whl", hash = "sha256:835a9c949dc193953c319603b2961c5c8f4327957fe23d914ca80d982665e8ee"},
+    {file = "protobuf-3.19.4-cp36-cp36m-win_amd64.whl", hash = "sha256:4276cdec4447bd5015453e41bdc0c0c1234eda08420b7c9a18b8d647add51e4b"},
+    {file = "protobuf-3.19.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6cbc312be5e71869d9d5ea25147cdf652a6781cf4d906497ca7690b7b9b5df13"},
+    {file = "protobuf-3.19.4-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:54a1473077f3b616779ce31f477351a45b4fef8c9fd7892d6d87e287a38df368"},
+    {file = "protobuf-3.19.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:435bb78b37fc386f9275a7035fe4fb1364484e38980d0dd91bc834a02c5ec909"},
+    {file = "protobuf-3.19.4-cp37-cp37m-win32.whl", hash = "sha256:16f519de1313f1b7139ad70772e7db515b1420d208cb16c6d7858ea989fc64a9"},
+    {file = "protobuf-3.19.4-cp37-cp37m-win_amd64.whl", hash = "sha256:cdc076c03381f5c1d9bb1abdcc5503d9ca8b53cf0a9d31a9f6754ec9e6c8af0f"},
+    {file = "protobuf-3.19.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:69da7d39e39942bd52848438462674c463e23963a1fdaa84d88df7fbd7e749b2"},
+    {file = "protobuf-3.19.4-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:48ed3877fa43e22bcacc852ca76d4775741f9709dd9575881a373bd3e85e54b2"},
+    {file = "protobuf-3.19.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd95d1dfb9c4f4563e6093a9aa19d9c186bf98fa54da5252531cc0d3a07977e7"},
+    {file = "protobuf-3.19.4-cp38-cp38-win32.whl", hash = "sha256:b38057450a0c566cbd04890a40edf916db890f2818e8682221611d78dc32ae26"},
+    {file = "protobuf-3.19.4-cp38-cp38-win_amd64.whl", hash = "sha256:7ca7da9c339ca8890d66958f5462beabd611eca6c958691a8fe6eccbd1eb0c6e"},
+    {file = "protobuf-3.19.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:36cecbabbda242915529b8ff364f2263cd4de7c46bbe361418b5ed859677ba58"},
+    {file = "protobuf-3.19.4-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c1068287025f8ea025103e37d62ffd63fec8e9e636246b89c341aeda8a67c934"},
+    {file = "protobuf-3.19.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96bd766831596d6014ca88d86dc8fe0fb2e428c0b02432fd9db3943202bf8c5e"},
+    {file = "protobuf-3.19.4-cp39-cp39-win32.whl", hash = "sha256:84123274d982b9e248a143dadd1b9815049f4477dc783bf84efe6250eb4b836a"},
+    {file = "protobuf-3.19.4-cp39-cp39-win_amd64.whl", hash = "sha256:3112b58aac3bac9c8be2b60a9daf6b558ca3f7681c130dcdd788ade7c9ffbdca"},
+    {file = "protobuf-3.19.4-py2.py3-none-any.whl", hash = "sha256:8961c3a78ebfcd000920c9060a262f082f29838682b1f7201889300c1fbe0616"},
+    {file = "protobuf-3.19.4.tar.gz", hash = "sha256:9df0c10adf3e83015ced42a9a7bd64e13d06c4cf45c340d2c63020ea04499d0a"},
+]
+psutil = [
+    {file = "psutil-5.9.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:799759d809c31aab5fe4579e50addf84565e71c1dc9f1c31258f159ff70d3f87"},
+    {file = "psutil-5.9.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:9272167b5f5fbfe16945be3db475b3ce8d792386907e673a209da686176552af"},
+    {file = "psutil-5.9.1-cp27-cp27m-win32.whl", hash = "sha256:0904727e0b0a038830b019551cf3204dd48ef5c6868adc776e06e93d615fc5fc"},
+    {file = "psutil-5.9.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e7e10454cb1ab62cc6ce776e1c135a64045a11ec4c6d254d3f7689c16eb3efd2"},
+    {file = "psutil-5.9.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:56960b9e8edcca1456f8c86a196f0c3d8e3e361320071c93378d41445ffd28b0"},
+    {file = "psutil-5.9.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:44d1826150d49ffd62035785a9e2c56afcea66e55b43b8b630d7706276e87f22"},
+    {file = "psutil-5.9.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c7be9d7f5b0d206f0bbc3794b8e16fb7dbc53ec9e40bbe8787c6f2d38efcf6c9"},
+    {file = "psutil-5.9.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd9246e4cdd5b554a2ddd97c157e292ac11ef3e7af25ac56b08b455c829dca8"},
+    {file = "psutil-5.9.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a442e25fab1f4d05e2655bb1b8ab6887981838d22effa2396d584b740194de"},
+    {file = "psutil-5.9.1-cp310-cp310-win32.whl", hash = "sha256:20b27771b077dcaa0de1de3ad52d22538fe101f9946d6dc7869e6f694f079329"},
+    {file = "psutil-5.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:58678bbadae12e0db55186dc58f2888839228ac9f41cc7848853539b70490021"},
+    {file = "psutil-5.9.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:3a76ad658641172d9c6e593de6fe248ddde825b5866464c3b2ee26c35da9d237"},
+    {file = "psutil-5.9.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a6a11e48cb93a5fa606306493f439b4aa7c56cb03fc9ace7f6bfa21aaf07c453"},
+    {file = "psutil-5.9.1-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:068935df39055bf27a29824b95c801c7a5130f118b806eee663cad28dca97685"},
+    {file = "psutil-5.9.1-cp36-cp36m-win32.whl", hash = "sha256:0f15a19a05f39a09327345bc279c1ba4a8cfb0172cc0d3c7f7d16c813b2e7d36"},
+    {file = "psutil-5.9.1-cp36-cp36m-win_amd64.whl", hash = "sha256:db417f0865f90bdc07fa30e1aadc69b6f4cad7f86324b02aa842034efe8d8c4d"},
+    {file = "psutil-5.9.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:91c7ff2a40c373d0cc9121d54bc5f31c4fa09c346528e6a08d1845bce5771ffc"},
+    {file = "psutil-5.9.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fea896b54f3a4ae6f790ac1d017101252c93f6fe075d0e7571543510f11d2676"},
+    {file = "psutil-5.9.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3054e923204b8e9c23a55b23b6df73a8089ae1d075cb0bf711d3e9da1724ded4"},
+    {file = "psutil-5.9.1-cp37-cp37m-win32.whl", hash = "sha256:d2d006286fbcb60f0b391741f520862e9b69f4019b4d738a2a45728c7e952f1b"},
+    {file = "psutil-5.9.1-cp37-cp37m-win_amd64.whl", hash = "sha256:b14ee12da9338f5e5b3a3ef7ca58b3cba30f5b66f7662159762932e6d0b8f680"},
+    {file = "psutil-5.9.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:19f36c16012ba9cfc742604df189f2f28d2720e23ff7d1e81602dbe066be9fd1"},
+    {file = "psutil-5.9.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:944c4b4b82dc4a1b805329c980f270f170fdc9945464223f2ec8e57563139cf4"},
+    {file = "psutil-5.9.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b6750a73a9c4a4e689490ccb862d53c7b976a2a35c4e1846d049dcc3f17d83b"},
+    {file = "psutil-5.9.1-cp38-cp38-win32.whl", hash = "sha256:a8746bfe4e8f659528c5c7e9af5090c5a7d252f32b2e859c584ef7d8efb1e689"},
+    {file = "psutil-5.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:79c9108d9aa7fa6fba6e668b61b82facc067a6b81517cab34d07a84aa89f3df0"},
+    {file = "psutil-5.9.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:28976df6c64ddd6320d281128817f32c29b539a52bdae5e192537bc338a9ec81"},
+    {file = "psutil-5.9.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b88f75005586131276634027f4219d06e0561292be8bd6bc7f2f00bdabd63c4e"},
+    {file = "psutil-5.9.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:645bd4f7bb5b8633803e0b6746ff1628724668681a434482546887d22c7a9537"},
+    {file = "psutil-5.9.1-cp39-cp39-win32.whl", hash = "sha256:32c52611756096ae91f5d1499fe6c53b86f4a9ada147ee42db4991ba1520e574"},
+    {file = "psutil-5.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:f65f9a46d984b8cd9b3750c2bdb419b2996895b005aefa6cbaba9a143b1ce2c5"},
+    {file = "psutil-5.9.1.tar.gz", hash = "sha256:57f1819b5d9e95cdfb0c881a8a5b7d542ed0b7c522d575706a80bedc848c8954"},
+]
+py = [
+    {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
+    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
+]
+pyasn1 = [
+    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
+    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
+    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
+    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
+    {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
+    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
+    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
+    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
+    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
+    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
+    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
+    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
+    {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
+]
+pyasn1-modules = [
+    {file = "pyasn1-modules-0.2.8.tar.gz", hash = "sha256:905f84c712230b2c592c19470d3ca8d552de726050d1d1716282a1f6146be65e"},
+    {file = "pyasn1_modules-0.2.8-py2.4.egg", hash = "sha256:0fe1b68d1e486a1ed5473f1302bd991c1611d319bba158e98b106ff86e1d7199"},
+    {file = "pyasn1_modules-0.2.8-py2.5.egg", hash = "sha256:fe0644d9ab041506b62782e92b06b8c68cca799e1a9636ec398675459e031405"},
+    {file = "pyasn1_modules-0.2.8-py2.6.egg", hash = "sha256:a99324196732f53093a84c4369c996713eb8c89d360a496b599fb1a9c47fc3eb"},
+    {file = "pyasn1_modules-0.2.8-py2.7.egg", hash = "sha256:0845a5582f6a02bb3e1bde9ecfc4bfcae6ec3210dd270522fee602365430c3f8"},
+    {file = "pyasn1_modules-0.2.8-py2.py3-none-any.whl", hash = "sha256:a50b808ffeb97cb3601dd25981f6b016cbb3d31fbf57a8b8a87428e6158d0c74"},
+    {file = "pyasn1_modules-0.2.8-py3.1.egg", hash = "sha256:f39edd8c4ecaa4556e989147ebf219227e2cd2e8a43c7e7fcb1f1c18c5fd6a3d"},
+    {file = "pyasn1_modules-0.2.8-py3.2.egg", hash = "sha256:b80486a6c77252ea3a3e9b1e360bc9cf28eaac41263d173c032581ad2f20fe45"},
+    {file = "pyasn1_modules-0.2.8-py3.3.egg", hash = "sha256:65cebbaffc913f4fe9e4808735c95ea22d7a7775646ab690518c056784bc21b4"},
+    {file = "pyasn1_modules-0.2.8-py3.4.egg", hash = "sha256:15b7c67fabc7fc240d87fb9aabf999cf82311a6d6fb2c70d00d3d0604878c811"},
+    {file = "pyasn1_modules-0.2.8-py3.5.egg", hash = "sha256:426edb7a5e8879f1ec54a1864f16b882c2837bfd06eee62f2c982315ee2473ed"},
+    {file = "pyasn1_modules-0.2.8-py3.6.egg", hash = "sha256:cbac4bc38d117f2a49aeedec4407d23e8866ea4ac27ff2cf7fb3e5b570df19e0"},
+    {file = "pyasn1_modules-0.2.8-py3.7.egg", hash = "sha256:c29a5e5cc7a3f05926aff34e097e84f8589cd790ce0ed41b67aed6857b26aafd"},
+]
+pycodestyle = [
+    {file = "pycodestyle-2.8.0-py2.py3-none-any.whl", hash = "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20"},
+    {file = "pycodestyle-2.8.0.tar.gz", hash = "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"},
+]
+pycparser = [
+    {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"},
+    {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
+]
+pyflakes = [
+    {file = "pyflakes-2.4.0-py2.py3-none-any.whl", hash = "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"},
+    {file = "pyflakes-2.4.0.tar.gz", hash = "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c"},
+]
+pyformat = [
+    {file = "pyformat-0.7.tar.gz", hash = "sha256:eb7b0e93f768c6f92e2cb06307deaa3a5141c7c61cd472b1a7918e30d09df20f"},
+]
+pygments = [
+    {file = "Pygments-2.12.0-py3-none-any.whl", hash = "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"},
+    {file = "Pygments-2.12.0.tar.gz", hash = "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb"},
+]
+pylint = [
+    {file = "pylint-2.13.9-py3-none-any.whl", hash = "sha256:705c620d388035bdd9ff8b44c5bcdd235bfb49d276d488dd2c8ff1736aa42526"},
+    {file = "pylint-2.13.9.tar.gz", hash = "sha256:095567c96e19e6f57b5b907e67d265ff535e588fe26b12b5ebe1fc5645b2c731"},
+]
+pyparsing = [
+    {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
+    {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
+]
+pyserial = [
+    {file = "pyserial-3.5-py2.py3-none-any.whl", hash = "sha256:c4451db6ba391ca6ca299fb3ec7bae67a5c55dde170964c7a14ceefec02f2cf0"},
+    {file = "pyserial-3.5.tar.gz", hash = "sha256:3c77e014170dfffbd816e6ffc205e9842efb10be9f58ec16d3e8675b4925cddb"},
+]
+pytest = [
+    {file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"},
+    {file = "pytest-5.4.3.tar.gz", hash = "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8"},
+]
+python-dateutil = [
+    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
+    {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
+]
+pytz = [
+    {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},
+    {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
+]
+pyyaml = [
+    {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"},
+    {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"},
+    {file = "PyYAML-5.4.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8"},
+    {file = "PyYAML-5.4.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:72a01f726a9c7851ca9bfad6fd09ca4e090a023c00945ea05ba1638c09dc3347"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:895f61ef02e8fed38159bb70f7e100e00f471eae2bc838cd0f4ebb21e28f8541"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-win32.whl", hash = "sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:cb333c16912324fd5f769fff6bc5de372e9e7a202247b48870bc251ed40239aa"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:fe69978f3f768926cfa37b867e3843918e012cf83f680806599ddce33c2c68b0"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-win32.whl", hash = "sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf"},
+    {file = "PyYAML-5.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46"},
+    {file = "PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb"},
+    {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:fd7f6999a8070df521b6384004ef42833b9bd62cfee11a09bda1079b4b704247"},
+    {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:bfb51918d4ff3d77c1c856a9699f8492c612cde32fd3bcd344af9be34999bfdc"},
+    {file = "PyYAML-5.4.1-cp38-cp38-win32.whl", hash = "sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc"},
+    {file = "PyYAML-5.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696"},
+    {file = "PyYAML-5.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77"},
+    {file = "PyYAML-5.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183"},
+    {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:d483ad4e639292c90170eb6f7783ad19490e7a8defb3e46f97dfe4bacae89122"},
+    {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:fdc842473cd33f45ff6bce46aea678a54e3d21f1b61a7750ce3c498eedfe25d6"},
+    {file = "PyYAML-5.4.1-cp39-cp39-win32.whl", hash = "sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10"},
+    {file = "PyYAML-5.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db"},
+    {file = "PyYAML-5.4.1.tar.gz", hash = "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e"},
+]
+recommonmark = [
+    {file = "recommonmark-0.6.0-py2.py3-none-any.whl", hash = "sha256:2ec4207a574289355d5b6ae4ae4abb29043346ca12cdd5f07d374dc5987d2852"},
+    {file = "recommonmark-0.6.0.tar.gz", hash = "sha256:29cd4faeb6c5268c633634f2d69aef9431e0f4d347f90659fd0aab20e541efeb"},
+]
+regex = [
+    {file = "regex-2022.6.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:042d122f9fee3ceb6d7e3067d56557df697d1aad4ff5f64ecce4dc13a90a7c01"},
+    {file = "regex-2022.6.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffef4b30785dc2d1604dfb7cf9fca5dc27cd86d65f7c2a9ec34d6d3ae4565ec2"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0afa6a601acf3c0dc6de4e8d7d8bbce4e82f8542df746226cd35d4a6c15e9456"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4a11cbe8eb5fb332ae474895b5ead99392a4ea568bd2a258ab8df883e9c2bf92"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c1f62ee2ba880e221bc950651a1a4b0176083d70a066c83a50ef0cb9b178e12"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aba3d13c77173e9bfed2c2cea7fc319f11c89a36fcec08755e8fb169cf3b0df"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:249437f7f5b233792234aeeecb14b0aab1566280de42dfc97c26e6f718297d68"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:179410c79fa86ef318d58ace233f95b87b05a1db6dc493fa29404a43f4b215e2"},
+    {file = "regex-2022.6.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5e201b1232d81ca1a7a22ab2f08e1eccad4e111579fd7f3bbf60b21ef4a16cea"},
+    {file = "regex-2022.6.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fdecb225d0f1d50d4b26ac423e0032e76d46a788b83b4e299a520717a47d968c"},
+    {file = "regex-2022.6.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:be57f9c7b0b423c66c266a26ad143b2c5514997c05dd32ce7ca95c8b209c2288"},
+    {file = "regex-2022.6.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ed657a07d8a47ef447224ea00478f1c7095065dfe70a89e7280e5f50a5725131"},
+    {file = "regex-2022.6.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:24908aefed23dd065b4a668c0b4ca04d56b7f09d8c8e89636cf6c24e64e67a1e"},
+    {file = "regex-2022.6.2-cp310-cp310-win32.whl", hash = "sha256:775694cd0bb2c4accf2f1cdd007381b33ec8b59842736fe61bdbad45f2ac7427"},
+    {file = "regex-2022.6.2-cp310-cp310-win_amd64.whl", hash = "sha256:809bbbbbcf8258049b031d80932ba71627d2274029386f0452e9950bcfa2c6e8"},
+    {file = "regex-2022.6.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ecd2b5d983eb0adf2049d41f95205bdc3de4e6cc2350e9c80d4409d3a75229de"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f4c101746a8dac0401abefa716b357c546e61ea2e3d4a564a9db9eac57ccbce"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:166ae7674d0a0e0f8044e7335ba86d0716c9d49465cff1b153f908e0470b8300"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c5eac5d8a8ac9ccf00805d02a968a36f5c967db6c7d2b747ab9ed782b3b3a28b"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f57823f35b18d82b201c1b27ce4e55f88e79e81d9ca07b50ce625d33823e1439"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4d42e3b7b23473729adbf76103e7df75f9167a5a80b1257ca30688352b4bb2dc"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b2932e728bee0a634fe55ee54d598054a5a9ffe4cd2be21ba2b4b8e5f8064c2c"},
+    {file = "regex-2022.6.2-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:17764683ea01c2b8f103d99ae9de2473a74340df13ce306c49a721f0b1f0eb9e"},
+    {file = "regex-2022.6.2-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:2ac29b834100d2c171085ceba0d4a1e7046c434ddffc1434dbc7f9d59af1e945"},
+    {file = "regex-2022.6.2-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:f43522fb5d676c99282ca4e2d41e8e2388427c0cf703db6b4a66e49b10b699a8"},
+    {file = "regex-2022.6.2-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:9faa01818dad9111dbf2af26c6e3c45140ccbd1192c3a0981f196255bf7ec5e6"},
+    {file = "regex-2022.6.2-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:17443f99b8f255273731f915fdbfea4d78d809bb9c3aaf67b889039825d06515"},
+    {file = "regex-2022.6.2-cp36-cp36m-win32.whl", hash = "sha256:4a5449adef907919d4ce7a1eab2e27d0211d1b255bf0b8f5dd330ad8707e0fc3"},
+    {file = "regex-2022.6.2-cp36-cp36m-win_amd64.whl", hash = "sha256:4d206703a96a39763b5b45cf42645776f5553768ea7f3c2c1a39a4f59cafd4ba"},
+    {file = "regex-2022.6.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fcd7c432202bcb8b642c3f43d5bcafc5930d82fe5b2bf2c008162df258445c1d"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:186c5a4a4c40621f64d771038ede20fca6c61a9faa8178f9e305aaa0c2442a97"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:047b2d1323a51190c01b6604f49fe09682a5c85d3c1b2c8b67c1cd68419ce3c4"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:30637e7fa4acfed444525b1ab9683f714be617862820578c9fd4e944d4d9ad1f"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adafe6f2c6d86dbf3313866b61180530ca4dcd0c264932dc8fa1ffb10871d58"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:67ae3601edf86e15ebe40885e5bfdd6002d34879070be15cf18fc0d80ea24fed"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:48dddddce0ea7e7c3e92c1e0c5a28c13ca4dc9cf7e996c706d00479652bff76c"},
+    {file = "regex-2022.6.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:68e5c641645351eb9eb12c465876e76b53717f99e9b92aea7a2dd645a87aa7aa"},
+    {file = "regex-2022.6.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:8fd5f8ae42f789538bb634bdfd69b9aa357e76fdfd7ad720f32f8994c0d84f1e"},
+    {file = "regex-2022.6.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:71988a76fcb68cc091e901fddbcac0f9ad9a475da222c47d3cf8db0876cb5344"},
+    {file = "regex-2022.6.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:4b8838f70be3ce9e706df9d72f88a0aa7d4c1fea61488e06fdf292ccb70ad2be"},
+    {file = "regex-2022.6.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:663dca677bd3d2e2b5b7d0329e9f24247e6f38f3b740dd9a778a8ef41a76af41"},
+    {file = "regex-2022.6.2-cp37-cp37m-win32.whl", hash = "sha256:24963f0b13cc63db336d8da2a533986419890d128c551baacd934c249d51a779"},
+    {file = "regex-2022.6.2-cp37-cp37m-win_amd64.whl", hash = "sha256:ceff75127f828dfe7ceb17b94113ec2df4df274c4cd5533bb299cb099a18a8ca"},
+    {file = "regex-2022.6.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1a6f2698cfa8340dfe4c0597782776b393ba2274fe4c079900c7c74f68752705"},
+    {file = "regex-2022.6.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a8a08ace913c4101f0dc0be605c108a3761842efd5f41a3005565ee5d169fb2b"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26dbe90b724efef7820c3cf4a0e5be7f130149f3d2762782e4e8ac2aea284a0b"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b5f759a1726b995dc896e86f17f9c0582b54eb4ead00ed5ef0b5b22260eaf2d0"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1fc26bb3415e7aa7495c000a2c13bf08ce037775db98c1a3fac9ff04478b6930"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52684da32d9003367dc1a1c07e059b9bbaf135ad0764cd47d8ac3dba2df109bc"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c1264eb40a71cf2bff43d6694ab7254438ca19ef330175060262b3c8dd3931a"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:bc635ab319c9b515236bdf327530acda99be995f9d3b9f148ab1f60b2431e970"},
+    {file = "regex-2022.6.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:27624b490b5d8880f25dac67e1e2ea93dfef5300b98c6755f585799230d6c746"},
+    {file = "regex-2022.6.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:555f7596fd1f123f8c3a67974c01d6ef80b9769e04d660d6c1a7cc3e6cff7069"},
+    {file = "regex-2022.6.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:933e72fbe1829cbd59da2bc51ccd73d73162f087f88521a87a8ec9cb0cf10fa8"},
+    {file = "regex-2022.6.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:cff5c87e941292c97d11dc81bd20679f56a2830f0f0e32f75b8ed6e0eb40f704"},
+    {file = "regex-2022.6.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c757f3a27b6345de13ef3ca956aa805d7734ce68023e84d0fc74e1f09ce66f7a"},
+    {file = "regex-2022.6.2-cp38-cp38-win32.whl", hash = "sha256:a58d21dd1a2d6b50ed091554ff85e448fce3fe33a4db8b55d0eba2ca957ed626"},
+    {file = "regex-2022.6.2-cp38-cp38-win_amd64.whl", hash = "sha256:495a4165172848503303ed05c9d0409428f789acc27050fe2cf0a4549188a7d5"},
+    {file = "regex-2022.6.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1ab5cf7d09515548044e69d3a0ec77c63d7b9dfff4afc19653f638b992573126"},
+    {file = "regex-2022.6.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c1ea28f0ee6cbe4c0367c939b015d915aa9875f6e061ba1cf0796ca9a3010570"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3de1ecf26ce85521bf73897828b6d0687cc6cf271fb6ff32ac63d26b21f5e764"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa7c7044aabdad2329974be2246babcc21d3ede852b3971a90fd8c2056c20360"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:53d69d77e9cfe468b000314dd656be85bb9e96de088a64f75fe128dfe1bf30dd"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c8d61883a38b1289fba9944a19a361875b5c0170b83cdcc95ea180247c1b7d3"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5429202bef174a3760690d912e3a80060b323199a61cef6c6c29b30ce09fd17"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e85b10280cf1e334a7c95629f6cbbfe30b815a4ea5f1e28d31f79eb92c2c3d93"},
+    {file = "regex-2022.6.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c400dfed4137f32127ea4063447006d7153c974c680bf0fb1b724cce9f8567fc"},
+    {file = "regex-2022.6.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7f648037c503985aed39f85088acab6f1eb6a0482d7c6c665a5712c9ad9eaefc"},
+    {file = "regex-2022.6.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e7b2ff451f6c305b516281ec45425dd423223c8063218c5310d6f72a0a7a517c"},
+    {file = "regex-2022.6.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:be456b4313a86be41706319c397c09d9fdd2e5cdfde208292a277b867e99e3d1"},
+    {file = "regex-2022.6.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c3db393b21b53d7e1d3f881b64c29d886cbfdd3df007e31de68b329edbab7d02"},
+    {file = "regex-2022.6.2-cp39-cp39-win32.whl", hash = "sha256:d70596f20a03cb5f935d6e4aad9170a490d88fc4633679bf00c652e9def4619e"},
+    {file = "regex-2022.6.2-cp39-cp39-win_amd64.whl", hash = "sha256:3b9b6289e03dbe6a6096880d8ac166cb23c38b4896ad235edee789d4e8697152"},
+    {file = "regex-2022.6.2.tar.gz", hash = "sha256:f7b43acb2c46fb2cd506965b2d9cf4c5e64c9c612bac26c1187933c7296bf08c"},
+]
+requests = [
+    {file = "requests-2.28.0-py3-none-any.whl", hash = "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f"},
+    {file = "requests-2.28.0.tar.gz", hash = "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"},
+]
+requests-oauthlib = [
+    {file = "requests-oauthlib-1.3.1.tar.gz", hash = "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"},
+    {file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"},
+]
+rsa = [
+    {file = "rsa-4.8-py3-none-any.whl", hash = "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"},
+    {file = "rsa-4.8.tar.gz", hash = "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17"},
+]
+scipy = [
+    {file = "scipy-1.7.3-1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:c9e04d7e9b03a8a6ac2045f7c5ef741be86727d8f49c45db45f244bdd2bcff17"},
+    {file = "scipy-1.7.3-1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b0e0aeb061a1d7dcd2ed59ea57ee56c9b23dd60100825f98238c06ee5cc4467e"},
+    {file = "scipy-1.7.3-1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:b78a35c5c74d336f42f44106174b9851c783184a85a3fe3e68857259b37b9ffb"},
+    {file = "scipy-1.7.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:173308efba2270dcd61cd45a30dfded6ec0085b4b6eb33b5eb11ab443005e088"},
+    {file = "scipy-1.7.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:21b66200cf44b1c3e86495e3a436fc7a26608f92b8d43d344457c54f1c024cbc"},
+    {file = "scipy-1.7.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceebc3c4f6a109777c0053dfa0282fddb8893eddfb0d598574acfb734a926168"},
+    {file = "scipy-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7eaea089345a35130bc9a39b89ec1ff69c208efa97b3f8b25ea5d4c41d88094"},
+    {file = "scipy-1.7.3-cp310-cp310-win_amd64.whl", hash = "sha256:304dfaa7146cffdb75fbf6bb7c190fd7688795389ad060b970269c8576d038e9"},
+    {file = "scipy-1.7.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:033ce76ed4e9f62923e1f8124f7e2b0800db533828c853b402c7eec6e9465d80"},
+    {file = "scipy-1.7.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4d242d13206ca4302d83d8a6388c9dfce49fc48fdd3c20efad89ba12f785bf9e"},
+    {file = "scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8499d9dd1459dc0d0fe68db0832c3d5fc1361ae8e13d05e6849b358dc3f2c279"},
+    {file = "scipy-1.7.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca36e7d9430f7481fc7d11e015ae16fbd5575615a8e9060538104778be84addf"},
+    {file = "scipy-1.7.3-cp37-cp37m-win32.whl", hash = "sha256:e2c036492e673aad1b7b0d0ccdc0cb30a968353d2c4bf92ac8e73509e1bf212c"},
+    {file = "scipy-1.7.3-cp37-cp37m-win_amd64.whl", hash = "sha256:866ada14a95b083dd727a845a764cf95dd13ba3dc69a16b99038001b05439709"},
+    {file = "scipy-1.7.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:65bd52bf55f9a1071398557394203d881384d27b9c2cad7df9a027170aeaef93"},
+    {file = "scipy-1.7.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:f99d206db1f1ae735a8192ab93bd6028f3a42f6fa08467d37a14eb96c9dd34a3"},
+    {file = "scipy-1.7.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5f2cfc359379c56b3a41b17ebd024109b2049f878badc1e454f31418c3a18436"},
+    {file = "scipy-1.7.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb7ae2c4dbdb3c9247e07acc532f91077ae6dbc40ad5bd5dca0bb5a176ee9bda"},
+    {file = "scipy-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c2d250074cfa76715d58830579c64dff7354484b284c2b8b87e5a38321672c"},
+    {file = "scipy-1.7.3-cp38-cp38-win32.whl", hash = "sha256:87069cf875f0262a6e3187ab0f419f5b4280d3dcf4811ef9613c605f6e4dca95"},
+    {file = "scipy-1.7.3-cp38-cp38-win_amd64.whl", hash = "sha256:7edd9a311299a61e9919ea4192dd477395b50c014cdc1a1ac572d7c27e2207fa"},
+    {file = "scipy-1.7.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eef93a446114ac0193a7b714ce67659db80caf940f3232bad63f4c7a81bc18df"},
+    {file = "scipy-1.7.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb326658f9b73c07081300daba90a8746543b5ea177184daed26528273157294"},
+    {file = "scipy-1.7.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:93378f3d14fff07572392ce6a6a2ceb3a1f237733bd6dcb9eb6a2b29b0d19085"},
+    {file = "scipy-1.7.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edad1cf5b2ce1912c4d8ddad20e11d333165552aba262c882e28c78bbc09dbf6"},
+    {file = "scipy-1.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d1cc2c19afe3b5a546ede7e6a44ce1ff52e443d12b231823268019f608b9b12"},
+    {file = "scipy-1.7.3-cp39-cp39-win32.whl", hash = "sha256:2c56b820d304dffcadbbb6cbfbc2e2c79ee46ea291db17e288e73cd3c64fefa9"},
+    {file = "scipy-1.7.3-cp39-cp39-win_amd64.whl", hash = "sha256:3f78181a153fa21c018d346f595edd648344751d7f03ab94b398be2ad083ed3e"},
+    {file = "scipy-1.7.3.tar.gz", hash = "sha256:ab5875facfdef77e0a47d5fd39ea178b58e60e454a4c85aa1e52fcb80db7babf"},
+]
+setuptools-scm = [
+    {file = "setuptools_scm-6.4.2-py3-none-any.whl", hash = "sha256:acea13255093849de7ccb11af9e1fb8bde7067783450cee9ef7a93139bddf6d4"},
+    {file = "setuptools_scm-6.4.2.tar.gz", hash = "sha256:6833ac65c6ed9711a4d5d2266f8024cfa07c533a0e55f4c12f6eff280a5a9e30"},
+]
+six = [
+    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
+    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+]
+snowballstemmer = [
+    {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"},
+    {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"},
+]
+sphinx = [
+    {file = "Sphinx-3.5.3-py3-none-any.whl", hash = "sha256:3f01732296465648da43dec8fb40dc451ba79eb3e2cc5c6d79005fd98197107d"},
+    {file = "Sphinx-3.5.3.tar.gz", hash = "sha256:ce9c228456131bab09a3d7d10ae58474de562a6f79abb3dc811ae401cf8c1abc"},
+]
+sphinx-gallery = []
+sphinx-rtd-theme = [
+    {file = "sphinx_rtd_theme-0.4.3-py2.py3-none-any.whl", hash = "sha256:00cf895504a7895ee433807c62094cf1e95f065843bf3acd17037c3e9a2becd4"},
+    {file = "sphinx_rtd_theme-0.4.3.tar.gz", hash = "sha256:728607e34d60456d736cc7991fd236afb828b21b82f956c5ea75f94c8414040a"},
+]
+sphinxcontrib-applehelp = [
+    {file = "sphinxcontrib-applehelp-1.0.2.tar.gz", hash = "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"},
+    {file = "sphinxcontrib_applehelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a"},
+]
+sphinxcontrib-devhelp = [
+    {file = "sphinxcontrib-devhelp-1.0.2.tar.gz", hash = "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"},
+    {file = "sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e"},
+]
+sphinxcontrib-htmlhelp = [
+    {file = "sphinxcontrib-htmlhelp-2.0.0.tar.gz", hash = "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"},
+    {file = "sphinxcontrib_htmlhelp-2.0.0-py2.py3-none-any.whl", hash = "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07"},
+]
+sphinxcontrib-jsmath = [
+    {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"},
+    {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"},
+]
+sphinxcontrib-qthelp = [
+    {file = "sphinxcontrib-qthelp-1.0.3.tar.gz", hash = "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72"},
+    {file = "sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl", hash = "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"},
+]
+sphinxcontrib-serializinghtml = [
+    {file = "sphinxcontrib-serializinghtml-1.1.5.tar.gz", hash = "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"},
+    {file = "sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl", hash = "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd"},
+]
+sqlparse = [
+    {file = "sqlparse-0.4.2-py3-none-any.whl", hash = "sha256:48719e356bb8b42991bdbb1e8b83223757b93789c00910a616a071910ca4a64d"},
+    {file = "sqlparse-0.4.2.tar.gz", hash = "sha256:0c00730c74263a94e5a9919ade150dfc3b19c574389985446148402998287dae"},
+]
+tensorboard = [
+    {file = "tensorboard-2.9.1-py3-none-any.whl", hash = "sha256:baa727f791776f9e5841d347127720ceed4bbd59c36b40604b95fb2ae6029276"},
+]
+tensorboard-data-server = [
+    {file = "tensorboard_data_server-0.6.1-py3-none-any.whl", hash = "sha256:809fe9887682d35c1f7d1f54f0f40f98bb1f771b14265b453ca051e2ce58fca7"},
+    {file = "tensorboard_data_server-0.6.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:fa8cef9be4fcae2f2363c88176638baf2da19c5ec90addb49b1cde05c95c88ee"},
+    {file = "tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl", hash = "sha256:d8237580755e58eff68d1f3abefb5b1e39ae5c8b127cc40920f9c4fb33f4b98a"},
+]
+tensorboard-plugin-wit = [
+    {file = "tensorboard_plugin_wit-1.8.1-py3-none-any.whl", hash = "sha256:ff26bdd583d155aa951ee3b152b3d0cffae8005dc697f72b44a8e8c2a77a8cbe"},
+]
+tensorflow = [
+    {file = "tensorflow-2.7.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:fa65204a0b62af977261c299d24105398cc13bc45c12d1d58a68d3975fcbe50a"},
+    {file = "tensorflow-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:be1ce6285f10e74e2b35460ae48f4aea1f2d3da98595307298407aff3b17cd99"},
+    {file = "tensorflow-2.7.3-cp37-cp37m-win_amd64.whl", hash = "sha256:d01eb97f5edd7d1c1d0b73bdb8e5c4f23ccde9ca1f308ec69465f83ac39c2a0d"},
+    {file = "tensorflow-2.7.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:00228b6f32ecaff601f7cbfe506fc311e0c0c43d91ccb981b4722df47085a4db"},
+    {file = "tensorflow-2.7.3-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ba0b03d382b254a0400ab66cfcb96c25efe99e9176ddfc448c2b59a8b397c628"},
+    {file = "tensorflow-2.7.3-cp38-cp38-win_amd64.whl", hash = "sha256:68cb8f227f509cbb972ebaa90f39bddc4be225d283794413ed57eb60fa358118"},
+    {file = "tensorflow-2.7.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:cb75250f36eff7422cbeccec6b3e3bba5af9fff3c5283f3d2dcc578feb8ce4b3"},
+    {file = "tensorflow-2.7.3-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:6aad6538a4a547620b3fb4c13b18831f4cc608951767f6067f2c2042ff8a7071"},
+    {file = "tensorflow-2.7.3-cp39-cp39-win_amd64.whl", hash = "sha256:fa585b033c25e3a73e56cab2d364a1db540f7a8dcc0dad6b0faef27bcc8a9ea1"},
+]
+tensorflow-estimator = [
+    {file = "tensorflow_estimator-2.7.0-py2.py3-none-any.whl", hash = "sha256:325b5a224864379242b7b76c6987ca544239be82579d33e68ec7c2bda57abc9d"},
+]
+tensorflow-io-gcs-filesystem = [
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:4222a9d0c0ddeca2fd2bfd70f5ed149346f5ba12ffe65d817d8e18393341d8e2"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5457eeef1f0f5f294225808b2290a251a2e4639ec66db9d32aa4ae62e807d7e8"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c71cebb26ce10e6e48dc46e6fc0acef5329b01f75a5e76c7defb77175bf97f7"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp310-cp310-win_amd64.whl", hash = "sha256:1c165595c7a67668b44c7ffb9746ffb351c630940d9cca7f2b31f8adf7a36b94"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:c20e1f95b904f43ac86fdb251f222be2c3e7026e9ddbde2a3b6a456f26a83944"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1cccdc12ec304a7ab3e6f85919ba5a77c2bf751b3d0f9e62196ee7df11a8136a"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94645cac4449dd2ccc40327c23d0256cf4e96597e5a55116a91076e9dc96023e"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ce0d7eaaebfcb5fdcff161af0e8a4b94d5dc346299111c08373d66058011a16d"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:e9569dadd79b2d4b28dbe5be47c378a884414a85c89eaeae6115bcba4f3cbb96"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:84a463e228cde296fc63672902a2eceac9fec5f8ae7605e9f18824db591e7f5c"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:531214e48ef64a96f565550b283e75cf0119abff14048a11a25453b47ec5b61c"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp38-cp38-win_amd64.whl", hash = "sha256:44b28c9c6a9e25774a53ec2e85ed4d0b5c4db3a7d3a4011ade94fa9ee636393c"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:09f9df13737e2b4d92b73653509281d77732ef9a90a1ebef824511ce5431eb0a"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c71438e6459f52462b95f98ab17b20cd1a269a1efe837e4df426a0b79359f3b7"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd285595afe03740553710ccdbd1397d69a8e48d758c731c0de1f1c5a71a9fe5"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp39-cp39-win_amd64.whl", hash = "sha256:2940b4ab6848ef5ec34dc3c140b5ae9eba0da13453da839c30ebe3461a6eb51d"},
+]
+termcolor = [
+    {file = "termcolor-1.1.0.tar.gz", hash = "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"},
+]
+tflite = [
+    {file = "tflite-2.1.0-py2.py3-none-any.whl", hash = "sha256:7b51610d8cd08bb198038b5bfb294fd838a9371291c9ee8172e9163c59da27f4"},
+]
+toml = [
+    {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
+    {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
+]
+tomli = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
+torch = [
+    {file = "torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:62052b50fffc29ca7afc0c04ef8206b6f1ca9d10629cb543077e12967e8d0398"},
+    {file = "torch-1.11.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:866bfba29ac98dec35d893d8e17eaec149d0ac7a53be7baae5c98069897db667"},
+    {file = "torch-1.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:951640fb8db308a59d9b510e7d1ad910aff92913323bbe4bc75435347ddd346d"},
+    {file = "torch-1.11.0-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:5d77b5ece78fdafa5c7f42995ff9474399d22571cd6b2de21a5d666306a2ff8c"},
+    {file = "torch-1.11.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:b5a38682769b544c875ecc34bcb81fbad5c922139b61319aacffcfd8a32f528c"},
+    {file = "torch-1.11.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f82d77695a60626f2b7382d85bc566de8a6b3e50d32080755abc040db802e419"},
+    {file = "torch-1.11.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b96654d42566080a134e784705f33f8536b3b95b5dcde357ed7879b1692a5f78"},
+    {file = "torch-1.11.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8ee7c2e8d7f7020d5bfbc1bb91b9591044c26bbd0cee5e4f694cfd7ed8649260"},
+    {file = "torch-1.11.0-cp37-none-macosx_10_9_x86_64.whl", hash = "sha256:6860b1d1bf0bb0b67a6bd47f85a0e4c825b518eea13b5d6101999dbbcbd5bc0c"},
+    {file = "torch-1.11.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:4322aa29f50da7f404db06cdf30896ea67b09f673af4a985afc7162bc897864d"},
+    {file = "torch-1.11.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e4d2e0ddd652f30e94cff750220324ec45705d4ecc69658f773b3cb1c7a28dd0"},
+    {file = "torch-1.11.0-cp38-cp38-win_amd64.whl", hash = "sha256:34ce5ea4d8d85da32cdbadb50d4585106901e9f8a3527991daa70c13a09de1f7"},
+    {file = "torch-1.11.0-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:0ccc85cd06227a3edf809e2c795fd5762c3d4e8a38b5c9f744c6e7cf841361bb"},
+    {file = "torch-1.11.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:c1554e49d74f1b2c3e7202d77056ba2dd7465437585bac64062b580f714a44e9"},
+    {file = "torch-1.11.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:58c7814502b1c129a650d7092033bbb0bbd64faf1a7941631aaa1aeaddc37570"},
+    {file = "torch-1.11.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:831cf588f01dda9409e75576741d2823453990dee2983d670f2584b37a01adf7"},
+    {file = "torch-1.11.0-cp39-cp39-win_amd64.whl", hash = "sha256:44a1d02fd20f827f0f36dc26fdcfc45e793806a6ad52769a22260655a77a4369"},
+    {file = "torch-1.11.0-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:50fd9bf85c578c871c28f1cb0ace9dfc6024401c7f399b174fb0f370899f4454"},
+    {file = "torch-1.11.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:0e48af66ad755f0f9c5f2664028a414f57c49d6adc37e77e06fe0004da4edb61"},
+]
+torchvision = [
+    {file = "torchvision-0.12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:693656e6790b6ab21e4a6e87e81c2982bad9e455b5eb24e14bb672382ec6130f"},
+    {file = "torchvision-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a0be4501ca0ba1b195644c9243f49a1c49a26e52a7f37924c4239d0bf5ecbd8d"},
+    {file = "torchvision-0.12.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:ebfb47adf65bf3926b990b2c4767e291f135e259e03232e0e1a30ecdb05eb087"},
+    {file = "torchvision-0.12.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:9771231639afb5973cdaea1d449b451e2982e1ef5410ca67bbdc2b465565573a"},
+    {file = "torchvision-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:894dacdc64b6e35e3f330722db51c76f4de016c7bf7bd79cf02ed2f4c106e625"},
+    {file = "torchvision-0.12.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:36dfdf6451fe3072ab15118982853b848896c0fd3b26cb8135e1e7981dbb0916"},
+    {file = "torchvision-0.12.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:aac76d52c5ce4229cb0eaebb762f3391fa736565eb35a4184fa0f7be30b705cd"},
+    {file = "torchvision-0.12.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:926666f0b893dce6619759c19b0dd3884af7a9d7022b10395653659d28e43c48"},
+    {file = "torchvision-0.12.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c225f55c1bfce027a03f4ca46ddb9559c83f8087c2880bed3261a76c49bb7996"},
+    {file = "torchvision-0.12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d1ccb53836ba886320dcda12d00ee8b5f8f38b6c36d7906f141d25778cf74104"},
+    {file = "torchvision-0.12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9f42420f7f0b29cd3d61776df3157827257a0cf16b2c02776dc16c96abb1256d"},
+    {file = "torchvision-0.12.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:9017248c7e526c8cdcaaab8cf41d904a520a409d707398189a06d0757901d235"},
+    {file = "torchvision-0.12.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0744902f2265d4c3e83c44a06b567df312e4a9faf8c92620016c7bed7056b5a7"},
+    {file = "torchvision-0.12.0-cp38-cp38-win_amd64.whl", hash = "sha256:a91db01496932350bf9c0ee8607ac8ef31c3ebfdaedefe5c5cda0515317f8b8e"},
+    {file = "torchvision-0.12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:24d03fcaa28004c64a24124ac4a894c50f5948c8eb290e398d6c76fff2bc678f"},
+    {file = "torchvision-0.12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69d82f47b67bad6ddcbb87833ba5950a6c271ba97baae4c0955610071bf034f5"},
+    {file = "torchvision-0.12.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:49ed7886b93b80c9733462edd06a07f8d4c6ea4d5bd2894e7268f7a3774f4f7d"},
+    {file = "torchvision-0.12.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b93a767f44e3933cb3b01a6fe9727db54590f57b7dac09d5aaf15966c6c151dd"},
+    {file = "torchvision-0.12.0-cp39-cp39-win_amd64.whl", hash = "sha256:edab05f7ba9f648c00435b384ffdbd7bde79a3b8ea893813fb50f6ccf28b1e76"},
+]
+tornado = [
+    {file = "tornado-6.1-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:d371e811d6b156d82aa5f9a4e08b58debf97c302a35714f6f45e35139c332e32"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:0d321a39c36e5f2c4ff12b4ed58d41390460f798422c4504e09eb5678e09998c"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9de9e5188a782be6b1ce866e8a51bc76a0fbaa0e16613823fc38e4fc2556ad05"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:61b32d06ae8a036a6607805e6720ef00a3c98207038444ba7fd3d169cd998910"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:3e63498f680547ed24d2c71e6497f24bca791aca2fe116dbc2bd0ac7f191691b"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:6c77c9937962577a6a76917845d06af6ab9197702a42e1346d8ae2e76b5e3675"},
+    {file = "tornado-6.1-cp35-cp35m-win32.whl", hash = "sha256:6286efab1ed6e74b7028327365cf7346b1d777d63ab30e21a0f4d5b275fc17d5"},
+    {file = "tornado-6.1-cp35-cp35m-win_amd64.whl", hash = "sha256:fa2ba70284fa42c2a5ecb35e322e68823288a4251f9ba9cc77be04ae15eada68"},
+    {file = "tornado-6.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0a00ff4561e2929a2c37ce706cb8233b7907e0cdc22eab98888aca5dd3775feb"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:748290bf9112b581c525e6e6d3820621ff020ed95af6f17fedef416b27ed564c"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:e385b637ac3acaae8022e7e47dfa7b83d3620e432e3ecb9a3f7f58f150e50921"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:25ad220258349a12ae87ede08a7b04aca51237721f63b1808d39bdb4b2164558"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:65d98939f1a2e74b58839f8c4dab3b6b3c1ce84972ae712be02845e65391ac7c"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:e519d64089b0876c7b467274468709dadf11e41d65f63bba207e04217f47c085"},
+    {file = "tornado-6.1-cp36-cp36m-win32.whl", hash = "sha256:b87936fd2c317b6ee08a5741ea06b9d11a6074ef4cc42e031bc6403f82a32575"},
+    {file = "tornado-6.1-cp36-cp36m-win_amd64.whl", hash = "sha256:cc0ee35043162abbf717b7df924597ade8e5395e7b66d18270116f8745ceb795"},
+    {file = "tornado-6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7250a3fa399f08ec9cb3f7b1b987955d17e044f1ade821b32e5f435130250d7f"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:ed3ad863b1b40cd1d4bd21e7498329ccaece75db5a5bf58cd3c9f130843e7102"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:dcef026f608f678c118779cd6591c8af6e9b4155c44e0d1bc0c87c036fb8c8c4"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:70dec29e8ac485dbf57481baee40781c63e381bebea080991893cd297742b8fd"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:d3f7594930c423fd9f5d1a76bee85a2c36fd8b4b16921cae7e965f22575e9c01"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3447475585bae2e77ecb832fc0300c3695516a47d46cefa0528181a34c5b9d3d"},
+    {file = "tornado-6.1-cp37-cp37m-win32.whl", hash = "sha256:e7229e60ac41a1202444497ddde70a48d33909e484f96eb0da9baf8dc68541df"},
+    {file = "tornado-6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:cb5ec8eead331e3bb4ce8066cf06d2dfef1bfb1b2a73082dfe8a161301b76e37"},
+    {file = "tornado-6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:20241b3cb4f425e971cb0a8e4ffc9b0a861530ae3c52f2b0434e6c1b57e9fd95"},
+    {file = "tornado-6.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:c77da1263aa361938476f04c4b6c8916001b90b2c2fdd92d8d535e1af48fba5a"},
+    {file = "tornado-6.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:fba85b6cd9c39be262fcd23865652920832b61583de2a2ca907dbd8e8a8c81e5"},
+    {file = "tornado-6.1-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:1e8225a1070cd8eec59a996c43229fe8f95689cb16e552d130b9793cb570a288"},
+    {file = "tornado-6.1-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:d14d30e7f46a0476efb0deb5b61343b1526f73ebb5ed84f23dc794bdb88f9d9f"},
+    {file = "tornado-6.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8f959b26f2634a091bb42241c3ed8d3cedb506e7c27b8dd5c7b9f745318ddbb6"},
+    {file = "tornado-6.1-cp38-cp38-win32.whl", hash = "sha256:34ca2dac9e4d7afb0bed4677512e36a52f09caa6fded70b4e3e1c89dbd92c326"},
+    {file = "tornado-6.1-cp38-cp38-win_amd64.whl", hash = "sha256:6196a5c39286cc37c024cd78834fb9345e464525d8991c21e908cc046d1cc02c"},
+    {file = "tornado-6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f0ba29bafd8e7e22920567ce0d232c26d4d47c8b5cf4ed7b562b5db39fa199c5"},
+    {file = "tornado-6.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:33892118b165401f291070100d6d09359ca74addda679b60390b09f8ef325ffe"},
+    {file = "tornado-6.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7da13da6f985aab7f6f28debab00c67ff9cbacd588e8477034c0652ac141feea"},
+    {file = "tornado-6.1-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:e0791ac58d91ac58f694d8d2957884df8e4e2f6687cdf367ef7eb7497f79eaa2"},
+    {file = "tornado-6.1-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:66324e4e1beede9ac79e60f88de548da58b1f8ab4b2f1354d8375774f997e6c0"},
+    {file = "tornado-6.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:a48900ecea1cbb71b8c71c620dee15b62f85f7c14189bdeee54966fbd9a0c5bd"},
+    {file = "tornado-6.1-cp39-cp39-win32.whl", hash = "sha256:d3d20ea5782ba63ed13bc2b8c291a053c8d807a8fa927d941bd718468f7b950c"},
+    {file = "tornado-6.1-cp39-cp39-win_amd64.whl", hash = "sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4"},
+    {file = "tornado-6.1.tar.gz", hash = "sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791"},
+]
+typed-ast = [
+    {file = "typed_ast-1.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:669dd0c4167f6f2cd9f57041e03c3c2ebf9063d0757dc89f79ba1daa2bfca9d4"},
+    {file = "typed_ast-1.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:211260621ab1cd7324e0798d6be953d00b74e0428382991adfddb352252f1d62"},
+    {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:267e3f78697a6c00c689c03db4876dd1efdfea2f251a5ad6555e82a26847b4ac"},
+    {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c542eeda69212fa10a7ada75e668876fdec5f856cd3d06829e6aa64ad17c8dfe"},
+    {file = "typed_ast-1.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:a9916d2bb8865f973824fb47436fa45e1ebf2efd920f2b9f99342cb7fab93f72"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:79b1e0869db7c830ba6a981d58711c88b6677506e648496b1f64ac7d15633aec"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a94d55d142c9265f4ea46fab70977a1944ecae359ae867397757d836ea5a3f47"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:183afdf0ec5b1b211724dfef3d2cad2d767cbefac291f24d69b00546c1837fb6"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-win_amd64.whl", hash = "sha256:639c5f0b21776605dd6c9dbe592d5228f021404dafd377e2b7ac046b0349b1a1"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cf4afcfac006ece570e32d6fa90ab74a17245b83dfd6655a6f68568098345ff6"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed855bbe3eb3715fca349c80174cfcfd699c2f9de574d40527b8429acae23a66"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6778e1b2f81dfc7bc58e4b259363b83d2e509a65198e85d5700dfae4c6c8ff1c"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-win_amd64.whl", hash = "sha256:0261195c2062caf107831e92a76764c81227dae162c4f75192c0d489faf751a2"},
+    {file = "typed_ast-1.5.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2efae9db7a8c05ad5547d522e7dbe62c83d838d3906a3716d1478b6c1d61388d"},
+    {file = "typed_ast-1.5.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7d5d014b7daa8b0bf2eaef684295acae12b036d79f54178b92a2b6a56f92278f"},
+    {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:370788a63915e82fd6f212865a596a0fefcbb7d408bbbb13dea723d971ed8bdc"},
+    {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4e964b4ff86550a7a7d56345c7864b18f403f5bd7380edf44a3c1fb4ee7ac6c6"},
+    {file = "typed_ast-1.5.4-cp38-cp38-win_amd64.whl", hash = "sha256:683407d92dc953c8a7347119596f0b0e6c55eb98ebebd9b23437501b28dcbb8e"},
+    {file = "typed_ast-1.5.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4879da6c9b73443f97e731b617184a596ac1235fe91f98d279a7af36c796da35"},
+    {file = "typed_ast-1.5.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e123d878ba170397916557d31c8f589951e353cc95fb7f24f6bb69adc1a8a97"},
+    {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebd9d7f80ccf7a82ac5f88c521115cc55d84e35bf8b446fcd7836eb6b98929a3"},
+    {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98f80dee3c03455e92796b58b98ff6ca0b2a6f652120c263efdba4d6c5e58f72"},
+    {file = "typed_ast-1.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:0fdbcf2fef0ca421a3f5912555804296f0b0960f0418c440f5d6d3abb549f3e1"},
+    {file = "typed_ast-1.5.4.tar.gz", hash = "sha256:39e21ceb7388e4bb37f4c679d72707ed46c2fbf2a5609b8b8ebc4b067d977df2"},
+]
+typing-extensions = [
+    {file = "typing_extensions-4.2.0-py3-none-any.whl", hash = "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708"},
+    {file = "typing_extensions-4.2.0.tar.gz", hash = "sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376"},
+]
+unify = [
+    {file = "unify-0.5.tar.gz", hash = "sha256:8ddce812b2457212b7598fe574c9e6eb3ad69710f445391338270c7f8a71723c"},
+]
+untokenize = [
+    {file = "untokenize-0.1.1.tar.gz", hash = "sha256:3865dbbbb8efb4bb5eaa72f1be7f3e0be00ea8b7f125c69cbd1f5fda926f37a2"},
+]
+urllib3 = [
+    {file = "urllib3-1.26.9-py2.py3-none-any.whl", hash = "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14"},
+    {file = "urllib3-1.26.9.tar.gz", hash = "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"},
+]
+wcwidth = [
+    {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"},
+    {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"},
+]
+werkzeug = [
+    {file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"},
+    {file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"},
+]
+wrapt = [
+    {file = "wrapt-1.14.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3"},
+    {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef"},
+    {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28"},
+    {file = "wrapt-1.14.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59"},
+    {file = "wrapt-1.14.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87"},
+    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1"},
+    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b"},
+    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462"},
+    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1"},
+    {file = "wrapt-1.14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320"},
+    {file = "wrapt-1.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2"},
+    {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4"},
+    {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069"},
+    {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310"},
+    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f"},
+    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656"},
+    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
+    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
+    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
+    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
+    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
+    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d"},
+    {file = "wrapt-1.14.1-cp35-cp35m-win32.whl", hash = "sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7"},
+    {file = "wrapt-1.14.1-cp35-cp35m-win_amd64.whl", hash = "sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00"},
+    {file = "wrapt-1.14.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4"},
+    {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1"},
+    {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1"},
+    {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff"},
+    {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d"},
+    {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1"},
+    {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569"},
+    {file = "wrapt-1.14.1-cp36-cp36m-win32.whl", hash = "sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed"},
+    {file = "wrapt-1.14.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471"},
+    {file = "wrapt-1.14.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248"},
+    {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68"},
+    {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d"},
+    {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77"},
+    {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7"},
+    {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015"},
+    {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a"},
+    {file = "wrapt-1.14.1-cp37-cp37m-win32.whl", hash = "sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853"},
+    {file = "wrapt-1.14.1-cp37-cp37m-win_amd64.whl", hash = "sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c"},
+    {file = "wrapt-1.14.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456"},
+    {file = "wrapt-1.14.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f"},
+    {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc"},
+    {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1"},
+    {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af"},
+    {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b"},
+    {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0"},
+    {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57"},
+    {file = "wrapt-1.14.1-cp38-cp38-win32.whl", hash = "sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5"},
+    {file = "wrapt-1.14.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d"},
+    {file = "wrapt-1.14.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383"},
+    {file = "wrapt-1.14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7"},
+    {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86"},
+    {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735"},
+    {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b"},
+    {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3"},
+    {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3"},
+    {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe"},
+    {file = "wrapt-1.14.1-cp39-cp39-win32.whl", hash = "sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5"},
+    {file = "wrapt-1.14.1-cp39-cp39-win_amd64.whl", hash = "sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb"},
+    {file = "wrapt-1.14.1.tar.gz", hash = "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d"},
+]
+xgboost = [
+    {file = "xgboost-1.6.1-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl", hash = "sha256:2b3d4ee105f8434873b40edc511330b8276bf3a8d9d42fb0319973079df30b07"},
+    {file = "xgboost-1.6.1-py3-none-macosx_12_0_arm64.whl", hash = "sha256:bd3e59a5490e010004106d8ea1d07aa8e048be51a0974fca6b4f00988f087ab8"},
+    {file = "xgboost-1.6.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:bbf16af8bf72e8761fcf69fdb5798bd5add6ecb48049198551b13c1d7abeabb5"},
+    {file = "xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6207c77f611b54d9f056edede819ead03f0235615675f88030ff9fe10d359551"},
+    {file = "xgboost-1.6.1-py3-none-win_amd64.whl", hash = "sha256:3adcb7e4ccf774d5e0128c01e5c381303c3799910ab0f2e996160fe3cd23b7fc"},
+    {file = "xgboost-1.6.1.tar.gz", hash = "sha256:24072028656f3428e7b8aabf77340ece057f273e41f7f85d67ccaefb7454bb18"},
+]
+zipp = [
+    {file = "zipp-3.8.0-py3-none-any.whl", hash = "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"},
+    {file = "zipp-3.8.0.tar.gz", hash = "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad"},
+]
diff --git a/apps/microtvm/pyproject.toml b/apps/microtvm/pyproject.toml
index 5976328592290..0ae1defe772ca 100644
--- a/apps/microtvm/pyproject.toml
+++ b/apps/microtvm/pyproject.toml
@@ -20,7 +20,7 @@
 
 [tool.black]
 line-length = 100
-target-version = ['py36']
+target-version = ['py37']
 include = '(\.pyi?$)'
 exclude = '''
 
@@ -59,12 +59,12 @@ packages = [
 ]
 
 [tool.poetry.dependencies]
+python = ">=3.7, <3.9"
 attrs = "^19"
 decorator = "^4.4"
 numpy = "~1.19"
 psutil = "^5"
 scipy = "^1.4"
-python = "^3.6"
 tornado = "^6"
 typed_ast = "^1.4"
 pyyaml = "^5.4.1"
@@ -93,16 +93,15 @@ cffi = {version = "^1.14", optional = true}
 mxnet = {version = "^1.6.0", optional = true}
 
 # ONNX frontend
-onnx = {version = "1.6.0", optional = true}
-onnxruntime = {version = "1.0.0", optional = true}
+onnx = {version = "==1.10.2", optional = true}
+onnxoptimizer = { version = "==0.2.6", optional = true }
+onnxruntime = { version = "==1.9.0", optional = true }
 
 # Pytorch (also used by ONNX)
-# NOTE: cannot download this right now due to https://github.com/python-poetry/poetry/issues/2247
-# torch = {url = "https://download.pytorch.org/whl/cu101/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl", optional = true}
-# torchvision = {version = "0.5.0", optional = true}
-# NOTE: torch depends on a number of other packages, but unhelpfully, does not expose that in the
-# wheel!!!
-future = {version = "*", optional = true}
+torch = { version = "==1.11.0", optional = true }
+torchvision = { version = "==0.12.0", optional = true }
+
+future = { version = "==*", optional = true }
 
 # Tensorflow frontend
 tensorflow = {version = "^2.1", optional = true}
@@ -120,7 +119,7 @@ importer-caffe2 = ["torch"]
 importer-coreml = ["coremltools"]
 importer-darknet = ["opencv-python"]
 importer-keras = ["tensorflow", "tensorflow-estimator"]
-importer-onnx = ["onnx", "onnxruntime", "torch", "torchvision", "future"]
+importer-onnx = ["future", "onnx", "onnxoptimizer", "onnxruntime", "torch", "torchvision"]
 importer-pytorch = ["torch", "torchvision", "future"]
 importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
 importer-tflite = ["tflite", "tensorflow", "tensorflow-estimator"]
diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
index d02518c538b4c..cde9d38b2df79 100644
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
@@ -59,6 +59,9 @@ curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poet
 sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
 sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
 
+# Python 3.7
+sudo apt install -y python3.7
+
 # Clean box for packaging as a base box
 sudo apt-get clean
 if [ $skip_zeroing_disk -eq 0 ]; then
diff --git a/apps/microtvm/reference-vm/arduino/provision_setup.sh b/apps/microtvm/reference-vm/arduino/provision_setup.sh
index 1a24cbad94193..a8dc2a0b0c13f 100644
--- a/apps/microtvm/reference-vm/arduino/provision_setup.sh
+++ b/apps/microtvm/reference-vm/arduino/provision_setup.sh
@@ -28,20 +28,12 @@ apps/microtvm/reference-vm/rebuild-tvm.sh ${platform}
 # Build poetry
 cd apps/microtvm/reference-vm/arduino
 
-poetry env use 3.6
-# NOTE: due to https://github.com/python-poetry/poetry/issues/2247, download torch here.
-poetry run pip3 install torch==1.4.0 torchvision==0.5.0
+poetry env use 3.7
 
 # importers
 poetry install -E importer-onnx
 poetry install -E importer-tflite
 
-echo "------------------------------[ TVM Message ]------------------------------"
-echo "WARNING: running 'poetry lock', which could take several minutes (depending"
-echo "on your network connection and the state of PyPI) as dependencies are"
-echo "downloaded and cached for future use."
-echo "------------------------------[ TVM Message ]------------------------------"
-poetry lock -vvv
 poetry install
 
 echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm-${platform}" >>~/.profile
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
index 7d619b1f45553..0c98713932277 100644
--- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
@@ -101,6 +101,9 @@ sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashr
 sed -i "/^# If not running interactively,/ i export ZEPHYR_BASE=$HOME/zephyr/zephyr" ~/.bashrc
 sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
 
+# Python 3.7
+sudo apt install -y python3.7
+
 # Clean box for packaging as a base box
 sudo apt-get clean
 if [ $skip_zeroing_disk -eq 0 ]; then
diff --git a/apps/microtvm/reference-vm/zephyr/provision_setup.sh b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
index 6771460dc9c37..cd41600ea42a5 100644
--- a/apps/microtvm/reference-vm/zephyr/provision_setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
@@ -28,20 +28,12 @@ apps/microtvm/reference-vm/rebuild-tvm.sh ${platform}
 # Build poetry
 cd apps/microtvm/reference-vm/zephyr
 
-poetry env use 3.6
-# NOTE: due to https://github.com/python-poetry/poetry/issues/2247, download torch here.
-poetry run pip3 install torch==1.4.0 torchvision==0.5.0
+poetry env use 3.7
 
 # importers
 poetry install -E importer-onnx
 poetry install -E importer-tflite
 
-echo "------------------------------[ TVM Message ]------------------------------"
-echo "WARNING: running 'poetry lock', which could take several minutes (depending"
-echo "on your network connection and the state of PyPI) as dependencies are"
-echo "downloaded and cached for future use."
-echo "------------------------------[ TVM Message ]------------------------------"
-poetry lock -vvv
 poetry install
 poetry run pip3 install -r ${ZEPHYR_BASE}/scripts/requirements.txt
 
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 4dc0109bdef89..d26b047e81210 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -149,6 +149,7 @@
     "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-riscv32",
     "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-riscv64",
     # microTVM Virtual Machines
+    "apps/microtvm/poetry.lock",
     "apps/microtvm/reference-vm/arduino/Vagrantfile",
     "apps/microtvm/reference-vm/arduino/base-box/Vagrantfile.packer-template",
     "apps/microtvm/reference-vm/zephyr/Vagrantfile",

From 762bed0d0d533bc564caefcbef5cc9e579a128c1 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Thu, 9 Jun 2022 17:58:35 -0600
Subject: [PATCH 087/181] [microTVM] Add support for Arduino Portenta H7
 (#11636)

* Add support for Portenta H7

* Add Portenta H7 to supported boards in README

* Rerun tests
---
 apps/microtvm/arduino/template_project/boards.json       | 8 ++++++++
 apps/microtvm/reference-vm/arduino/README.md             | 1 +
 .../reference-vm/arduino/base-box/base_box_provision.sh  | 9 +++++++++
 python/tvm/target/target.py                              | 1 +
 4 files changed, 19 insertions(+)

diff --git a/apps/microtvm/arduino/template_project/boards.json b/apps/microtvm/arduino/template_project/boards.json
index 8f039f0680e77..b8efbbc57887e 100644
--- a/apps/microtvm/arduino/template_project/boards.json
+++ b/apps/microtvm/arduino/template_project/boards.json
@@ -41,6 +41,14 @@
         "vid_hex": "2341",
         "pid_hex": "805a"
     },
+    "portentah7": {
+        "package": "arduino",
+        "architecture": "mbed_portenta",
+        "board": "envie_m7",
+        "model": "stm32h7xx",
+        "vid_hex": "2341",
+        "pid_hex": "025b"
+    },
     "pybadge": {
         "package": "adafruit",
         "architecture": "samd",
diff --git a/apps/microtvm/reference-vm/arduino/README.md b/apps/microtvm/reference-vm/arduino/README.md
index 3fa1d8bfb4e31..46acfc5f18a42 100644
--- a/apps/microtvm/reference-vm/arduino/README.md
+++ b/apps/microtvm/reference-vm/arduino/README.md
@@ -32,6 +32,7 @@ This RVM has been tested and is known to work with these boards:
 - Adafruit Pybadge
 - Arduino Due
 - Arduino Nano 33 BLE
+- Arduino Portenta H7
 - Feather S2
 - Sony Spresense
 - Wio Terminal
diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
index 1174e00a81f53..287f81df135f1 100644
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
@@ -54,9 +54,18 @@ arduino-cli core update-index --additional-urls $ADAFRUIT_BOARDS_URL,$ESP32_BOAR
 arduino-cli version
 arduino-cli core install arduino:mbed_nano@3.0.1
 arduino-cli core install arduino:sam@1.6.12
+arduino-cli core install arduino:mbed_portenta@3.1.1
 arduino-cli core install adafruit:samd@1.7.10 --additional-urls $ADAFRUIT_BOARDS_URL
 arduino-cli core install esp32:esp32@2.0.2 --additional-urls $ESP32_BOARDS_URL
 arduino-cli core install SPRESENSE:spresense@2.5.0 --additional-urls $SPRESENSE_BOARDS_URL
 
+# The Arduino Code API has a major bug that breaks TVM. It has been worked around in
+# most board SDKs (including arduino:sam), but it still exists for the Portenta H7.
+# There is a PR to fix it (https://github.com/arduino/ArduinoCore-API/pull/163), but
+# it may not be merged for a while (and a new release will have to be deployed too).
+# The below sed command avoids the bug, and will be removed when no longer needed.
+PORTENTA_H7_BUGFIX_PATH=~/.arduino15/packages/arduino/hardware/mbed_portenta/3.1.1/cores/arduino/api/Common.h
+sed -i '3 i #include <stdbool.h>' $PORTENTA_H7_BUGFIX_PATH
+
 # Cleanup
 rm -f *.sh
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index bcb284839d194..debc84980df0c 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -428,6 +428,7 @@ def intel_graphics(model="unknown", options=None):
     "nrf5340dk": ["-mcpu=cortex-m33"],
     "sam3x8e": ["-mcpu=cortex-m3"],
     "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"],
+    "stm32h7xx": ["-mcpu=cortex-m7"],
     "stm32l4r5zi": ["-mcpu=cortex-m4"],
     "stm32u5xx": ["-mcpu=cortex-m33"],
     "zynq_mp_r5": ["-mcpu=cortex-r5"],

From fc8fdae61245c7a16a96cf031e36eaa58e3c0e49 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Thu, 9 Jun 2022 19:58:50 -0400
Subject: [PATCH 088/181] adding vvchernov to contributors file (#11649)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8f43ad455e08a..6b69b2600e46f 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -95,6 +95,7 @@ We do encourage everyone to work anything they are interested in.
 - [Liangfu Chen](https://github.com/liangfu): @liangfu
 - [Tianqi Chen](https://github.com/tqchen): @tqchen
 - [Zhi Chen](https://github.com/zhiics): @zhiics
+- [Valery Chernov](https://github.com/vvchernov): @vvchernov
 - [Neo Chien](https://github.com/cchung100m): @cchung100m
 - [Meghan Cowan](https://github.com/cowanmeg): @cowanmeg
 - [Balint Cristian](https://github.com/cbalint13): @cbalint13

From 60e7eb5e52f2494919ad32c374a6b0ce92e50769 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Thu, 9 Jun 2022 19:59:04 -0400
Subject: [PATCH 089/181] [COMMUNITY] Alexander Peskov -> Reviewers (#11648)

* adding ramana to reviewers list

* adding apeskov as reviewer

* fix
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6b69b2600e46f..897606507b099 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -144,6 +144,7 @@ We do encourage everyone to work anything they are interested in.
 - [Michalis Papadimitriou](https://github.com/mikepapadim): @mikepapadim
 - [Ashutosh Parkhi](https://github.com/ashutosh-arm): @ashutosh-arm
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic
+- [Alexander Peskov](https://github.com/apeskov): @apeskov
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909): @PariksheetPinjari909
 - [Josh Pollock](https://github.com/joshpoll): @joshpoll
 - [Ramana Radhakrishnan](https://github.com/u99127): @u99127

From fe299d76882aa030851126cfbf32bf272492dc43 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 9 Jun 2022 17:45:36 -0700
Subject: [PATCH 090/181] [TVMSCRIPT] Improve tvmscript type hints (#11654)

* [TVMSCRIPT] Improve tvmscript type hints

- Change numeric types to classes so they work as function arguments.
- Add var as a class.
- Add floordiv, index, and mod to PrimExpr.

* use Union
---
 python/tvm/script/tir/__init__.pyi | 67 +++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi
index e4513feb43238..1c5687da52918 100644
--- a/python/tvm/script/tir/__init__.pyi
+++ b/python/tvm/script/tir/__init__.pyi
@@ -60,10 +60,13 @@ class PrimExpr:
     def __div__(self: PrimExpr, other: PrimExpr) -> PrimExpr: ...
     @overload
     def __div__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
+    def __mod__(self: PrimExpr, other: Union[int, float, PrimExpr]) -> PrimExpr: ...
     def __radd__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
     def __rsub__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
     def __rmul__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
     def __rdiv__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
+    def __floordiv__(self: PrimExpr, other: Union[int, float, PrimExpr]) -> PrimExpr: ...
+    def __index__(self: PrimExpr) -> int: ...  # so range doesn't complain
 
 class Var(PrimExpr): ...
 class IterVar(Var): ...
@@ -82,24 +85,6 @@ class Buffer:
     @property
     def data(self: Buffer) -> Ptr: ...
 
-"""
-Variables and constants
-"""
-
-def bool(imm: Union[PrimExpr, builtins.bool, builtins.int]) -> PrimExpr: ...
-def int8(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def int16(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def int32(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def int64(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def uint8(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def uint16(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def uint32(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def uint64(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def float8(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def float16(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def float32(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def float64(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-
 """
 Intrinsic
 """
@@ -299,7 +284,7 @@ special_stmt - Annotations
 """
 
 def buffer_var(dtype: str, storage_scope: str) -> Var: ...
-def func_attr(attrs: Mapping[str, Object]) -> None: ...
+def func_attr(attrs: Mapping[str, Union[Object, str, bool, int, float]]) -> None: ...
 def prim_func(input_func: Callable) -> PrimFunc: ...
 
 """
@@ -418,7 +403,7 @@ ty - redefine types
 
 class boolean: ...
 
-class handle:
+class handle(Var):
     @overload
     def __getitem__(self: handle, pos: Sequence[Union[int, PrimExpr, slice]]) -> Buffer: ...
     @overload
@@ -435,3 +420,45 @@ class handle:
 class Ptr: ...
 
 def target(target_str: Union[str, Mapping[str, Object]]) -> Target: ...
+
+class var(Var):
+    def __init__(self: Var, dtype: str): ...
+
+class bool(PrimExpr):
+    def __init__(self: bool, imm: Union[PrimExpr, builtins.bool, builtins.int]): ...
+
+class int8(PrimExpr):
+    def __init__(self: int8, imm: Union[PrimExpr, int]): ...
+
+class int16(PrimExpr):
+    def __init__(self: int16, imm: Union[PrimExpr, int]): ...
+
+class int32(PrimExpr):
+    def __init__(self: int32, imm: Union[PrimExpr, int]): ...
+
+class int64(PrimExpr):
+    def __init__(self: int64, imm: Union[PrimExpr, int]): ...
+
+class uint8(PrimExpr):
+    def __init__(self: uint8, imm: Union[PrimExpr, int]): ...
+
+class uint16(PrimExpr):
+    def __init__(self: uint16, imm: Union[PrimExpr, int]): ...
+
+class uint32(PrimExpr):
+    def __init__(self: uint32, imm: Union[PrimExpr, int]): ...
+
+class uint64(PrimExpr):
+    def __init__(self: uint64, imm: Union[PrimExpr, int]): ...
+
+class float8(PrimExpr):
+    def __init__(self: float8, imm: Union[PrimExpr, int, float]): ...
+
+class float16(PrimExpr):
+    def __init__(self: float16, imm: Union[PrimExpr, int, float]): ...
+
+class float32(PrimExpr):
+    def __init__(self: float32, imm: Union[PrimExpr, int, float]): ...
+
+class float64(PrimExpr):
+    def __init__(self: float64, imm: Union[PrimExpr, int, float]): ...

From 832856d1090de195a5fa7b2962d7cdd4f470d6ff Mon Sep 17 00:00:00 2001
From: fPecc <peccfederico@frba.utn.edu.ar>
Date: Fri, 10 Jun 2022 02:59:24 +0200
Subject: [PATCH 091/181] [CRT runtime] Added functions
 TVMPlatformBeforeMeasurement and TVMPlatformAfterMeasurement (#11244)

* Added functions with weak links before and after TVMFuncCall in the TimeEvaluator

* Fixed lint

* Clang changes

* Added clang proposal

* clang-format proposed changes

Co-authored-by: Federico Peccia <peccia@fzi.de>
---
 include/tvm/runtime/crt/platform.h       | 19 +++++++++++++++++++
 src/runtime/crt/common/crt_runtime_api.c | 16 ++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h
index c774aaeaa0db5..bb916afacde14 100644
--- a/include/tvm/runtime/crt/platform.h
+++ b/include/tvm/runtime/crt/platform.h
@@ -97,6 +97,25 @@ tvm_crt_error_t TVMPlatformTimerStart();
  */
 tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds);
 
+/*! \brief Platform-specific before measurement call.
+ *
+ * A function which is called before calling TVMFuncCall in the TimeEvaluator.
+ * Can be used, for example, to initialize reset global state which may affect the results of
+ * measurement.
+ *
+ * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
+ */
+tvm_crt_error_t TVMPlatformBeforeMeasurement();
+
+/*! \brief Platform-specific after measurement call.
+ *
+ * A function which is called after calling TVMFuncCall in the TimeEvaluator.
+ * It is the counterpart of the TVMPlatformBeforeMeasurement function.
+ *
+ * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
+ */
+tvm_crt_error_t TVMPlatformAfterMeasurement();
+
 /*! \brief Fill a buffer with random data.
  *
  * Cryptographically-secure random data is NOT required. This function is intended for use
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index 49a699c3ce135..31ab3e9a69731 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -526,6 +526,11 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
     int exec_count = 0;
     // do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0).
     do {
+      err = TVMPlatformBeforeMeasurement();
+      if (err != kTvmErrorNoError) {
+        goto release_and_return;
+      }
+
       err = TVMPlatformTimerStart();
       if (err != kTvmErrorNoError) {
         goto release_and_return;
@@ -546,6 +551,11 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
         goto release_and_return;
       }
       repeat_res_seconds += curr_res_seconds;
+
+      err = TVMPlatformAfterMeasurement();
+      if (err != kTvmErrorNoError) {
+        goto release_and_return;
+      }
     } while (repeat_res_seconds < min_repeat_seconds);
     double mean_exec_seconds = repeat_res_seconds / exec_count;
     *iter = mean_exec_seconds;
@@ -575,6 +585,12 @@ __attribute__((weak)) tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer,
   return kTvmErrorFunctionCallNotImplemented;
 }
 
+// Default implementation, overridden by the platform runtime.
+__attribute__((weak)) tvm_crt_error_t TVMPlatformBeforeMeasurement() { return kTvmErrorNoError; }
+
+// Default implementation, overridden by the platform runtime.
+__attribute__((weak)) tvm_crt_error_t TVMPlatformAfterMeasurement() { return kTvmErrorNoError; }
+
 // Fill the tensor in args[0] with random data using TVMPlatformGenerateRandom.
 // Named to correspond with the analogous function in the C++ runtime.
 int TVMContribRandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,

From 0b46efa33e045ab4d892e40c1f5084dd6651c8b6 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 9 Jun 2022 18:50:06 -0700
Subject: [PATCH 092/181] [BUG] Disable second PlanDevices pass (#11662)

Though started with the best of intentions, the second
PlanDevices pass to account for memory scope's introduced
by lowering is buggy and not ready for prime time. It
has caused an ICHECK fail since for some reason the new
constraints are not flowing into device_copies.
---
 src/relay/backend/vm/compiler.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index d9730b1b5a4ca..48f12ea8aaf87 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -244,6 +244,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
         host_virtual_device_(std::move(host_virtual_device)) {}
 
   VMFunction Compile(const GlobalVar& var, const Function& func) {
+    VLOG(1) << "Compiling:" << std::endl << PrettyPrint(func);
     std::vector<Index> param_device_indexes;
     if (IsClosure(func)) {
       // After lifting we'll have functions of the form:
@@ -1102,9 +1103,8 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   // let-bound functions.
   pass_seqs.push_back(DeadCodeElimination(/*inline_once=*/false));
 
-  // Now that we have PrimFuncs, flow and solve VirtualDevice constraints again to account for
-  // any memory scopes which lowering has settled on.
-  pass_seqs.push_back(transform::PlanDevices(config_));
+  // At this point it's possible to run PlanDevices again to pick up any additional constraints
+  // introduced during lowering. However we'll not do this until more testing has been done.
 
   // Inline the functions that are lifted to the module scope. We perform this
   // pass after all other optimization passes but before the memory allocation

From 53d163c96850c8476d479803c59344c6977ef9e8 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 10 Jun 2022 11:05:18 +0900
Subject: [PATCH 093/181] [TIR, CUDA] Add pass to replace global to shared
 memory copy with cp.async (#11658)

* [TIR, CUDA] Add pass to replace global to shared memory copy with cp.async

* add missing doc

* black

* missing src

* clang format

* clang format

* check against nested async scope
---
 include/tvm/tir/stmt.h                        |   5 +
 include/tvm/tir/transform.h                   |   6 +
 python/tvm/testing/utils.py                   |   7 +
 python/tvm/tir/transform/transform.py         |  11 ++
 src/driver/driver_api.cc                      |   8 +
 src/target/source/ptx.cc                      |   3 +-
 src/tir/transforms/inject_ptx_async_copy.cc   | 145 ++++++++++++++
 .../python/unittest/test_tir_ptx_cp_async.py  |   4 +-
 ...est_tir_schedule_tensorize_ldmatrix_mma.py |   8 +-
 ...est_tir_transform_inject_ptx_async_copy.py | 183 ++++++++++++++++++
 10 files changed, 370 insertions(+), 10 deletions(-)
 create mode 100644 src/tir/transforms/inject_ptx_async_copy.cc
 create mode 100644 tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py

diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 48cac6d8d0571..288ed9d609ab8 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -1441,6 +1441,11 @@ constexpr const char* pipeline_exec_scope = "pipeline_exec_scope";
  */
 constexpr const char* device_scope = "device_scope";
 
+/*!
+ * \brief Mark that the attached statement runs asynchronously.
+ */
+constexpr const char* async_scope = "async_scope";
+
 /*!
  * \brief Mark that the shape of TensorCore fragment
  */
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 6393eeb9430b9..39a6459048ad3 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -644,6 +644,12 @@ TVM_DLL Pass AnnotateEntryFunc();
  */
 TVM_DLL Pass Filter(runtime::TypedPackedFunc<bool(PrimFunc)> fcond);
 
+/*!
+ * \brief Pass to rewrite global to shared memory copy on CUDA with asyncronous copy.
+ * \return The pass.
+ */
+TVM_DLL Pass InjectPTXAsyncCopy();
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index bf3cc94f5ddf7..59ff93cfea5c3 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1599,6 +1599,13 @@ def terminate_self():
     sys.exit(-1)
 
 
+def is_ampere_or_newer():
+    """Check if the target environment has an NVIDIA Ampere GPU or newer."""
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+    return major >= 8
+
+
 def main():
     test_file = inspect.getsourcefile(sys._getframe(1))
     sys.exit(pytest.main([test_file] + sys.argv[1:]))
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index e0a7501ef92af..e1ddfe439afe1 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -825,3 +825,14 @@ def Filter(fcond: Callable):
         The result pass
     """
     return _ffi_api.Filter(fcond)  # type: ignore
+
+
+def InjectPTXAsyncCopy():
+    """Rewrite global to shared memory copy on CUDA with asyncronous copy.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.InjectPTXAsyncCopy()  # type: ignore
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index ace31800de27f..7f015e7ca2b90 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -50,6 +50,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_storage_rewrite", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_ptx_async_copy", Bool);
 
 using runtime::PackedFunc;
 using runtime::TVMArgs;
@@ -559,6 +560,13 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
   mixed_pass_list.push_back(tir::transform::InferFragment());
   mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
 
+  bool use_ptx_async_copy =
+      pass_ctx->GetConfig<Bool>("tir.use_ptx_async_copy", Bool(false)).value();
+
+  if (use_ptx_async_copy) {
+    mixed_pass_list.push_back(tir::transform::InjectPTXAsyncCopy());
+  }
+
   bool unpacked_api = mixed_mod->GetAttr<relay::Executor>(tvm::attr::kExecutor)
                           .value_or(relay::Executor::Create("graph", {}))
                           ->GetAttr<Bool>("unpacked-api")
diff --git a/src/target/source/ptx.cc b/src/target/source/ptx.cc
index 71c68baed6dcc..c5e3bf98ec2d2 100644
--- a/src/target/source/ptx.cc
+++ b/src/target/source/ptx.cc
@@ -651,7 +651,7 @@ std::string PrintCpAsyncAssembly(const std::string& shared_ptr,
       : "l"((void *)({smem_addr}))
     );
     __asm__ __volatile__(
-      "cp.async.cg.shared.global [%0], [%1], %2;"
+      "cp.async.{cg_or_ca}.shared.global [%0], [%1], %2;"
        :: "r"(addr), "l"((void*)({global_ptr})), "n"({bytes})
     );
   }
@@ -660,6 +660,7 @@ std::string PrintCpAsyncAssembly(const std::string& shared_ptr,
   replacer.register_rule("{smem_addr}", shared_ptr + " + " + shared_elem_offset);
   replacer.register_rule("{global_ptr}", global_ptr + " + " + global_elem_offset);
   replacer.register_rule("{bytes}", bytes);
+  replacer.register_rule("{cg_or_ca}", bytes == "16" ? "cg" : "ca");
   asm_code = replacer.rewrite(asm_code);
   return asm_code;
 }
diff --git a/src/tir/transforms/inject_ptx_async_copy.cc b/src/tir/transforms/inject_ptx_async_copy.cc
new file mode 100644
index 0000000000000..c74ce9d3d2b70
--- /dev/null
+++ b/src/tir/transforms/inject_ptx_async_copy.cc
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Replace copy from global to shared with async copy
+ * \file inject_ptx_async_copy.cc
+ */
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../ir/buffer_common.h"
+#include "storage_access.h"
+#include "tvm/tir/stmt.h"
+
+namespace tvm {
+namespace tir {
+
+class PTXAsyncCopyInjector : public StmtMutator {
+ public:
+  Stmt VisitStmt_(const AttrStmtNode* attr) {
+    if (attr->attr_key == tir::attr::async_scope) {
+      ICHECK(in_async == false) << "Nested async scopes not supported";
+      in_async = true;
+      auto body = this->VisitStmt(attr->body);
+      in_async = false;
+      return body;
+    }
+    return StmtMutator::VisitStmt_(attr);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* store) {
+    if (in_async && (store->buffer.scope() == "shared" || store->buffer.scope() == "shared.dyn")) {
+      if (auto* load = store->value.as<BufferLoadNode>()) {
+        if (load->buffer.scope() == "global") {
+          ICHECK(load->indices.size() == 1 && store->indices.size() == 1);
+          ICHECK(load->indices[0]->dtype.lanes() == store->indices[0]->dtype.lanes());
+
+          const int indices_lanes = load->indices[0]->dtype.lanes();
+          const int bytes = indices_lanes * load->buffer->dtype.bytes();
+
+          if (bytes == 4 || bytes == 8 || bytes == 16) {
+            auto dst_elem_type = GetPointerType(store->buffer->data->type_annotation);
+            auto src_elem_type = GetPointerType(load->buffer->data->type_annotation);
+            ICHECK(dst_elem_type.first && src_elem_type.first)
+                << "Both store and load buffer should have a pointer type annotation.";
+
+            int index_factor = 1;
+            if (dst_elem_type != src_elem_type) {
+              // The only case where src and dst have different dtypes is when the dst shared memory
+              // is a byte buffer generated by merging dynamic shared memory.
+              ICHECK(store->buffer.scope() == "shared.dyn");
+              ICHECK(dst_elem_type.second == DataType::UInt(8));
+              // BufferStore/Load have the "pointer reinterpret" semantics according to their
+              // "value" dtype. Their "indices" are supposed to be applied after such pointer cast,
+              // for example: ((*float16)(byte_buffer))[buffer->indices] = fp16_value;
+              // To replace BufferStore/Load with cp.async, we need to multiply the store index by
+              // the byte size of the "value" dtype, to get the correct offset into the byte buffer.
+              index_factor = src_elem_type.second.bytes();
+            }
+
+            if (indices_lanes == 1) {
+              auto src_offset = load->indices[0];
+              auto dst_offset = store->indices[0];
+              return Evaluate(
+                  Call(store->buffer->dtype, tvm::tir::builtin::ptx_cp_async(),
+                       {store->buffer->data, tir::Mul(dst_offset, PrimExpr(index_factor)),
+                        load->buffer->data, src_offset, PrimExpr(bytes)}));
+            }
+
+            // Only some vectorized indexing patterns are supported for now.
+            auto src_offset = [=]() -> PrimExpr {
+              if (load->indices[0]->IsInstance<RampNode>()) {
+                return load->indices[0].as<RampNode>()->base;
+              }
+              return PrimExpr();
+            }();
+
+            auto dst_offset = [=]() -> PrimExpr {
+              if (store->indices[0].as<RampNode>()) {
+                return store->indices[0].as<RampNode>()->base;
+              } else if (store->indices[0].as<AddNode>()) {
+                // The case where the dst buffer is a byte buffer generated by merging dynamic
+                // shared memory.
+                // A_shared.dyn[(ramp(...), 1, 8) + x8(17408))] = A_global[ramp(...),1, 8)]
+                auto* add = store->indices[0].as<AddNode>();
+                if (!add->a->IsInstance<RampNode>()) return PrimExpr();
+                if (!add->b->IsInstance<BroadcastNode>()) return PrimExpr();
+                return tir::Add(add->a.as<RampNode>()->base, add->b.as<BroadcastNode>()->value);
+              }
+              return PrimExpr();
+            }();
+
+            if (src_offset.defined() && dst_offset.defined()) {
+              return Evaluate(
+                  Call(store->buffer->dtype, tvm::tir::builtin::ptx_cp_async(),
+                       {store->buffer->data, tir::Mul(dst_offset, PrimExpr(index_factor)),
+                        load->buffer->data, src_offset, PrimExpr(bytes)}));
+            }
+          }
+        }
+      }
+    }
+    return StmtMutator::VisitStmt_(store);
+  }
+
+ private:
+  bool in_async{false};
+};
+
+namespace transform {
+
+Pass InjectPTXAsyncCopy() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto* n = f.CopyOnWrite();
+    n->body = PTXAsyncCopyInjector()(n->body);
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.InjectPTXAsyncCopy", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.InjectPTXAsyncCopy").set_body_typed(InjectPTXAsyncCopy);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/unittest/test_tir_ptx_cp_async.py b/tests/python/unittest/test_tir_ptx_cp_async.py
index 17b60885509f8..5e6535f295cb4 100644
--- a/tests/python/unittest/test_tir_ptx_cp_async.py
+++ b/tests/python/unittest/test_tir_ptx_cp_async.py
@@ -40,8 +40,8 @@ def ptx_cp_async(A: T.Buffer[(32, 128), "float16"], B: T.Buffer[(32, 128), "floa
             )
 
         # TODO(masahi): Remove dtype requirement from TVMScript parser
-        T.evaluate(T.ptx_commit_group(dtype="float16"))
-        T.evaluate(T.ptx_wait_group(0, dtype="float16"))
+        T.evaluate(T.ptx_commit_group(dtype=""))
+        T.evaluate(T.ptx_wait_group(0, dtype=""))
 
         for i in range(128):
             B[tx, i] = A_shared[tx, i]
diff --git a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
index 9feb994e71584..32c1625653e5f 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
@@ -76,12 +76,6 @@ def maybe_swap(i, j):
     return (a, b, c)
 
 
-def is_ampere_or_newer():
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-    return major >= 8
-
-
 def run_test(
     k_inner,
     in_dtype,
@@ -117,7 +111,7 @@ def run_test(
         mma_store_intrin,
     )
 
-    if not is_ampere_or_newer():
+    if not tvm.testing.is_ampere_or_newer():
         return None
 
     f = tvm.build(sch.mod["main"], target="cuda", name="dense")
diff --git a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
new file mode 100644
index 0000000000000..d7e13f40aa143
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
@@ -0,0 +1,183 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm.script import tir as T
+import numpy as np
+import tvm.testing
+
+
+def count_cp_async(stmt):
+    num_alloc = [0]
+
+    def verify(n):
+        if isinstance(n, tvm.tir.Call) and str(n.op) == "tir.ptx_cp_async":
+            num_alloc[0] += 1
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
+    return num_alloc[0]
+
+
+def generate_global_to_shared_vectorized_copy(dtype, vector_size):
+    num_iters = 128 // vector_size
+    vector_size_expr = tvm.runtime.convert(vector_size)
+
+    @T.prim_func
+    def ptx_global_to_shared_copy(
+        A: T.Buffer[(32, 128), dtype], B: T.Buffer[(32, 128), dtype]
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        bx = T.env_thread("blockIdx.x")
+        tx = T.env_thread("threadIdx.x")
+        T.launch_thread(bx, 1)
+        T.launch_thread(tx, 32)
+        with T.block():
+            A_shared = T.alloc_buffer([32, 128], dtype, scope="shared")
+            T.reads(A[0:32, 0:128])
+            T.writes(B[0:32, 0:128])
+
+            T.attr("default", "async_scope", 1)
+            for i in T.serial(num_iters):
+                for j in T.vectorized(vector_size):
+                    A_shared[tx, i * vector_size_expr + j] = A[tx, i * vector_size_expr + j]
+
+            T.evaluate(T.ptx_commit_group(dtype=""))
+            T.evaluate(T.ptx_wait_group(0, dtype=""))
+
+            for i in range(128):
+                B[tx, i] = A_shared[tx, i]
+
+    return ptx_global_to_shared_copy
+
+
+@T.prim_func
+def ptx_global_to_shared_copy_fp32x1(
+    A: T.Buffer[(32, 128), "float32"], B: T.Buffer[(32, 128), "float32"]
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    bx = T.env_thread("blockIdx.x")
+    tx = T.env_thread("threadIdx.x")
+    T.launch_thread(bx, 1)
+    T.launch_thread(tx, 32)
+    with T.block():
+        A_shared = T.alloc_buffer([32, 128], "float32", scope="shared")
+        T.reads(A[0:32, 0:128])
+        T.writes(B[0:32, 0:128])
+
+        T.attr("default", "async_scope", 1)
+        for i in T.serial(128):
+            A_shared[tx, i] = A[tx, i]
+
+        T.evaluate(T.ptx_commit_group(dtype=""))
+        T.evaluate(T.ptx_wait_group(0, dtype=""))
+
+        for i in range(128):
+            B[tx, i] = A_shared[tx, i]
+
+
+@T.prim_func
+def ptx_global_to_shared_dyn_copy_fp16x8(
+    A: T.Buffer[(32, 128), "float16"],
+    B: T.Buffer[(32, 128), "float16"],
+    C: T.Buffer[(32, 128), "float16"],
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    bx = T.env_thread("blockIdx.x")
+    tx = T.env_thread("threadIdx.x")
+    T.launch_thread(bx, 1)
+    T.launch_thread(tx, 32)
+    with T.block():
+        A_shared = T.alloc_buffer([32, 128], "float16", scope="shared.dyn")
+        B_shared = T.alloc_buffer([32, 128], "float16", scope="shared.dyn")
+        T.reads(A[0:32, 0:128], B[0:32, 0:128])
+        T.writes(C[0:32, 0:128])
+
+        T.attr("default", "async_scope", 1)
+        for i in T.serial(16):
+            for j in T.vectorized(8):
+                A_shared[tx, i * 8 + j] = A[tx, i * 8 + j]
+                B_shared[tx, i * 8 + j] = B[tx, i * 8 + j]
+
+        T.evaluate(T.ptx_commit_group(dtype=""))
+        T.evaluate(T.ptx_wait_group(0, dtype=""))
+
+        for i in range(128):
+            C[tx, i] = A_shared[tx, i] + B_shared[tx, i]
+
+
+@tvm.testing.requires_cuda
+def test_inject_async_copy():
+    for dtype, vec_size in [("float16", 8), ("float16", 4), ("float32", 4), ("float32", 1)]:
+        if vec_size == 1:
+            f = ptx_global_to_shared_copy_fp32x1
+        else:
+            f = generate_global_to_shared_vectorized_copy(dtype, vec_size)
+
+        mod = tvm.IRModule.from_expr(f)
+        mod = tvm.tir.transform.FlattenBuffer()(mod)
+        if vec_size > 1:
+            mod = tvm.tir.transform.VectorizeLoop()(mod)
+        mod = tvm.tir.transform.InjectPTXAsyncCopy()(mod)
+
+        assert count_cp_async(mod["main"].body) == 1
+
+        if not tvm.testing.is_ampere_or_newer():
+            continue
+
+        with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}):
+            mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda")
+
+        A_np = np.random.rand(32, 128).astype(dtype)
+        B_np = np.zeros((32, 128)).astype(dtype)
+        dev = tvm.cuda(0)
+        A_nd = tvm.nd.array(A_np, device=dev)
+        B_nd = tvm.nd.array(B_np, device=dev)
+        mod(A_nd, B_nd)
+        tvm.testing.assert_allclose(B_nd.numpy(), A_np)
+
+
+@tvm.testing.requires_cuda
+def test_inject_async_copy_shared_dyn():
+    f = ptx_global_to_shared_dyn_copy_fp16x8
+
+    mod = tvm.IRModule.from_expr(f)
+    mod = tvm.tir.transform.FlattenBuffer()(mod)
+    mod = tvm.tir.transform.VectorizeLoop()(mod)
+    mod = tvm.tir.transform.MergeDynamicSharedMemoryAllocations()(mod)
+    mod = tvm.tir.transform.InjectPTXAsyncCopy()(mod)
+
+    assert count_cp_async(mod["main"].body) == 2
+
+    if not tvm.testing.is_ampere_or_newer():
+        return
+
+    with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}):
+        mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda")
+
+    A_np = np.random.rand(32, 128).astype("float16")
+    B_np = np.random.rand(32, 128).astype("float16")
+    C_np = np.zeros((32, 128)).astype("float16")
+    dev = tvm.cuda(0)
+    A_nd = tvm.nd.array(A_np, device=dev)
+    B_nd = tvm.nd.array(B_np, device=dev)
+    C_nd = tvm.nd.array(C_np, device=dev)
+    mod(A_nd, B_nd, C_nd)
+    tvm.testing.assert_allclose(C_nd.numpy(), A_np + B_np)
+
+
+if __name__ == "__main__":
+    test_inject_async_copy()
+    test_inject_async_copy_shared_dyn()

From ec24ae60a028f5aae0fa2f1d8a668eb6bf366414 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 9 Jun 2022 21:22:49 -0700
Subject: [PATCH 094/181] [BYOC] RelayToTIR custom codegen passes can still
 depend on dynamic shape functions (#11619)

In #11474 I got ready to switch CUTLASS from function-at-a-time to IRModule-at-a-time compilation.
However my approach didn't handle dynamic shape functions, so I adjust it here.

The idea is still that such passes will leave behind
calls to 'extern' functions. However, converting those
calls to 'call_lowered' form in
MarkCompilerFunctionsAsExtern is too soon since only
the TECompiler knows how to capture all the attributes
necessary to support dynamic shape functions.

So stop doing that in MarkCompilerFunctionsAsExtern and
instead support this case properly in the TECompiler.

While there try to chip away at the chronic lack of structure in te_compiler.cc. Every little bit helps.

Add a basic unit test.
---
 src/relay/backend/aot_executor_codegen.cc     |   8 +-
 src/relay/backend/graph_executor_codegen.cc   |  27 +-
 src/relay/backend/interpreter.cc              |   3 +-
 src/relay/backend/te_compiler.cc              | 329 ++++++++++++------
 src/relay/backend/te_compiler.h               |  32 +-
 src/relay/backend/vm/compiler.cc              |  24 +-
 .../transforms/compiler_function_utils.cc     |  51 ---
 .../transforms/compiler_function_utils.h      |  11 +-
 .../relay/backend/test_pass_lower_te.py       | 241 +++++++++++++
 .../transform/test_compiler_function_utils.py |   5 +-
 10 files changed, 503 insertions(+), 228 deletions(-)
 create mode 100644 tests/python/relay/backend/test_pass_lower_te.py

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 167afd2c5f782..381cfa0c9d1c8 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -1064,9 +1064,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     mod = transform::ToANormalForm()(mod);
 
-    IRModule lowered_mod = tec::LowerTEPass(
-        mod_name,
-        [this, workspace_byte_alignment](BaseFunc func) {
+    IRModule lowered_mod =
+        tec::LowerTE(mod_name, config_, [this, workspace_byte_alignment](BaseFunc func) {
           // We need to maintain the constant map for external
           // functions so we pass this processing function which
           // allows us to process each function as we lower it.
@@ -1078,8 +1077,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           // execute as a further pass, instead writing data to the
           // lowering process directly.
           tec::UpdateFunctionMetadata(func, this->function_metadata_, workspace_byte_alignment);
-        },
-        config_)(mod);
+        })(mod);
 
     auto lowered_main = lowered_mod->Lookup("main");
     auto lowered_main_func = GetRef<Function>(lowered_main.as<FunctionNode>());
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 7dba23803f8c7..af426e5c71cbf 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -217,22 +217,19 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       mod = WithAttr(mod, "main_func_info", func_info);
     }
 
-    IRModule lowered_mod = tec::LowerTEPass(
-        mod_name_,
-        [this](BaseFunc func) {
-          // We need to maintain the constant map for external
-          // functions so we pass this processing function which
-          // allows us to process each function as we lower it.
-          if (func->GetAttr<String>(attr::kCompiler).defined()) {
-            UpdateConstants(func, &params_);
-          }
+    IRModule lowered_mod = tec::LowerTE(mod_name_, config_, [this](BaseFunc func) {
+      // We need to maintain the constant map for external
+      // functions so we pass this processing function which
+      // allows us to process each function as we lower it.
+      if (func->GetAttr<String>(attr::kCompiler).defined()) {
+        UpdateConstants(func, &params_);
+      }
 
-          // TODO(@areusch, @jroesch): We should refactor this to
-          // execute as a further pass, instead writing data to the
-          // lowering process directly.
-          tec::UpdateFunctionMetadata(func, this->function_metadata_);
-        },
-        config_)(mod);
+      // TODO(@areusch, @jroesch): We should refactor this to
+      // execute as a further pass, instead writing data to the
+      // lowering process directly.
+      tec::UpdateFunctionMetadata(func, this->function_metadata_);
+    })(mod);
 
     Optional<backend::FunctionInfo> main_func_info =
         lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 9661040eab308..65a0fdc948240 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -960,8 +960,7 @@ IRModule Prepare(IRModule mod, const CompilationConfig& config) {
        // eta expand to support constructors in argument position.
        transform::EtaExpand(
            /*expand_constructor=*/true, /*expand_global_var=*/false),
-       transform::InferType(),
-       tec::LowerTEPass(/*module_name=*/"intrp", [](BaseFunc func) { /* no-op */ }, config)});
+       transform::InferType(), tec::LowerTE(/*module_name=*/"intrp", config)});
 
   transform::PassContext pass_ctx = transform::PassContext::Current();
   With<transform::PassContext> ctx(pass_ctx);
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index c78f3abd6eccf..e9491b0a89010 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -17,6 +17,76 @@
  * under the License.
  */
 
+/*!
+ * \file relay/backend/te_compiler.cc
+ * \brief Manages the transition from Relay "Primitive" \p Functions to TIR \p PrimFuncs. Also
+ * handles invocation of external codegen.
+ *
+ * \p LowerTEPass handles the following (as a monolithic blob of code):
+ *
+ *  - Most importantly, any function with the "Primitive" attribute is first converted to TE by
+ *    \p LowerToTECompute (see te_compiler_cache.cc) using each operator's 'compute' function.
+ *    The TE is then 'scheduled' to TIR using the 'anchor' operator's 'schedule' function. Both
+ *    of those functions come from the \p OpStrategy returned by the Python
+ *    'relay.backend.lower_call' function (see te_compiler.py).
+ *    The TIR is packed as a \p PrimFunc and introduced as a new global function. Calls to the
+ *    original "Primitive" function are then rewritten to the form:
+ *    \code
+ *      call_lowered(@new_global, (... original args...), attributes)
+ *    \endcode
+ *
+ *  - The above "Primitive" function can appear:
+ *     - As a global function
+ *     - As a let-bound function
+ *     - As an inline function, ie the 'op' of calls.
+ *    In all three cases it is possible for the same "Primitive" function to be called multiple
+ *    times, and that sharing must be respected.
+ *
+ *  - "Primitive" functions must have a "global_symbol" attribute matching their desired or
+ *    existing global name. Care is taken to ensure GlobalVars with the same name are shared.
+ *
+ *  - It is possible for multiple structurally equal "Primitive" functions to appear in the same
+ *    \p IRModule. Only one implementation should be generated, and all calls should share that
+ *    implementation.
+ *
+ *  - When later converting to DPS (see memory_alloc.cc) we must handle functions who's result
+ *    tensor shapes depend at runtime on the input tensor shapes and/or data.
+ *     - That dependency is first described in TE form (see \p MakeShapeFunc in
+ *       te_compiler_cache.cc), then scheduled to yield a 'dynamic shape function' \p PrimFunc.
+ *       This relies on each operator's "FShapeFunc" and "TShapeDataDependent" attributes.
+ *       Since shapes are rank-1 tensors everything can be reflected back down into the regular
+ *       TE/TIR forms.
+ *     - Then the call_lowered attributes must record everything about the dynamic shape function
+ *       later needed by memory_alloc.cc. We call this 'cross linking' the call with the shape
+ *       function.
+ *
+ *  - Two external codegen mechanisms are supported, both triggered by "Primitive" functions which
+ *    also have a "Compiler" attribute bound to $compiler:
+ *     - Function-at-a-time (old style): The primitive function is passed to the function
+ *       registered as 'relay.ext.$compiler'. The function returns a runtime::Module which
+ *       should return true for \p ImplementsFunction for the function's global name. That
+ *       module is added to the IRModule's "external_mods" attributes.
+ *     - IRModule-at-a-item (new style): The \p RelayToTIRTargetHook sub-pass looks for
+ *       $compiler names which correspond to TargetKind names with a \p RelayToTIR attribute.
+ *       The \p Pass bound to that attribute is run, and each such 'custom' pass can do what
+ *       it likes, including replacing Functions with PrimFuncs, or adding new runtime::Modules
+ *       to the IRModule's "external_mods" attribute.
+ *
+ *  - Calls to functions added by external codegen are also rewritten to call_lowered form, and
+ *    may also require cross-linking to dynamic shape functions. However, since the functions
+ *    are/will be implemented by a runtime::Module all the Relay type information is no longer
+ *    available. So the Relay definitions for these "Primitive" "Compiler" functions are retained
+ *    in the \p IRModule, but marked with the "Extern" attribute to signal the function is now
+ *    just for carrying metadata.
+ *
+ *  - Some operators are handled specially:
+ *     - 'reshape', since it's a no-op on the underlying tensor buffer, and this is handled by
+ *       condition tests in many passes.
+ *     - 'debug', since it's intercepted differently depending on runtimes.
+ *
+ * TODO(mbs): This desperately deserves a refactor to separate all these concerns. See Relax.
+ */
+
 #include "./te_compiler.h"
 
 #include <tvm/driver/driver_api.h>
@@ -222,7 +292,7 @@ class TECompilerImpl : public TECompilerNode {
         } else {
           // It is valid for the external codegen function to return null:
           //  - Unit tests can use it.
-          //  - The true compilation may have already been handled by a RelayToTIR custom hook pass
+          //  - The true compilation may have already been handled by a RelayToTIR custom pass
           //    on the Target's kind. The original Relay functions will be left in place so
           //    that we can capture that their function names are now externally defined.
           VLOG(1) << "Note that no external runtime module was generated by external codegen '"
@@ -566,100 +636,128 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
         return itr->second;
       }
     } else if (const auto* function_node = expr.as<FunctionNode>()) {
-      if (!function_node->HasNonzeroAttr(attr::kPrimitive)) {
-        // Not marked as primitive by FuseOps.
-        return {};
-      }
-      if (const auto* call_node = function_node->body.as<CallNode>()) {
-        if (call_node->op == debug_op_) {
-          // Debug 'primitives' are not lowered.
-          return {};
+      if (function_node->HasNonzeroAttr(attr::kExtern)) {
+        // We have a regular call to an 'extern' function. The call itself needs to be rewritten
+        // to call_lowered form, and any required dynamic shape functions generated and
+        // cross-linked.
+        return GetRef<Function>(function_node);
+      } else if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+        if (const auto* call_node = function_node->body.as<CallNode>()) {
+          if (call_node->op == debug_op_) {
+            // Debug 'primitives' are not lowered.
+            return {};
+          }
         }
+        // We have a regular call to a 'primitive' function (possibly with a 'Compiler' attribute).
+        // We need to lower and rewrite the call.
+        return GetRef<Function>(function_node);
+      } else {
+        // Not marked as primitive during partitioning or TVM fusion.
+        return {};
       }
-      return GetRef<Function>(function_node);
     } else {
       return {};
     }
   }
 
   /*!
-   * \brief Lowers the primitive function \p func to TIR for ultimate execution
-   * on a device with configuration \p target. Returns the global var bound
-   * to the TIR implementation, and attributes to attach to the call to identify it as
-   * a TIR call.
+   * \brief Returns a 'call_lowered' call to \p prim_fn_var with \p args and \p span with all the
+   * required attributes filled in. Generally \p prim_fn_var will correspond to the lowered or
+   * externally codegen-ed form of \p original_function, where \p lowered_functions binds all
+   * the required lowered functions.
+   *
+   * The call's attributes will capture:
+   *  - Any attributes on the original_function.
+   *  - All the lowered functions.
+   *    TODO(mbs): Pretty sure that's no longer needed.
+   *  - Details needed to cross-link the call to it's dynamic shape function, if any.
    */
-  Expr MakeLoweredCall(Function func, Array<Expr> visited_args, Span span, Target target) {
-    CCacheKey key = CCacheKey(func, target);
-    CachedFunc cfunc = compiler_->Lower(key, module_name_);
-    ICHECK(cfunc.defined());
-
-    auto opt_compiler = func->GetAttr<String>(attr::kCompiler);
+  Expr MakeLoweredCall(const BaseFunc& original_function, const GlobalVar& prim_fn_var,
+                       Array<Expr> args, Span span, const Target& target,
+                       const Map<GlobalVar, BaseFunc>& lowered_functions) {
+    auto opt_compiler = original_function->GetAttr<String>(attr::kCompiler);
 
     // Add some metadata on top of the *original function* and invoke the callback so it can
     // be captured.
     // TODO(@areusch, @jroesch): this metadata is for AOT, this should be our interface for AOT
     Map<GlobalVar, tir::PrimFunc> prim_fns;
     Array<GlobalVar> all_prim_fn_vars;
-    for (const auto& kv : cfunc->funcs->functions) {
+    for (const auto& kv : lowered_functions) {
       if (opt_compiler) {
-        // We expect just the original func but with just the ExternalSymbol attribute signaling
-        // the function (will be) compiled externally.
+        // We expect the original function to have just the "Extern" attribute signaling the
+        // function (will be) compiled externally.
         ICHECK(kv.second.as<FunctionNode>())
             << PrettyPrint(kv.first) << " must be bound to an (external) Function";
       } else {
-        // We expect one or more PrimFuncs, one of which corresponds to 'the' lowered primitive
-        // (and the rest in support of that via tir::Calls).
+        // We expect one or more PrimFuncs, one of which corresponds to 'the' lowered primitive,
+        // and the rest are in support of that via tir::Calls.
         ICHECK(kv.second.as<tir::PrimFuncNode>())
             << PrettyPrint(kv.first) << " must be bound to a PrimFunc";
         prim_fns.Set(kv.first, Downcast<tir::PrimFunc>(kv.second));
         all_prim_fn_vars.push_back(kv.first);
       }
     }
-    Function func_with_metadata = func;
-    func_with_metadata = WithAttr(func_with_metadata, "prim_fn_var", cfunc->prim_fn_var);
-    func_with_metadata = WithAttr(func_with_metadata, "prim_funcs", prim_fns);
-    func_with_metadata = WithAttr(func_with_metadata, tvm::attr::kTarget, cfunc->target);
-    this->process_fn_(func_with_metadata);
 
+    // Alas, WithAttr cannot work with base classes.
+    if (const auto* prim_func_node = original_function.as<te::PrimFuncNode>()) {
+      auto func_with_metadata = GetRef<te::PrimFunc>(prim_func_node);
+      func_with_metadata = WithAttr(func_with_metadata, "prim_fn_var", prim_fn_var);
+      func_with_metadata = WithAttr(func_with_metadata, "prim_funcs", prim_fns);
+      func_with_metadata = WithAttr(func_with_metadata, tvm::attr::kTarget, target);
+      this->process_fn_(func_with_metadata);
+    } else {
+      const auto* function_node = original_function.as<FunctionNode>();
+      ICHECK(function_node);
+      auto func_with_metadata = GetRef<Function>(function_node);
+      func_with_metadata = WithAttr(func_with_metadata, "prim_fn_var", prim_fn_var);
+      func_with_metadata = WithAttr(func_with_metadata, "prim_funcs", prim_fns);
+      func_with_metadata = WithAttr(func_with_metadata, tvm::attr::kTarget, target);
+      this->process_fn_(func_with_metadata);
+    }
+
+    // Now prepare the attributes of the call_lowered.
     CallLoweredAttrs call_lowered_attrs;
 
-    // Non-External Relay Function
     // TODO(mbs): "reshape" cleanup.
-    if (!opt_compiler && func->HasNonzeroAttr(attr::kReshapeOnly)) {
+    if (!opt_compiler && original_function->HasNonzeroAttr(attr::kReshapeOnly)) {
       call_lowered_attrs.metadata.Set(attr::kReshapeOnly, tvm::Integer(1));
     }
 
-    call_lowered_attrs.metadata.Set("relay_attrs", func->attrs);
+    call_lowered_attrs.metadata.Set("relay_attrs", original_function->attrs);
     call_lowered_attrs.metadata.Set("all_prim_fn_vars", all_prim_fn_vars);
 
-    if (IsDynamic(func->ret_type)) {
-      // Also lower the companion dynamic shape function.
-      // Shape function keys use the underlying primitive function as their 'function',
-      // but the generic 'cpu' target as the target since all shape functions run
-      // on the host cpu irrespective of where the primitive runs.
-      CCacheKey shape_key(func, config_->host_virtual_device->target);
-      CachedFunc lowered_shape_func = compiler_->LowerShapeFunc(shape_key);
-
-      // Capture the shape function's global var and parameters 'states' in call
-      // annotations so calling convention can be recovered.
-      // TODO(mbs): Shape cleanup.
-      call_lowered_attrs.metadata.Set("prim_shape_fn_var", lowered_shape_func->prim_fn_var);
-      call_lowered_attrs.metadata.Set("prim_shape_fn_states",
-                                      lowered_shape_func->shape_func_param_states);
-      call_lowered_attrs.metadata.Set("prim_shape_fn_num_inputs",
-                                      Integer(static_cast<int>(lowered_shape_func->inputs.size())));
-      call_lowered_attrs.metadata.Set(
-          "prim_shape_fn_num_outputs",
-          Integer(static_cast<int>(lowered_shape_func->outputs.size())));
-      Array<GlobalVar> all_prim_shape_fn_vars;
-      for (const auto& kv : lowered_shape_func->funcs->functions) {
-        CHECK(kv.second.as<tir::PrimFuncNode>()) << "must be a prim fn";
-        all_prim_shape_fn_vars.push_back(kv.first);
+    if (const auto* function_node = original_function.as<FunctionNode>()) {
+      if (IsDynamic(function_node->ret_type)) {
+        // Create a dynamic shape function to calculate the expected shape of the results of
+        // the lowered function.
+        // Shape function keys use the original function as their 'function', but the generic 'cpu'
+        // target as the target since all shape functions run on the host cpu irrespective of where
+        // the primitive runs.
+        CCacheKey shape_key(GetRef<Function>(function_node), config_->host_virtual_device->target);
+        CachedFunc lowered_shape_func = compiler_->LowerShapeFunc(shape_key);
+
+        // Capture the shape function's global var and parameters 'states' in call
+        // annotations so calling convention can be recovered.
+        // TODO(mbs): Shape cleanup.
+        call_lowered_attrs.metadata.Set("prim_shape_fn_var", lowered_shape_func->prim_fn_var);
+        call_lowered_attrs.metadata.Set("prim_shape_fn_states",
+                                        lowered_shape_func->shape_func_param_states);
+        call_lowered_attrs.metadata.Set(
+            "prim_shape_fn_num_inputs",
+            Integer(static_cast<int>(lowered_shape_func->inputs.size())));
+        call_lowered_attrs.metadata.Set(
+            "prim_shape_fn_num_outputs",
+            Integer(static_cast<int>(lowered_shape_func->outputs.size())));
+        Array<GlobalVar> all_prim_shape_fn_vars;
+        for (const auto& kv : lowered_shape_func->funcs->functions) {
+          CHECK(kv.second.as<tir::PrimFuncNode>()) << "must be a prim fn";
+          all_prim_shape_fn_vars.push_back(kv.first);
+        }
+        call_lowered_attrs.metadata.Set("all_prim_shape_fn_vars", all_prim_shape_fn_vars);
       }
-      call_lowered_attrs.metadata.Set("all_prim_shape_fn_vars", all_prim_shape_fn_vars);
     }
 
-    return CallLowered(cfunc->prim_fn_var, std::move(visited_args), std::move(call_lowered_attrs),
+    return CallLowered(prim_fn_var, std::move(args), std::move(call_lowered_attrs),
                        std::move(span));
   }
 
@@ -697,43 +795,51 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
   }
 
   Expr DeviceAwareVisitExpr_(const CallNode* call_node) override {
-    // We can see five forms of calls:
-    //  1. A 'normal' Relay call to a Function with the "primitive" attribute. We will need
-    //     to lower that to a global PrimFunc and rewrite the call to:
+    // We can see six forms of calls:
+    //  1. A 'normal' Relay call to a Function with the "Primitive" attribute and not "Compiler"
+    //     attribute. We will need to lower that to a global PrimFunc and rewrite the call to:
     //       call_lowered(@new_global, (arg1, ..., argn), <attributes>)
-    //     However there are a few special forms which are excluded from this treatment, see
-    //     below.
-    //  2. A 'normal' Relay call to a Function with the "compiler" attribute. We will need
-    //     to invoke the appropriate BYOC toolchain function to yield a runtime module and
-    //     rewrite the call to the same form as above.
-    //  3. A 'normal' Relay call to a PrimFunc which has already been supplied via a global
-    //     definition. We rewrite to use the call_lowered form, but otherwise nothing else
+    //     If needed, the call needs to be cross-linked with any dynamic shape functions.
+    //     (However, some primitives are special and handled separately.)
+    //  2. A 'normal' Relay call to a Function with the "Primitive" and "Compiler" attributes. We
+    //     will need to invoke the "relay.ext.<compiler>" function to yield a runtime module, and
+    //     rewrite the call to the same form as above. Dynamic shape function cross-linking may
+    //     also be needed.
+    //  3. A 'normal' Relay call to a Function with the "Extern" attribute. This function has
+    //     already been compiled by an external codegen and a definition for it exists in some
+    //     runtime module. Again, we rewrite to call_lowered form, and cross-link with a dynamic
+    //     shape function if needed.
+    //  4. A 'normal' Relay call to a PrimFunc which has already been supplied via a global
+    //     definition. We rewrite those to use the call_lowered form, but otherwise nothing else
     //     needs to be done.
-    //  4. A 'normal' Relay call to a Relay Function without any special attribute. These
+    //  5. A 'call_lowered' call from an earlier invocation of this pass or otherwise deliberately
+    //     inserted. It has all the required attributes, and any associated dynamic shape function
+    //     has been generated and cross-linked. These calls are not changed.
+    //  6. A 'normal' Relay call to a Relay Function without any special attribute. These
     //     calls are not changed.
-    //  5. A call_lowered call from an earlier invocation of this pass.
-    // Note that ResolveToPrimitive will yield non-null only for cases 1-3.
+    //
+    // Note that ResolveToPrimitive will yield non-null only for cases 1-4.
+
+    // Prepare the arguments and op.
+    Array<Expr> new_args;
+    for (const auto& arg : call_node->args) {
+      new_args.push_back(VisitExpr(arg));
+    }
+    Expr new_op = VisitExpr(call_node->op);
 
     // Look for (possibly indirect) calls to primitives.
     BaseFunc primitive_func = ResolveToPrimitive(call_node->op);
     if (!primitive_func.defined()) {
-      // Not a call to a primitive function we need to rewrite.
+      // Cases 5 and 6: Leave as ordinary call.
       if (const auto* function_node = call_node->op.as<FunctionNode>()) {
         process_fn_(GetRef<Function>(function_node));
       }
-      return DeviceAwareExprMutator::DeviceAwareVisitExpr_(call_node);
-    }
-
-    // Prepare the arguments.
-    Array<Expr> new_args;
-    for (const auto& arg : call_node->args) {
-      new_args.push_back(VisitExpr(arg));
+      return WithFields(GetRef<Call>(call_node), std::move(new_op), std::move(new_args));
     }
 
-    // Special case: device_copies are left as calls to primitive operators
-    // (thus undoing FuseOps) so that each backend can handle them directly.
-    // TODO(mbs): device_copy cleanup. Would be better for FuseOps to just leave device_copy
-    // alone.
+    // Special case for case 1: device_copies are left as calls to primitive operators
+    // so that each backend can handle them directly.
+    // TODO(mbs): device_copy cleanup. Would be better for FuseOps to just leave device_copy alone.
     if (const auto* function_node = primitive_func.as<FunctionNode>()) {
       DeviceCopyProps device_copy_props = GetDeviceCopyProps(function_node->body);
       if (device_copy_props.body.defined()) {
@@ -743,33 +849,23 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
       }
     }
 
-    // Special case: If already lowered by other means then so we don't need to mutate
-    // the call but we do need to mutate the arguments
+    ICHECK(call_node->type_args.empty()) << "lowered functions cannot be polymorphic";
+
+    // Case 4: If the function has already been lowered we just need to update the call.
     if (const auto* prim_func_node = primitive_func.as<tir::PrimFuncNode>()) {
       // Function should already be Target annotated by this point
       // but the TE Compiler metadata is still needed for the callback
       // TODO(Mousius) - Robustify this to not assume we're in the GlobalVar for Target Hooks
-      GlobalVar prim_func_var = Downcast<GlobalVar>(call_node->op);
+      Optional<Target> opt_target = primitive_func->GetAttr<Target>(tvm::attr::kTarget);
+      ICHECK(opt_target.defined());
+      auto prim_fn_var = Downcast<GlobalVar>(call_node->op);
       tir::PrimFunc prim_func = GetRef<tir::PrimFunc>(prim_func_node);
-
-      Map<GlobalVar, tir::PrimFunc> prim_fns = {{prim_func_var, prim_func}};
-      tir::PrimFunc func_with_metadata = WithAttrs(prim_func, {
-                                                                  {"prim_fn_var", prim_func_var},
-                                                                  {"prim_funcs", prim_fns},
-                                                              });
-
-      ICHECK(!IsDynamic(call_node->checked_type()));
-      CallLoweredAttrs call_lowered_attrs;
-      call_lowered_attrs.metadata.Set("relay_attrs", primitive_func->attrs);
-
-      process_fn_(func_with_metadata);
-      ICHECK(call_node->type_args.empty()) << "lowered functions cannot be polymorphic";
-      return CallLowered(prim_func_var, std::move(new_args), std::move(call_lowered_attrs),
-                         call_node->span);
+      Map<GlobalVar, BaseFunc> prim_fns = {{prim_fn_var, prim_func}};
+      return MakeLoweredCall(primitive_func, prim_fn_var, std::move(new_args), call_node->span,
+                             opt_target.value(), prim_fns);
     }
 
-    // Typical case: call to fused primitive Relay Function.
-    // Find the desired target device.
+    // Determine the target for lowering or external codegen.
     Target target;
     Optional<String> opt_compiler = primitive_func->GetAttr<String>(attr::kCompiler);
     if (opt_compiler.defined()) {
@@ -791,10 +887,20 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
       ICHECK(target.defined());
     }
 
-    // Lower the primitive function for that target.
-    Function function = Downcast<Function>(primitive_func);
-    ICHECK(call_node->type_args.empty()) << "lowered functions cannot be polymorphic";
-    return MakeLoweredCall(function, std::move(new_args), call_node->span, target);
+    if (primitive_func->HasNonzeroAttr(attr::kExtern)) {
+      // Case 3: Function has already been compiled.
+      GlobalVar prim_fn_var = Downcast<GlobalVar>(call_node->op);
+      return MakeLoweredCall(primitive_func, prim_fn_var, std::move(new_args), call_node->span,
+                             target, /*lowered_functions=*/{});
+    } else {
+      // Cases 1 and 2: lower the primitive function for the desired target, possibly using external
+      // codegen.
+      CCacheKey key(Downcast<Function>(primitive_func), target);
+      CachedFunc cfunc = compiler_->Lower(key, module_name_);
+      ICHECK(cfunc.defined());
+      return MakeLoweredCall(primitive_func, cfunc->prim_fn_var, std::move(new_args),
+                             call_node->span, target, cfunc->funcs->functions);
+    }
   }
 
   IRModule module_;
@@ -1046,6 +1152,7 @@ void UpdateFunctionMetadata(BaseFunc func,
   function_metadata.Set(prim_fn_var.value()->name_hint, fi);
 }
 
+/*! \brief Main lowering driving. */
 IRModule LowerTE(const IRModule& module, const String& module_name, ProcessFn process_fn,
                  CompilationConfig config) {
   TECompiler compiler(module);
@@ -1163,7 +1270,7 @@ Map<Target, IRModule> GetPerTargetModules(IRModule mod) {
   return per_target_modules;
 }
 
-Pass LowerTEPass(String module_name, ProcessFn process_fn, CompilationConfig complilation_config) {
+Pass LowerTE(String module_name, CompilationConfig complilation_config, ProcessFn process_fn) {
   runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func = [=](IRModule module,
                                                                             PassContext ctx) {
     return LowerTE(module, module_name, process_fn, complilation_config);
@@ -1174,6 +1281,12 @@ Pass LowerTEPass(String module_name, ProcessFn process_fn, CompilationConfig com
        tvm::transform::CreateModulePass(pass_func, 0, "LowerTE", {"InferType"}), InferType(),
        tvm::tir::transform::ExtractPrimFuncConstants()});
 }
+
+TVM_REGISTER_GLOBAL("relay.tec.LowerTE")
+    .set_body_typed([](String module_name, CompilationConfig compilation_config) {
+      return LowerTE(std::move(module_name), std::move(compilation_config));
+    });
+
 }  // namespace tec
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index 8312a20cb862b..5d16da4b8bb25 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file relay/backend/tir_compiler.h
- *  * \brief Internal compilation layer which lowers Relay "primitive functions" to TIR PrimFns.
+ * \file relay/backend/te_compiler.h
+ * \brief Internal compilation layer which lowers Relay "primitive functions" to TIR PrimFns.
  *
  *
  * This represents the new design of the Relay compilation flow and will replace the interface
@@ -173,36 +173,22 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, const Compila
  */
 Map<Target, IRModule> GetPerTargetModules(IRModule mod);
 
-/*! \brief Lower an IRModule's primitive functions to TIR.
- *
- * This is the "back half" of the Relay compiler which lowers "primitive functions"
- * to TE expressions, schedules them, and then to TIR.
- *
- * \param module The IRModule.
- * \param memory_plan The memory plan used during lowering
- * \param module_name The name of this module
- * \param process_fn Callback allowing one-level up code generators to process
- * each function that we lower
- * \return The lowered module, see above.
- */
-IRModule LowerTE(
-    const IRModule& module, backend::StaticMemoryPlan memory_plan, const String& module_name,
-    ProcessFn process_fn = [](BaseFunc f) {});
+inline void DefaultProcessFn(BaseFunc) {}
 
 /*!
  * \brief Pass to lower an IRModule's primitive functions to TIR.
  *
  * This is the "back half" of the Relay compiler which lowers "primitive functions"
- * to TE expressions, schedules them, and then to TIR. It annotates all functions
- * with their target.
+ * to TE expressions, schedules them, and emits PrimFuncs.
  *
- * \param module_name The name of this module
- * \param process_fn Callback allowing one-level up code generators to process
- * each function that we lower
+ * \param module_name The name of this module, used as a prefix for generated globals.
  * \param config All available targets.
+ * \param process_fn Callback allowing one-level up code generators to process
+ * each function that we lower (default is no-op).
  * \returns The pass which lowers primitive functions to TIR
  */
-transform::Pass LowerTEPass(String module_name, ProcessFn process_fn, CompilationConfig config);
+transform::Pass LowerTE(String module_name, CompilationConfig config,
+                        ProcessFn process_fn = DefaultProcessFn);
 
 }  // namespace tec
 }  // namespace relay
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 48f12ea8aaf87..8820a403bf709 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1040,13 +1040,11 @@ transform::Sequential VMCompiler::FuseAndLowerOperators(const CompilationConfig&
   // Give each "primitive" Function a hash.
   pass_seqs.push_back(LabelOps());
   // Lower "primitive" Functions to PrimFuncs and rewrite calls.
-  pass_seqs.push_back(tec::LowerTEPass(/*module_name=*/"vm_mod",
-                                       [this](const BaseFunc& func) {
-                                         if (func->GetAttr<String>(attr::kCompiler).defined()) {
-                                           backend::UpdateConstants(func, &params_);
-                                         }
-                                       },
-                                       config));
+  pass_seqs.push_back(tec::LowerTE(/*module_name=*/"vm_mod", config, [this](const BaseFunc& func) {
+    if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      backend::UpdateConstants(func, &params_);
+    }
+  }));
   // Since lowered functions are bound in the IRModule, we can now eliminate any unused
   // let-bound functions.
   pass_seqs.push_back(DeadCodeElimination(/*inline_once=*/false));
@@ -1091,13 +1089,11 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   pass_seqs.push_back(transform::LabelOps());
 
   // Lower all functions annotated as "primitive" by FuseOps.
-  pass_seqs.push_back(tec::LowerTEPass(/*module_name=*/"vm_mod",
-                                       [this](const BaseFunc& func) {
-                                         if (func->GetAttr<String>(attr::kCompiler).defined()) {
-                                           backend::UpdateConstants(func, &params_);
-                                         }
-                                       },
-                                       config_));
+  pass_seqs.push_back(tec::LowerTE(/*module_name=*/"vm_mod", config_, [this](const BaseFunc& func) {
+    if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      backend::UpdateConstants(func, &params_);
+    }
+  }));
 
   // Since lowered functions are bound in the IRModule, we can now eliminate any unused
   // let-bound functions.
diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc
index f22e9bd80dd07..3df07e4c57f5b 100644
--- a/src/relay/transforms/compiler_function_utils.cc
+++ b/src/relay/transforms/compiler_function_utils.cc
@@ -81,42 +81,6 @@ class Outliner : public MixedModeMutator {
   IRModule mod_;
 };
 
-/*!
- * \brief Rewrite calls to global "Compiler" functions to use the 'call_lowered' convention.
- */
-class CallRewriter : public MixedModeMutator {
- public:
-  CallRewriter(std::string compiler_filter, IRModule mod)
-      : compiler_filter_(std::move(compiler_filter)), mod_(std::move(mod)) {}
-
-  Expr Rewrite_(const CallNode* pre, const Expr& post) final {
-    Call new_call = Downcast<Call>(post);
-    if (const auto* global_var_node = new_call->op.as<GlobalVarNode>()) {
-      if (const auto* function_node =
-              mod_->Lookup(GetRef<GlobalVar>(global_var_node)).as<FunctionNode>()) {
-        Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
-        if (opt_compiler.defined() &&
-            (compiler_filter_.empty() || opt_compiler.value() == compiler_filter_)) {
-          Optional<String> opt_global_symbol =
-              function_node->GetAttr<String>(tvm::attr::kGlobalSymbol);
-          ICHECK(opt_global_symbol.defined());
-          GlobalVar global_symbol = mod_->GetGlobalVar(opt_global_symbol.value());
-          CallLoweredAttrs attrs;
-          attrs.metadata.Set("relay_attrs", new_call->attrs);
-          return CallLowered(global_symbol, new_call->args, attrs, new_call->span);
-        }
-      }
-    }
-    return post;
-  }
-
- private:
-  /*! \brief If non-empty, the "Compiler" attribute value to require on functions to outline. */
-  std::string compiler_filter_;
-  /*! \brief Module being rewritten. */
-  IRModule mod_;
-};
-
 }  // namespace
 
 GlobalSymbolCache::~GlobalSymbolCache() = default;
@@ -169,20 +133,6 @@ transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) {
   runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
       [compiler_filter = std::move(compiler_filter)](IRModule mod, transform::PassContext ctx) {
         IRModule output_mod = mod->ShallowCopy();
-
-        // First pass, rewrite the calls.
-        // We have to do this before marking functions as 'extern' to know which calls to rewrite!
-        for (const auto& kv : mod->functions) {
-          if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) {
-            Expr new_body =
-                CallRewriter(compiler_filter, output_mod).VisitExpr(function_node->body);
-            Function new_function =
-                WithFields(GetRef<Function>(function_node), /*opt_params=*/{}, new_body);
-            output_mod->Update(kv.first, new_function);
-          }
-        }
-
-        // Second pass, mark functions as 'extern'.
         for (const auto& kv : mod->functions) {
           if (const auto* function_node = kv.second.as<FunctionNode>()) {
             Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
@@ -197,7 +147,6 @@ transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) {
             }
           }
         }
-
         return output_mod;
       };
 
diff --git a/src/relay/transforms/compiler_function_utils.h b/src/relay/transforms/compiler_function_utils.h
index e4b1f05211fe1..9d1dcd9f21a22 100644
--- a/src/relay/transforms/compiler_function_utils.h
+++ b/src/relay/transforms/compiler_function_utils.h
@@ -43,11 +43,8 @@
  *
  *  - \p MarkCompilerFunctionsAsExtern will replace global functions with a matching "Compiler"
  *    attribute with the same function with just  an "Extern" attribute, signalling the function
- *    has been dealt with. Calls to such functions will be rewritten to use the 'call_lowered'
- *    calling convention. Can be used after lowering to cleanup the IRModule.
- *
- * Note that the above behaviour is hard coded within the TECompiler, but is only available to
- * external codegen using the Function-at-a-time "relay.ext.toolchain" extension point.
+ *    has been dealt with. However calls to such functions will be left unchanged.  Can be used
+ *    after lowering to cleanup the IRModule.
  */
 
 #ifndef TVM_RELAY_TRANSFORMS_COMPILER_FUNCTION_UTILS_H_
@@ -118,8 +115,8 @@ transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string co
 
 /*!
  * \brief A pass to mark all global functions which have a "Compiler" attribute matching
- * compiler_filter as 'extern' by replacing all attributes with a single "Extern" attribute, and
- * rewrite all calls to such functions to use the 'call_lowered' calling convention.
+ * compiler_filter as 'extern' by replacing all attributes with a single "Extern" attribute.
+ * Calls to such functions are not changed.
  *
  * If \p compiler_filter is non-empty only functions with that as their attribute value are
  * outlined.
diff --git a/tests/python/relay/backend/test_pass_lower_te.py b/tests/python/relay/backend/test_pass_lower_te.py
new file mode 100644
index 0000000000000..310a16e269e0f
--- /dev/null
+++ b/tests/python/relay/backend/test_pass_lower_te.py
@@ -0,0 +1,241 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Exercises the LowerTE pass.
+
+import tvm
+import tvm.testing
+import logging
+
+logging.basicConfig()
+logger = logging.getLogger("test_pass_lower_te")
+logger.setLevel(logging.INFO)
+
+# Since the TE compiler needs a good refactor it has not been exposed as a 'standard' pass
+# in relay.transform. For testing grab it directly.
+LowerTE = tvm._ffi.get_global_func("relay.tec.LowerTE")
+
+
+def transform(mod):
+    logger.info("Starting module:\n%s", mod)
+    host_target = tvm.target.Target("llvm")
+    prim_target = tvm.target.Target("llvm", host=host_target)
+    ctxt = tvm.transform.PassContext()
+    config = tvm.target.make_compilation_config(ctxt, prim_target)
+    mod = tvm.relay.transform.PlanDevices(config)(mod)
+    mod = tvm.relay.transform.InferType()(mod)
+    mod = LowerTE("test", config)(mod)
+    mod = tvm.relay.transform.InferType()(mod)
+    logger.info("After LowerTE:\n%s", mod)
+    return mod
+
+
+# All attempts to use structural equalty tests against an expected IRModule parsed from
+# Relay text were thwarted by the difficulty of setting up the expected call_lower attributes
+# with the right GlobalVar instances. So the following assert structural correctness the hard way.
+
+
+def test_lower_primitive():
+    input_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+          %0 = fn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Primitive=1) -> Tensor[(5, 7), float32] { 
+            add(%x, %y)
+          };
+          %0(%a, %a)  
+        }      
+        """,
+        "from_string",
+        None,
+        None,
+    )
+
+    actual_mod = transform(input_mod)
+
+    # Expected:
+    #   def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+    #     %0 = (%a, %a);
+    #     call_lowered(@test_fused_add, %0, metadata={relay_attrs={Primitive=1},all_prim_fn_vars=[@test_fused_add]})
+    #   }
+    #   def @test_fused_add = <lowered PrimFunc>
+
+    main = actual_mod["main"]
+    call = main.body
+    assert call.op.name == "call_lowered"
+    assert len(call.args) == 2
+    assert call.args[0].name_hint == "test_fused_add"
+    assert len(call.args[1].fields) == 2
+    assert call.args[1].fields[0].name_hint == "a"
+    assert call.args[1].fields[1].name_hint == "a"
+    assert call.attrs.metadata["relay_attrs"].Primitive == 1
+    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 1
+    assert call.attrs.metadata["all_prim_fn_vars"][0].name_hint == "test_fused_add"
+
+    test_fused_add = actual_mod["test_fused_add"]
+    assert isinstance(test_fused_add, tvm.tir.PrimFunc)
+
+
+def test_lower_compiler():
+    @tvm._ffi.register_func("relay.ext.test_pass_lower_te")
+    def relay_ext_test_pass_lower_te(func):
+        return None
+
+    input_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+          %0 = fn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Primitive=1, Compiler="test_pass_lower_te", global_symbol="test_add") -> Tensor[(5, 7), float32] { 
+            add(%x, %y)
+          };
+          %0(%a, %a)  
+        }      
+        """,
+        "from_string",
+        None,
+        None,
+    )
+
+    actual_mod = transform(input_mod)
+
+    # Expected:
+    #   def @main(%a : Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+    #     %0 = (%a, %a)
+    #     call_lowered(@test_add , %0, metadata={relay_attrs={Primitive=1, Compiler="test_pass_lower_te", global_symbol="test_add"}}, all_prim_fn_vars=[]})
+    #   }
+    #   def @test_add(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], Extern=1) -> Tensor[(5, 7), float32] {
+    #     add(%x, %y)
+    #   }
+
+    main = actual_mod["main"]
+    call = main.body
+    assert call.op.name == "call_lowered"
+    assert len(call.args) == 2
+    assert call.args[0].name_hint == "test_add"
+    assert len(call.args[1].fields) == 2
+    assert call.args[1].fields[0].name_hint == "a"
+    assert call.args[1].fields[1].name_hint == "a"
+    assert call.attrs.metadata["relay_attrs"].Primitive == 1
+    assert call.attrs.metadata["relay_attrs"].Compiler == "test_pass_lower_te"
+    assert call.attrs.metadata["relay_attrs"].global_symbol == "test_add"
+    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 0
+
+    test_add = actual_mod["test_add"]
+    assert isinstance(test_add, tvm.relay.Function)
+    assert test_add.attrs["Extern"] == 1
+
+
+def test_lower_extern():
+    input_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+          @my_add(%a, %a)
+        }
+        def @my_add(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Extern=1) -> Tensor[(5, 7), float32] { 
+          add(%x, %y)
+        }      
+        """,
+        "from_string",
+        None,
+        None,
+    )
+
+    actual_mod = transform(input_mod)
+
+    # Expected:
+    #   def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+    #     %0 = (%a, %a);
+    #     call_lowered(@my_add, %0, metadata={relay_attrs={Extern=1}}, all_prim_fn_vars=[]})
+    #   }
+    #   def @my_add(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], Extern=1) -> Tensor[(5, 7), float32] {
+    #     add(%x, %y)
+    #   }
+
+    main = actual_mod["main"]
+    call = main.body
+    assert call.op.name == "call_lowered"
+    assert len(call.args) == 2
+    assert call.args[0].name_hint == "my_add"
+    assert len(call.args[1].fields) == 2
+    assert call.args[1].fields[0].name_hint == "a"
+    assert call.args[1].fields[1].name_hint == "a"
+    assert call.attrs.metadata["relay_attrs"].Extern == 1
+    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 0
+
+    test_add = actual_mod["my_add"]
+    assert isinstance(test_add, tvm.relay.Function)
+    assert test_add.attrs["Extern"] == 1
+
+
+def test_lower_extern_with_dynamic_shape():
+    input_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(?, ?), float32] {
+          @my_dyn(%a, %a)
+        }
+        def @my_dyn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Extern=1) -> Tensor[(?, ?), float32] { 
+          add(%x, %y)
+        }      
+        """,
+        "from_string",
+        None,
+        None,
+    )
+
+    actual_mod = transform(input_mod)
+
+    # Expected:
+    # def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(?, ?), float32] {
+    #   %0 = (%a, %a);
+    #   call_lowered(@my_dyn, %0, metadata={prim_shape_fn_var='shape_func_add', relay_attrs={Extern=1}, prim_shape_fn_states=[2, 2], prim_shape_fn_num_inputs=2, all_prim_shape_fn_vars=['shape_func_add'], prim_shape_fn_num_outputs=1, all_prim_fn_vars=[]})
+    # }
+    # def @my_dyn(%x: Tensor[(5, 7), float32] , %y: Tensor[(5, 7), float32] , Extern=1) -> Tensor[(?, ?), float32] {
+    #   add(%x, %y)
+    # }
+    # def @shape_func_add = <shape PrimFunc>
+
+    main = actual_mod["main"]
+    call = main.body
+    assert call.op.name == "call_lowered"
+    assert len(call.args) == 2
+    assert call.args[0].name_hint == "my_dyn"
+    assert len(call.args[1].fields) == 2
+    assert call.args[1].fields[0].name_hint == "a"
+    assert call.args[1].fields[1].name_hint == "a"
+    assert call.attrs.metadata["prim_shape_fn_var"].name_hint == "shape_func_add"
+    assert call.attrs.metadata["relay_attrs"].Extern == 1
+    assert len(call.attrs.metadata["prim_shape_fn_states"]) == 2
+    assert call.attrs.metadata["prim_shape_fn_states"][0] == 2
+    assert call.attrs.metadata["prim_shape_fn_states"][1] == 2
+    assert call.attrs.metadata["prim_shape_fn_num_inputs"] == 2
+    assert len(call.attrs.metadata["all_prim_shape_fn_vars"]) == 1
+    assert call.attrs.metadata["all_prim_shape_fn_vars"][0].name_hint == "shape_func_add"
+    assert call.attrs.metadata["prim_shape_fn_num_outputs"] == 1
+    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 0
+
+    my_dyn = actual_mod["my_dyn"]
+    assert isinstance(my_dyn, tvm.relay.Function)
+    assert my_dyn.attrs["Extern"] == 1
+
+    shape_func_add = actual_mod["shape_func_add"]
+    assert isinstance(shape_func_add, tvm.tir.PrimFunc)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/transform/test_compiler_function_utils.py b/tests/python/relay/transform/test_compiler_function_utils.py
index 13e0f98e79f19..b9eb115475956 100644
--- a/tests/python/relay/transform/test_compiler_function_utils.py
+++ b/tests/python/relay/transform/test_compiler_function_utils.py
@@ -38,8 +38,7 @@ def make_consts(dtype, shapes):
             (2304,),  # 1
             (600, 32, 64),  # 2
         ],
-    ),
-    "attributes": [{"relay_attrs": None}],
+    )
 }
 
 
@@ -115,7 +114,7 @@ def expected_extern_mod():
         """
         #[version = "0.0.5"]
         def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
-          %1 = call_lowered(@tvmgen_default_cutlass_main_0, (%x0, meta[relay.Constant][0], meta[relay.Constant][1]), metadata=meta[attributes][0]);
+          %1 = @tvmgen_default_cutlass_main_0(%x0, meta[relay.Constant][0], meta[relay.Constant][1]);
           %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
                   Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
             %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],

From 6fca5c657a2fadc16fd7ff44de8a6a9656d50c1b Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 9 Jun 2022 22:09:40 -0700
Subject: [PATCH 095/181] [MetaSchedule] Developer Ergonomics Enhancement
 (#11622)

Per discussion with @Kathryn-cat

- [x] Move `initialize_with_tune_context` as private API `_initialize_with_tune_context`, and
encourage using `TuneContext.initialize`
- [x] Instead of using bunch of import statements, encourage using `ms.xxx` as the prefix
(e.g. `ms.database.MemoryDatabase`) to organize things better
- [x] Move `DefaultLLVM`, `DefaultCUDA` to a separate file and make them more discoverable
- [x] Move `DummyDatabase` to `tvm.meta_schedule.database.MemoryDatabase` given it's actually useful
- [x] Delegate class members' methods in `TuneContext`, for example, having
`TuneContext.generste_design_space` from `TuneContext.space_generator.generste_design_space`

Next PR:
- Allow using a string `"default"` in `TuneContext` as well as `tune_relay/tir/te` to quickly
specify a set of target-specific rules
- Add `TuneContext.tune` to allow directly tuning without task scheduler.
- Enhance detection of `ScheduleFn` in `TuneContext` to make it easier for users to quickly try out
template-driven scheduling on TIR.

Co-Authored-By: Kathryn (Jinqi) Chen <65606304+Kathryn-cat@users.noreply.github.com>
---
 include/tvm/meta_schedule/search_strategy.h   |  11 +-
 include/tvm/meta_schedule/tune_context.h      |  27 +-
 python/tvm/meta_schedule/__init__.py          |   5 +-
 python/tvm/meta_schedule/database/__init__.py |   1 +
 python/tvm/meta_schedule/database/database.py |   2 +-
 .../meta_schedule/database/memory_database.py |  63 ++++
 python/tvm/meta_schedule/default_config.py    | 346 ++++++++++++++++++
 python/tvm/meta_schedule/mutator/mutator.py   |   8 +-
 python/tvm/meta_schedule/postproc/postproc.py |   6 +-
 .../schedule_rule/schedule_rule.py            |  10 +-
 .../search_strategy/search_strategy.py        |  13 +-
 .../space_generator/schedule_fn.py            |   2 +-
 .../space_generator/space_generator.py        |   6 +-
 python/tvm/meta_schedule/testing/__init__.py  |   8 -
 .../tvm/meta_schedule/testing/dummy_object.py |  60 +++
 python/tvm/meta_schedule/testing/utils.py     | 109 +-----
 python/tvm/meta_schedule/tune.py              | 329 +----------------
 python/tvm/meta_schedule/tune_context.py      | 118 +++++-
 .../search_strategy/evolutionary_search.cc    |  13 +-
 .../search_strategy/replay_func.cc            |   3 +-
 .../search_strategy/replay_trace.cc           |   3 +-
 .../search_strategy/search_strategy.cc        |   5 +-
 .../task_scheduler/task_scheduler.cc          | 107 +-----
 src/meta_schedule/tune_context.cc             |  81 ++++
 ..._meta_schedule_custom_rule_winograd_cpu.py |  20 +-
 ...meta_schedule_custom_rule_winograd_cuda.py |  17 +-
 .../test_meta_schedule_integration.py         |  38 +-
 .../test_meta_schedule_measure_callback.py    |  75 ++--
 .../test_meta_schedule_multi_anchor.py        |  27 +-
 ...chedule_mutator_mutate_compute_location.py |  12 +-
 ...t_meta_schedule_mutator_mutate_parallel.py |  12 +-
 ..._schedule_mutator_mutate_thread_binding.py |  12 +-
 ..._meta_schedule_mutator_mutate_tile_size.py |  10 +-
 ...est_meta_schedule_mutator_mutate_unroll.py |  12 +-
 .../test_meta_schedule_post_order_apply.py    |  43 ++-
 ...schedule_postproc_disallow_dynamic_loop.py |   3 +-
 ...dule_postproc_rewrite_cooperative_fetch.py |   3 +-
 ...hedule_postproc_rewrite_reduction_block.py |   3 +-
 ...eta_schedule_postproc_rewrite_tensorize.py |   6 +-
 ...schedule_postproc_rewrite_unbound_block.py |   3 +-
 ..._meta_schedule_postproc_verify_gpu_code.py |   3 +-
 ...meta_schedule_schedule_rule_add_rfactor.py |   4 +-
 ...t_meta_schedule_schedule_rule_auto_bind.py |   4 +-
 ...meta_schedule_schedule_rule_auto_inline.py |   4 +-
 ...le_schedule_rule_cross_thread_reduction.py |   9 +-
 ...hedule_schedule_rule_multi_level_tiling.py |  15 +-
 ...schedule_rule_parallel_vectorize_unroll.py |   4 +-
 ...e_schedule_rule_random_compute_location.py |   6 +-
 .../test_meta_schedule_search_strategy.py     | 107 +++---
 .../test_meta_schedule_space_generator.py     |  16 +-
 .../test_meta_schedule_task_scheduler.py      | 142 ++++---
 .../unittest/test_meta_schedule_tune_relay.py |  51 ++-
 52 files changed, 1111 insertions(+), 886 deletions(-)
 create mode 100644 python/tvm/meta_schedule/database/memory_database.py
 create mode 100644 python/tvm/meta_schedule/default_config.py
 create mode 100644 python/tvm/meta_schedule/testing/dummy_object.py

diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index 5e249850f5d5b..a75a4cd8ae868 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -113,12 +113,10 @@ class SearchStrategyNode : public runtime::Object {
 
   /*!
    * \brief Update the search strategy with measurement results.
-   * \param context The tuning context.
    * \param measure_candidates The candidates to be measured.
    * \param results The measurement results from the runner.
    */
-  virtual void NotifyRunnerResults(const TuneContext& context,
-                                   const Array<MeasureCandidate>& measure_candidates,
+  virtual void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                                    const Array<RunnerResult>& results) = 0;
 
   static constexpr const char* _type_key = "meta_schedule.SearchStrategy";
@@ -150,8 +148,8 @@ class PySearchStrategyNode : public SearchStrategyNode {
    * \brief The function type of `NotifyRunnerResults` method.
    * \param results The measurement results from the runner.
    */
-  using FNotifyRunnerResults = runtime::TypedPackedFunc<void(
-      const TuneContext&, const Array<MeasureCandidate>&, const Array<RunnerResult>&)>;
+  using FNotifyRunnerResults =
+      runtime::TypedPackedFunc<void(const Array<MeasureCandidate>&, const Array<RunnerResult>&)>;
 
   /*! \brief The packed function to the `InitializeWithTuneContext` method. */
   FInitializeWithTuneContext f_initialize_with_tune_context;
@@ -177,8 +175,7 @@ class PySearchStrategyNode : public SearchStrategyNode {
                  const Optional<CostModel>& cost_model) final;
   void PostTuning() final;
   Optional<Array<MeasureCandidate>> GenerateMeasureCandidates() final;
-  void NotifyRunnerResults(const TuneContext& context,
-                           const Array<MeasureCandidate>& measure_candidates,
+  void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                            const Array<RunnerResult>& results);
 
   static constexpr const char* _type_key = "meta_schedule.PySearchStrategy";
diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
index ee09099d1a926..3d732e7fbd992 100644
--- a/include/tvm/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -42,6 +42,7 @@ namespace tvm {
 namespace meta_schedule {
 
 class TaskSchedulerNode;
+class MeasureCallback;
 
 /*! \brief The auto tuning context. */
 class TuneContextNode : public runtime::Object {
@@ -70,7 +71,7 @@ class TuneContextNode : public runtime::Object {
   int num_threads;
 
   /*! \brief Whether the tuning task has been stopped or finished. */
-  bool is_terminated;
+  bool is_terminated;  // TODO(@junrushao1994): move to TaskScheduler
   /*! \brief The measure candidates. */
   Optional<Array<MeasureCandidate>> measure_candidates;
   /*! \brief The building results. */
@@ -87,18 +88,36 @@ class TuneContextNode : public runtime::Object {
     v->Visit("postprocs", &postprocs);
     v->Visit("mutator_probs", &mutator_probs);
     v->Visit("task_name", &task_name);
+    // `logging_func` is not visited
     v->Visit("rand_state", &rand_state);
     v->Visit("num_threads", &num_threads);
     v->Visit("is_terminated", &is_terminated);
+    v->Visit("measure_candidates", &measure_candidates);
     v->Visit("builder_results", &builder_results);
     v->Visit("runner_futures", &runner_futures);
-    v->Visit("measure_candidates", &measure_candidates);
-    // `logging_func` is not visited
   }
 
   /*! \brief Initialize members that needs initialization with tune context. */
   void Initialize();
-
+  /*! \brief Set the measure candidates from the SearchStrategy */
+  void _SetMeasureCandidates(const Array<MeasureCandidate>& candidates);
+  /*!
+   * \brief Send the measure candidates to builder.
+   * \param builder The builder to send the candidates to.
+   */
+  void _SendToBuilder(const Builder& builder);
+  /*!
+   * \brief Send the built measure candidates to runner.
+   * \param runner The runner to send the candidates to.
+   */
+  void _SendToRunner(const Runner& runner);
+  /*!
+   * \brief Join the running tasks.
+   * \returns The results from the runner
+   */
+  Array<RunnerResult> _Join();
+  /*! \brief Set `measure_candidates`, `builder_results` and `runner_futures` to null. */
+  void _ClearMeasureState();
   static constexpr const char* _type_key = "meta_schedule.TuneContext";
   TVM_DECLARE_FINAL_OBJECT_INFO(TuneContextNode, Object);
 };
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index 76eebbdf23f10..0028fbdf4faa6 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -20,7 +20,9 @@
     builder,
     cost_model,
     database,
+    default_config,
     feature_extractor,
+    measure_callback,
     mutator,
     postproc,
     runner,
@@ -32,5 +34,6 @@
 from .extracted_task import ExtractedTask
 from .relay_integration import extract_task_from_relay
 from .search_strategy import MeasureCandidate
-from .tune import TuneConfig, tune_relay, tune_te, tune_tir
+from .tune import TuneConfig, tune_extracted_tasks, tune_relay, tune_te, tune_tir
 from .tune_context import TuneContext
+from .utils import derived_object
diff --git a/python/tvm/meta_schedule/database/__init__.py b/python/tvm/meta_schedule/database/__init__.py
index 320647b0e31b4..2a87eea147d9c 100644
--- a/python/tvm/meta_schedule/database/__init__.py
+++ b/python/tvm/meta_schedule/database/__init__.py
@@ -20,3 +20,4 @@
 """
 from .database import Database, PyDatabase, TuningRecord, Workload
 from .json_database import JSONDatabase
+from .memory_database import MemoryDatabase
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index 8e0c805410204..802a739e69582 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Tuning record database"""
+"""TuningRecord database"""
 from typing import Any, Callable, List, Optional
 
 from tvm._ffi import register_object
diff --git a/python/tvm/meta_schedule/database/memory_database.py b/python/tvm/meta_schedule/database/memory_database.py
new file mode 100644
index 0000000000000..6d10e4b5272a9
--- /dev/null
+++ b/python/tvm/meta_schedule/database/memory_database.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A database that stores TuningRecords in memory"""
+from typing import List
+
+from ...ir import IRModule, structural_equal
+from ..utils import derived_object
+from .database import PyDatabase, TuningRecord, Workload
+
+
+@derived_object
+class MemoryDatabase(PyDatabase):
+    """An in-memory database based on python list for testing."""
+
+    def __init__(self):
+        super().__init__()
+        self.records = []
+        self.workload_reg = []
+
+    def has_workload(self, mod: IRModule) -> bool:
+        for workload in self.workload_reg:
+            if structural_equal(workload.mod, mod):
+                return True
+        return False
+
+    def commit_tuning_record(self, record: TuningRecord) -> None:
+        self.records.append(record)
+
+    def commit_workload(self, mod: IRModule) -> Workload:
+        for workload in self.workload_reg:
+            if structural_equal(workload.mod, mod):
+                return workload
+        workload = Workload(mod)
+        self.workload_reg.append(workload)
+        return workload
+
+    def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
+        return list(
+            filter(
+                lambda x: x.workload == workload,
+                sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)),
+            )
+        )[: int(top_k)]
+
+    def __len__(self) -> int:
+        return len(self.records)
+
+    def print_results(self) -> None:
+        print("\n".join([str(r) for r in self.records]))
diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
new file mode 100644
index 0000000000000..34411bde057b2
--- /dev/null
+++ b/python/tvm/meta_schedule/default_config.py
@@ -0,0 +1,346 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-outside-toplevel
+"""Pre-configured Defaults for MetaSchedule search rules"""
+import logging
+from os import path as osp
+from typing import Callable, Dict, List, Optional, Union
+
+from tvm._ffi.registry import register_func
+from tvm.ir import IRModule
+from tvm.target import Target
+from tvm.tir import PrimFunc
+
+from .builder import Builder, LocalBuilder
+from .cost_model import CostModel, XGBModel
+from .database import Database, JSONDatabase
+from .feature_extractor import PerStoreFeature
+from .measure_callback import MeasureCallback
+from .mutator import Mutator
+from .postproc import Postproc
+from .runner import LocalRunner, Runner
+from .schedule_rule import ScheduleRule
+from .space_generator import PostOrderApply, SpaceGenerator
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+FnSpaceGenerator = Callable[[], SpaceGenerator]
+FnScheduleRule = Callable[[], List[ScheduleRule]]
+FnPostproc = Callable[[], List[Postproc]]
+FnMutatorProb = Callable[[], Dict[Mutator, float]]
+
+
+@register_func("tvm.meta_schedule.tune.parse_mod")  # for use in ApplyHistoryBest
+def mod(mod: Union[PrimFunc, IRModule]) -> IRModule:  # pylint: disable=redefined-outer-name
+    """Normalize the input to an IRModule"""
+    if isinstance(mod, PrimFunc):
+        mod = mod.with_attr("global_symbol", "main")
+        mod = mod.with_attr("tir.noalias", True)
+        mod = IRModule({"main": mod})
+    if not isinstance(mod, IRModule):
+        raise TypeError(f"Expected `mod` to be PrimFunc or IRModule, but gets: {mod}")
+    # in order to make sure the mod can be found in ApplyHistoryBest
+    # different func name can cause structural unequal
+    func_names = mod.get_global_vars()
+    (func_name,) = func_names
+    if len(func_names) == 1 and func_name != "main":
+        mod = IRModule({"main": mod[func_name]})
+    return mod
+
+
+def target(target: Union[str, Target]) -> Target:  # pylint: disable=redefined-outer-name
+    """Normalize the input to tvm.target.Target"""
+    if isinstance(target, str):
+        target = Target(target)
+    if not isinstance(target, Target):
+        raise TypeError(f"Expected `target` to be str or Target, but gets: {target}")
+    return target
+
+
+def builder(builder: Optional[Builder]) -> Builder:  # pylint: disable=redefined-outer-name
+    """Normalize the input to tvm.meta_schedule.Builder"""
+    if builder is None:
+        builder = LocalBuilder()  # type: ignore
+    if not isinstance(builder, Builder):
+        raise TypeError(f"Expected `builder` to be Builder, but gets: {builder}")
+    return builder
+
+
+def runner(runner: Optional[Runner]) -> Runner:  # pylint: disable=redefined-outer-name
+    """Normalize the input to tvm.meta_schedule.Runner"""
+    if runner is None:
+        runner = LocalRunner()  # type: ignore
+    if not isinstance(runner, Runner):
+        raise TypeError(f"Expected `runner` to be Runner, but gets: {runner}")
+    return runner
+
+
+def database(
+    database: Union[None, Database],  # pylint: disable=redefined-outer-name
+    path: str,
+) -> Database:
+    """Normalize the input to tvm.meta_schedule.Database"""
+    if database is None:
+        path_workload = osp.join(path, "database_workload.json")
+        path_tuning_record = osp.join(path, "database_tuning_record.json")
+        logger.info(
+            "Creating JSONDatabase. Workload at: %s. Tuning records at: %s",
+            path_workload,
+            path_tuning_record,
+        )
+        database = JSONDatabase(
+            path_workload=path_workload,
+            path_tuning_record=path_tuning_record,
+        )
+    if not isinstance(database, Database):
+        raise TypeError(f"Expected `database` to be Database, but gets: {database}")
+    return database
+
+
+def callbacks(  # pylint: disable=redefined-outer-name
+    measure_callbacks: Optional[List[MeasureCallback]],
+) -> List[MeasureCallback]:
+    """Normalize the input to List[tvm.meta_schedule.MeasureCallback]"""
+    if measure_callbacks is None:
+        from tvm.meta_schedule import measure_callback as M
+
+        return [
+            M.AddToDatabase(),
+            M.RemoveBuildArtifact(),
+            M.EchoStatistics(),
+            M.UpdateCostModel(),
+        ]
+    if not isinstance(measure_callbacks, (list, tuple)):
+        raise TypeError(
+            f"Expected `measure_callbacks` to be List[MeasureCallback], "
+            f"but gets: {measure_callbacks}"
+        )
+    measure_callbacks = list(measure_callbacks)
+    for i, callback in enumerate(measure_callbacks):
+        if not isinstance(callback, MeasureCallback):
+            raise TypeError(
+                f"Expected `measure_callbacks` to be List[MeasureCallback], "
+                f"but measure_callbacks[{i}] is: {callback}"
+            )
+    return measure_callbacks
+
+
+def cost_model(
+    cost_model: Optional[CostModel],  # pylint: disable=redefined-outer-name
+) -> CostModel:
+    """Normalize the input to tvm.meta_schedule.CostModel"""
+    if cost_model is None:
+        return XGBModel(extractor=PerStoreFeature())  # type: ignore
+    if not isinstance(cost_model, CostModel):
+        raise TypeError(f"Expected `cost_model` to be CostModel, but gets: {cost_model}")
+    return cost_model
+
+
+def space_generator(
+    space_generator: Optional[FnSpaceGenerator],  # pylint: disable=redefined-outer-name
+) -> SpaceGenerator:
+    """Normalize the input to tvm.meta_schedule.SpaceGenerator"""
+    if space_generator is None:
+        return PostOrderApply()
+    if callable(space_generator):
+        space_generator = space_generator()
+    if not isinstance(space_generator, SpaceGenerator):
+        raise TypeError(
+            f"Expected `space_generator` to return SpaceGenerator, " f"but gets: {space_generator}"
+        )
+    return space_generator
+
+
+def schedule_rules(  # pylint: disable=redefined-outer-name
+    sch_rules: Optional[FnScheduleRule],
+    target: Target,
+) -> List[ScheduleRule]:
+    """Normalize the input to List[tvm.meta_schedule.ScheduleRule]"""
+    if callable(sch_rules):
+        return sch_rules()
+    if sch_rules is not None:
+        raise TypeError(f"Expected `sch_rules` to be None or callable, but gets: {sch_rules}")
+    if target.kind.name == "llvm":
+        return _DefaultLLVM.schedule_rules()
+    if target.kind.name in ["cuda", "rocm", "vulkan"]:
+        return _DefaultCUDA.schedule_rules()
+    raise ValueError(f"Unsupported target: {target}")
+
+
+def postproc(  # pylint: disable=redefined-outer-name
+    postproc: Optional[FnPostproc],
+    target: Target,
+) -> List[Postproc]:
+    """Normalize the input to List[tvm.meta_schedule.Postproc]"""
+    if callable(postproc):
+        return postproc()
+    if postproc is not None:
+        raise TypeError(f"Expected `postproc` to be None or callable, but gets: {postproc}")
+    if target.kind.name == "llvm":
+        return _DefaultLLVM.postprocs()
+    if target.kind.name in ["cuda", "rocm", "vulkan"]:
+        return _DefaultCUDA.postprocs()
+    raise ValueError(f"Unsupported target: {target}")
+
+
+def mutator_probs(  # pylint: disable=redefined-outer-name
+    mutator_probs: Optional[FnMutatorProb],
+    target: Target,
+) -> Dict[Mutator, float]:
+    """Normalize the input to Dict[tvm.meta_schedule.Mutator, float]"""
+    if callable(mutator_probs):
+        return mutator_probs()
+    if mutator_probs is not None:
+        raise TypeError(
+            f"Expected `mutator_probs` to be None or callable, but gets: {mutator_probs}"
+        )
+    if target.kind.name == "llvm":
+        return _DefaultLLVM.mutator_probs()
+    if target.kind.name in ["cuda", "rocm", "vulkan"]:
+        return _DefaultCUDA.mutator_probs()
+    raise ValueError(f"Unsupported target: {target}")
+
+
+class _DefaultLLVM:
+    """Default tuning configuration for LLVM."""
+
+    @staticmethod
+    def schedule_rules() -> List[ScheduleRule]:
+        from tvm.meta_schedule import schedule_rule as M
+
+        return [
+            M.AutoInline(
+                into_producer=False,
+                into_consumer=True,
+                inline_const_tensor=True,
+                disallow_if_then_else=True,
+                require_injective=True,
+                require_ordered=True,
+                disallow_op=["tir.exp"],
+            ),
+            M.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
+            M.MultiLevelTiling(
+                structure="SSRSRS",
+                tile_binds=None,
+                max_innermost_factor=64,
+                vector_load_lens=None,
+                reuse_read=None,
+                reuse_write=M.ReuseType(
+                    req="may",
+                    levels=[1, 2],
+                    scope="global",
+                ),
+            ),
+            M.ParallelizeVectorizeUnroll(
+                max_jobs_per_core=16,
+                max_vectorize_extent=64,
+                unroll_max_steps=[0, 16, 64, 512],
+                unroll_explicit=True,
+            ),
+            M.RandomComputeLocation(),
+        ]
+
+    @staticmethod
+    def postprocs() -> List[Postproc]:
+        from tvm.meta_schedule import postproc as M
+
+        return [
+            M.DisallowDynamicLoop(),
+            M.RewriteParallelVectorizeUnroll(),
+            M.RewriteReductionBlock(),
+        ]
+
+    @staticmethod
+    def mutator_probs() -> Dict[Mutator, float]:
+        from tvm.meta_schedule import mutator as M
+
+        return {
+            M.MutateTileSize(): 0.9,
+            M.MutateComputeLocation(): 0.05,
+            M.MutateUnroll(): 0.03,
+            M.MutateParallel(max_jobs_per_core=16): 0.02,
+        }
+
+
+class _DefaultCUDA:
+    """Default tuning configuration for CUDA."""
+
+    @staticmethod
+    def schedule_rules() -> List[ScheduleRule]:
+        from tvm.meta_schedule import schedule_rule as M
+
+        return [
+            M.MultiLevelTiling(
+                structure="SSSRRSRS",
+                tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
+                max_innermost_factor=64,
+                vector_load_lens=[1, 2, 3, 4],
+                reuse_read=M.ReuseType(
+                    req="must",
+                    levels=[4],
+                    scope="shared",
+                ),
+                reuse_write=M.ReuseType(
+                    req="must",
+                    levels=[3],
+                    scope="local",
+                ),
+            ),
+            M.AutoInline(
+                into_producer=True,
+                into_consumer=True,
+                inline_const_tensor=True,
+                disallow_if_then_else=False,
+                require_injective=False,
+                require_ordered=False,
+                disallow_op=None,
+            ),
+            M.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]),
+            M.ParallelizeVectorizeUnroll(
+                max_jobs_per_core=-1,  # disable parallelize
+                max_vectorize_extent=-1,  # disable vectorize
+                unroll_max_steps=[0, 16, 64, 512, 1024],
+                unroll_explicit=True,
+            ),
+            M.AutoBind(
+                max_threadblocks=256,
+                thread_extents=[32, 64, 128, 256, 512, 1024],
+            ),
+        ]
+
+    @staticmethod
+    def postprocs() -> List[Postproc]:
+        from tvm.meta_schedule import postproc as M
+
+        return [
+            M.DisallowDynamicLoop(),
+            M.RewriteCooperativeFetch(),
+            M.RewriteUnboundBlock(),
+            M.RewriteParallelVectorizeUnroll(),
+            M.RewriteReductionBlock(),
+            M.VerifyGPUCode(),
+        ]
+
+    @staticmethod
+    def mutator_probs() -> Dict[Mutator, float]:
+        from tvm.meta_schedule import mutator as M
+
+        return {
+            M.MutateTileSize(): 0.9,
+            M.MutateUnroll(): 0.08,
+            M.MutateThreadBinding(): 0.02,
+        }
diff --git a/python/tvm/meta_schedule/mutator/mutator.py b/python/tvm/meta_schedule/mutator/mutator.py
index 2b066f49bd91b..0c8de96680349 100644
--- a/python/tvm/meta_schedule/mutator/mutator.py
+++ b/python/tvm/meta_schedule/mutator/mutator.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Meta Schedule Mutator."""
-from typing import Callable, Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, Callable, Optional
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
@@ -31,7 +31,7 @@
 class Mutator(Object):
     """Mutator is designed to mutate the trace to explore the design space."""
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the mutator with a tune context.
 
         Parameters
@@ -94,10 +94,10 @@ class PyMutator:
 
     _tvm_metadata = {
         "cls": _PyMutator,
-        "methods": ["initialize_with_tune_context", "apply", "__str__"],
+        "methods": ["_initialize_with_tune_context", "apply", "__str__"],
     }
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the mutator with a tune context.
 
         Parameters
diff --git a/python/tvm/meta_schedule/postproc/postproc.py b/python/tvm/meta_schedule/postproc/postproc.py
index 1706aae40614e..e37666bd1ce0c 100644
--- a/python/tvm/meta_schedule/postproc/postproc.py
+++ b/python/tvm/meta_schedule/postproc/postproc.py
@@ -33,7 +33,7 @@
 class Postproc(Object):
     """Rules to apply a postprocessor to a schedule."""
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the postprocessor with a tune context.
 
         Parameters
@@ -96,10 +96,10 @@ class PyPostproc:
 
     _tvm_metadata = {
         "cls": _PyPostproc,
-        "methods": ["initialize_with_tune_context", "apply", "__str__"],
+        "methods": ["_initialize_with_tune_context", "apply", "__str__"],
     }
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the postprocessor with a tune context.
 
         Parameters
diff --git a/python/tvm/meta_schedule/schedule_rule/schedule_rule.py b/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
index e3ffdb0f4f8ec..481444341b86b 100644
--- a/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
+++ b/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
@@ -22,10 +22,10 @@
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
-from tvm.tir.schedule import Schedule, BlockRV
+from tvm.tir.schedule import BlockRV, Schedule
 
-from ..utils import _get_default_str
 from .. import _ffi_api
+from ..utils import _get_default_str
 
 if TYPE_CHECKING:
     from ..tune_context import TuneContext
@@ -35,7 +35,7 @@
 class ScheduleRule(Object):
     """Rules to modify a block in a schedule."""
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the schedule rule with a tune context.
 
         Parameters
@@ -102,10 +102,10 @@ class PyScheduleRule:
 
     _tvm_metadata = {
         "cls": _PyScheduleRule,
-        "methods": ["initialize_with_tune_context", "apply", "__str__"],
+        "methods": ["_initialize_with_tune_context", "apply", "__str__"],
     }
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the schedule rule with a tune context.
 
         Parameters
diff --git a/python/tvm/meta_schedule/search_strategy/search_strategy.py b/python/tvm/meta_schedule/search_strategy/search_strategy.py
index 14b46a0785f1d..1cd8a448fe8e9 100644
--- a/python/tvm/meta_schedule/search_strategy/search_strategy.py
+++ b/python/tvm/meta_schedule/search_strategy/search_strategy.py
@@ -77,7 +77,7 @@ class SearchStrategy(Object):
     before usage and post-tuned after usage.
     """
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the search strategy with tuning context.
 
         Parameters
@@ -129,7 +129,6 @@ def generate_measure_candidates(self) -> Optional[List[MeasureCandidate]]:
 
     def notify_runner_results(
         self,
-        context: "TuneContext",
         measure_candidates: List[MeasureCandidate],
         results: List[RunnerResult],
     ) -> None:
@@ -137,8 +136,6 @@ def notify_runner_results(
 
         Parameters
         ----------
-        context : TuneContext
-            The tuning context for update.
         measure_candidates : List[MeasureCandidate]
             The measure candidates for update.
         results : List[RunnerResult]
@@ -146,7 +143,6 @@ def notify_runner_results(
         """
         _ffi_api.SearchStrategyNotifyRunnerResults(  # type: ignore # pylint: disable=no-member
             self,
-            context,
             measure_candidates,
             results,
         )
@@ -192,7 +188,7 @@ class PySearchStrategy:
     _tvm_metadata = {
         "cls": _PySearchStrategy,
         "methods": [
-            "initialize_with_tune_context",
+            "_initialize_with_tune_context",
             "pre_tuning",
             "post_tuning",
             "generate_measure_candidates",
@@ -200,7 +196,7 @@ class PySearchStrategy:
         ],
     }
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the search strategy with tuning context.
 
         Parameters
@@ -236,7 +232,6 @@ def generate_measure_candidates(self) -> Optional[List[MeasureCandidate]]:
 
     def notify_runner_results(
         self,
-        context: "TuneContext",
         measure_candidates: List[MeasureCandidate],
         results: List[RunnerResult],
     ) -> None:
@@ -244,8 +239,6 @@ def notify_runner_results(
 
         Parameters
         ----------
-        context : TuneContext
-            The tuning context for update.
         measure_candidates : List[MeasureCandidate]
             The measure candidates for update.
         results : List[RunnerResult]
diff --git a/python/tvm/meta_schedule/space_generator/schedule_fn.py b/python/tvm/meta_schedule/space_generator/schedule_fn.py
index 6763d9f9d56c5..ffc13eecca260 100644
--- a/python/tvm/meta_schedule/space_generator/schedule_fn.py
+++ b/python/tvm/meta_schedule/space_generator/schedule_fn.py
@@ -53,7 +53,7 @@ def __init__(self, sch_fn: SCH_FN_TYPE):
         super().__init__()
         self.sch_fn = sch_fn
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the design space generator with tuning context.
 
         Parameters
diff --git a/python/tvm/meta_schedule/space_generator/space_generator.py b/python/tvm/meta_schedule/space_generator/space_generator.py
index 4b7fff0283e06..eb999de495853 100644
--- a/python/tvm/meta_schedule/space_generator/space_generator.py
+++ b/python/tvm/meta_schedule/space_generator/space_generator.py
@@ -35,7 +35,7 @@
 class SpaceGenerator(Object):
     """The abstract design space generator interface."""
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the design space generator with tuning context.
 
         Parameters
@@ -96,10 +96,10 @@ class PySpaceGenerator:
 
     _tvm_metadata = {
         "cls": _PySpaceGenerator,
-        "methods": ["initialize_with_tune_context", "generate_design_space"],
+        "methods": ["_initialize_with_tune_context", "generate_design_space"],
     }
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the design space generator with tuning context.
 
         Parameters
diff --git a/python/tvm/meta_schedule/testing/__init__.py b/python/tvm/meta_schedule/testing/__init__.py
index 24e57928778d1..5d6081fa81e4a 100644
--- a/python/tvm/meta_schedule/testing/__init__.py
+++ b/python/tvm/meta_schedule/testing/__init__.py
@@ -15,11 +15,3 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utilities in meta schedule"""
-from .utils import (
-    DummyDatabase,
-    DummyBuilder,
-    DummyRunner,
-    DummyRunnerFuture,
-    DummyMutator,
-    apply_fixed_schedules,
-)
diff --git a/python/tvm/meta_schedule/testing/dummy_object.py b/python/tvm/meta_schedule/testing/dummy_object.py
new file mode 100644
index 0000000000000..50ae974df5d8b
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/dummy_object.py
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Dummy objects for testing."""
+import random
+from typing import List, Optional
+
+from tvm.tir.schedule import Trace
+
+from ..builder import BuilderInput, BuilderResult, PyBuilder
+from ..mutator import PyMutator
+from ..runner import PyRunner, PyRunnerFuture, RunnerFuture, RunnerInput, RunnerResult
+from ..tune_context import TuneContext  # pylint: disable=unused-import
+from ..utils import derived_object
+
+
+@derived_object
+class DummyRunnerFuture(PyRunnerFuture):
+    def done(self) -> bool:
+        return True
+
+    def result(self) -> RunnerResult:
+        run_secs = [random.uniform(5, 30) for _ in range(random.randint(1, 10))]
+        return RunnerResult(run_secs, None)
+
+
+@derived_object
+class DummyBuilder(PyBuilder):
+    def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
+        return [BuilderResult("test_path", None) for _ in build_inputs]
+
+
+@derived_object
+class DummyRunner(PyRunner):
+    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
+        return [DummyRunnerFuture() for _ in runner_inputs]  # type: ignore
+
+
+@derived_object
+class DummyMutator(PyMutator):
+    """Dummy Mutator for testing"""
+
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
+        pass
+
+    def apply(self, trace: Trace, _) -> Optional[Trace]:
+        return Trace(trace.insts, {})
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
index 62950fdd0bb4a..f353d401a10c6 100644
--- a/python/tvm/meta_schedule/testing/utils.py
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -15,114 +15,21 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utility functions in meta schedule"""
-import random
-from typing import Callable, Dict, List, Optional, Union
+from typing import Callable, Dict, Optional, Union
 
-import tvm
+from tvm import meta_schedule as ms
 from tvm.ir import IRModule
-from tvm.meta_schedule import TuneContext  # pylint: disable=unused-import
-from tvm.meta_schedule.builder import BuilderInput, BuilderResult, PyBuilder
-from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload
-from tvm.meta_schedule.extracted_task import ExtractedTask
-from tvm.meta_schedule.mutator.mutator import PyMutator
-from tvm.meta_schedule.relay_integration import extract_task_from_relay
-from tvm.meta_schedule.runner import (
-    PyRunner,
-    PyRunnerFuture,
-    RunnerFuture,
-    RunnerInput,
-    RunnerResult,
-)
-from tvm.meta_schedule.tune import Parse
-from tvm.meta_schedule.utils import derived_object
 from tvm.relay import Function as RelayFunc
 from tvm.runtime import NDArray
 from tvm.target import Target
 from tvm.tir import Schedule
-from tvm.tir.schedule import Trace
-
-
-@derived_object
-class DummyDatabase(PyDatabase):
-    """
-    An in-memory database based on python list for testing.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.records = []
-        self.workload_reg = []
-
-    def has_workload(self, mod: IRModule) -> bool:
-        for workload in self.workload_reg:
-            if tvm.ir.structural_equal(workload.mod, mod):
-                return True
-        return False
-
-    def commit_tuning_record(self, record: TuningRecord) -> None:
-        self.records.append(record)
-
-    def commit_workload(self, mod: IRModule) -> Workload:
-        for workload in self.workload_reg:
-            if tvm.ir.structural_equal(workload.mod, mod):
-                return workload
-        workload = Workload(mod)
-        self.workload_reg.append(workload)
-        return workload
-
-    def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
-        return list(
-            filter(
-                lambda x: x.workload == workload,
-                sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)),
-            )
-        )[: int(top_k)]
-
-    def __len__(self) -> int:
-        return len(self.records)
-
-    def print_results(self) -> None:
-        print("\n".join([str(r) for r in self.records]))
-
-
-@derived_object
-class DummyRunnerFuture(PyRunnerFuture):
-    def done(self) -> bool:
-        return True
-
-    def result(self) -> RunnerResult:
-        run_secs = [random.uniform(5, 30) for _ in range(random.randint(1, 10))]
-        return RunnerResult(run_secs, None)
-
-
-@derived_object
-class DummyBuilder(PyBuilder):
-    def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
-        return [BuilderResult("test_path", None) for _ in build_inputs]
-
-
-@derived_object
-class DummyRunner(PyRunner):
-    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
-        return [DummyRunnerFuture() for _ in runner_inputs]  # type: ignore
-
-
-@derived_object
-class DummyMutator(PyMutator):
-    """Dummy Mutator for testing"""
-
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
-        pass
-
-    def apply(self, trace: Trace, _) -> Optional[Trace]:
-        return Trace(trace.insts, {})
 
 
 def apply_fixed_schedules(
     relay_mod: Union[RelayFunc, IRModule],
     target: Union[str, Target],
     params: Optional[Dict[str, NDArray]],
-    schedule_fn: Callable[[ExtractedTask, Schedule], bool],
+    schedule_fn: Callable[[ms.ExtractedTask, Schedule], bool],
 ):
     """Apply fixed schedules (manually written, without any tunable knobs) as specified by
     schedule_fn to extracted tasks, and return a database that can be passed to ApplyHistoryBest.
@@ -145,17 +52,15 @@ def apply_fixed_schedules(
         The database containing dummy tuning records for manually scheduled traces.
     """
     target = Target(target) if isinstance(target, str) else target
-    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
-
-    database = DummyDatabase()
-
+    extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params)
+    database = ms.database.MemoryDatabase()
     for task in extracted_tasks:
-        mod = Parse._mod(task.dispatched[0])
+        mod = ms.default_config.mod(task.dispatched[0])
         sch = Schedule(mod)
 
         if schedule_fn(task, sch):
             workload = database.commit_workload(mod)
-            tune_rec = TuningRecord(sch.trace, workload, [0.0], target, [])
+            tune_rec = ms.database.TuningRecord(sch.trace, workload, [0.0], target, [])
             database.commit_tuning_record(tune_rec)
 
     return database
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 9af237b3b7b86..cc7c4cbc93563 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -22,7 +22,6 @@
 from os import path as osp
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union
 
-from tvm._ffi.registry import register_func
 from tvm.ir import IRModule
 from tvm.ir.transform import PassContext
 from tvm.runtime import Module, NDArray
@@ -30,19 +29,19 @@
 from tvm.te import Tensor, create_prim_func
 from tvm.tir import PrimFunc, Schedule
 
+from . import default_config
 from .apply_history_best import ApplyHistoryBest
-from .builder import Builder, LocalBuilder
-from .cost_model import CostModel, XGBModel
-from .database import Database, JSONDatabase, TuningRecord
+from .builder import Builder
+from .cost_model import CostModel
+from .database import Database, TuningRecord
 from .extracted_task import ExtractedTask
-from .feature_extractor import PerStoreFeature
 from .measure_callback import MeasureCallback
 from .mutator import Mutator
 from .postproc import Postproc
-from .runner import LocalRunner, Runner
+from .runner import Runner
 from .schedule_rule import ScheduleRule
 from .search_strategy import EvolutionarySearch, ReplayFunc, ReplayTrace
-from .space_generator import PostOrderApply, SpaceGenerator
+from .space_generator import SpaceGenerator
 from .task_scheduler import GradientBased, RoundRobin
 from .tune_context import TuneContext
 from .utils import autotvm_silencer, batch_parameterize_config
@@ -55,295 +54,6 @@
 FnMutatorProb = Callable[[], Dict[Mutator, float]]
 
 
-class DefaultLLVM:
-    """Default tuning configuration for LLVM."""
-
-    @staticmethod
-    def _sch_rules() -> List[ScheduleRule]:
-        from tvm.meta_schedule import schedule_rule as M
-
-        return [
-            M.AutoInline(
-                into_producer=False,
-                into_consumer=True,
-                inline_const_tensor=True,
-                disallow_if_then_else=True,
-                require_injective=True,
-                require_ordered=True,
-                disallow_op=["tir.exp"],
-            ),
-            M.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
-            M.MultiLevelTiling(
-                structure="SSRSRS",
-                tile_binds=None,
-                max_innermost_factor=64,
-                vector_load_lens=None,
-                reuse_read=None,
-                reuse_write=M.ReuseType(
-                    req="may",
-                    levels=[1, 2],
-                    scope="global",
-                ),
-            ),
-            M.ParallelizeVectorizeUnroll(
-                max_jobs_per_core=16,
-                max_vectorize_extent=64,
-                unroll_max_steps=[0, 16, 64, 512],
-                unroll_explicit=True,
-            ),
-            M.RandomComputeLocation(),
-        ]
-
-    @staticmethod
-    def _postproc() -> List[Postproc]:
-        from tvm.meta_schedule import postproc as M
-
-        return [
-            M.DisallowDynamicLoop(),
-            M.RewriteParallelVectorizeUnroll(),
-            M.RewriteReductionBlock(),
-        ]
-
-    @staticmethod
-    def _mutator_probs() -> Dict[Mutator, float]:
-        from tvm.meta_schedule import mutator as M
-
-        return {
-            M.MutateTileSize(): 0.9,
-            M.MutateComputeLocation(): 0.05,
-            M.MutateUnroll(): 0.03,
-            M.MutateParallel(max_jobs_per_core=16): 0.02,
-        }
-
-
-class DefaultCUDA:
-    """Default tuning configuration for CUDA."""
-
-    @staticmethod
-    def _sch_rules() -> List[ScheduleRule]:
-        from tvm.meta_schedule import schedule_rule as M
-
-        return [
-            M.MultiLevelTiling(
-                structure="SSSRRSRS",
-                tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
-                max_innermost_factor=64,
-                vector_load_lens=[1, 2, 3, 4],
-                reuse_read=M.ReuseType(
-                    req="must",
-                    levels=[4],
-                    scope="shared",
-                ),
-                reuse_write=M.ReuseType(
-                    req="must",
-                    levels=[3],
-                    scope="local",
-                ),
-            ),
-            M.AutoInline(
-                into_producer=True,
-                into_consumer=True,
-                inline_const_tensor=True,
-                disallow_if_then_else=False,
-                require_injective=False,
-                require_ordered=False,
-                disallow_op=None,
-            ),
-            M.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]),
-            M.ParallelizeVectorizeUnroll(
-                max_jobs_per_core=-1,  # disable parallelize
-                max_vectorize_extent=-1,  # disable vectorize
-                unroll_max_steps=[0, 16, 64, 512, 1024],
-                unroll_explicit=True,
-            ),
-            M.AutoBind(
-                max_threadblocks=256,
-                thread_extents=[32, 64, 128, 256, 512, 1024],
-            ),
-        ]
-
-    @staticmethod
-    def _postproc() -> List[Postproc]:
-        from tvm.meta_schedule import postproc as M
-
-        return [
-            M.DisallowDynamicLoop(),
-            M.RewriteCooperativeFetch(),
-            M.RewriteUnboundBlock(),
-            M.RewriteParallelVectorizeUnroll(),
-            M.RewriteReductionBlock(),
-            M.VerifyGPUCode(),
-        ]
-
-    @staticmethod
-    def _mutator_probs() -> Dict[Mutator, float]:
-        from tvm.meta_schedule import mutator as M
-
-        return {
-            M.MutateTileSize(): 0.9,
-            M.MutateUnroll(): 0.08,
-            M.MutateThreadBinding(): 0.02,
-        }
-
-
-class Parse:
-    """Parse tuning configuration from user inputs."""
-
-    @staticmethod
-    @register_func("tvm.meta_schedule.tune.parse_mod")  # for use in ApplyHistoryBest
-    def _mod(mod: Union[PrimFunc, IRModule]) -> IRModule:
-        if isinstance(mod, PrimFunc):
-            mod = mod.with_attr("global_symbol", "main")
-            mod = mod.with_attr("tir.noalias", True)
-            mod = IRModule({"main": mod})
-        if not isinstance(mod, IRModule):
-            raise TypeError(f"Expected `mod` to be PrimFunc or IRModule, but gets: {mod}")
-        # in order to make sure the mod can be found in ApplyHistoryBest
-        # different func name can cause structural unequal
-        func_names = mod.get_global_vars()
-        (func_name,) = func_names
-        if len(func_names) == 1 and func_name != "main":
-            mod = IRModule({"main": mod[func_name]})
-        return mod
-
-    @staticmethod
-    def _target(target: Union[str, Target]) -> Target:
-        if isinstance(target, str):
-            target = Target(target)
-        if not isinstance(target, Target):
-            raise TypeError(f"Expected `target` to be str or Target, but gets: {target}")
-        return target
-
-    @staticmethod
-    def _builder(builder: Optional[Builder]) -> Builder:
-        if builder is None:
-            builder = LocalBuilder()  # type: ignore
-        if not isinstance(builder, Builder):
-            raise TypeError(f"Expected `builder` to be Builder, but gets: {builder}")
-        return builder
-
-    @staticmethod
-    def _runner(runner: Optional[Runner]) -> Runner:
-        if runner is None:
-            runner = LocalRunner()  # type: ignore
-        if not isinstance(runner, Runner):
-            raise TypeError(f"Expected `runner` to be Runner, but gets: {runner}")
-        return runner
-
-    @staticmethod
-    def _database(database: Union[None, Database], path: str) -> Database:
-        if database is None:
-            path_workload = osp.join(path, "database_workload.json")
-            path_tuning_record = osp.join(path, "database_tuning_record.json")
-            logger.info(
-                "Creating JSONDatabase. Workload at: %s. Tuning records at: %s",
-                path_workload,
-                path_tuning_record,
-            )
-            database = JSONDatabase(
-                path_workload=path_workload,
-                path_tuning_record=path_tuning_record,
-            )
-        if not isinstance(database, Database):
-            raise TypeError(f"Expected `database` to be Database, but gets: {database}")
-        return database
-
-    @staticmethod
-    def _callbacks(
-        measure_callbacks: Optional[List[MeasureCallback]],
-    ) -> List[MeasureCallback]:
-        if measure_callbacks is None:
-            from tvm.meta_schedule import measure_callback as M
-
-            return [
-                M.AddToDatabase(),
-                M.RemoveBuildArtifact(),
-                M.EchoStatistics(),
-                M.UpdateCostModel(),
-            ]
-        if not isinstance(measure_callbacks, (list, tuple)):
-            raise TypeError(
-                f"Expected `measure_callbacks` to be List[MeasureCallback], "
-                f"but gets: {measure_callbacks}"
-            )
-        measure_callbacks = list(measure_callbacks)
-        for i, callback in enumerate(measure_callbacks):
-            if not isinstance(callback, MeasureCallback):
-                raise TypeError(
-                    f"Expected `measure_callbacks` to be List[MeasureCallback], "
-                    f"but measure_callbacks[{i}] is: {callback}"
-                )
-        return measure_callbacks
-
-    @staticmethod
-    def _cost_model(cost_model: Optional[CostModel]) -> CostModel:
-        if cost_model is None:
-            return XGBModel(extractor=PerStoreFeature())  # type: ignore
-        if not isinstance(cost_model, CostModel):
-            raise TypeError(f"Expected `cost_model` to be CostModel, but gets: {cost_model}")
-        return cost_model
-
-    @staticmethod
-    def _space_generator(space_generator: Optional[FnSpaceGenerator]) -> SpaceGenerator:
-        if space_generator is None:
-            return PostOrderApply()
-        if callable(space_generator):
-            space_generator = space_generator()
-        if not isinstance(space_generator, SpaceGenerator):
-            raise TypeError(
-                f"Expected `space_generator` to return SpaceGenerator, "
-                f"but gets: {space_generator}"
-            )
-        return space_generator
-
-    @staticmethod
-    def _sch_rules(sch_rules: Optional[FnScheduleRule], target: Target) -> List[ScheduleRule]:
-        if callable(sch_rules):
-            return sch_rules()
-        if sch_rules is not None:
-            raise TypeError(f"Expected `sch_rules` to be None or callable, but gets: {sch_rules}")
-        # pylint: disable=protected-access
-        if target.kind.name == "llvm":
-            return DefaultLLVM._sch_rules()
-        if target.kind.name in ["cuda", "rocm", "vulkan"]:
-            return DefaultCUDA._sch_rules()
-        # pylint: enable=protected-access
-        raise ValueError(f"Unsupported target: {target}")
-
-    @staticmethod
-    def _postproc(postproc: Optional[FnPostproc], target: Target) -> List[Postproc]:
-        if callable(postproc):
-            return postproc()
-        if postproc is not None:
-            raise TypeError(f"Expected `postproc` to be None or callable, but gets: {postproc}")
-        # pylint: disable=protected-access
-        if target.kind.name == "llvm":
-            return DefaultLLVM._postproc()
-        if target.kind.name in ["cuda", "rocm", "vulkan"]:
-            return DefaultCUDA._postproc()
-        # pylint: enable=protected-access
-        raise ValueError(f"Unsupported target: {target}")
-
-    @staticmethod
-    def _mutator_probs(
-        mutator_probs: Optional[FnMutatorProb],
-        target: Target,
-    ) -> Dict[Mutator, float]:
-        if callable(mutator_probs):
-            return mutator_probs()
-        if mutator_probs is not None:
-            raise TypeError(
-                f"Expected `mutator_probs` to be None or callable, but gets: {mutator_probs}"
-            )
-        # pylint: disable=protected-access
-        if target.kind.name == "llvm":
-            return DefaultLLVM._mutator_probs()
-        if target.kind.name in ["cuda", "rocm", "vulkan"]:
-            return DefaultCUDA._mutator_probs()
-        # pylint: enable=protected-access
-        raise ValueError(f"Unsupported target: {target}")
-
-
 class TuneConfig(NamedTuple):
     """Configuration for tuning
 
@@ -544,7 +254,7 @@ def tune_extracted_tasks(
     Parameters
     ----------
     extracted_tasks : List[ExtractedTask]
-        The list of extraced tasks.
+        The list of extracted tasks.
     config : TuneConfig
         The search strategy config.
     work_dir : Optional[str]
@@ -597,24 +307,24 @@ def tune_extracted_tasks(
     )
 
     logger.info("Working directory: %s", work_dir)
-    database = Parse._database(database, work_dir)
-    builder = Parse._builder(builder)
-    runner = Parse._runner(runner)
-    cost_model = Parse._cost_model(cost_model)
-    measure_callbacks = Parse._callbacks(measure_callbacks)
+    database = default_config.database(database, work_dir)
+    builder = default_config.builder(builder)
+    runner = default_config.runner(runner)
+    cost_model = default_config.cost_model(cost_model)
+    measure_callbacks = default_config.callbacks(measure_callbacks)
     # parse the tuning contexts
     tune_contexts = []
     for i, task in enumerate(extracted_tasks):
         assert len(task.dispatched) == 1, "Only size 1 dispatched task list is supported for now"
         tune_contexts.append(
             TuneContext(
-                mod=Parse._mod(task.dispatched[0]),
+                mod=default_config.mod(task.dispatched[0]),
                 target=task.target,
-                space_generator=Parse._space_generator(space),
+                space_generator=default_config.space_generator(space),
                 search_strategy=config.create_strategy(),
-                sch_rules=Parse._sch_rules(sch_rules, task.target),
-                postprocs=Parse._postproc(postprocs, task.target),
-                mutator_probs=Parse._mutator_probs(mutator_probs, task.target),
+                sch_rules=default_config.schedule_rules(sch_rules, task.target),
+                postprocs=default_config.postproc(postprocs, task.target),
+                mutator_probs=default_config.mutator_probs(mutator_probs, task.target),
                 task_name=task.task_name,
                 logger=logging.getLogger(
                     logger_name_pattern.format(task_id=i, task_name=task.task_name)
@@ -694,8 +404,7 @@ def tune_tir(
     )
 
     # pylint: disable=protected-access
-    mod = Parse._mod(mod)
-    target = Parse._target(target)
+    target = default_config.target(target)
     # pylint: enable=protected-access
     database = tune_extracted_tasks(
         extracted_tasks=[
@@ -851,7 +560,7 @@ def tune_relay(
     from .relay_integration import extract_task_from_relay
 
     # pylint: disable=protected-access, enable=import-outside-toplevel
-    target = Parse._target(target)
+    target = default_config.target(target)
     # pylint: enable=protected-access,
     # parse the tuning contexts
     extracted_tasks = extract_task_from_relay(mod, target, params)
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index 19ab0a40cf617..78fd3d659fafa 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -17,23 +17,26 @@
 """Meta Schedule tuning context."""
 
 import logging
-from typing import Optional, List, Dict, TYPE_CHECKING
+from typing import TYPE_CHECKING, Dict, List, Optional
 
 from tvm import IRModule
 from tvm._ffi import register_object
 from tvm.meta_schedule.utils import cpu_count, make_logging_func
 from tvm.runtime import Object
 from tvm.target import Target
-from tvm.tir import PrimFunc
+from tvm.tir import PrimFunc, Schedule
 
 from . import _ffi_api
 
 if TYPE_CHECKING:
-    from .space_generator import SpaceGenerator
-    from .search_strategy import SearchStrategy
-    from .schedule_rule import ScheduleRule
-    from .postproc import Postproc
+    from .cost_model import CostModel
+    from .database import Database
     from .mutator import Mutator
+    from .postproc import Postproc
+    from .runner import RunnerResult
+    from .schedule_rule import ScheduleRule
+    from .search_strategy import MeasureCandidate, SearchStrategy
+    from .space_generator import SpaceGenerator
 
 
 @register_object("meta_schedule.TuneContext")
@@ -114,7 +117,6 @@ def __init__(
             self.logger = logging.getLogger(__name__)
         else:
             self.logger = None
-
         self.__init_handle_by_constructor__(
             _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
             mod,
@@ -132,5 +134,105 @@ def __init__(
 
     def initialize(self):
         """Initialize the tuning context"""
-
         _ffi_api.TuneContextInitialize(self)  # type: ignore # pylint: disable=no-member
+
+    def generate_design_space(self) -> List[Schedule]:
+        """Generate design spaces given a module.
+
+        Delegated to self.space_generator.generate_design_space with self.mod
+
+        Returns
+        -------
+        design_spaces : List[Schedule]
+            The generated design spaces, i.e., schedules.
+        """
+        if self.mod is None:
+            raise ValueError("`mod` is not provided. Please construct TuneContext with `mod`")
+        if self.space_generator is None:
+            raise ValueError(
+                "space_generator is not provided."
+                "Please construct TuneContext with space_generator"
+            )
+        return self.space_generator.generate_design_space(self.mod)
+
+    def pre_tuning(
+        self,
+        design_spaces: List[Schedule],
+        database: Optional["Database"] = None,
+        cost_model: Optional["CostModel"] = None,
+    ) -> None:
+        """A method to be called for SearchStrategy to do necessary preparation before tuning.
+
+        Delegated to self.search_strategy.pre_tuning.
+
+        Parameters
+        ----------
+        design_spaces : List[Schedule]
+            The design spaces used during tuning process.
+        database : Optional[Database] = None
+            The database used during tuning process.
+        cost_model : Optional[CostModel] = None
+            The cost model used during tuning process.
+        """
+        if self.search_strategy is None:
+            raise ValueError(
+                "search_strategy is not provided."
+                "Please construct TuneContext with search_strategy"
+            )
+        return self.search_strategy.pre_tuning(design_spaces, database, cost_model)
+
+    def post_tuning(self) -> None:
+        """A method to be called for SearchStrategy to do necessary cleanup after tuning.
+
+        Delegated to self.search_strategy.post_tuning.
+        """
+        if self.search_strategy is None:
+            raise ValueError(
+                "search_strategy is not provided."
+                "Please construct TuneContext with search_strategy"
+            )
+        _ffi_api.SearchStrategyPostTuning(self)  # type: ignore # pylint: disable=no-member
+
+    def generate_measure_candidates(self) -> Optional[List["MeasureCandidate"]]:
+        """Generate a batch of measure candidates from design spaces for measurement.
+
+        Delegated to self.search_strategy.generate_measure_candidates.
+
+        Returns
+        -------
+        measure_candidates : Optional[List[IRModule]]
+            The measure candidates generated, None if search is finished.
+        """
+        if self.search_strategy is None:
+            raise ValueError(
+                "search_strategy is not provided."
+                "Please construct TuneContext with search_strategy"
+            )
+        return _ffi_api.SearchStrategyGenerateMeasureCandidates(self)  # type: ignore # pylint: disable=no-member
+
+    def notify_runner_results(
+        self,
+        measure_candidates: List["MeasureCandidate"],
+        results: List["RunnerResult"],
+    ) -> None:
+        """Update the state in SearchStrategy with profiling results.
+
+        Delegated to self.search_strategy.notify_runner_results.
+
+        Parameters
+        ----------
+        measure_candidates : List[MeasureCandidate]
+            The measure candidates for update.
+        results : List[RunnerResult]
+            The profiling results from the runner.
+        """
+        if self.search_strategy is None:
+            raise ValueError(
+                "search_strategy is not provided."
+                "Please construct TuneContext with search_strategy"
+            )
+        _ffi_api.SearchStrategyNotifyRunnerResults(  # type: ignore # pylint: disable=no-member
+            self,
+            measure_candidates,
+            results,
+        )
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 8b36a95217046..7714af3fec74e 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -314,8 +314,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     /*! \brief An interface method to be called by it's counterpart in EvolutionarySearchNode */
     inline Optional<Array<MeasureCandidate>> GenerateMeasureCandidates();
     /*! \brief An interface method to be called by it's counterpart in EvolutionarySearchNode */
-    inline void NotifyRunnerResults(const TuneContext& context,
-                                    const Array<MeasureCandidate>& measure_candidates,
+    inline void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                                     const Array<RunnerResult>& results);
   };
 
@@ -399,7 +398,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
         << "ValueError: Database is not supplied in PreTuning. Evolutionary"
            "search algorithm requires a database to be present, so that it "
            "could sample from previously-explored population. If you do not "
-           "intent to store data on disk, please use `tvm.meta_schedule.testing.DummyDatabase`";
+           "intent to store data on disk, please use `tvm.meta_schedule.database.MemoryDatabase`";
     CHECK(cost_model.defined())
         << "ValueError: CostModel is not supplied in PreTuning. Evolutionary search "
            "algorithm expects a cost model to filter out potentially less efficient kernels. If "
@@ -430,11 +429,10 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     return this->state_->GenerateMeasureCandidates();
   }
 
-  void NotifyRunnerResults(const TuneContext& context,
-                           const Array<MeasureCandidate>& measure_candidates,
+  void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                            const Array<RunnerResult>& results) final {
     ICHECK(this->state_ != nullptr);
-    this->state_->NotifyRunnerResults(context, measure_candidates, results);
+    this->state_->NotifyRunnerResults(measure_candidates, results);
   }
 };
 
@@ -681,8 +679,7 @@ Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasure
 }
 
 void EvolutionarySearchNode::State::NotifyRunnerResults(
-    const TuneContext& context, const Array<MeasureCandidate>& measure_candidates,
-    const Array<RunnerResult>& results) {
+    const Array<MeasureCandidate>& measure_candidates, const Array<RunnerResult>& results) {
   st += results.size();
   ed += results.size();
 }
diff --git a/src/meta_schedule/search_strategy/replay_func.cc b/src/meta_schedule/search_strategy/replay_func.cc
index 1aaaaa09e8ab8..24bc38ae80f5c 100644
--- a/src/meta_schedule/search_strategy/replay_func.cc
+++ b/src/meta_schedule/search_strategy/replay_func.cc
@@ -98,8 +98,7 @@ class ReplayFuncNode : public SearchStrategyNode {
     return this->state_->GenerateMeasureCandidates();
   }
 
-  void NotifyRunnerResults(const TuneContext& context,
-                           const Array<MeasureCandidate>& measure_candidates,
+  void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                            const Array<RunnerResult>& results) final {
     ICHECK(this->state_ != nullptr);
     this->state_->NotifyRunnerResults(results);
diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc
index 355f71455d912..b4b5ef8b3154d 100644
--- a/src/meta_schedule/search_strategy/replay_trace.cc
+++ b/src/meta_schedule/search_strategy/replay_trace.cc
@@ -116,8 +116,7 @@ class ReplayTraceNode : public SearchStrategyNode {
     return this->state_->GenerateMeasureCandidates();
   }
 
-  void NotifyRunnerResults(const TuneContext& context,
-                           const Array<MeasureCandidate>& measure_candidates,
+  void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                            const Array<RunnerResult>& results) final {
     ICHECK(this->state_ != nullptr);
     this->state_->NotifyRunnerResults(results);
diff --git a/src/meta_schedule/search_strategy/search_strategy.cc b/src/meta_schedule/search_strategy/search_strategy.cc
index f4c392ca2f1a1..5865fc8422489 100644
--- a/src/meta_schedule/search_strategy/search_strategy.cc
+++ b/src/meta_schedule/search_strategy/search_strategy.cc
@@ -52,12 +52,11 @@ Optional<Array<MeasureCandidate>> PySearchStrategyNode::GenerateMeasureCandidate
   return f_generate_measure_candidates();
 }
 
-void PySearchStrategyNode::NotifyRunnerResults(const TuneContext& context,
-                                               const Array<MeasureCandidate>& measure_candidates,
+void PySearchStrategyNode::NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                                                const Array<RunnerResult>& results) {
   ICHECK(f_notify_runner_results != nullptr)
       << "PySearchStrategy's NotifyRunnerResults method not implemented!";
-  f_notify_runner_results(context, measure_candidates, results);
+  f_notify_runner_results(measure_candidates, results);
 }
 
 SearchStrategy SearchStrategy::PySearchStrategy(
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index 5d41f2edfb26f..9c1f451414e32 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -21,77 +21,6 @@
 namespace tvm {
 namespace meta_schedule {
 
-/*!
- * \brief Send the measure candidates to builder.
- * \param builder The builder to send the candidates to.
- * \param context The tuning context.
- * \param candidates The measure candidates.
- */
-void SendToBuilder(const Builder& builder, const TuneContext& context, PackedFunc logging_func) {
-  Array<MeasureCandidate> candidates = context->measure_candidates.value();
-  TVM_PY_LOG(INFO, logging_func) << "Sending " << candidates.size() << " sample(s) to builder";
-  Target target = context->target.value();
-  Array<BuilderInput> inputs;
-  inputs.reserve(candidates.size());
-  for (const MeasureCandidate& candidate : candidates) {
-    ICHECK(candidate.defined()) << "Undefined MeasureCandidate found";
-    inputs.push_back(BuilderInput(candidate->sch->mod(), target));
-  }
-  context->builder_results = builder->Build(inputs);
-}
-
-/*!
- * \brief Send the built measure candidates to runner.
- * \param runner The runner to send the candidates to.
- * \param context The tuning context.
- * \param candidates The measure candidates.
- * \param builder_results The builder results.
- * \return An array of the runner results.
- */
-void SendToRunner(const Runner& runner, const TuneContext& context, PackedFunc logging_func) {
-  Array<MeasureCandidate> candidates = context->measure_candidates.value();
-  Array<BuilderResult> builder_results = context->builder_results.value();
-  TVM_PY_LOG(INFO, logging_func) << "Sending " << candidates.size() << " sample(s) to runner";
-  Target target = context->target.value();
-  ICHECK_EQ(candidates.size(), builder_results.size());
-  int n = candidates.size();
-  int n_build_errors = 0;
-  Array<RunnerInput> inputs;
-  inputs.reserve(n);
-  for (int i = 0; i < n; ++i) {
-    const MeasureCandidate& candidate = candidates[i];
-    const BuilderResult& builder_result = builder_results[i];
-    if (builder_result->error_msg.defined()) {
-      ++n_build_errors;
-      continue;
-    }
-    inputs.push_back(RunnerInput(/*artifact_path=*/builder_result->artifact_path.value(),
-                                 /*device_type=*/target->kind->name,
-                                 /*args_info=*/candidate->args_info));
-  }
-  Array<RunnerFuture> futures = runner->Run(inputs);
-  if (n_build_errors == 0) {
-    context->runner_futures = futures;
-    return;
-  }
-  Array<RunnerFuture> results;
-  results.reserve(n);
-  for (int i = 0, j = 0; i < n; ++i) {
-    const BuilderResult& builder_result = builder_results[i];
-    if (builder_result->error_msg.defined()) {
-      results.push_back(RunnerFuture(
-          /*f_done=*/[]() -> bool { return true; },
-          /*f_result=*/
-          [msg = builder_result->error_msg]() -> RunnerResult {
-            return RunnerResult(NullOpt, msg);
-          }));
-    } else {
-      results.push_back(futures[j++]);
-    }
-  }
-  context->runner_futures = results;
-}
-
 void TaskSchedulerNode::InitializeTask(int task_id) {
   TuneContext task = this->tasks[task_id];
   TVM_PY_LOG(INFO, this->logging_func)
@@ -132,11 +61,17 @@ void TaskSchedulerNode::Tune() {
     TuneContext task = tasks[task_id];
     ICHECK(!task->is_terminated);
     ICHECK(!task->runner_futures.defined());
-    SearchStrategy strategy = task->search_strategy.value();
-    if ((task->measure_candidates = strategy->GenerateMeasureCandidates()).defined()) {
-      num_trials_already += task->measure_candidates.value().size();
-      SendToBuilder(this->builder, task, this->logging_func);
-      SendToRunner(this->runner, task, this->logging_func);
+    if (Optional<Array<MeasureCandidate>> candidates =
+            task->search_strategy.value()->GenerateMeasureCandidates()) {
+      int num_candidates = candidates.value().size();
+      task->_SetMeasureCandidates(candidates.value());
+      num_trials_already += num_candidates;
+      TVM_PY_LOG(INFO, this->logging_func)
+          << "Sending " << num_candidates << " sample(s) to builder";
+      task->_SendToBuilder(this->builder);
+      TVM_PY_LOG(INFO, this->logging_func)
+          << "Sending " << num_candidates << " sample(s) to runner";
+      task->_SendToRunner(this->runner);
     } else {
       ICHECK(!task->is_terminated);
       task->is_terminated = true;
@@ -174,28 +109,12 @@ void TaskSchedulerNode::TouchTask(int task_id) {
 
 Array<RunnerResult> TaskSchedulerNode::JoinRunningTask(int task_id) {
   TuneContext task = tasks[task_id];
-  ICHECK(task->runner_futures.defined());
-  Array<RunnerFuture> futures = task->runner_futures.value();
-  int n = futures.size();
-  Array<RunnerResult> results;
-  results.reserve(n);
-  for (RunnerFuture future : futures) {
-    results.push_back(future->Result());
-  }
-  task->search_strategy.value()->NotifyRunnerResults(task, task->measure_candidates.value(),
-                                                     results);
-  // Invoke the callbacks
-  ICHECK(task->measure_candidates.defined());
-  ICHECK(task->builder_results.defined());
-  ICHECK_EQ(results.size(), task->measure_candidates.value().size());
-  ICHECK_EQ(results.size(), task->builder_results.value().size());
+  Array<RunnerResult> results = task->_Join();
   for (const MeasureCallback& callback : this->measure_callbacks) {
     callback->Apply(GetRef<TaskScheduler>(this), task_id, task->measure_candidates.value(),
                     task->builder_results.value(), results);
   }
-  task->measure_candidates = NullOpt;
-  task->builder_results = NullOpt;
-  task->runner_futures = NullOpt;
+  task->_ClearMeasureState();
   return results;
 }
 
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 3607e3050803e..362db0a380971 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -70,6 +70,87 @@ void TuneContextNode::Initialize() {
   }
 }
 
+void TuneContextNode::_SetMeasureCandidates(const Array<MeasureCandidate>& candidates) {
+  this->measure_candidates = candidates;
+}
+
+void TuneContextNode::_SendToBuilder(const Builder& builder) {
+  Array<MeasureCandidate> candidates = this->measure_candidates.value();
+  Target target = this->target.value();
+  Array<BuilderInput> inputs;
+  inputs.reserve(candidates.size());
+  for (const MeasureCandidate& candidate : candidates) {
+    inputs.push_back(BuilderInput(candidate->sch->mod(), target));
+  }
+  this->builder_results = builder->Build(inputs);
+}
+
+void TuneContextNode::_SendToRunner(const Runner& runner) {
+  Array<MeasureCandidate> candidates = this->measure_candidates.value();
+  Array<BuilderResult> builder_results = this->builder_results.value();
+  Target target = this->target.value();
+  ICHECK_EQ(candidates.size(), builder_results.size());
+  int n = candidates.size();
+  int n_build_errors = 0;
+  Array<RunnerInput> inputs;
+  inputs.reserve(n);
+  for (int i = 0; i < n; ++i) {
+    const MeasureCandidate& candidate = candidates[i];
+    const BuilderResult& builder_result = builder_results[i];
+    if (builder_result->error_msg.defined()) {
+      ++n_build_errors;
+      continue;
+    }
+    inputs.push_back(RunnerInput(/*artifact_path=*/builder_result->artifact_path.value(),
+                                 /*device_type=*/target->kind->name,
+                                 /*args_info=*/candidate->args_info));
+  }
+  Array<RunnerFuture> futures = runner->Run(inputs);
+  if (n_build_errors == 0) {
+    this->runner_futures = futures;
+    return;
+  }
+  Array<RunnerFuture> results;
+  results.reserve(n);
+  for (int i = 0, j = 0; i < n; ++i) {
+    const BuilderResult& builder_result = builder_results[i];
+    if (builder_result->error_msg.defined()) {
+      results.push_back(RunnerFuture(
+          /*f_done=*/[]() -> bool { return true; },
+          /*f_result=*/
+          [msg = builder_result->error_msg]() -> RunnerResult {
+            return RunnerResult(NullOpt, msg);
+          }));
+    } else {
+      results.push_back(futures[j++]);
+    }
+  }
+  this->runner_futures = results;
+}
+
+Array<RunnerResult> TuneContextNode::_Join() {
+  ICHECK(this->runner_futures.defined());
+  Array<RunnerFuture> futures = this->runner_futures.value();
+  int n = futures.size();
+  Array<RunnerResult> results;
+  results.reserve(n);
+  for (RunnerFuture future : futures) {
+    results.push_back(future->Result());
+  }
+  this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results);
+  ICHECK(this->measure_candidates.defined());
+  ICHECK(this->builder_results.defined());
+  ICHECK_EQ(results.size(), this->measure_candidates.value().size());
+  ICHECK_EQ(results.size(), this->builder_results.value().size());
+  return results;
+}
+
+void TuneContextNode::_ClearMeasureState() {
+  this->measure_candidates = NullOpt;
+  this->builder_results = NullOpt;
+  this->runner_futures = NullOpt;
+}
+
 TVM_REGISTER_NODE_TYPE(TuneContextNode);
 
 TVM_REGISTER_GLOBAL("meta_schedule.TuneContext")
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
index 04dcf957780c8..31b8b81829955 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
@@ -17,11 +17,9 @@
 # pylint: disable=missing-docstring
 
 import tvm
+from tvm import meta_schedule as ms
 from tvm.ir import IRModule
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.space_generator import PostOrderApply
 from tvm.meta_schedule.testing.conv2d_winograd_cpu import conv2d_winograd_cpu
-from tvm.meta_schedule.tune import DefaultLLVM
 from tvm.target import Target
 from tvm.tir.schedule import Schedule, Trace
 
@@ -164,16 +162,20 @@ def inverse(sch: Schedule):
 def test_conv2d_winograd_cpu():
     mod = conv2d_winograd_cpu
     mod = IRModule({"main": mod})
-    context = TuneContext(
+    target = Target("llvm --num-cores=16")
+    context = ms.TuneContext(
         mod=mod,
-        target=Target("llvm"),
+        target=target,
         task_name="Custom Search Space Task",
-        sch_rules=DefaultLLVM._sch_rules(),  # pylint: disable=protected-access
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=ms.default_config.schedule_rules(
+            None,
+            target,
+        ),
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     (sch,) = post_order_apply.generate_design_space(mod)
-
     decisions = dict(
         zip(
             [i for i in sch.trace.insts[:-4] if i.kind.name.startswith("Sample")],
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
index 328f98e7f0cb0..f8fdb79a1ded4 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
@@ -17,11 +17,9 @@
 # pylint: disable=missing-docstring
 
 import tvm
+from tvm import meta_schedule as ms
 from tvm.ir import IRModule
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.space_generator import PostOrderApply
 from tvm.meta_schedule.testing.conv2d_winograd_cuda import conv2d_winograd_cuda
-from tvm.meta_schedule.tune import DefaultCUDA
 from tvm.target import Target
 from tvm.tir.schedule import Schedule, Trace
 
@@ -283,16 +281,17 @@ def root_anno(sch: Schedule):
 def test_conv2d_winograd_cuda():
     mod = conv2d_winograd_cuda
     mod = IRModule({"main": mod})
-    context = TuneContext(
+    context = ms.TuneContext(
         mod=mod,
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
         task_name="Custom Search Space Task",
-        sch_rules=DefaultCUDA._sch_rules(),  # pylint: disable=protected-access
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=ms.default_config.schedule_rules(  # pylint: disable=protected-access
+            None, Target("cuda")
+        ),
     )
-    for sch_rule in context.sch_rules:
-        sch_rule.initialize_with_tune_context(context)
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     (sch,) = post_order_apply.generate_design_space(mod)
     decisions = dict(
         zip(
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 3b33039bd2874..155d6aa235fd0 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -14,21 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
-
+"""Integration test for MetaSchedule"""
 import numpy as np
 import pytest
 import tvm
 import tvm.testing
 from tvm import meta_schedule as ms
-from tvm import relay
-from tvm.meta_schedule import ApplyHistoryBest
-from tvm.meta_schedule.database import TuningRecord
-from tvm.meta_schedule.relay_integration import extract_task_from_relay
-from tvm.meta_schedule.testing import DummyDatabase
+from tvm import relay, te, tir
 from tvm.meta_schedule.testing.relay_workload import get_network
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
-from tvm.meta_schedule.tune import Parse
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
@@ -63,7 +57,7 @@ def _has_torch():
 
 
 def test_meta_schedule_apply_history_best_no_current():
-    assert ApplyHistoryBest.current() is None
+    assert ms.ApplyHistoryBest.current() is None
 
 
 @requires_torch
@@ -199,7 +193,6 @@ def test_meta_schedule_integration_extract_from_bert_base():
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet_with_filter_func():
     def filter_func(args) -> bool:
-        from tvm import te, tir
 
         has_complex_op = False
         visited = set()
@@ -262,14 +255,25 @@ def traverse(t):
 @requires_torch
 def test_meta_schedule_integration_apply_history_best():
     mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    database = DummyDatabase()
-    env = ApplyHistoryBest(database)
+    database = ms.database.MemoryDatabase()
+    env = ms.ApplyHistoryBest(database)
     target = Target("llvm")
     workload = database.commit_workload(MockModule)
     database.commit_tuning_record(
-        TuningRecord(Schedule(MockModule).trace, workload, [1.0], target, [])
+        ms.database.TuningRecord(
+            trace=Schedule(MockModule).trace,
+            workload=workload,
+            run_secs=[1.0],
+            target=target,
+            args_info=[],
+        )
+    )
+    mod = env.query(
+        task_name="mock-task",
+        mod=mod,
+        target=target,
+        dispatched=[MockModule],
     )
-    mod = env.query(task_name="mock-task", mod=mod, target=target, dispatched=[MockModule])
     assert tvm.ir.structural_equal(mod, workload.mod)
 
 
@@ -277,7 +281,7 @@ def test_meta_schedule_integration_apply_history_best():
 def extract_task_qbert():
     mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
     target = "llvm -mcpu=cascadelake"
-    extracted_tasks = extract_task_from_relay(mod, target, params)
+    extracted_tasks = ms.extract_task_from_relay(mod, target, params)
     tune_tasks = list(
         filter(
             lambda task: "dense" in task.task_name or "batch_matmul" in task.task_name,
@@ -294,7 +298,7 @@ def extract_task_qbert():
         if out_type.dtype == "float32":
             continue
 
-        mod = Parse._mod(task.dispatched[0])
+        mod = ms.default_config.mod(task.dispatched[0])
         sch = tvm.tir.Schedule(mod)
         block = sch.get_block("compute")
         annotations = sch.get(block).annotations
@@ -331,7 +335,7 @@ def test_extract_task_arm_conv2d_nchwc():
     params = {"weight": weight_np, "bias": bias_np}
 
     target = "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon"
-    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+    extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params)
     tune_tasks = list(
         filter(
             lambda task: "conv2d" in task.task_name,
diff --git a/tests/python/unittest/test_meta_schedule_measure_callback.py b/tests/python/unittest/test_meta_schedule_measure_callback.py
index 298b51e0158e5..fba8c883e5015 100644
--- a/tests/python/unittest/test_meta_schedule_measure_callback.py
+++ b/tests/python/unittest/test_meta_schedule_measure_callback.py
@@ -20,13 +20,8 @@
 
 import pytest
 import tvm
-from tvm.meta_schedule.builder import BuilderResult
-from tvm.meta_schedule.measure_callback import PyMeasureCallback
-from tvm.meta_schedule.runner import RunnerResult
-from tvm.meta_schedule.search_strategy import MeasureCandidate
-from tvm.meta_schedule.task_scheduler import RoundRobin, TaskScheduler
-from tvm.meta_schedule.testing import DummyBuilder, DummyDatabase, DummyRunner
-from tvm.meta_schedule.utils import derived_object
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.dummy_object import DummyBuilder, DummyRunner
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule
 
@@ -53,85 +48,87 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def test_meta_schedule_measure_callback():
-    @derived_object
-    class FancyMeasureCallback(PyMeasureCallback):
+    @ms.derived_object
+    class FancyMeasureCallback(ms.measure_callback.PyMeasureCallback):
         def apply(
             self,
-            task_scheduler: TaskScheduler,
+            task_scheduler: ms.task_scheduler.TaskScheduler,
             task_id: int,
-            measure_candidates: List[MeasureCandidate],
-            builds: List[BuilderResult],
-            results: List[RunnerResult],
+            measure_candidates: List[ms.MeasureCandidate],
+            builder_results: List[ms.builder.BuilderResult],
+            runner_results: List[ms.runner.RunnerResult],
         ) -> None:
             assert len(measure_candidates) == 1
             tvm.ir.assert_structural_equal(measure_candidates[0].sch.mod, Matmul)
             assert (
-                len(builds) == 1
-                and builds[0].error_msg is None
-                and builds[0].artifact_path == "test_build"
+                len(builder_results) == 1
+                and builder_results[0].error_msg is None
+                and builder_results[0].artifact_path == "test_build"
             )
             assert (
-                len(results) == 1 and results[0].error_msg is None and len(results[0].run_secs) == 2
+                len(runner_results) == 1
+                and runner_results[0].error_msg is None
+                and len(runner_results[0].run_secs) == 2
             )
 
     measure_callback = FancyMeasureCallback()
     measure_callback.apply(
-        RoundRobin(
+        ms.task_scheduler.RoundRobin(
             tasks=[],
             task_weights=[],
             builder=DummyBuilder(),
             runner=DummyRunner(),
-            database=DummyDatabase(),
+            database=ms.database.MemoryDatabase(),
             max_trials=1,
         ),
         0,
-        [MeasureCandidate(Schedule(Matmul), None)],
-        [BuilderResult("test_build", None)],
-        [RunnerResult([1.0, 2.1], None)],
+        [ms.MeasureCandidate(Schedule(Matmul), None)],
+        [ms.builder.BuilderResult("test_build", None)],
+        [ms.runner.RunnerResult([1.0, 2.1], None)],
     )
 
 
 def test_meta_schedule_measure_callback_fail():
-    @derived_object
-    class FailingMeasureCallback(PyMeasureCallback):
+    @ms.derived_object
+    class FailingMeasureCallback(ms.measure_callback.PyMeasureCallback):
         def apply(
             self,
-            task_scheduler: TaskScheduler,
+            task_scheduler: ms.task_scheduler.TaskScheduler,
             task_id: int,
-            measure_candidates: List[MeasureCandidate],
-            builds: List[BuilderResult],
-            results: List[RunnerResult],
+            measure_candidates: List[ms.MeasureCandidate],
+            builder_results: List[ms.builder.BuilderResult],
+            runner_results: List[ms.runner.RunnerResult],
         ) -> None:
             raise ValueError("test")
 
     measure_callback = FailingMeasureCallback()
     with pytest.raises(ValueError, match="test"):
         measure_callback.apply(
-            RoundRobin(
+            ms.task_scheduler.RoundRobin(
                 tasks=[],
                 task_weights=[],
                 builder=DummyBuilder(),
                 runner=DummyRunner(),
-                database=DummyDatabase(),
+                database=ms.database.MemoryDatabase(),
                 max_trials=1,
             ),
             0,
-            [MeasureCandidate(Schedule(Matmul), None)],
-            [BuilderResult("test_build", None)],
-            [RunnerResult([1.0, 2.1], None)],
+            [ms.MeasureCandidate(Schedule(Matmul), None)],
+            [ms.builder.BuilderResult("test_build", None)],
+            [ms.runner.RunnerResult([1.0, 2.1], None)],
         )
 
 
 def test_meta_schedule_measure_callback_as_string():
-    @derived_object
-    class NotSoFancyMeasureCallback(PyMeasureCallback):
+    @ms.derived_object
+    class NotSoFancyMeasureCallback(ms.measure_callback.PyMeasureCallback):
         def apply(
             self,
-            task_scheduler: "TaskScheduler",
+            task_scheduler: ms.task_scheduler.TaskScheduler,
             task_id: int,
-            measure_candidates: List[MeasureCandidate],
-            builds: List[BuilderResult],
-            results: List[RunnerResult],
+            measure_candidates: List[ms.MeasureCandidate],
+            builder_results: List[ms.builder.BuilderResult],
+            runner_results: List[ms.runner.RunnerResult],
         ) -> None:
             pass
 
diff --git a/tests/python/unittest/test_meta_schedule_multi_anchor.py b/tests/python/unittest/test_meta_schedule_multi_anchor.py
index 0b8af9c14550b..b7d012ca04d6c 100644
--- a/tests/python/unittest/test_meta_schedule_multi_anchor.py
+++ b/tests/python/unittest/test_meta_schedule_multi_anchor.py
@@ -17,9 +17,9 @@
 import numpy as np
 import tvm
 import tvm.testing
+from tvm import meta_schedule as ms
 from tvm import relay
-from tvm.meta_schedule import ApplyHistoryBest
-from tvm.meta_schedule.testing import apply_fixed_schedules
+from tvm.meta_schedule.testing.utils import apply_fixed_schedules
 
 
 def get_dense_dense(data_shape, weight_shape):
@@ -27,10 +27,8 @@ def multi_dense():
         p_data = relay.var("p_data", shape=data_shape, dtype="float32")
         p_weight1 = relay.var("p_weight1", shape=weight_shape, dtype="float32")
         p_weight2 = relay.var("p_weight2", shape=weight_shape, dtype="float32")
-
         dense1 = relay.nn.dense(p_data, p_weight1)
         dense2 = relay.nn.dense(dense1, p_weight2)
-
         f = relay.Function([p_data, p_weight1, p_weight2], dense2)
         f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
         return f
@@ -38,7 +36,6 @@ def multi_dense():
     data = relay.var("data", shape=data_shape, dtype="float32")
     weight1 = relay.var("weight1", shape=weight_shape, dtype="float32")
     weight2 = relay.var("weight2", shape=weight_shape, dtype="float32")
-
     out = relay.Call(multi_dense(), [data, weight1, weight2])
     return relay.Function([data, weight1, weight2], out)
 
@@ -51,26 +48,18 @@ def get_ref(data_np, weight1_np, weight2_np):
 def schedule_dense_dense(sch):
     dense1 = sch.get_block("T_matmul_NT")
     dense2 = sch.get_block("T_matmul_NT_1")
-
-    y1, x1, k1 = sch.get_loops(dense1)
-    y2, x2, k2 = sch.get_loops(dense2)
-
-    # ...
+    _y1, _x1, _k1 = sch.get_loops(dense1)
+    _y2, _x2, _k2 = sch.get_loops(dense2)
 
 
 def test_dense_dense():
     M, N, K = 128, 128, 128
     data_shape = (M, K)
     weight_shape = (N, K)
-
     relay_mod = tvm.IRModule.from_expr(get_dense_dense(data_shape, weight_shape))
-
-    # print(relay.transform.InferType()(relay_mod))
-
     data_np = np.random.randn(*data_shape).astype("float32")
     weight1_np = np.random.randn(*weight_shape).astype("float32")
     weight2_np = np.random.randn(*weight_shape).astype("float32")
-
     target = "llvm"
     params = {"weight1": weight1_np, "weight2": weight2_np}
 
@@ -81,8 +70,7 @@ def schedule_fn(task, sch):
         return False
 
     database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
-
-    with ApplyHistoryBest(database):
+    with ms.ApplyHistoryBest(database):
         with tvm.transform.PassContext(
             opt_level=3,
             config={"relay.backend.use_meta_schedule": True},
@@ -90,16 +78,11 @@ def schedule_fn(task, sch):
             lib = relay.build(relay_mod, target=target, params=params)
 
     dev = tvm.device(target, 0)
-
     runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
     runtime.set_input("data", data_np)
     runtime.run()
-
     out = runtime.get_output(0).numpy()
-
     ref = get_ref(data_np, weight1_np, weight2_np)
-
     tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
 
 
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
index 20a977189da52..882655c17f5ac 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
@@ -62,9 +62,15 @@ def _sch(decision: int) -> Schedule:
 
 
 def _make_mutator(target: Target) -> Mutator:
-    mutator = MutateComputeLocation()
-    mutator.initialize_with_tune_context(TuneContext(mod=add, target=target))
-    return mutator
+    ctx = TuneContext(
+        mod=add,
+        target=target,
+        mutator_probs={
+            MutateComputeLocation(): 1.0,
+        },
+    )
+    ctx.initialize()
+    return list(ctx.mutator_probs.keys())[0]
 
 
 def test_mutate_compute_location_add():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
index e263114ef60f2..42e8ffd678f54 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
@@ -80,9 +80,15 @@ def _sch(decisions: List[List[int]], ann_val: int) -> Schedule:
 
 
 def _make_mutator(target: Target, max_jobs_per_core: int) -> Mutator:
-    mutator = MutateParallel(max_jobs_per_core)
-    mutator.initialize_with_tune_context(TuneContext(mod=matmul, target=target))
-    return mutator
+    ctx = TuneContext(
+        mod=matmul,
+        target=target,
+        mutator_probs={
+            MutateParallel(max_jobs_per_core): 1.0,
+        },
+    )
+    ctx.initialize()
+    return list(ctx.mutator_probs.keys())[0]
 
 
 def test_mutate_parallel_matmul():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
index a2e5dcbd1f0a8..10bbdb366c8f7 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
@@ -63,9 +63,15 @@ def _sch() -> Schedule:
 
 
 def _make_mutator(target: Target) -> Mutator:
-    mutator = MutateThreadBinding()
-    mutator.initialize_with_tune_context(TuneContext(mod=element_wise, target=target))
-    return mutator
+    ctx = TuneContext(
+        mod=element_wise,
+        target=target,
+        mutator_probs={
+            MutateThreadBinding(): 1.0,
+        },
+    )
+    ctx.initialize()
+    return list(ctx.mutator_probs.keys())[0]
 
 
 def test_mutate_thread_binding():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
index 4a3b1f8e943af..47b386447b02b 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
@@ -68,9 +68,13 @@ def _sch(decisions: List[List[int]]) -> Schedule:
 
 
 def _make_mutator(target: Target) -> Mutator:
-    mutator = MutateTileSize()
-    mutator.initialize_with_tune_context(TuneContext(mod=matmul, target=target))
-    return mutator
+    ctx = TuneContext(
+        mod=matmul,
+        target=target,
+        mutator_probs={MutateTileSize(): 1.0},
+    )
+    ctx.initialize()
+    return list(ctx.mutator_probs.keys())[0]
 
 
 def test_mutate_tile_size_matmul():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
index 3f3fbcafc0db7..dece8a8bc1ec9 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
@@ -85,9 +85,15 @@ def _sch(decisions: List[List[int]]) -> Schedule:
 
 
 def _make_mutator(target: Target) -> Mutator:
-    mutator = MutateUnroll()
-    mutator.initialize_with_tune_context(TuneContext(mod=matmul, target=target))
-    return mutator
+    ctx = TuneContext(
+        mod=matmul,
+        target=target,
+        mutator_probs={
+            MutateUnroll(): 1.0,
+        },
+    )
+    ctx.initialize()
+    return list(ctx.mutator_probs.keys())[0]
 
 
 def test_mutate_unroll_matmul():
diff --git a/tests/python/unittest/test_meta_schedule_post_order_apply.py b/tests/python/unittest/test_meta_schedule_post_order_apply.py
index c5b6adb466e2d..4300e66aa567f 100644
--- a/tests/python/unittest/test_meta_schedule_post_order_apply.py
+++ b/tests/python/unittest/test_meta_schedule_post_order_apply.py
@@ -155,7 +155,7 @@ def _check_correct(schedule: Schedule):
 
 @derived_object
 class WowSoFancyScheduleRule(PyScheduleRule):
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         pass
 
     def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
@@ -172,7 +172,7 @@ def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
 
 @derived_object
 class DoubleScheduleRule(PyScheduleRule):
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         pass
 
     def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
@@ -197,7 +197,7 @@ def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
 
 @derived_object
 class ReorderScheduleRule(PyScheduleRule):
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         pass
 
     def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
@@ -220,10 +220,11 @@ def test_meta_schedule_post_order_apply():
         mod=mod,
         target=Target("llvm"),
         task_name="Test Task",
+        space_generator=PostOrderApply(),
         sch_rules=[WowSoFancyScheduleRule()],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 1
     assert not tvm.ir.structural_equal(schs[0].mod, mod)
@@ -236,10 +237,11 @@ def test_meta_schedule_post_order_apply_double():
         mod=mod,
         target=Target("llvm"),
         task_name="Double Rules Task",
+        space_generator=PostOrderApply(),
         sch_rules=[DoubleScheduleRule()],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 2
     for sch in schs:
@@ -253,10 +255,11 @@ def test_meta_schedule_post_order_apply_multiple():
         mod=mod,
         target=Target("llvm"),
         task_name="Double Rules Task",
+        space_generator=PostOrderApply(),
         sch_rules=[DoubleScheduleRule(), ReorderScheduleRule()],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 4
     for sch in schs:
@@ -270,10 +273,11 @@ def test_meta_schedule_post_order_apply_duplicate_matmul():
         mod=mod,
         target=Target("llvm"),
         task_name="Duplicate Matmul Task",
+        space_generator=PostOrderApply(),
         sch_rules=[WowSoFancyScheduleRule()],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     with pytest.raises(
         TVMError,
         match=r".*TVMError: Check failed: \(block_names_.count\(block->name_hint\) == 0\)"
@@ -285,7 +289,7 @@ def test_meta_schedule_post_order_apply_duplicate_matmul():
 def test_meta_schedule_post_order_apply_remove_block():
     @derived_object
     class TrinityDouble(PyScheduleRule):
-        def initialize_with_tune_context(self, context: "TuneContext") -> None:
+        def _initialize_with_tune_context(self, context: "TuneContext") -> None:
             pass
 
         def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
@@ -307,7 +311,7 @@ def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
 
     @derived_object
     class RemoveBlock(PyScheduleRule):
-        def initialize_with_tune_context(self, context: "TuneContext") -> None:
+        def _initialize_with_tune_context(self, context: "TuneContext") -> None:
             pass
 
         def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
@@ -341,10 +345,11 @@ def correct_trace(a, b, c, d):
         mod=mod,
         target=Target("llvm"),
         task_name="Remove Block Task",
+        space_generator=PostOrderApply(),
         sch_rules=[RemoveBlock(), TrinityDouble()],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 4
     for sch in schs:
@@ -368,13 +373,12 @@ def test_meta_schedule_custom_search_space():
         mod=mod,
         target=Target("llvm"),
         task_name="Custom Search Space Task",
+        space_generator=PostOrderApply(),
         sch_rules=[],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
-
+    context.initialize()
+    post_order_apply = context.space_generator
     post_order_apply.generate_design_space(mod)
-
     called = False
 
     def custom_search_space_func(sch: Schedule, _: BlockRV) -> List[Schedule]:
@@ -383,7 +387,6 @@ def custom_search_space_func(sch: Schedule, _: BlockRV) -> List[Schedule]:
         return [sch]
 
     register_func("tvm.meta_schedule.test.custom_search_space", custom_search_space_func)
-
     post_order_apply.generate_design_space(mod)
     assert called
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py b/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
index d27e3e61084f0..906519cd36eb5 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
@@ -37,8 +37,7 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
index aa1d219d1c65a..e31e912ae4a9a 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
@@ -39,8 +39,7 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
index 263448aa1be6a..c7b6e89727a10 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
@@ -37,8 +37,7 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
index bc84fb1ad0b2b..51bf2226d3e1a 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
@@ -17,9 +17,8 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
 import tvm.tir.tensor_intrin
+from tvm.meta_schedule import TuneContext, postproc
 from tvm.script import tir as T
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule import postproc
 
 
 @tvm.script.ir_module
@@ -458,8 +457,7 @@ def _create_context(mod, target, postprocs):
         postprocs=postprocs,
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
index 61bd0e349fcf5..d797bc9d154d0 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
@@ -38,8 +38,7 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
index a1d2bcfcde089..c91f7bfd1daea 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
@@ -41,8 +41,7 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
index 5a80312203542..7f7f52d1f8a28 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
@@ -33,9 +33,7 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index aa7cb09265e9c..2cedd2051dc82 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -68,9 +68,7 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
index e206fcc4502ce..5e6690d88e830 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
@@ -252,9 +252,7 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index 47f405842c98c..79d53cebe45fa 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -16,17 +16,16 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 
+import tvm
 from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing import te_workload
 from tvm.meta_schedule.testing.schedule_rule import cross_thread_reduction
 from tvm.meta_schedule.testing.space_generation import check_trace
 from tvm.meta_schedule.tune_context import TuneContext
+from tvm.script import tir as T
 from tvm.target import Target
 from tvm.te.operation import create_prim_func
 
-import tvm
-from tvm.script import tir as T
-
 
 @tvm.script.ir_module
 class Softmax_mn_after_inline:
@@ -68,9 +67,7 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 43ce9969be844..029dbc52efd1d 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -17,18 +17,17 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
 from tvm import te
+from tvm.meta_schedule import schedule_rule
 from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing import te_workload
-from tvm.meta_schedule.testing.schedule_rule import (
-    multi_level_tiling,
-)
+from tvm.meta_schedule.testing.schedule_rule import multi_level_tiling
 from tvm.meta_schedule.testing.space_generation import check_trace
 from tvm.meta_schedule.tune_context import TuneContext
-from tvm.meta_schedule import schedule_rule
 from tvm.script import tir as T
-from tvm.te import create_prim_func
 from tvm.target import Target
-from tvm.tir.tensor_intrin import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN, DP4A_INTRIN
+from tvm.te import create_prim_func
+from tvm.tir.tensor_intrin import DP4A_INTRIN
+from tvm.tir.tensor_intrin import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 
 
 def _create_context(mod, target, rule) -> TuneContext:
@@ -39,9 +38,7 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
index 85aa80eb3c82b..752bf5e04c4e8 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
@@ -232,9 +232,7 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
index 18db006c6ca8d..379fb4675aa50 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
@@ -16,8 +16,8 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
-from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.schedule_rule import RandomComputeLocation
+from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing.space_generation import check_trace
 from tvm.meta_schedule.tune_context import TuneContext
 from tvm.script import tir as T
@@ -63,9 +63,7 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index 4eb8aac5a3314..fd8c023b5e4e0 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -16,25 +16,13 @@
 # under the License.
 """ Test Meta Schedule SearchStrategy """
 # pylint: disable=missing-function-docstring
-import sys
 from typing import List
 
 import pytest
 import tvm
 import tvm.testing
 from tvm import meta_schedule as ms
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.runner import RunnerResult
-from tvm.meta_schedule.search_strategy import (
-    EvolutionarySearch,
-    ReplayFunc,
-    ReplayTrace,
-    SearchStrategy,
-)
-from tvm.meta_schedule.space_generator import ScheduleFn
-from tvm.meta_schedule.task_scheduler import RoundRobin
-from tvm.meta_schedule.testing import DummyMutator
-from tvm.meta_schedule.testing.utils import DummyDatabase
+from tvm.meta_schedule.testing.dummy_object import DummyMutator
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule, Trace
 
@@ -81,34 +69,51 @@ def _schedule_matmul(sch: Schedule):
     sch.reorder(i_0, j_0, i_1, j_1, k_0, i_2, j_2, k_1, i_3, j_3)
 
 
-@pytest.mark.parametrize("TestClass", [ReplayFunc, ReplayTrace])
-def test_meta_schedule_replay_func(TestClass: SearchStrategy):  # pylint: disable = invalid-name
+@pytest.mark.parametrize(
+    "TestClass",
+    [
+        ms.search_strategy.ReplayFunc,
+        ms.search_strategy.ReplayTrace,
+    ],
+)
+def test_meta_schedule_replay_func(
+    TestClass: ms.search_strategy.SearchStrategy,
+):  # pylint: disable = invalid-name
     num_trials_per_iter = 7
     max_trials_per_task = 20
 
-    strategy = TestClass(
-        num_trials_per_iter=num_trials_per_iter, max_trials_per_task=max_trials_per_task
+    context = ms.TuneContext(
+        mod=Matmul,
+        space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+        search_strategy=TestClass(
+            num_trials_per_iter=num_trials_per_iter, max_trials_per_task=max_trials_per_task
+        ),
     )
-    context = TuneContext(mod=Matmul, space_generator=ScheduleFn(sch_fn=_schedule_matmul))
-    context.space_generator.initialize_with_tune_context(context)
+    context.initialize()
+    strategy = context.search_strategy
     spaces = context.space_generator.generate_design_space(context.mod)
-
-    strategy.initialize_with_tune_context(context)
     strategy.pre_tuning(spaces)
-    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
+    (correct_sch,) = ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(
+        Matmul
+    )
     num_trials_each_iter: List[int] = []
     candidates = strategy.generate_measure_candidates()
     while candidates is not None:
         num_trials_each_iter.append(len(candidates))
-        runner_results: List[RunnerResult] = []
+        runner_results: List[ms.runner.RunnerResult] = []
         for candidate in candidates:
             _is_trace_equal(
                 candidate.sch,
                 correct_sch,
-                remove_decisions=(isinstance(strategy, ReplayTrace)),
+                remove_decisions=(isinstance(strategy, ms.search_strategy.ReplayTrace)),
+            )
+            runner_results.append(
+                ms.runner.RunnerResult(
+                    run_secs=[0.11, 0.41, 0.54],
+                    error_msg=None,
+                )
             )
-            runner_results.append(RunnerResult(run_secs=[0.11, 0.41, 0.54], error_msg=None))
-        strategy.notify_runner_results(context, candidates, runner_results)
+        strategy.notify_runner_results(candidates, runner_results)
         candidates = strategy.generate_measure_candidates()
     strategy.post_tuning()
     assert num_trials_each_iter == [7, 7, 6]
@@ -123,14 +128,16 @@ def _schedule_matmul_small(sch: Schedule):
 
     num_trials_per_iter = 10
     max_trials_per_task = 2000
-    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
+    (correct_sch,) = ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(
+        Matmul
+    )
 
-    context = TuneContext(
+    context = ms.TuneContext(
         mod=Matmul,
-        space_generator=ScheduleFn(
+        space_generator=ms.space_generator.ScheduleFn(
             sch_fn=_schedule_matmul_small,
         ),
-        search_strategy=EvolutionarySearch(
+        search_strategy=ms.search_strategy.EvolutionarySearch(
             num_trials_per_iter=num_trials_per_iter,
             max_trials_per_task=max_trials_per_task,
             population_size=5,
@@ -151,22 +158,27 @@ def _schedule_matmul_small(sch: Schedule):
     strategy = context.search_strategy
     strategy.pre_tuning(
         context.space_generator.generate_design_space(context.mod),
-        database=DummyDatabase(),
+        database=ms.database.MemoryDatabase(),
         cost_model=ms.cost_model.RandomModel(),
     )
     num_trials_each_iter: List[int] = []
     candidates = strategy.generate_measure_candidates()
     while candidates is not None:
         num_trials_each_iter.append(len(candidates))
-        runner_results: List[RunnerResult] = []
+        runner_results: List[ms.runner.RunnerResult] = []
         for candidate in candidates:
             _is_trace_equal(
                 candidate.sch,
                 correct_sch,
-                remove_decisions=(isinstance(strategy, ReplayTrace)),
+                remove_decisions=(isinstance(strategy, ms.search_strategy.ReplayTrace)),
             )
-            runner_results.append(RunnerResult(run_secs=[0.11, 0.41, 0.54], error_msg=None))
-        strategy.notify_runner_results(context, candidates, runner_results)
+            runner_results.append(
+                ms.runner.RunnerResult(
+                    run_secs=[0.11, 0.41, 0.54],
+                    error_msg=None,
+                )
+            )
+        strategy.notify_runner_results(candidates, runner_results)
         candidates = strategy.generate_measure_candidates()
     strategy.post_tuning()
     assert sum(num_trials_each_iter) == 25
@@ -177,14 +189,16 @@ def test_meta_schedule_evolutionary_search_early_stop():  # pylint: disable = in
     def _schedule_matmul_empty(sch: Schedule):
         return sch
 
-    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
+    (correct_sch,) = ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(
+        Matmul
+    )
 
     num_trials_per_iter = 10
     max_trials_per_task = 100
 
-    context = TuneContext(
+    context = ms.TuneContext(
         mod=Matmul,
-        search_strategy=EvolutionarySearch(
+        search_strategy=ms.search_strategy.EvolutionarySearch(
             num_trials_per_iter=num_trials_per_iter,
             max_trials_per_task=max_trials_per_task,
             population_size=5,
@@ -195,7 +209,7 @@ def _schedule_matmul_empty(sch: Schedule):
             genetic_max_fail_count=10,
             eps_greedy=0.9,
         ),
-        space_generator=ScheduleFn(
+        space_generator=ms.space_generator.ScheduleFn(
             sch_fn=_schedule_matmul_empty,
         ),
         mutator_probs={
@@ -208,22 +222,27 @@ def _schedule_matmul_empty(sch: Schedule):
     strategy = context.search_strategy
     strategy.pre_tuning(
         context.space_generator.generate_design_space(context.mod),
-        database=DummyDatabase(),
+        database=ms.database.MemoryDatabase(),
         cost_model=ms.cost_model.RandomModel(),
     )
     num_trials_each_iter: List[int] = []
     candidates = strategy.generate_measure_candidates()
     while candidates is not None:
         num_trials_each_iter.append(len(candidates))
-        runner_results: List[RunnerResult] = []
+        runner_results: List[ms.runner.RunnerResult] = []
         for candidate in candidates:
             _is_trace_equal(
                 candidate.sch,
                 correct_sch,
-                remove_decisions=(isinstance(strategy, ReplayTrace)),
+                remove_decisions=(isinstance(strategy, ms.search_strategy.ReplayTrace)),
+            )
+            runner_results.append(
+                ms.runner.RunnerResult(
+                    run_secs=[0.11, 0.41, 0.54],
+                    error_msg=None,
+                ),
             )
-            runner_results.append(RunnerResult(run_secs=[0.11, 0.41, 0.54], error_msg=None))
-        strategy.notify_runner_results(context, candidates, runner_results)
+        strategy.notify_runner_results(candidates, runner_results)
         candidates = strategy.generate_measure_candidates()
     strategy.post_tuning()
     assert num_trials_each_iter == [1, 0, 0, 0, 0]
diff --git a/tests/python/unittest/test_meta_schedule_space_generator.py b/tests/python/unittest/test_meta_schedule_space_generator.py
index 84104c8bcff27..9201fe16e8490 100644
--- a/tests/python/unittest/test_meta_schedule_space_generator.py
+++ b/tests/python/unittest/test_meta_schedule_space_generator.py
@@ -17,21 +17,23 @@
 """ Test Meta Schedule SpaceGenerator """
 # pylint: disable=missing-function-docstring
 
-import sys
 import math
+import sys
 
 import pytest
-
 import tvm
 import tvm.testing
-from tvm.meta_schedule.utils import derived_object
-from tvm.meta_schedule.space_generator import ScheduleFn, PySpaceGenerator, SpaceGeneratorUnion
-from tvm.meta_schedule.tune_context import TuneContext
 from tvm._ffi.base import TVMError
+from tvm.meta_schedule.space_generator import (
+    PySpaceGenerator,
+    ScheduleFn,
+    SpaceGeneratorUnion,
+)
+from tvm.meta_schedule.tune_context import TuneContext
+from tvm.meta_schedule.utils import derived_object
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule
 
-
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
 # fmt: off
 
@@ -98,7 +100,7 @@ class TestPySpaceGenerator(PySpaceGenerator):
         TVMError, match="PySpaceGenerator's InitializeWithTuneContext method not implemented!"
     ):
         generator = TestPySpaceGenerator()
-        generator.initialize_with_tune_context(TuneContext())
+        generator._initialize_with_tune_context(TuneContext())
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index f24dc5fbbc1fd..fc2497f053035 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -23,16 +23,18 @@
 import pytest
 import tvm
 import tvm.testing
+from tvm import meta_schedule as ms
 from tvm._ffi.base import TVMError
-from tvm.meta_schedule import TuneContext, measure_callback
-from tvm.meta_schedule.search_strategy import ReplayTrace
-from tvm.meta_schedule.space_generator import ScheduleFn
-from tvm.meta_schedule.task_scheduler import GradientBased, PyTaskScheduler, RoundRobin
-from tvm.meta_schedule.testing import DummyBuilder, DummyDatabase, DummyRunner
-from tvm.meta_schedule.utils import derived_object
+from tvm.meta_schedule.testing.dummy_object import DummyBuilder, DummyRunner
 from tvm.script import tir as T
 from tvm.tir import Schedule
 
+# from tvm.meta_schedule import TuneContext, measure_callback
+# from tvm.meta_schedule.search_strategy import ReplayTrace
+# from tvm.meta_schedule.space_generator import ScheduleFn
+# from tvm.meta_schedule.task_scheduler import GradientBased, PyTaskScheduler, RoundRobin
+# from tvm.meta_schedule.utils import derived_object
+
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
 
 
@@ -123,8 +125,8 @@ def _schedule_batch_matmul(sch: Schedule):
     sch.reorder(i_0, j_0, i_1, j_1, k_0, i_2, j_2, k_1, i_3, j_3, t_0, t_1)
 
 
-@derived_object
-class MyTaskScheduler(PyTaskScheduler):
+@ms.derived_object
+class MyTaskScheduler(ms.task_scheduler.PyTaskScheduler):
     done: Set = set()
 
     def next_task_id(self) -> int:
@@ -153,14 +155,17 @@ def next_task_id(self) -> int:
 def test_meta_schedule_task_scheduler_single():
     num_trials_per_iter = 3
     max_trials_per_task = 10
-    database = DummyDatabase()
-    round_robin = RoundRobin(
+    database = ms.database.MemoryDatabase()
+    round_robin = ms.task_scheduler.RoundRobin(
         [
-            TuneContext(
+            ms.TuneContext(
                 MatmulModule,
                 target=tvm.target.Target("llvm"),
-                space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-                search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+                space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+                search_strategy=ms.search_strategy.ReplayTrace(
+                    num_trials_per_iter,
+                    max_trials_per_task,
+                ),
                 task_name="Test",
                 rand_state=42,
             )
@@ -169,7 +174,7 @@ def test_meta_schedule_task_scheduler_single():
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
-        measure_callbacks=[measure_callback.AddToDatabase()],
+        measure_callbacks=[ms.measure_callback.AddToDatabase()],
         max_trials=max_trials_per_task,
     )
     round_robin.tune()
@@ -180,39 +185,48 @@ def test_meta_schedule_task_scheduler_multiple():
     num_trials_per_iter = 6
     max_trials_per_task = 101
     tasks = [
-        TuneContext(
+        ms.TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="Matmul",
             rand_state=42,
         ),
-        TuneContext(
+        ms.TuneContext(
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
-        TuneContext(
+        ms.TuneContext(
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_batch_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
     ]
-    database = DummyDatabase()
-    round_robin = RoundRobin(
+    database = ms.database.MemoryDatabase()
+    round_robin = ms.task_scheduler.RoundRobin(
         tasks,
         [1.0, 1.0, 1.0],
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
-        measure_callbacks=[measure_callback.AddToDatabase()],
+        measure_callbacks=[ms.measure_callback.AddToDatabase()],
         max_trials=max_trials_per_task * len(tasks),
     )
     round_robin.tune()
@@ -230,8 +244,8 @@ def test_meta_schedule_task_scheduler_multiple():
 
 
 def test_meta_schedule_task_scheduler_NIE():  # pylint: disable=invalid-name
-    @derived_object
-    class NIETaskScheduler(PyTaskScheduler):
+    @ms.derived_object
+    class NIETaskScheduler(ms.task_scheduler.PyTaskScheduler):
         pass
 
     with pytest.raises(TVMError, match="PyTaskScheduler's NextTaskId method not implemented!"):
@@ -239,21 +253,21 @@ class NIETaskScheduler(PyTaskScheduler):
             tasks=[],
             builder=DummyBuilder(),
             runner=DummyRunner(),
-            database=DummyDatabase(),
+            database=ms.database.MemoryDatabase(),
             max_trials=1,
         )
         scheduler.next_task_id()
 
 
 def test_meta_schedule_task_scheduler_avoid_cyclic():  # pylint: disable=invalid-name
-    database = DummyDatabase()
+    database = ms.database.MemoryDatabase()
     scheduler = MyTaskScheduler(
         [],
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
         measure_callbacks=[
-            measure_callback.AddToDatabase(),
+            ms.measure_callback.AddToDatabase(),
         ],
         max_trials=10,
     )
@@ -266,40 +280,47 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
     num_trials_per_iter = 6
     max_trials_per_task = 101
     tasks = [
-        TuneContext(
+        ms.TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="Matmul",
             rand_state=42,
         ),
-        TuneContext(
+        ms.TuneContext(
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
-        TuneContext(
+        ms.TuneContext(
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_batch_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
     ]
-    database = DummyDatabase()
+    database = ms.database.MemoryDatabase()
     scheduler = MyTaskScheduler(
         tasks,
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
-        measure_callbacks=[
-            measure_callback.AddToDatabase(),
-        ],
+        measure_callbacks=[ms.measure_callback.AddToDatabase()],
         max_trials=max_trials_per_task * len(tasks),
     )
     scheduler.tune()
@@ -320,39 +341,48 @@ def test_meta_schedule_task_scheduler_multiple_gradient_based():
     num_trials_per_iter = 6
     max_trials_per_task = 101
     tasks = [
-        TuneContext(
+        ms.TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="Matmul",
             rand_state=42,
         ),
-        TuneContext(
+        ms.TuneContext(
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
-        TuneContext(
+        ms.TuneContext(
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_batch_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
     ]
-    database = DummyDatabase()
-    gradient_based = GradientBased(
+    database = ms.database.MemoryDatabase()
+    gradient_based = ms.task_scheduler.GradientBased(
         tasks,
         task_weights=[1.0, 1.0, 1.0],
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
-        measure_callbacks=[measure_callback.AddToDatabase()],
+        measure_callbacks=[ms.measure_callback.AddToDatabase()],
         seed=0x20220214,
         max_trials=max_trials_per_task * len(tasks),
     )
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index e0883dbd227ed..c2baf8d2b9215 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -23,17 +23,13 @@
 import numpy as np  # type: ignore
 import pytest
 import tvm
+from tvm import meta_schedule as ms
 from tvm import relay
 from tvm._ffi import register_func
 from tvm.contrib import graph_executor
 from tvm.ir import IRModule
-from tvm.meta_schedule import ApplyHistoryBest, TuneConfig
-from tvm.meta_schedule.database import JSONDatabase, PyDatabase, TuningRecord, Workload
-from tvm.meta_schedule.relay_integration import extract_task_from_relay
-from tvm.meta_schedule.testing import apply_fixed_schedules
 from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.tune import tune_extracted_tasks, tune_relay
-from tvm.meta_schedule.utils import derived_object
+from tvm.meta_schedule.testing.utils import apply_fixed_schedules
 from tvm.script import tir as T
 from tvm.target.target import Target
 from tvm.tir.schedule import BlockRV, Schedule
@@ -142,11 +138,11 @@ def test_meta_schedule_tune_relay(
     mod, params, (input_name, _, _) = get_network(name=model_name, input_shape=input_shape)
     target = Target(target)
     with tempfile.TemporaryDirectory() as work_dir:
-        rt_mod1: tvm.runtime.Module = tune_relay(
+        rt_mod1: tvm.runtime.Module = ms.tune_relay(
             mod=mod,
             params=params,
             target=target,
-            config=TuneConfig(
+            config=ms.TuneConfig(
                 strategy="evolutionary",
                 num_trials_per_iter=32,
                 max_trials_per_task=20000,
@@ -156,7 +152,7 @@ def test_meta_schedule_tune_relay(
                 },
             ),
             work_dir=work_dir,
-            database=JSONDatabase(
+            database=ms.database.JSONDatabase(
                 osp.join(work_dir, "workload.json"),
                 osp.join(work_dir, "records.json"),
             ),
@@ -178,14 +174,14 @@ def get_output(data, lib):
 
 
 def test_meta_schedule_te2primfunc_argument_order():
-    @derived_object
-    class TestDummyDatabase(PyDatabase):
+    @ms.derived_object
+    class TestDummyDatabase(ms.database.PyDatabase):
         def __init__(self):
             super().__init__()
             self.records = []
             self.workload_reg = []
 
-        def has_workload(self, mod: IRModule) -> Workload:
+        def has_workload(self, mod: IRModule) -> ms.database.Workload:
             for workload in self.workload_reg:
                 if tvm.ir.structural_equal(workload.mod, mod):
                     return True
@@ -195,18 +191,22 @@ def has_workload(self, mod: IRModule) -> Workload:
                 + " Incorrect TIR was generated from TE subgraph."
             )
 
-        def commit_tuning_record(self, record: TuningRecord) -> None:
+        def commit_tuning_record(self, record: ms.database.TuningRecord) -> None:
             self.records.append(record)
 
-        def commit_workload(self, mod: IRModule) -> Workload:
+        def commit_workload(self, mod: IRModule) -> ms.database.Workload:
             for workload in self.workload_reg:
                 if tvm.ir.structural_equal(workload.mod, mod):
                     return workload
-            workload = Workload(mod)
+            workload = ms.database.Workload(mod)
             self.workload_reg.append(workload)
             return workload
 
-        def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
+        def get_top_k(
+            self,
+            workload: ms.database.Workload,
+            top_k: int,
+        ) -> List[ms.database.TuningRecord]:
             return list(
                 filter(
                     lambda x: x.workload == workload,
@@ -250,7 +250,7 @@ def print_results(self) -> None:
     database.commit_workload(tvmgen_default_fused_layout_transform_1)
     database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc)
 
-    with ApplyHistoryBest(database):
+    with ms.ApplyHistoryBest(database):
         with tvm.transform.PassContext(
             opt_level=3,
             config={"relay.backend.use_meta_schedule": True},
@@ -300,12 +300,11 @@ def test_meta_schedule_relay_lowering():
     data = tvm.nd.array(data_sample, dev)
 
     with tempfile.TemporaryDirectory() as work_dir:
-        database = JSONDatabase(
+        database = ms.database.JSONDatabase(
             osp.join(work_dir, "workload.json"), osp.join(work_dir, "records.json")
         )
-
         database.commit_tuning_record(
-            TuningRecord(
+            ms.database.TuningRecord(
                 Trace([], {}),
                 database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc),
                 [0.0],
@@ -313,8 +312,7 @@ def test_meta_schedule_relay_lowering():
                 args_info=[],
             )
         )
-
-        with ApplyHistoryBest(database):
+        with ms.ApplyHistoryBest(database):
             with tvm.transform.PassContext(
                 opt_level=3,
                 config={"relay.backend.use_meta_schedule": True},
@@ -435,8 +433,7 @@ def manual_tir_common(do_tune=False):
     params = {"weight": weight_np, "bias": bias_np}
 
     if do_tune:
-        extracted_tasks = extract_task_from_relay(relay_mod, target, params)
-
+        extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params)
         # Filter out tasks that we don't intend to schedule / tune with TIR.
         tune_tasks = list(
             filter(
@@ -444,7 +441,7 @@ def manual_tir_common(do_tune=False):
                 extracted_tasks,
             )
         )
-        config = TuneConfig(
+        config = ms.TuneConfig(
             strategy="replay_trace",
             num_trials_per_iter=64,
             max_trials_per_task=20000,
@@ -454,7 +451,7 @@ def manual_tir_common(do_tune=False):
         with tempfile.TemporaryDirectory() as work_dir:
             # postprocs=lambda: [] is important to prevent default post processors from
             # tampering with the manual schedule.
-            database = tune_extracted_tasks(
+            database = ms.tune_extracted_tasks(
                 tune_tasks,
                 config,
                 work_dir=work_dir,
@@ -480,7 +477,7 @@ def schedule_fn(task, sch):
 
         database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
 
-    with ApplyHistoryBest(database):
+    with ms.ApplyHistoryBest(database):
         with tvm.transform.PassContext(
             opt_level=3,
             config={"relay.backend.use_meta_schedule": True},

From 8a2f43eb0dd1eeaecaa1275a75aa35d4051386d5 Mon Sep 17 00:00:00 2001
From: yuanfz <42092999+yuanfz98@users.noreply.github.com>
Date: Fri, 10 Jun 2022 12:54:31 +0200
Subject: [PATCH 096/181] [Bugfix] GetReduceAxes accept empty axis (#11643)

* emptycommit 2nd try

* code

Co-authored-by: yuanfz <42092999+FZYUAN-1@users.noreply.github.com>
---
 src/relay/op/tensor/reduce.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index d844bb57f35d0..fba2a60cecb24 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -50,7 +50,7 @@ TVM_REGISTER_NODE_TYPE(VarianceAttrs);
  */
 inline std::vector<int64_t> GetReduceAxes(const uint32_t indim, const Array<Integer>& inaxis,
                                           bool exclude) {
-  if (!inaxis.defined()) {
+  if (!inaxis.defined() || inaxis.empty()) {
     std::vector<int64_t> r_axes(indim);
     std::iota(r_axes.begin(), r_axes.end(), 0);
     return r_axes;

From f117244ac4e7b1aa86c8c75061682abd4407359e Mon Sep 17 00:00:00 2001
From: Qianshui <qianshui.jiang@intel.com>
Date: Fri, 10 Jun 2022 19:59:09 +0800
Subject: [PATCH 097/181] [DNNL][Relay extern-schedule] DNNL Conv2D Kernel
 enable by assigning "-libs=mkldnn" (#11571)

* enable oneDNN conv op by using -libs=mkldnn

* add channel last format support and let oneDNN chose blocked format.

* remove unnecessary changes

* reformat 3 files

* reformat 1 file

* change the argument name

* change the argument name

* rename the arguments

* fix cpp lint issue

* fix cpp lint issue

* fix cpp lint issue

* clang reformated

* adjust .py import for testing

* function existence check in test
---
 cmake/modules/contrib/BLAS.cmake       |   2 +
 python/tvm/contrib/mkldnn.py           | 105 +++++++++++++++++++++++++
 python/tvm/relay/op/strategy/x86.py    |  23 ++++--
 python/tvm/topi/x86/conv2d.py          |  30 +++++++
 src/runtime/contrib/dnnl/dnnl.cc       |  82 ++++++++++++++++---
 src/runtime/contrib/dnnl/dnnl_kernel.h |   1 +
 tests/python/relay/test_op_level2.py   |  98 +++++++++++++++++++++++
 7 files changed, 326 insertions(+), 15 deletions(-)

diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
index 06c8755882d58..f31218088a9e8 100644
--- a/cmake/modules/contrib/BLAS.cmake
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -72,6 +72,7 @@ if(IS_DIRECTORY ${USE_MKLDNN})
     include_directories(SYSTEM ${USE_MKLDNN}/include)
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${MKLDNN_LIBRARY})
     list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc)
+    list(APPEND RUNTIME_SRCS src/runtime/contrib/dnnl/dnnl.cc)
     add_definitions(-DUSE_DNNL=1)
     message(STATUS "Use MKLDNN library " ${MKLDNN_LIBRARY})
   endif()
@@ -84,6 +85,7 @@ elseif(USE_MKLDNN STREQUAL "ON")
     add_definitions(-DUSE_DNNL=1)
     message(STATUS "Use MKLDNN library " ${MKLDNN_LIBRARY})
     list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc)
+    list(APPEND RUNTIME_SRCS src/runtime/contrib/dnnl/dnnl.cc)
   endif()
 elseif(USE_MKLDNN STREQUAL "OFF")
   # pass
diff --git a/python/tvm/contrib/mkldnn.py b/python/tvm/contrib/mkldnn.py
index 8d5f4da0345b4..a60a35f0ad04b 100644
--- a/python/tvm/contrib/mkldnn.py
+++ b/python/tvm/contrib/mkldnn.py
@@ -17,6 +17,7 @@
 """External function interface to BLAS libraries."""
 import tvm
 from tvm import te
+from ..topi.nn.utils import get_pad_tuple
 
 
 def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
@@ -50,3 +51,107 @@ def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
         name="C",
         **kwargs,
     )
+
+
+def dnnl_conv2d(
+    src,
+    weights,
+    stride,
+    padding,
+    dilation,
+    groups,
+    channel_last=False,
+    out_dtype="float32",
+    **kwargs,
+):
+    """Convolution operator in NCHW layout.
+
+    Parameters
+    ----------
+    src : tvm.te.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    weights : tvm.te.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of 2 or 4 ints
+        padding size, or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
+
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    groups: str
+        input data layout: NCHW or NHWC
+
+    channel_last: bool
+        chose if input/output data format is in channel_last format(NHWC) or
+        in plain format(NCHW)
+
+    out_dtype: str
+        output datatype: now only support float32
+
+    Returns
+    -------
+    Output : tvm.te.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if channel_last:
+        batch, in_height, in_width, _ = src.shape
+        kernel_h, kernel_w, _, num_filter = weights.shape
+    else:
+        batch, _, in_height, in_width = src.shape
+        num_filter, _, kernel_h, kernel_w = weights.shape
+
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w)
+    )
+    out_channel = num_filter
+    out_height = (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1
+    out_width = (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1
+
+    if channel_last:
+        out_shape = (batch, out_height, out_width, out_channel)
+    else:
+        out_shape = (batch, out_channel, out_height, out_width)
+
+    return te.extern(
+        out_shape,
+        [src, weights],
+        lambda ins, outs: tvm.tir.call_packed(
+            "tvm.contrib.mkldnn.conv2d",
+            ins[0],
+            ins[1],
+            outs[0],
+            pad_top,
+            pad_down,
+            pad_left,
+            pad_right,
+            stride[0],
+            stride[1],
+            groups,
+            channel_last,
+        ),
+        name="C",
+        dtype=out_dtype,
+        **kwargs,
+    )
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 59a57fd233f56..12ef048b48cda 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -120,6 +120,12 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
                     wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8),
                     name="conv2d_nchw_int8.x86",
                 )
+            elif "mkldnn" in target.libs:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.x86.conv2d_nchw_mkldnn),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_mkldnn),
+                    name="conv2d_nchw_mkldnn.x86",
+                )
             else:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.x86.conv2d_nchw),
@@ -133,11 +139,18 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             assert kernel_layout == "HWIO"
             if not is_auto_scheduler_enabled():
                 logger.warning("conv2d NHWC layout is not optimized for x86 with autotvm.")
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
-                wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
-                name="conv2d_nhwc.x86",
-            )
+            if "mkldnn" in target.libs:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.x86.conv2d_nhwc_mkldnn),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc_mkldnn),
+                    name="conv2d_nhwc_mkldnn.x86",
+                )
+            else:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
+                    name="conv2d_nhwc.x86",
+                )
 
             judge_winograd_auto_scheduler = False
             if len(kernel.shape) == 4:
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index 182454acf3a6f..a28c75b81d3f1 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -23,7 +23,9 @@
 import tvm
 from tvm import te
 from tvm import autotvm
+from tvm.contrib import mkldnn
 from .. import nn
+from ..generic import schedule_extern
 from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
@@ -267,6 +269,34 @@ def _callback(op):
     return s
 
 
+@autotvm.register_topi_compute("conv2d_nchw_mkldnn.x86")
+def conv2d_nchw_mkldnn(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d in NCHW format using mkldnn."""
+    groups = 1
+    _out = mkldnn.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, False, out_dtype)
+    return _out
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_mkldnn.x86")
+def schedule_conv2d_nchw_mkldnn(_, outs):
+    """Create schedule for conv2d_nchw_mkldnn"""
+    return schedule_extern(outs)
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_mkldnn.x86")
+def conv2d_nhwc_mkldnn(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d in NHWC format using mkldnn."""
+    groups = 1
+    _out = mkldnn.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, True, out_dtype)
+    return _out
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_mkldnn.x86")
+def schedule_conv2d_nhwc_mkldnn(_, outs):
+    """Create schedule for conv2d_nhwc_mkldnn"""
+    return schedule_extern(outs)
+
+
 # FIXME - https://github.com/apache/tvm/issues/4122
 # _declaration_conv_nhwc_pack expects kernel layout to be HWOI. However, the tests use HWIO
 # layout. Commenting until we have clarity about the nhwc_pack implementation from the author.
diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc
index d1190df913756..7d3763d411baf 100644
--- a/src/runtime/contrib/dnnl/dnnl.cc
+++ b/src/runtime/contrib/dnnl/dnnl.cc
@@ -80,8 +80,8 @@ inline void read_from_dnnl_memory(void* handle, const memory& mem) {
 
 void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, int p_N_, int p_C_,
                         int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph0_, int p_Pw0_, int p_Ph1_,
-                        int p_Pw1_, int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_,
-                        primitive_attr attr) {
+                        int p_Pw1_, int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_, primitive_attr attr,
+                        bool channel_last) {
   using tag = memory::format_tag;
   using dt = memory::data_type;
   engine eng(engine::kind::cpu, 0);
@@ -97,32 +97,62 @@ void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, in
   memory::dims conv2d_padding0 = {p_Ph0_, p_Pw0_};
   memory::dims conv2d_padding1 = {p_Ph1_, p_Pw1_};
 
-  auto user_src_memory = memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data);
+  auto user_src_memory =
+      memory({{conv2d_src_tz}, dt::f32, channel_last ? tag::nhwc : tag::nchw}, eng, data);
   auto user_weights_memory =
-      memory({{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng, weights);
+      memory({{conv2d_weights_tz}, dt::f32, channel_last ? tag::hwio : tag::oihw}, eng, weights);
+  if (p_G_ > 1)
+    user_weights_memory = memory(
+        {{conv2d_weights_tz}, dt::f32, channel_last ? tag::ghwio : tag::goihw}, eng, weights);
   auto conv2d_user_bias_memory = memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, bias);
+  auto user_dst_memory =
+      memory({{conv2d_dst_tz}, dt::f32, channel_last ? tag::nhwc : tag::nchw}, eng, out);
 
   auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any);
   auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any);
   auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any);
-  auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::nchw);
+  auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::any);
 
   auto conv2d_desc = convolution_forward::desc(
       prop_kind::forward_inference, algorithm::convolution_direct, conv2d_src_md, conv2d_weights_md,
       conv2d_bias_md, conv2d_dst_md, conv2d_strides, conv2d_padding0, conv2d_padding1);
   auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, attr, eng);
 
+  // reorder if src layout not DNNL chosen.
   auto conv2d_src_memory = user_src_memory;
+  if (conv2d_prim_desc.src_desc() != user_src_memory.get_desc()) {
+    conv2d_src_memory = memory(conv2d_prim_desc.src_desc(), eng);
+    auto reorder_src = reorder(user_src_memory, conv2d_src_memory);
+    reorder_src.execute(s, {{DNNL_ARG_FROM, user_src_memory}, {DNNL_ARG_TO, conv2d_src_memory}});
+  }
+
+  // reorder if weights layout not DNNL chosen.
   auto conv2d_weights_memory = user_weights_memory;
-  auto conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng);
+  if (conv2d_prim_desc.weights_desc() != user_weights_memory.get_desc()) {
+    conv2d_weights_memory = memory(conv2d_prim_desc.weights_desc(), eng);
+    auto reorder_weights = reorder(user_weights_memory, conv2d_weights_memory);
+    reorder_weights.execute(
+        s, {{DNNL_ARG_FROM, user_weights_memory}, {DNNL_ARG_TO, conv2d_weights_memory}});
+  }
+
+  auto conv2d_dst_memory = user_dst_memory;
+  if (conv2d_prim_desc.dst_desc() != user_dst_memory.get_desc()) {
+    conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng);
+  }
 
   auto conv = convolution_forward(conv2d_prim_desc);
   conv.execute(s, {{DNNL_ARG_SRC, conv2d_src_memory},
                    {DNNL_ARG_WEIGHTS, conv2d_weights_memory},
                    {DNNL_ARG_BIAS, conv2d_user_bias_memory},
                    {DNNL_ARG_DST, conv2d_dst_memory}});
+
+  // reorder if dst layout not DNNL chosen.
+  if (conv2d_prim_desc.dst_desc() != user_dst_memory.get_desc()) {
+    reorder(conv2d_dst_memory, user_dst_memory)
+        .execute(s, {{DNNL_ARG_FROM, conv2d_dst_memory}, {DNNL_ARG_TO, user_dst_memory}});
+  }
+
   s.wait();
-  read_from_dnnl_memory(out, conv2d_dst_memory);
 }
 
 extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, int p_C_, int p_H_,
@@ -131,7 +161,8 @@ extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, i
   primitive_attr attr;
   std::vector<float> bias(p_O_, 0);
   return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_,
-                            p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, attr);
+                            p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, attr,
+                            false);
 }
 
 primitive_attr create_attr_with_relu_post_op() {
@@ -151,7 +182,7 @@ extern "C" void dnnl_fused_conv2d_relu(float* data, float* weights, float* out,
   std::vector<float> bias(p_O_, 0);
   return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_,
                             p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
-                            create_attr_with_relu_post_op());
+                            create_attr_with_relu_post_op(), false);
 }
 
 extern "C" void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float* bias, float* out,
@@ -161,7 +192,7 @@ extern "C" void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float*
                                             int p_Sw_) {
   return dnnl_conv2d_common(data, weights, bias, out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, p_Ph0_,
                             p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
-                            create_attr_with_relu_post_op());
+                            create_attr_with_relu_post_op(), false);
 }
 
 extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_, int p_O_) {
@@ -306,6 +337,37 @@ extern "C" void dnnl_binary_op(float* data, float* weight, float* out, int algo_
   read_from_dnnl_memory(out, dst_memory);
 }
 
+// DNNL Conv2d single OP
+TVM_REGISTER_GLOBAL("tvm.contrib.mkldnn.conv2d").set_body([](TVMArgs args, TVMRetValue* ret) {
+  DLTensor* input = args[0];
+  DLTensor* weights = args[1];
+  DLTensor* output = args[2];
+  int p_Ph0_ = args[3], p_Pw0_ = args[4], p_Ph1_ = args[5], p_Pw1_ = args[6], p_Sh_ = args[7],
+      p_Sw_ = args[8], p_G_ = args[9];
+  bool channel_last = args[10];
+
+  int p_N_ = input->shape[0], p_C_ = input->shape[1], p_H_ = input->shape[2],
+      p_W_ = input->shape[3], p_O_ = output->shape[1], p_Kh_ = weights->shape[2],
+      p_Kw_ = weights->shape[3];
+
+  if (channel_last) {
+    p_N_ = input->shape[0];
+    p_H_ = input->shape[1];
+    p_W_ = input->shape[2];
+    p_C_ = input->shape[3];
+    p_O_ = output->shape[3];
+    p_Kh_ = weights->shape[0];
+    p_Kw_ = weights->shape[1];
+  }
+
+  std::vector<float> bias(p_O_, 0);
+  primitive_attr attr;
+  return dnnl_conv2d_common(static_cast<float*>(input->data), static_cast<float*>(weights->data),
+                            bias.data(), static_cast<float*>(output->data), p_N_, p_C_, p_H_, p_W_,
+                            p_O_, p_G_, p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
+                            attr, channel_last);
+});
+
 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/dnnl/dnnl_kernel.h b/src/runtime/contrib/dnnl/dnnl_kernel.h
index 522313ae5a640..04e06d9c9e946 100644
--- a/src/runtime/contrib/dnnl/dnnl_kernel.h
+++ b/src/runtime/contrib/dnnl/dnnl_kernel.h
@@ -27,6 +27,7 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/logging.h>
+#include <tvm/runtime/registry.h>
 
 #include <vector>
 
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index b4f30c2eab27c..db1eb16b8ca35 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1996,5 +1996,103 @@ def test_conv2d_rocm_sdot4():
     np.testing.assert_equal(out, ref)
 
 
+@tvm.testing.requires_x86
+def test_conv2d_nchw_mkldnn():
+    if not tvm.get_global_func("tvm.contrib.mkldnn.conv2d", allow_missing=True):
+        print(
+            "skip because extern mkldnn function is not available, \
+                built with MKLDNN=ON"
+        )
+        return
+    d_shape = (1, 64, 56, 56)
+    w_shape = (64, 64, 3, 3)
+    padding = (1, 1)
+    strides = (1, 1)
+
+    data = relay.var("data", shape=d_shape, dtype="float32")
+    weight = relay.var("weight", shape=w_shape, dtype="float32")
+    out_channel = w_shape[0]
+    conv2d = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=w_shape[2:],
+        channels=out_channel,
+        padding=padding,
+        strides=strides,
+        out_dtype="float32",
+    )
+
+    mod = tvm.IRModule.from_expr(conv2d)
+
+    data_np = np.random.uniform(1, 10, d_shape).astype("float32")
+    weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
+
+    target = "llvm -mcpu=skylake-avx512 -libs=mkldnn"
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target, params={"weight": weight_np})
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    ref = tvm.topi.testing.conv2d_nchw_python(data_np, weight_np, strides, padding)
+
+    np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
+
+
+@tvm.testing.requires_x86
+def test_conv2d_nhwc_mkldnn():
+    if not tvm.get_global_func("tvm.contrib.mkldnn.conv2d", allow_missing=True):
+        print(
+            "skip because extern mkldnn function is not available, \
+                built with MKLDNN=ON"
+        )
+        return
+    d_shape = (1, 56, 56, 64)
+    w_shape = (3, 3, 64, 64)
+    padding = (1, 1)
+    strides = (1, 1)
+
+    data = relay.var("data", shape=d_shape, dtype="float32")
+    weight = relay.var("weight", shape=w_shape, dtype="float32")
+    out_channel = w_shape[3]
+    conv2d = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=w_shape[:2],
+        channels=out_channel,
+        padding=padding,
+        strides=strides,
+        out_dtype="float32",
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+    )
+
+    mod = tvm.IRModule.from_expr(conv2d)
+
+    data_np = np.random.uniform(1, 10, d_shape).astype("float32")
+    weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
+
+    target = "llvm -mcpu=skylake-avx512 -libs=mkldnn"
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target, params={"weight": weight_np})
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    ref = tvm.topi.testing.conv2d_nhwc_python(data_np, weight_np, strides, padding)
+
+    np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From e7f793d0ad5f141444fff41d308be17231ec6b86 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Fri, 10 Jun 2022 17:31:13 +0100
Subject: [PATCH 098/181] Add assert message (#11665)

Change-Id: I88f19c7105cce048d2f52d50450a551fb12162dc
---
 python/tvm/relay/backend/contrib/ethosu/te/identity.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/te/identity.py b/python/tvm/relay/backend/contrib/ethosu/te/identity.py
index 0b61e0c28b880..d2ffcee085ac8 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/identity.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/identity.py
@@ -132,7 +132,7 @@ def match_ethosu_identity(output_tensor, device_config):
 
     input_tensors_shape = input_tensors[0].shape
     length = len(input_tensors_shape)
-    assert length <= 4
+    assert length <= 4, "Input tensor shape must be <= 4 for the identity operator"
     channels = int(input_tensors_shape[length - 1]) if length >= 3 else 1
 
     subkernels = len(device_config.get_kernel_steps(identity.op.name, 1, 1, ifm_dtype))

From dccc1c7d89ecb2281ff1d20f95a9d1b563bbc86e Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Fri, 10 Jun 2022 19:03:54 +0100
Subject: [PATCH 099/181] [CI] fix ci_gpu dockerfile (#11644)

---
 docker/install/ubuntu_install_papi.sh | 8 ++++++--
 docs/how_to/profile/papi.rst          | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_papi.sh b/docker/install/ubuntu_install_papi.sh
index 713312ee58c96..bd8908240dac3 100755
--- a/docker/install/ubuntu_install_papi.sh
+++ b/docker/install/ubuntu_install_papi.sh
@@ -23,11 +23,15 @@ set -o pipefail
 apt-get update --fix-missing
 
 # deps
-apt-get install -y linux-tools-common linux-tools-generic
+apt-get install -y linux-tools-common linux-tools-generic kmod
 
 cd /
 git clone https://bitbucket.org/icl/papi.git
-cd papi/src
+# Pulling the latest version of this has broken the images before. Checkout the tagged version below for now.
+cd papi
+git checkout papi-6-0-0-1-t
+cd src
 export PAPI_CUDA_ROOT=/usr/local/cuda
+export PAPI_ROCM_ROOT=/opt/rocm
 ./configure --with-components="$1"
 make -j $(nproc) && make install
diff --git a/docs/how_to/profile/papi.rst b/docs/how_to/profile/papi.rst
index b7c23b2c0c735..78d512c9888bf 100644
--- a/docs/how_to/profile/papi.rst
+++ b/docs/how_to/profile/papi.rst
@@ -34,6 +34,7 @@ PAPI can either be installed using your package manager (``apt-get install libpa
 on Ubuntu), or from source here:
 https://bitbucket.org/icl/papi/src/master/.
 
+Pulling the latest version of PAPI from source has caused build issues before. Therefore, it is recommended to checkout tagged version ``papi-6-0-0-1-t``.
 
 Building TVM With PAPI
 ----------------------

From 6f79165f24c1a2f10209e0213c182745c2212329 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 10 Jun 2022 13:23:06 -0700
Subject: [PATCH 100/181] [microtvm] Add mxnet importer and update pyyaml to
 fix poetry error (#11668)

---
 apps/microtvm/poetry.lock                     | 82 ++++++++++---------
 apps/microtvm/pyproject.toml                  |  5 +-
 .../reference-vm/arduino/provision_setup.sh   |  1 +
 .../reference-vm/zephyr/provision_setup.sh    |  1 +
 4 files changed, 50 insertions(+), 39 deletions(-)

diff --git a/apps/microtvm/poetry.lock b/apps/microtvm/poetry.lock
index a6a9cd5a124cf..9fd0e7d972343 100644
--- a/apps/microtvm/poetry.lock
+++ b/apps/microtvm/poetry.lock
@@ -635,8 +635,8 @@ python-versions = ">=3.5"
 
 [[package]]
 name = "mxnet"
-version = "1.9.1"
-description = "Apache MXNet is an ultra-scalable deep learning framework. This version uses openblas and MKLDNN."
+version = "1.6.0"
+description = "MXNet is an ultra-scalable deep learning framework. This version uses openblas."
 category = "main"
 optional = true
 python-versions = "*"
@@ -999,11 +999,11 @@ python-versions = "*"
 
 [[package]]
 name = "pyyaml"
-version = "5.4.1"
+version = "6.0"
 description = "YAML parser and emitter for Python"
 category = "main"
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
+python-versions = ">=3.6"
 
 [[package]]
 name = "recommonmark"
@@ -1525,6 +1525,7 @@ importer-caffe2 = ["torch"]
 importer-coreml = ["coremltools"]
 importer-darknet = ["opencv-python"]
 importer-keras = ["tensorflow", "tensorflow-estimator"]
+importer-mxnet = ["mxnet"]
 importer-onnx = ["future", "onnx", "onnxoptimizer", "onnxruntime", "torch", "torchvision"]
 importer-pytorch = ["torch", "torchvision", "future"]
 importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
@@ -1534,7 +1535,7 @@ xgboost = ["xgboost"]
 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.7, <3.9"
-content-hash = "f5a314157836c088e542703c94163559d1445f6e47cd24ee73a28e32ea192b67"
+content-hash = "b29aa19dd110116a43fc8a26b09ff4db6fbc19401e8592155631e051b41aa7fb"
 
 [metadata.files]
 absl-py = [
@@ -2037,9 +2038,12 @@ more-itertools = [
     {file = "more_itertools-8.13.0-py3-none-any.whl", hash = "sha256:c5122bffc5f104d37c1626b8615b511f3427aa5389b94d61e5ef8236bfbc3ddb"},
 ]
 mxnet = [
-    {file = "mxnet-1.9.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:73c045f65ad05fe9ca3c4202e10471703b57231f8ac8b05d973ec2ab362178fb"},
-    {file = "mxnet-1.9.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5e51a0c05d99f8f1b3b5e7c02170be57af2e6edb3ad9af2cb9551ace3e22942c"},
-    {file = "mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:65d5dac162c87a14d138d888b54494d515036d9047ae804ff51f770bd02197a6"},
+    {file = "mxnet-1.6.0-cp35-cp35m-macosx_10_12_x86_64.whl", hash = "sha256:557db7609ba2cea18d57eb062d29a8e42258e1164392316ccd6f3741b58de5cb"},
+    {file = "mxnet-1.6.0-cp36-cp36m-macosx_10_12_x86_64.whl", hash = "sha256:7dc1f13c5934285bbb5b0fc112c9b4601d65786bf179a4b726c1164f074d24af"},
+    {file = "mxnet-1.6.0-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:57222543d04dda608d9ba041d1a794abb4f4159490f9cd063715afd9e3818dd1"},
+    {file = "mxnet-1.6.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d8e2b789bf2c3987447a1ab45e43e90ccee9b3acead115a036599558865c05c5"},
+    {file = "mxnet-1.6.0-py2.py3-none-any.whl", hash = "sha256:f18406c87a6dba2d1bc6b95dcab0a7e798079a392f85281143804ab897dec916"},
+    {file = "mxnet-1.6.0-py2.py3-none-win_amd64.whl", hash = "sha256:9f0abcabf6b1a3762ec092e4019821603955dadd9908ceb27ab02698186aa47f"},
 ]
 numpy = [
     {file = "numpy-1.19.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cc6bd4fd593cb261332568485e20a0712883cf631f6f5e8e86a52caa8b2b50ff"},
@@ -2372,35 +2376,39 @@ pytz = [
     {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
 ]
 pyyaml = [
-    {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"},
-    {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"},
-    {file = "PyYAML-5.4.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8"},
-    {file = "PyYAML-5.4.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:72a01f726a9c7851ca9bfad6fd09ca4e090a023c00945ea05ba1638c09dc3347"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:895f61ef02e8fed38159bb70f7e100e00f471eae2bc838cd0f4ebb21e28f8541"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-win32.whl", hash = "sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:cb333c16912324fd5f769fff6bc5de372e9e7a202247b48870bc251ed40239aa"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:fe69978f3f768926cfa37b867e3843918e012cf83f680806599ddce33c2c68b0"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-win32.whl", hash = "sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf"},
-    {file = "PyYAML-5.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46"},
-    {file = "PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb"},
-    {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:fd7f6999a8070df521b6384004ef42833b9bd62cfee11a09bda1079b4b704247"},
-    {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:bfb51918d4ff3d77c1c856a9699f8492c612cde32fd3bcd344af9be34999bfdc"},
-    {file = "PyYAML-5.4.1-cp38-cp38-win32.whl", hash = "sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc"},
-    {file = "PyYAML-5.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696"},
-    {file = "PyYAML-5.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77"},
-    {file = "PyYAML-5.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183"},
-    {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:d483ad4e639292c90170eb6f7783ad19490e7a8defb3e46f97dfe4bacae89122"},
-    {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:fdc842473cd33f45ff6bce46aea678a54e3d21f1b61a7750ce3c498eedfe25d6"},
-    {file = "PyYAML-5.4.1-cp39-cp39-win32.whl", hash = "sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10"},
-    {file = "PyYAML-5.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db"},
-    {file = "PyYAML-5.4.1.tar.gz", hash = "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e"},
+    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
+    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
+    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
+    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
+    {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
+    {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
+    {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
+    {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
+    {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
+    {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
+    {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
+    {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
+    {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
+    {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
+    {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
 ]
 recommonmark = [
     {file = "recommonmark-0.6.0-py2.py3-none-any.whl", hash = "sha256:2ec4207a574289355d5b6ae4ae4abb29043346ca12cdd5f07d374dc5987d2852"},
diff --git a/apps/microtvm/pyproject.toml b/apps/microtvm/pyproject.toml
index 0ae1defe772ca..c4af779c887c9 100644
--- a/apps/microtvm/pyproject.toml
+++ b/apps/microtvm/pyproject.toml
@@ -67,7 +67,7 @@ psutil = "^5"
 scipy = "^1.4"
 tornado = "^6"
 typed_ast = "^1.4"
-pyyaml = "^5.4.1"
+pyyaml = "==6.0"
 pyserial = "^3.5"
 
 # AutoTVM
@@ -90,7 +90,7 @@ cffi = {version = "^1.14", optional = true}
 # If TF version conflict, maybe try: keras = "2.3.1"
 
 # MXNet frontend
-mxnet = {version = "^1.6.0", optional = true}
+mxnet = {version = "==1.6.0", optional = true}
 
 # ONNX frontend
 onnx = {version = "==1.10.2", optional = true}
@@ -123,6 +123,7 @@ importer-onnx = ["future", "onnx", "onnxoptimizer", "onnxruntime", "torch", "tor
 importer-pytorch = ["torch", "torchvision", "future"]
 importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
 importer-tflite = ["tflite", "tensorflow", "tensorflow-estimator"]
+importer-mxnet = ["mxnet"]
 
 [tool.poetry.dev-dependencies]
 autodocsumm = "^0.1"
diff --git a/apps/microtvm/reference-vm/arduino/provision_setup.sh b/apps/microtvm/reference-vm/arduino/provision_setup.sh
index a8dc2a0b0c13f..1d54db17fae56 100644
--- a/apps/microtvm/reference-vm/arduino/provision_setup.sh
+++ b/apps/microtvm/reference-vm/arduino/provision_setup.sh
@@ -33,6 +33,7 @@ poetry env use 3.7
 # importers
 poetry install -E importer-onnx
 poetry install -E importer-tflite
+poetry install -E importer-mxnet
 
 poetry install
 
diff --git a/apps/microtvm/reference-vm/zephyr/provision_setup.sh b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
index cd41600ea42a5..785055a696583 100644
--- a/apps/microtvm/reference-vm/zephyr/provision_setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
@@ -33,6 +33,7 @@ poetry env use 3.7
 # importers
 poetry install -E importer-onnx
 poetry install -E importer-tflite
+poetry install -E importer-mxnet
 
 poetry install
 poetry run pip3 install -r ${ZEPHYR_BASE}/scripts/requirements.txt

From 04579155dfef3e935ddcaf6cc25cd5759933f913 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 10 Jun 2022 14:23:59 -0700
Subject: [PATCH 101/181] [ci][docs] Don't delete old versions when checking
 out docs (#11612)

We don't have a good way to tell if a file was deleted or not in a docs update, so currently we delete the entire `docs/` folder and replace it from the build. However, this includes old version docs that aren't build in the normal docs build. This excludes them from the deletion so they stick around between updates. We'll have to revisit this list at each release but it should be a simple update.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile              | 4 ++--
 jenkins/Deploy.groovy.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ec4cea52d67b3..ad7771b817455 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-09T09:42:12.430625
+// Generated at 2022-06-10T12:12:40.419262
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -3330,7 +3330,7 @@ def deploy_docs() {
       git status
       git checkout -B $DOCS_DEPLOY_BRANCH
 
-      rm -rf docs
+      git ls-tree HEAD docs/ --name-only | grep -vP '^docs/v\\d' | xargs rm -rf
       mkdir -p docs
       tar xf ../docs.tgz -C docs
       COMMIT=$(cat docs/commit_hash)
diff --git a/jenkins/Deploy.groovy.j2 b/jenkins/Deploy.groovy.j2
index 3a049c5141dd9..0c81f8f4724a1 100644
--- a/jenkins/Deploy.groovy.j2
+++ b/jenkins/Deploy.groovy.j2
@@ -47,7 +47,7 @@ def deploy_docs() {
       git status
       git checkout -B $DOCS_DEPLOY_BRANCH
 
-      rm -rf docs
+      git ls-tree HEAD docs/ --name-only | grep -vP '^docs/v\\d' | xargs rm -rf
       mkdir -p docs
       tar xf ../docs.tgz -C docs
       COMMIT=$(cat docs/commit_hash)

From dc522a6ff65b68532cd1bba43827cd981114df2c Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 10 Jun 2022 14:33:24 -0700
Subject: [PATCH 102/181] [Hexagon] Run single RPC server on Android in each
 testing session  (#11547)

* Reuse hexagon launcher in test session

* separate random name generation

* revert get_aot_executor

* Fix launcher for simulator case

* add stop server for simulator
---
 python/tvm/contrib/hexagon/build.py           | 158 ++++++++++--------
 python/tvm/contrib/hexagon/pytest_plugin.py   |  66 ++++++--
 python/tvm/contrib/hexagon/session.py         |  90 ++++++----
 .../contrib/test_hexagon/test_launcher.py     |   2 -
 4 files changed, 195 insertions(+), 121 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index c659d66bec5db..7e29f645cea57 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -28,6 +28,7 @@
 import random
 import string
 import subprocess
+import tempfile
 from typing import Union
 
 import tvm
@@ -36,6 +37,7 @@
 
 
 HEXAGON_RPC_LIB_DIR = os.environ.get("HEXAGON_RPC_LIB_DIR")
+ANDROID_BASH_FILE_NAME = "android_bash.sh"
 
 
 def _get_hexagon_rpc_lib_dir() -> pathlib.Path:
@@ -116,7 +118,6 @@ def __init__(self, rpc_info: dict, workspace: Union[str, pathlib.Path] = None):
         self._rpc_info.update(rpc_info)
         self._workspace = self._create_workspace(workspace)
         self._device_key = self.HEXAGON_REMOTE_DEVICE_KEY
-        self._serial_number = None
 
     @abc.abstractmethod
     def start_server(self):
@@ -128,6 +129,11 @@ def stop_server(self):
         """Stop the RPC server"""
         ...
 
+    @abc.abstractmethod
+    def cleanup_directory(self):
+        """Cleanup working directory"""
+        ...
+
     @abc.abstractmethod
     def _copy_to_remote(
         self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path]
@@ -144,13 +150,18 @@ def _copy_to_remote(
         ...
 
     @abc.abstractmethod
-    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]):
+    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path:
         """Create a directory in the remote location.
 
         Parameters
         ----------
         remote_path : str or pathlib.Path
             Name of the directory to be created.
+
+        Returns
+        -------
+        pathlib.Path :
+            Absolute path of the remote workspace.
         """
         ...
 
@@ -171,10 +182,9 @@ def _create_workspace(self, workspace: Union[str, pathlib.Path]) -> pathlib.Path
         if not workspace:
             base_dir = self._rpc_info["workspace_base"]
             workspace = os.path.join(base_dir, _get_test_directory_name())
-        self._create_remote_directory(workspace)
-        return pathlib.Path(workspace)
+        return self._create_remote_directory(workspace)
 
-    def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
+    def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str) -> pathlib.Path:
         """Upload a local file to the remote workspace.
 
         Parameters
@@ -183,9 +193,16 @@ def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
             Path to the local file to be copied.
         remote_filename : str
             Name of the file in the remote workspace.
+
+        Returns
+        -------
+        pathlib.Path :
+            Uploaded file remote path.
         """
         assert self._workspace
-        self._copy_to_remote(local_path, os.path.join(str(self._workspace), remote_filename))
+        remote_file_path = self._workspace / remote_filename
+        self._copy_to_remote(local_path, str(remote_file_path))
+        return remote_file_path
 
     def start_session(self, session_name: str = "hexagon-rpc") -> Session:
         """Connect to the RPC server.
@@ -221,10 +238,7 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module], sess
             session and loaded.
 
             If the object passed is a string or pathlib.Path, it must
-            be either a bare file name (without any path components),
-            or a full path in the remote system. If it is a file name,
-            the file must already have been uploaded to the remote,
-            and be placed in the remote workspace.
+            be a full path in the remote system.
 
         session : Session
 
@@ -240,7 +254,10 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module], sess
         return session.load_module(module)
 
     def get_graph_executor(
-        self, graph_json: str, module_name: Union[str, pathlib.Path], session: Session
+        self,
+        graph_json: str,
+        module: Union[str, pathlib.Path, tvm.runtime.Module],
+        session: Session,
     ):
         """Create a local GraphModule which consumes a remote libmod.
 
@@ -248,8 +265,14 @@ def get_graph_executor(
         ----------
         graph_json : str
             The string with the graph JSON.
-        module_name : str or pathlib.Path
-            Remote module filename. Same restrictions apply as in load_module().
+        module : Union[str, pathlib.Path, tvm.runtime.Module]
+
+            The module to load.  If `module` is a
+            `tvm.runtime.Module`, it will be uploaded to the remote
+            session and loaded.
+
+            If the object passed is a string or pathlib.Path, it must
+            be a full path in the remote system.
         session : Session
             Remote session. The session must be established (via __enter__)
             prior to calling this function.
@@ -259,13 +282,12 @@ def get_graph_executor(
         GraphModule :
             Runtime graph module that can be used to execute the graph.
         """
-        graph_mod = self.load_module(module_name, session)
-        return tvm.contrib.graph_executor.create(graph_json, graph_mod, session.device)
+        return session.get_graph_executor(graph_json, module)
 
     def get_graph_debug_executor(
         self,
         graph_json: str,
-        module_name: Union[str, pathlib.Path],
+        module: Union[str, pathlib.Path, tvm.runtime.Module],
         session: Session,
         dump_root: Union[str, pathlib.Path] = None,
     ):
@@ -275,39 +297,24 @@ def get_graph_debug_executor(
         ----------
         graph_json : str
             The string with the graph JSON.
-        module_name : str or pathlib.Path
-            Remote module filename. Same restrictions apply as in load_module().
-        session : Session
-            Remote session. The session must be established (via __enter__)
-            prior to calling this function.
-
-        Returns
-        -------
-        GraphModuleDebug :
-            Runtime debug graph module that can be used to debug the graph.
-        """
-        graph_mod = self.load_module(module_name, session)
-        return tvm.contrib.debugger.debug_executor.create(
-            graph_json, graph_mod, session.device, dump_root=str(dump_root)
-        )
+        module : Union[str, pathlib.Path, tvm.runtime.Module]
 
-    def get_aot_executor(self, module_name: Union[str, pathlib.Path], session: Session):
-        """Create a local AoTModule which consumes a remote libmod.
+            The module to load.  If `module` is a
+            `tvm.runtime.Module`, it will be uploaded to the remote
+            session and loaded.
 
-        Parameters
-        ----------
-        module_name : str or pathlib.Path
-            Remote module filename. Same restrictions apply as in load_module().
+            If the object passed is a string or pathlib.Path, it must
+            be a full path in the remote system.
         session : Session
             Remote session. The session must be established (via __enter__)
             prior to calling this function.
 
         Returns
         -------
-        aot_module : AotModule
-            Runtime AOT module that can be used to execute.
+        GraphModuleDebug :
+            Runtime debug graph module that can be used to debug the graph.
         """
-        return session.get_aot_executor(module_name)
+        return session.get_graph_debug_executor(graph_json, module, dump_root=dump_root)
 
 
 class HexagonLauncherAndroid(HexagonLauncherRPC):
@@ -315,7 +322,6 @@ class HexagonLauncherAndroid(HexagonLauncherRPC):
 
     ANDROID_HEXAGON_TEST_BASE_DIR = pathlib.Path("/data/local/tmp/hexagon_test")
     ANDROID_HEXAGON_RPC_FILES = [
-        "android_bash.sh",
         "libhexagon_rpc_skel.so",
         "libtvm_runtime.so",
         "tvm_rpc_android",
@@ -354,39 +360,42 @@ def _copy_to_remote(
             self._adb_device_sub_cmd + ["push", str(local_path), str(remote_path)]
         )
 
-    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]):
+    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path:
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         subprocess.check_call(self._adb_device_sub_cmd + ["shell", "mkdir", "-p", str(remote_path)])
+        return pathlib.Path(remote_path)
 
     def _copy_binaries(self):
         """Upload Android server binaries."""
 
         # Create bash script
-        android_bash_script_path = _get_hexagon_rpc_lib_dir() / "android_bash.sh"
-        with open(_get_hexagon_rpc_lib_dir() / "android_bash.sh.template", "r") as src_f:
-            if os.path.exists(android_bash_script_path):
-                os.remove(android_bash_script_path)
-            with open(android_bash_script_path, "w") as dest_f:
-                for line in src_f.readlines():
-                    if "<RPC_TRACKER_HOST>" in line:
-                        line = line.replace(
-                            "<RPC_TRACKER_HOST>", str(self._rpc_info["rpc_tracker_host"])
-                        )
-                    if "<RPC_TRACKER_PORT>" in line:
-                        line = line.replace(
-                            "<RPC_TRACKER_PORT>", str(self._rpc_info["rpc_tracker_port"])
-                        )
-                    if "<HEXAGON_REMOTE_DEVICE_KEY>" in line:
-                        line = line.replace("<HEXAGON_REMOTE_DEVICE_KEY>", self._device_key)
-                    if "<RPC_SERVER_PORT>" in line:
-                        line = line.replace(
-                            "<RPC_SERVER_PORT>", str(self._rpc_info["rpc_server_port"])
-                        )
-                    dest_f.write(line)
-
-        # Make shell script executable
-        android_bash_stat = os.stat(android_bash_script_path)
-        os.chmod(android_bash_script_path, android_bash_stat.st_mode | stat.S_IEXEC)
+        with open(_get_hexagon_rpc_lib_dir() / f"{ANDROID_BASH_FILE_NAME}.template", "r") as src_f:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                android_bash_script_path = pathlib.Path(temp_dir) / ANDROID_BASH_FILE_NAME
+                with open(android_bash_script_path, "w") as dest_f:
+                    for line in src_f.readlines():
+                        if "<RPC_TRACKER_HOST>" in line:
+                            line = line.replace(
+                                "<RPC_TRACKER_HOST>", str(self._rpc_info["rpc_tracker_host"])
+                            )
+                        if "<RPC_TRACKER_PORT>" in line:
+                            line = line.replace(
+                                "<RPC_TRACKER_PORT>", str(self._rpc_info["rpc_tracker_port"])
+                            )
+                        if "<HEXAGON_REMOTE_DEVICE_KEY>" in line:
+                            line = line.replace("<HEXAGON_REMOTE_DEVICE_KEY>", self._device_key)
+                        if "<RPC_SERVER_PORT>" in line:
+                            line = line.replace(
+                                "<RPC_SERVER_PORT>", str(self._rpc_info["rpc_server_port"])
+                            )
+                        dest_f.write(line)
+
+                # Make shell script executable
+                android_bash_stat = os.stat(android_bash_script_path)
+                os.chmod(android_bash_script_path, android_bash_stat.st_mode | stat.S_IEXEC)
+                self._copy_to_remote(
+                    android_bash_script_path, self._workspace / android_bash_script_path.name
+                )
 
         # Push files
         lib_dir = _get_hexagon_rpc_lib_dir()
@@ -436,7 +445,8 @@ def _run_server_script(self):
 
         # Run server and connect to tracker
         subprocess.Popen(
-            self._adb_device_sub_cmd + ["shell", f"cd {self._workspace} && ./android_bash.sh"],
+            self._adb_device_sub_cmd
+            + ["shell", f"cd {self._workspace} && ./{ANDROID_BASH_FILE_NAME}"],
             stdout=subprocess.PIPE,
             stdin=subprocess.PIPE,
             stderr=subprocess.PIPE,
@@ -472,8 +482,8 @@ def _terminate_remote(self):
             self._adb_device_sub_cmd + ["shell", f"kill `cat {self._workspace}/rpc_pid.txt`"]
         )
 
-    def _cleanup_directory(self):
-        # Remove workspace directory on remote target
+    def cleanup_directory(self):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
         subprocess.Popen(self._adb_device_sub_cmd + ["shell", f"rm -rf {self._workspace}"])
 
     def start_server(self):
@@ -485,7 +495,7 @@ def stop_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._cleanup_port_forwarding()
         self._terminate_remote()
-        self._cleanup_directory()
+        self.cleanup_directory()
 
 
 class HexagonLauncherSimulator(HexagonLauncherRPC):
@@ -511,9 +521,10 @@ def _copy_to_remote(
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         subprocess.check_call(["cp", str(local_path), str(remote_path)])
 
-    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]):
+    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path:
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         subprocess.check_call(["mkdir", "-p", str(remote_path)])
+        return pathlib.Path(os.path.abspath(remote_path))
 
     def _copy_libcxx(self, dest_dir: Union[str, pathlib.Path]):
         """Copy libc++ libraries to the remote workspace."""
@@ -585,6 +596,9 @@ def _start(self):
         self._server_process = mp.Process(target=lambda *a: _start(self, *a))
         self._server_process.start()
 
+    def cleanup_directory(self):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
+
     def stop_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._server_process.terminate()
diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index 278bd833da954..1841c654b934d 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -56,7 +56,7 @@ def _compose(args, decs):
 requires_hexagon_toolchain = tvm.testing.requires_hexagon(support_required="compile-only")
 
 
-@tvm.testing.fixture
+@pytest.fixture(scope="session")
 def android_serial_number() -> Optional[str]:
     serial = os.getenv(ANDROID_SERIAL_NUMBER, default="")
     # Setting ANDROID_SERIAL_NUMBER to an empty string should be
@@ -138,22 +138,29 @@ def tvm_tracker_port(_tracker_info) -> int:
     return port
 
 
-@tvm.testing.fixture
+@pytest.fixture(scope="session")
+def rpc_server_port_for_session() -> int:
+    return get_free_port()
+
+
+@pytest.fixture()
 def rpc_server_port() -> int:
     return get_free_port()
 
 
-@tvm.testing.fixture
+@pytest.fixture(scope="session")
 def adb_server_socket() -> str:
     return os.getenv(ADB_SERVER_SOCKET, default="tcp:5037")
 
 
-@tvm.testing.fixture
-def hexagon_launcher(
-    request, android_serial_number, rpc_server_port, adb_server_socket
+@pytest.fixture(scope="session")
+def hexagon_server_process(
+    request, android_serial_number, rpc_server_port_for_session, adb_server_socket
 ) -> HexagonLauncherRPC:
-    """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined"""
-    if android_serial_number is None:
+    """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined.
+    This launcher is started only once per test session.
+    """
+    if android_serial_number is None or android_serial_number == "simulator":
         yield None
     else:
         # Requesting these fixtures sets up a local tracker, if one
@@ -165,19 +172,54 @@ def hexagon_launcher(
         rpc_info = {
             "rpc_tracker_host": tvm_tracker_host,
             "rpc_tracker_port": tvm_tracker_port,
-            "rpc_server_port": rpc_server_port,
+            "rpc_server_port": rpc_server_port_for_session,
             "adb_server_socket": adb_server_socket,
         }
         launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-        launcher.start_server()
+
         try:
+            launcher.start_server()
             yield launcher
         finally:
             launcher.stop_server()
 
 
-@tvm.testing.fixture
-def hexagon_session(hexagon_launcher) -> Session:
+@pytest.fixture
+def hexagon_launcher(
+    hexagon_server_process,
+    rpc_server_port,
+    tvm_tracker_host,
+    tvm_tracker_port,
+    adb_server_socket,
+    android_serial_number,
+) -> HexagonLauncherRPC:
+    """Initials and returns hexagon launcher which reuses RPC info and Android serial number."""
+    if android_serial_number is None:
+        yield None
+
+    if android_serial_number != "simulator":
+        rpc_info = hexagon_server_process._rpc_info
+    else:
+        rpc_info = {
+            "rpc_tracker_host": tvm_tracker_host,
+            "rpc_tracker_port": tvm_tracker_port,
+            "rpc_server_port": rpc_server_port,
+            "adb_server_socket": adb_server_socket,
+        }
+
+    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+    try:
+        if android_serial_number == "simulator":
+            launcher.start_server()
+        yield launcher
+    finally:
+        if android_serial_number == "simulator":
+            launcher.stop_server()
+        launcher.cleanup_directory()
+
+
+@pytest.fixture
+def hexagon_session(hexagon_launcher: HexagonLauncherRPC) -> Session:
     if hexagon_launcher is None:
         yield None
     else:
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index f30fe6e47096f..0c0bf296df446 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -93,7 +93,8 @@ def __enter__(self):
             raise exception
 
     def __exit__(self, exc_type, exc_value, exc_traceback):
-        pass
+        # close session to the tracker
+        del self._rpc
 
     @property
     def device(self):
@@ -109,7 +110,7 @@ def device(self):
 
         return self._device
 
-    def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
+    def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str) -> pathlib.Path:
         """Upload a local file to the remote workspace.
 
         Parameters
@@ -118,8 +119,13 @@ def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
             Path to the local file to be copied.
         remote_filename : str
             Name of the file in the remote workspace.
+
+        Returns
+        -------
+        pathlib.Path :
+            Uploaded file remote path.
         """
-        self._launcher.upload(local_path, remote_filename)
+        return self._launcher.upload(local_path, remote_filename)
 
     def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
         """Load TVM module.
@@ -136,10 +142,7 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
             session and loaded.
 
             If the object passed is a string or pathlib.Path, it must
-            be either a bare file name (without any path components),
-            or a full path in the remote system. If it is a file name,
-            the file must already have been uploaded to the remote,
-            and be placed in the remote workspace.
+            be a full path in the remote system.
 
         Returns
         -------
@@ -155,16 +158,19 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
                 binary_name = "test_binary.so"
                 binary_path = temp_dir / binary_name
                 module.save(str(binary_path))
-                self.upload(binary_path, binary_name)
-                module = binary_name
+                remote_file_path = self.upload(binary_path, binary_name)
+        else:
+            remote_file_path = module
 
-        assert isinstance(module, (str, pathlib.Path)), "Invalid path type:" + str(type(module))
-        return self._rpc.get_function("tvm.hexagon.load_module")(str(module))
+        assert isinstance(remote_file_path, (str, pathlib.Path)), "Invalid path type:" + str(
+            type(remote_file_path)
+        )
+        return self._rpc.get_function("tvm.hexagon.load_module")(str(remote_file_path))
 
     def get_graph_executor(
         self,
         graph_json: str,
-        module_name: Union[str, pathlib.Path],
+        module_name: Union[str, pathlib.Path, tvm.runtime.Module],
     ):
         """Create a local GraphModule which consumes a remote libmod.
 
@@ -173,14 +179,10 @@ def get_graph_executor(
 
         Parameters
         ----------
-
-        module_name : Union[str, pathlib.Path]
-
+        module_name : Union[str, pathlib.Path, tvm.runtime.Module]
             The remote module filename, following the same restrictions
             as `load_module`.
-
         graph_json : str
-
             The string with the graph JSON.
 
         Returns
@@ -196,31 +198,54 @@ def get_graph_executor(
 
     def get_aot_executor(
         self,
-        module_name: Union[str, pathlib.Path],
+        module_file: Union[str, pathlib.Path],
     ):
         """Create a local GraphModule which consumes a remote libmod.
-
         The session must be established (via __enter__) prior to
         calling this function.
-
         Parameters
         ----------
+        module_file : Union[str, pathlib.Path]
+            The remote module filename, following the same restrictions
+            as `load_module`. The filename should be an absolute path.
+        Returns
+        -------
+        GraphModule :
+            Runtime graph module that can be used to execute the graph.
+        """
+        aot_mod = self.load_module(module_file)
+        return tvm.runtime.executor.AotModule(aot_mod["default"](self.device))
 
-        module_name : Union[str, pathlib.Path]
+    def get_graph_debug_executor(
+        self,
+        graph_json: str,
+        module_name: Union[str, pathlib.Path, tvm.runtime.Module],
+        dump_root: Union[str, pathlib.Path] = None,
+    ):
+        """Create a local GraphModuleDebug which consumes a remote libmod.
 
+        Parameters
+        ----------
+        graph_json : str
+            The string with the graph JSON.
+         module_name : Union[str, pathlib.Path, tvm.runtime.Module]
             The remote module filename, following the same restrictions
             as `load_module`.
+        session : Session
+            Remote session. The session must be established (via __enter__)
+            prior to calling this function.
 
         Returns
         -------
-        GraphModule :
-            Runtime graph module that can be used to execute the graph.
-
+        GraphModuleDebug :
+            Runtime debug graph module that can be used to debug the graph.
         """
 
-        aot_mod = self.load_module(module_name)
-        self._set_device_type(aot_mod)
-        return tvm.runtime.executor.AotModule(aot_mod["default"](self.device))
+        graph_debug_mod = self.load_module(module_name)
+        self._set_device_type(graph_debug_mod)
+        return tvm.contrib.debugger.debug_executor.create(
+            graph_json, graph_debug_mod, self.device, dump_root=str(dump_root)
+        )
 
     def get_executor_from_factory(self, module: ExecutorFactoryModule):
         """Create a local GraphModule which consumes a remote libmod.
@@ -286,11 +311,7 @@ def _graph_executor_from_factory(
             Runtime graph module that can be used to execute the graph.
 
         """
-
-        graph_json = module.get_graph_json()
-        graph_mod = self.load_module(module.get_lib())
-
-        return tvm.contrib.graph_executor.create(graph_json, graph_mod, self.device)
+        return self.get_graph_executor(module.get_graph_json(), module.get_lib())
 
     def _aot_executor_from_factory(
         self,
@@ -354,7 +375,6 @@ def _aot_executor_from_factory(
                     f"Target kind should be from these options: [hexagon, llvm]."
                 )
 
-            self.upload(binary_path, binary_name)
+            remote_file_path = self.upload(binary_path, binary_name)
 
-        aot_mod = self.load_module(binary_name)
-        return tvm.runtime.executor.AotModule(aot_mod["default"](self.device))
+        return self.get_aot_executor(remote_file_path)
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index ad798925ee88b..aae2e598f6177 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -15,8 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import sys
-import pytest
 import numpy as np
 
 import tvm.testing

From e8712a91985b764f3e9e0d435256fdbf29796ee5 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Sat, 11 Jun 2022 07:45:18 +0800
Subject: [PATCH 103/181] [BYOC][DNNL] Improve performance of DNNL BYOC dense
 operator (#11513)

* Enhance dnnl byoc dense operators performance by 1) introducing gelu fusion and 2) introducing alter dense weight layout.

* fix lint issue

* add unittest for dense pack

* Make code compatible after introducing TensorRequisite(PR-11345)

* Fix comments & refactor code

* Fix lint

* Fix partition graph unittest case

* Fix comments

* Fix comments

* Fix lint
---
 python/tvm/relay/op/contrib/dnnl.py           | 123 +++++++++++++++++-
 src/relay/backend/contrib/dnnl/codegen.cc     |   4 +-
 src/relay/backend/utils.h                     |  32 +++++
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc |   9 +-
 tests/python/contrib/test_dnnl.py             | 103 ++++++++++++---
 .../python/relay/test_pass_partition_graph.py |   6 +-
 6 files changed, 250 insertions(+), 27 deletions(-)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index c87a7162b0707..6581f10a2f568 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -40,10 +40,15 @@
 from tvm.relay.expr import GlobalVar
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
+from tvm.relay.analysis import analysis as _analysis
+from tvm.relay import expr as _expr
+
+
 from ... import _ffi_api
 from ...dataflow_pattern import wildcard, is_op, is_expr, rewrite, DFPatternCallback
 from .register import register_pattern_table
 
+
 logger = logging.getLogger("DNNL")
 
 
@@ -139,12 +144,22 @@ def make_dense_pattern(with_bias=True, with_eltwise=None):
     data = wildcard()
     weight = wildcard()
     bias = wildcard()
+
     dense = is_op("nn.dense")(data, weight)
     if with_bias:
         dense_out = is_op("add")(dense, bias)
     else:
         dense_out = dense
-    if with_eltwise:
+    if with_eltwise == "gelu":
+        const1 = wildcard()
+        const2 = wildcard()
+        const3 = wildcard()
+        div = is_op("divide")(dense_out, const1)
+        erf_val = is_op("erf")(div)
+        added_erf_val = is_op("add")(erf_val, const2)
+        mul_val = is_op("multiply")(dense_out, added_erf_val)
+        dense_out = is_op("multiply")(mul_val, const3)
+    elif with_eltwise:
         dense_out = is_op(with_eltwise)(dense_out)
     return dense_out
 
@@ -176,7 +191,7 @@ def make_dnnl_pattern(op_name, with_bias, with_eltwise):
         dnnl_pattern = (pat_name, make_dense_pattern(with_bias, with_eltwise))
     else:
         logger.warning(
-            "Currently, only conv1d, conv2d, conv2d_transpose, conv3d_transpose and "
+            "Currently, only conv1d, conv2d, conv2d_transpose, conv3d_transpose, "
             "dense op are supported, but got %s.",
             op_name,
         )
@@ -193,12 +208,12 @@ def pattern_table():
     dnnl_patterns : List[dnnl_pattern]
         Created patterns.
     """
-    elt_list = ["nn.relu", "tanh", "sigmoid", None]
+    elt_list = ["nn.relu", "tanh", "sigmoid", "gelu", None]
     dnnl_patterns = []
     for with_bias in [True, False]:
         for elt in elt_list:
             if not with_bias and not elt:
-                return dnnl_patterns
+                continue
             for conv_name in [
                 "nn.conv1d",
                 "nn.conv2d",
@@ -206,7 +221,8 @@ def pattern_table():
                 "nn.conv2d_transpose",
                 "nn.conv3d_transpose",
             ]:
-                dnnl_patterns.append(make_dnnl_pattern(conv_name, with_bias, elt))
+                if elt != "gelu":
+                    dnnl_patterns.append(make_dnnl_pattern(conv_name, with_bias, elt))
             dnnl_patterns.append(make_dnnl_pattern("nn.dense", with_bias, elt))
     return dnnl_patterns
 
@@ -339,6 +355,7 @@ def tag2layout(input_data, is_weight=False, conv_type="Conv1D"):
             res += i
         else:
             raise ValueError("Unsupport layout format: %s" % input_data)
+
     return res
 
 
@@ -594,3 +611,99 @@ def rewrite_layer_norm(mod):
     """
     mod["main"] = rewrite(LayerNormRewrite(), mod["main"])
     return mod
+
+
+class DenseReshapeBiasGeluRewrite(DFPatternCallback):
+    """
+    A callback to reorder reshape operators when the patterns are as below:
+
+    Pattern #1:
+    1   %62 = nn.dense(%61, meta[relay.Constant][13] /* ty=Tensor[(64, 64), float32] */,
+                units=None, out_dtype="float32") /* ty=Tensor[(3136, 64), float32] */;
+    2   %63 = reshape(%62, newshape=[1, 3136, 64]) /* ty=Tensor[(1, 3136, 64), float32] */;
+    3   %64 = add(meta[relay.Constant][4] /* ty=Tensor[(64), float32] */, %63)
+                /* ty=Tensor[(1, 3136, 64), float32] */;
+
+    Pattern #2:
+    1   %76 = nn.dense(%75, meta[relay.Constant][18] /* ty=Tensor[(512, 64), float32] */,
+                units=None, out_dtype="float32") /*  ty=Tensor[(3136, 512), float32] */;
+    2   %77 = reshape(%76, newshape=[1, 3136, 512]) /* ty=Tensor[(1, 3136, 512), float32] */;
+    3   %78 = add(meta[relay.Constant][15] /* ty=Tensor[(512), float32] */, %77)
+                /* ty=Tensor[(1, 3136, 512), float32] */;
+    4   %79 = divide(%78, 1.41421f /* ty=float32 */) /* ty=Tensor[(1, 3136, 512), float32] */;
+    5   %80 = erf(%79) /* ty=Tensor[(1, 3136, 512), float32] */;
+    6   %81 = add(%80, 1f /* ty=float32 */) /* ty=Tensor[(1, 3136, 512), float32] */;
+    7   %82 = multiply(%78, %81) /* ty=Tensor[(1, 3136, 512), float32] */;
+    8   %83 = multiply(%82, 0.5f /* ty=float32 */) /* ty=Tensor[(1, 3136, 512), float32] */;
+    """
+
+    def __init__(self, has_gelu=True):
+        super(DenseReshapeBiasGeluRewrite, self).__init__()
+        self.data = wildcard()
+        self.weight = wildcard()
+        self.bias = wildcard()
+        self.const1 = wildcard()
+        self.const2 = wildcard()
+        self.const3 = wildcard()
+
+        self.attr_map = {}
+        self.has_gelu = has_gelu
+
+        den = is_op("nn.dense")(self.data, self.weight)
+        re_den = is_op("reshape")(den)
+        added = is_op("add")(self.bias, re_den)
+        if self.has_gelu:
+            divisor = is_op("divide")(added, self.const1)
+            val_erf = is_op("erf")(divisor)
+            added_erf = is_op("add")(val_erf, self.const2)
+            mul1 = is_op("multiply")(added, added_erf)
+            mul2 = is_op("multiply")(mul1, self.const3)
+            self.pattern = mul2
+        else:
+            self.pattern = added
+
+    def get_attr(self, pre):
+        """Recursively retrieve attributes from reshape operator."""
+
+        def visit_func(expr):
+            if isinstance(expr, _expr.Call) and expr.op == relay.op.get("reshape"):
+                new_attrs = {}
+                for k in expr.attrs.keys():
+                    new_attrs[k] = expr.attrs[k]
+                self.attr_map["reshape"] = new_attrs
+
+        _analysis.post_order_visit(pre, visit_func)
+
+    def callback(self, pre, post, node_map):
+        self.get_attr(pre)
+
+        data = node_map[self.data][0]
+        weight = node_map[self.weight][0]
+        bias = node_map[self.bias][0]
+
+        den = relay.op.nn.dense(data, weight)
+        added = relay.op.add(bias, den)
+        if not self.has_gelu:
+            return relay.op.reshape(added, self.attr_map["reshape"]["newshape"])
+
+        const1 = node_map[self.const1][0]
+        const2 = node_map[self.const2][0]
+        const3 = node_map[self.const3][0]
+
+        divisor = relay.op.divide(added, const1)
+        val_erf = relay.op.erf(divisor)
+        added_erf = relay.op.add(val_erf, const2)
+        mul1 = relay.op.multiply(added, added_erf)
+        mul2 = relay.op.multiply(mul1, const3)
+        return relay.op.reshape(mul2, self.attr_map["reshape"]["newshape"])
+
+
+def rewrite_dense_bias_gelu_reshape_last(mod):
+    """Rewrite the input graph to reorder reshape operators so that
+    we can perform dense_bias_gelu/dense_bias fusion and then offload
+    them to byoc part.
+    """
+    mod["main"] = rewrite(
+        [DenseReshapeBiasGeluRewrite(), DenseReshapeBiasGeluRewrite(has_gelu=False)], mod["main"]
+    )
+    return mod
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 41480ed33b0a2..927cd12ae0fb9 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -454,6 +454,7 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
     ICHECK_NE(pattern_name, "");
     std::vector<std::string> op_list;
     size_t pos = 0, start = 0;
+
     while ((pos = pattern_name.find(interval, start)) != std::string::npos) {
       std::string op_name = pattern_name.substr(start, pos - start);
       if (op_name.find("dnnl") != std::string::npos) {
@@ -508,8 +509,7 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
         call = GetRootCall(fn->body.as<CallNode>(), op_list.size() - 1, op_list);
         ICHECK(call->op.as<OpNode>()) << "Not op node";
       } else if (name.find("dnnl.dense") != std::string::npos) {
-        std::vector<std::string> op_list = ParsingOpList(name);
-        call = GetRootCall(fn->body.as<CallNode>(), op_list.size() - 1, op_list);
+        call = GetRootCall(fn->body.as<CallNode>(), 10, "nn.dense");
         ICHECK(call->op.as<OpNode>()) << "Not op node";
       } else {
         LOG(FATAL) << "Unrecognized DNNL pattern: " << name;
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 360f366a162ec..70080254c414d 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -480,6 +480,38 @@ inline const CallNode* GetRootCall(const CallNode* current_call, const std::stri
   return GetRootCall(next_call, op_name);
 }
 
+/*!
+ * \brief Retrieve the expected "root" op nested inside a fused call, such as conv2d in
+ *        relu(add(conv2d))
+ * \param call A Relay call node. Typically nn.relu when called the first time.
+ * \param max_depth The maximum number of calls before the root op, counting from current_call.
+ * \param op_name The name of expected "root" op in this fused call.
+ * \return A CallNode corresponding to the root op
+ */
+inline const CallNode* GetRootCall(const CallNode* current_call, int max_depth,
+                                   const std::string& op_name) {
+  ICHECK(current_call && max_depth >= 0);
+
+  if (max_depth == 0) {
+    ICHECK(current_call && IsOp(current_call, op_name));
+    return current_call;
+  }
+  if (IsOp(current_call, op_name)) {
+    return current_call;
+  }
+
+  ICHECK_GT(current_call->args.size(), 0);
+
+  size_t valid_node_idx = 0;
+  while (valid_node_idx < current_call->args.size() &&
+         current_call->args[valid_node_idx].as<VarNode>()) {
+    valid_node_idx++;
+  }
+
+  const auto* next_call = current_call->args[valid_node_idx].as<CallNode>();
+  return GetRootCall(next_call, max_depth - 1, op_name);
+}
+
 /*!
  * \brief Get the external symbol of the Relay function name.
  *
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index db8f25e2a6ea5..5045f3323af7c 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -83,7 +83,6 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       // Find proper dnnl::memory buffers
       std::unordered_map<int, dnnl::memory> mem_args;
       for (const auto& kvp : arg_reqs) mem_args[kvp.first] = mem_solver(kvp.second);
-
       prim.execute(stream_, mem_args);
     }
   }
@@ -143,6 +142,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     std::regex relu_pat(".*_relu.*");
     std::regex tanh_pat(".*_tanh.*");
     std::regex sigmoid_pat(".*_sigmoid.*");
+    std::regex gelu_pat(".*_gelu.*");
 
     // Parsing post-ops.
     dnnl::post_ops ops;
@@ -155,7 +155,12 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     if (std::regex_match(op_name, sigmoid_pat)) {
       ops.append_eltwise(1.f, dnnl::algorithm::eltwise_logistic, 0.f, 0.f);
     }
-    attr.set_post_ops(ops);
+    if (std::regex_match(op_name, gelu_pat)) {
+      ops.append_eltwise(1.f, dnnl::algorithm::eltwise_gelu_erf, 0.f, 0.f);
+    }
+    if (ops.len() != 0) {
+      attr.set_post_ops(ops);
+    }
 
     // Parsing bias_add.
     return std::regex_match(op_name, bias_add_pat) ? true : false;
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 3e4e831aa594e..c884665421cbf 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -19,6 +19,7 @@
 import numpy as np
 import sys
 import subprocess
+import math
 
 import tvm
 from tvm import relay
@@ -56,7 +57,7 @@ def bf16_supported():
     return _bf16_supported
 
 
-def partition_for_dnnl(mod, params=None, alter_layout=True):
+def partition_for_dnnl(mod, params=None, alter_layout=True, prune_subgraphs=True):
     """Partition the graph greedily offloading supported operators to DNNL.
 
     Parameters
@@ -112,6 +113,7 @@ def partition_for_dnnl(mod, params=None, alter_layout=True):
                                 mod = alter_layout_seq(mod)
 
     mod = dnnl.rewrite_layer_norm(mod)
+    mod = dnnl.rewrite_dense_bias_gelu_reshape_last(mod)
 
     byoc_seq = tvm.transform.Sequential(
         [
@@ -121,9 +123,11 @@ def partition_for_dnnl(mod, params=None, alter_layout=True):
             transform.PartitionGraph(),
         ]
     )
+
     with tvm.transform.PassContext(opt_level=3):
         mod = byoc_seq(mod)
-        mod = dnnl.prune_dnnl_subgraphs(mod)
+        if prune_subgraphs:
+            mod = dnnl.prune_dnnl_subgraphs(mod)
     return mod
 
 
@@ -150,16 +154,15 @@ def assert_result_dict_holds(result_dict):
                 tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
 
 
-def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, test_bf16=True):
-    def check_dnnl_used(mod, subgraph_num=None):
-        num_dnnl_subgraphs = sum(
-            [1 if "dnnl" in gv.name_hint else 0 for gv in mod.get_global_vars()]
-        )
-        if subgraph_num:
-            assert num_dnnl_subgraphs == subgraph_num
-        else:
-            assert num_dnnl_subgraphs >= 1
+def check_dnnl_used(mod, subgraph_num=None):
+    num_dnnl_subgraphs = sum([1 if "dnnl" in gv.name_hint else 0 for gv in mod.get_global_vars()])
+    if subgraph_num:
+        assert num_dnnl_subgraphs == subgraph_num
+    else:
+        assert num_dnnl_subgraphs >= 1
 
+
+def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, test_bf16=True):
     dev = tvm.cpu()
     result_dict = dict()
     for mode in ["graph", "vm"]:
@@ -170,6 +173,7 @@ def check_dnnl_used(mod, subgraph_num=None):
         ]
         if test_bf16 and bf16_supported():
             configs += [(True, False, True), (True, True, True)]
+
         for use_dnnl, alter_layout, use_bf16 in configs:
             result_key = (
                 mode
@@ -585,21 +589,56 @@ def get_conv3d_transpose_bias(
         return out, dic, param_lst
 
 
-def get_dense(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
+def gelu_helper(data):
+    const1 = relay.const(math.sqrt(2.0))
+    const2 = relay.const(1.0)
+    const3 = relay.const(0.5)
+    divisor = relay.op.divide(data, const1)
+    val_erf = relay.op.erf(divisor)
+    added_erf = relay.op.add(val_erf, const2)
+    mul1 = relay.op.multiply(data, added_erf)
+    out = relay.op.multiply(mul1, const3)
+    return out
+
+
+def get_dense(
+    x_shape=(1, 16), k_shape=(32, 16), activation=None, has_reshape=False, dtype="float32"
+):
     x = relay.var("x", shape=(x_shape), dtype=dtype)
     kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
     out = relay.nn.dense(x, kernel, units=k_shape[0])
+    # out = relay.nn.dense(x, kernel, units=None)
+    if has_reshape:
+        out = relay.reshape(out, newshape=(1, x_shape[0], k_shape[0]))
     dic = {"x": x_shape, "kernel": k_shape}
     param_lst = ["kernel"]
+
+    if activation == "gelu":
+        out = gelu_helper(out)
     return out, dic, param_lst
 
 
-def get_dense_bias(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
-    dense, dic, param_lst = get_dense(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+def get_dense_bias(
+    x_shape=(1, 16),
+    k_shape=(32, 16),
+    activation=None,
+    has_reshape=False,
+    use_add=False,
+    dtype="float32",
+):
+    dense, dic, param_lst = get_dense(
+        x_shape=x_shape, k_shape=k_shape, has_reshape=has_reshape, dtype=dtype
+    )
     bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(dense, bias)
+    if use_add:
+        out = relay.add(dense, bias)
+    else:
+        out = relay.nn.bias_add(dense, bias)
     dic["bias"] = (k_shape[0],)
     param_lst += ["bias"]
+
+    if activation == "gelu":
+        out = gelu_helper(out)
     return out, dic, param_lst
 
 
@@ -891,6 +930,11 @@ def test_dense(run_module, dtype="float32"):
     config = dense, dic, param_lst
     run_and_verify_func(config, run_module=run_module, dtype=dtype)
 
+    dense, dic, param_lst = get_dense(x_shape, k_shape, activation="gelu", dtype=dtype)
+    dense = tvm.IRModule.from_expr(dense)
+    config = dense, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
 
 def test_dense_pattern(run_module, dtype="float32"):
     x_shape = (1, 16)
@@ -906,6 +950,11 @@ def test_dense_pattern(run_module, dtype="float32"):
     config = dense_bias, dic, param_lst
     run_and_verify_func(config, run_module=run_module, dtype=dtype)
 
+    dense_bias, dic, param_lst = get_dense_bias(x_shape, k_shape, activation="gelu", dtype=dtype)
+    dense_bias = tvm.IRModule.from_expr(dense_bias)
+    config = dense_bias, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
 
 def test_pool2d(run_module, dtype="float32"):
     def get_graph(
@@ -1053,5 +1102,29 @@ def test_layer_norm(run_module, dtype="float32"):
     run_and_verify_func(config, run_module=run_module, dtype=dtype)
 
 
+def test_rewrite_dense_bias_gelu_reshape_last(run_module, dtype="float32"):
+    def get_graph(act=None):
+        x_shape = (1, 16)
+        k_shape = (32, 16)
+
+        dense_bias, dic, param_lst = get_dense_bias(
+            x_shape, k_shape, activation=act, has_reshape=True, use_add=True, dtype=dtype
+        )
+        dense_bias = tvm.IRModule.from_expr(dense_bias)
+        processed_dense_bias = partition_for_dnnl(
+            dense_bias, params=None, alter_layout=False, prune_subgraphs=False
+        )
+        check_dnnl_used(processed_dense_bias, 1)
+
+        return dense_bias, dic, param_lst
+
+    run_and_verify_func(
+        get_graph("gelu"), subgraph_num=1, run_module=run_module, dtype=dtype, test_bf16=False
+    )
+    run_and_verify_func(
+        get_graph(), subgraph_num=1, run_module=run_module, dtype=dtype, test_bf16=False
+    )
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 761a430997b03..dedeae56e9daf 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -928,9 +928,9 @@ def test_dnnl_fuse():
     ) = (
         dnnl_patterns[1],
         dnnl_patterns[13],
-        dnnl_patterns[19],
-        dnnl_patterns[25],
-        dnnl_patterns[37],
+        dnnl_patterns[20],
+        dnnl_patterns[26],
+        dnnl_patterns[38],
     )
 
     def get_blocks(

From 705993e485a8c4b8a94a9c4d6e770c170b6fe1bc Mon Sep 17 00:00:00 2001
From: Qianshui <qianshui.jiang@intel.com>
Date: Sat, 11 Jun 2022 07:47:40 +0800
Subject: [PATCH 104/181] [DNNL][CBLAS][BYOC] Unifles all MKLDNN/DNNL  to DNNL
 (#11638)

* unifies all MKLDNN/DNNL_CODEGEN to DNNL

* translate -lib=mkldnn to -libs=dnnl in target

* type check added before

* rebase and update conv2d from mkldnn to dnnl
---
 CMakeLists.txt                                |  3 +-
 cmake/config.cmake                            | 17 ++++++---
 cmake/modules/LibInfo.cmake                   |  3 +-
 cmake/modules/contrib/BLAS.cmake              | 29 --------------
 cmake/modules/contrib/DNNL.cmake              | 32 ++++++++++++++--
 .../how_to/relay_bring_your_own_codegen.rst   |  2 +-
 python/tvm/contrib/{mkldnn.py => dnnl.py}     |  4 +-
 python/tvm/relay/op/strategy/x86.py           | 38 +++++++++----------
 python/tvm/target/target.py                   |  6 +++
 python/tvm/topi/x86/conv2d.py                 | 30 +++++++--------
 python/tvm/topi/x86/dense.py                  | 30 +++++++--------
 .../contrib/cblas/{mkldnn.cc => dnnl_blas.cc} | 12 +++---
 src/runtime/contrib/dnnl/dnnl.cc              |  2 +-
 src/support/libinfo.cc                        | 11 ++----
 tests/python/contrib/test_cblas.py            | 18 ++++-----
 tests/python/relay/test_op_level2.py          | 20 +++++-----
 tests/scripts/task_config_build_cpu.sh        |  2 +-
 17 files changed, 130 insertions(+), 129 deletions(-)
 rename python/tvm/contrib/{mkldnn.py => dnnl.py} (97%)
 rename src/runtime/contrib/cblas/{mkldnn.cc => dnnl_blas.cc} (78%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b4d6e18aad630..22386656442e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,8 +84,7 @@ tvm_option(PICOJSON_PATH "Path to PicoJSON" "3rdparty/picojson")
 tvm_option(USE_BYODT_POSIT "Build with BYODT software emulated posit custom datatype" OFF)
 tvm_option(USE_BLAS "The blas library to be linked" none)
 tvm_option(USE_MKL "MKL root path when use MKL blas" OFF)
-tvm_option(USE_MKLDNN "Build with MKLDNN" OFF)
-tvm_option(USE_DNNL_CODEGEN "Enable MKLDNN (DNNL) codegen" OFF)
+tvm_option(USE_DNNL "Enable DNNL codegen" OFF)
 tvm_option(USE_CUDNN "Build with cuDNN" OFF)
 tvm_option(USE_CUBLAS "Build with cuBLAS" OFF)
 tvm_option(USE_CUTLASS "Build with CUTLASS" OFF)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index c436c3feaa9fb..2c22d2b4986b4 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -160,8 +160,18 @@ set(USE_BLAS none)
 # set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`
 set(USE_MKL OFF)
 
-# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
-set(USE_MKLDNN OFF)
+# Whether use DNNL library, aka Intel OneDNN: https://oneapi-src.github.io/oneDNN
+#
+# Now matmul/dense/conv2d supported by -libs=dnnl,
+# and more OP patterns supported in DNNL codegen(json runtime)
+#
+# choices:
+# - ON: Enable DNNL in BYOC and -libs=dnnl, by default using json runtime in DNNL codegen
+# - JSON: same as above.
+# - C_SRC: use c source runtime in DNNL codegen
+# - path/to/oneDNN：oneDNN root path
+# - OFF: Disable DNNL
+set(USE_DNNL OFF)
 
 # Whether use OpenMP thread pool, choices: gnu, intel
 # Note: "gnu" uses gomp library, "intel" uses iomp5 library
@@ -212,9 +222,6 @@ set(USE_ROCBLAS OFF)
 # Whether use contrib sort
 set(USE_SORT ON)
 
-# Whether use MKL-DNN (DNNL) codegen
-set(USE_DNNL_CODEGEN OFF)
-
 # Whether to use Arm Compute Library (ACL) codegen
 # We provide 2 separate flags since we cannot build the ACL runtime on x86.
 # This is useful for cases where you want to cross-compile a relay graph
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 3e6b3c787f656..2c07a94ed5325 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -65,7 +65,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_CUDNN="${USE_CUDNN}"
     TVM_INFO_USE_CUSTOM_LOGGING="${USE_CUSTOM_LOGGING}"
     TVM_INFO_USE_CUTLASS="${USE_CUTLASS}"
-    TVM_INFO_USE_DNNL_CODEGEN="${USE_DNNL_CODEGEN}"
+    TVM_INFO_USE_DNNL="${USE_DNNL}"
     TVM_INFO_USE_ETHOSN="${USE_ETHOSN}"
     TVM_INFO_USE_FALLBACK_STL_MAP="${USE_FALLBACK_STL_MAP}"
     TVM_INFO_USE_GRAPH_EXECUTOR_CUDA_GRAPH="${USE_GRAPH_EXECUTOR_CUDA_GRAPH}"
@@ -85,7 +85,6 @@ function(add_lib_info src_file)
     TVM_INFO_USE_MICRO="${USE_MICRO}"
     TVM_INFO_USE_MIOPEN="${USE_MIOPEN}"
     TVM_INFO_USE_MKL="${USE_MKL}"
-    TVM_INFO_USE_MKLDNN="${USE_MKLDNN}"
     TVM_INFO_USE_MSVC_MT="${USE_MSVC_MT}"
     TVM_INFO_USE_NNPACK="${USE_NNPACK}"
     TVM_INFO_USE_OPENCL="${USE_OPENCL}"
diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
index f31218088a9e8..4840aaa0690d0 100644
--- a/cmake/modules/contrib/BLAS.cmake
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -63,32 +63,3 @@ if(USE_MKL OR USE_MKL_PATH)
   add_definitions(-DUSE_MKL_BLAS=1)
   message(STATUS "Use MKL library " ${BLAS_LIBRARY_MKL})
 endif()
-
-if(IS_DIRECTORY ${USE_MKLDNN})
-  find_library(MKLDNN_LIBRARY NAMES dnnl HINTS ${USE_MKLDNN}/lib/)
-  if (MKLDNN_LIBRARY STREQUAL "MKLDNN_LIBRARY-NOTFOUND")
-    message(WARNING "Cannot find MKLDNN library at ${USE_MKLDNN}.")
-  else()
-    include_directories(SYSTEM ${USE_MKLDNN}/include)
-    list(APPEND TVM_RUNTIME_LINKER_LIBS ${MKLDNN_LIBRARY})
-    list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc)
-    list(APPEND RUNTIME_SRCS src/runtime/contrib/dnnl/dnnl.cc)
-    add_definitions(-DUSE_DNNL=1)
-    message(STATUS "Use MKLDNN library " ${MKLDNN_LIBRARY})
-  endif()
-elseif(USE_MKLDNN STREQUAL "ON")
-  find_library(MKLDNN_LIBRARY dnnl)
-  if (MKLDNN_LIBRARY STREQUAL "MKLDNN_LIBRARY-NOTFOUND")
-    message(WARNING "Cannot find MKLDNN library. Try to specify the path to MKLDNN library.")
-  else()
-    list(APPEND TVM_RUNTIME_LINKER_LIBS ${MKLDNN_LIBRARY})
-    add_definitions(-DUSE_DNNL=1)
-    message(STATUS "Use MKLDNN library " ${MKLDNN_LIBRARY})
-    list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc)
-    list(APPEND RUNTIME_SRCS src/runtime/contrib/dnnl/dnnl.cc)
-  endif()
-elseif(USE_MKLDNN STREQUAL "OFF")
-  # pass
-else()
-  message(FATAL_ERROR "Invalid option: USE_MKLDNN=" ${USE_MKLDNN})
-endif()
diff --git a/cmake/modules/contrib/DNNL.cmake b/cmake/modules/contrib/DNNL.cmake
index 6642719cb485c..caa5a84e44924 100644
--- a/cmake/modules/contrib/DNNL.cmake
+++ b/cmake/modules/contrib/DNNL.cmake
@@ -15,7 +15,24 @@
 # specific language governing permissions and limitations
 # under the License.
 
-if((USE_DNNL_CODEGEN STREQUAL "ON") OR (USE_DNNL_CODEGEN STREQUAL "JSON"))
+if(IS_DIRECTORY ${USE_DNNL})
+  find_library(EXTERN_LIBRARY_DNNL NAMES dnnl HINTS ${USE_DNNL}/lib/)
+  if (EXTERN_LIBRARY_DNNL STREQUAL "EXTERN_LIBRARY_DNNL-NOTFOUND")
+    message(WARNING "Cannot find DNNL library at ${USE_DNNL}.")
+  else()
+    add_definitions(-DUSE_JSON_RUNTIME=1)
+    tvm_file_glob(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/*.cc)
+    list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
+
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_DNNL})
+    tvm_file_glob(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+                                        src/runtime/contrib/dnnl/dnnl_utils.cc
+                                        src/runtime/contrib/dnnl/dnnl.cc
+                                        src/runtime/contrib/cblas/dnnl_blas.cc)
+    list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
+    message(STATUS "Build with DNNL JSON runtime: " ${EXTERN_LIBRARY_DNNL})
+  endif()
+elseif((USE_DNNL STREQUAL "ON") OR (USE_DNNL STREQUAL "JSON"))
   add_definitions(-DUSE_JSON_RUNTIME=1)
   tvm_file_glob(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/*.cc)
   list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
@@ -23,17 +40,24 @@ if((USE_DNNL_CODEGEN STREQUAL "ON") OR (USE_DNNL_CODEGEN STREQUAL "JSON"))
   find_library(EXTERN_LIBRARY_DNNL dnnl)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_DNNL})
   tvm_file_glob(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/dnnl_json_runtime.cc
-                                      src/runtime/contrib/dnnl/dnnl_utils.cc)
+                                      src/runtime/contrib/dnnl/dnnl_utils.cc
+                                      src/runtime/contrib/dnnl/dnnl.cc
+                                      src/runtime/contrib/cblas/dnnl_blas.cc)
   list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
   message(STATUS "Build with DNNL JSON runtime: " ${EXTERN_LIBRARY_DNNL})
-elseif(USE_DNNL_CODEGEN STREQUAL "C_SRC")
+elseif(USE_DNNL STREQUAL "C_SRC")
   tvm_file_glob(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/*.cc)
   list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
 
   find_library(EXTERN_LIBRARY_DNNL dnnl)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_DNNL})
-  tvm_file_glob(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/dnnl.cc)
+  tvm_file_glob(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/dnnl.cc
+                                      src/runtime/contrib/cblas/dnnl_blas.cc)
   list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
   message(STATUS "Build with DNNL C source module: " ${EXTERN_LIBRARY_DNNL})
+elseif(USE_DNNL STREQUAL "OFF")
+  # pass
+else()
+  message(FATAL_ERROR "Invalid option: USE_DNNL=" ${USE_DNNL})
 endif()
 
diff --git a/docs/dev/how_to/relay_bring_your_own_codegen.rst b/docs/dev/how_to/relay_bring_your_own_codegen.rst
index b9f2337de2d41..304bd016dec22 100644
--- a/docs/dev/how_to/relay_bring_your_own_codegen.rst
+++ b/docs/dev/how_to/relay_bring_your_own_codegen.rst
@@ -21,7 +21,7 @@
 Bring Your Own Codegen To TVM
 =============================
 
-As the number of hardware devices targeted by deep learning workloads keeps increasing, the required knowledge for users to achieve high performance on various devices keeps increasing as well. To free data scientists from worrying about the performance when developing a new model, hardware backend providers either provide libraries such as MKLDNN or cuDNN with many commonly used deep learning operators, or provide frameworks such as TensorRT to let users describe their models in a certain way to achieve high performance. However, users have to learn a new programming interface when they attempt to work on a new library or device. As a result, the demand for a unified programming interface becomes more and more important to 1) let all users and hardware backend providers stand on the same page, and 2) provide a feasible solution to allow specialized hardware or library to only support widely used operators with extremely high performance, but fallback unsupported operators to general devices like CPU/GPU.
+As the number of hardware devices targeted by deep learning workloads keeps increasing, the required knowledge for users to achieve high performance on various devices keeps increasing as well. To free data scientists from worrying about the performance when developing a new model, hardware backend providers either provide libraries such as DNNL(Intel OneDNN) or cuDNN with many commonly used deep learning operators, or provide frameworks such as TensorRT to let users describe their models in a certain way to achieve high performance. However, users have to learn a new programming interface when they attempt to work on a new library or device. As a result, the demand for a unified programming interface becomes more and more important to 1) let all users and hardware backend providers stand on the same page, and 2) provide a feasible solution to allow specialized hardware or library to only support widely used operators with extremely high performance, but fallback unsupported operators to general devices like CPU/GPU.
 
 In this developer guide, we demonstrate how you, as a hardware backend provider, can easily implement your own codegen and register it as a Relay backend compiler to support your hardware device/library. This guide covers two types of codegen based on different graph representations you need:
 
diff --git a/python/tvm/contrib/mkldnn.py b/python/tvm/contrib/dnnl.py
similarity index 97%
rename from python/tvm/contrib/mkldnn.py
rename to python/tvm/contrib/dnnl.py
index a60a35f0ad04b..b9b77a2d20aed 100644
--- a/python/tvm/contrib/mkldnn.py
+++ b/python/tvm/contrib/dnnl.py
@@ -46,7 +46,7 @@ def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
         (n, m),
         [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
-            "tvm.contrib.mkldnn.matmul", ins[0], ins[1], outs[0], transa, transb
+            "tvm.contrib.dnnl.matmul", ins[0], ins[1], outs[0], transa, transb
         ),
         name="C",
         **kwargs,
@@ -138,7 +138,7 @@ def dnnl_conv2d(
         out_shape,
         [src, weights],
         lambda ins, outs: tvm.tir.call_packed(
-            "tvm.contrib.mkldnn.conv2d",
+            "tvm.contrib.dnnl.conv2d",
             ins[0],
             ins[1],
             outs[0],
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 12ef048b48cda..a032fd00bf343 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -120,11 +120,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
                     wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8),
                     name="conv2d_nchw_int8.x86",
                 )
-            elif "mkldnn" in target.libs:
+            elif "dnnl" in target.libs:
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.x86.conv2d_nchw_mkldnn),
-                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_mkldnn),
-                    name="conv2d_nchw_mkldnn.x86",
+                    wrap_compute_conv2d(topi.x86.conv2d_nchw_dnnl),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_dnnl),
+                    name="conv2d_nchw_dnnl.x86",
                 )
             else:
                 strategy.add_implementation(
@@ -139,11 +139,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             assert kernel_layout == "HWIO"
             if not is_auto_scheduler_enabled():
                 logger.warning("conv2d NHWC layout is not optimized for x86 with autotvm.")
-            if "mkldnn" in target.libs:
+            if "dnnl" in target.libs:
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.x86.conv2d_nhwc_mkldnn),
-                    wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc_mkldnn),
-                    name="conv2d_nhwc_mkldnn.x86",
+                    wrap_compute_conv2d(topi.x86.conv2d_nhwc_dnnl),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc_dnnl),
+                    name="conv2d_nhwc_dnnl.x86",
                 )
             else:
                 strategy.add_implementation(
@@ -443,18 +443,18 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
                 "Currently mkl only support the data type to be float32, float64 or input with "
                 "uint8 and int8 while output wiht int32. Skip."
             )
-    if "mkldnn" in target.libs:
+    if "dnnl" in target.libs:
         length_before = len(strategy.specializations) if strategy.specializations else 0
         with SpecializedCondition(same_type and dtype == "float32"):
             strategy.add_implementation(
-                wrap_compute_matmul(topi.x86.matmul_mkldnn),
-                wrap_topi_schedule(topi.x86.schedule_matmul_mkldnn),
-                name="matmul_mkldnn.x86",
+                wrap_compute_matmul(topi.x86.matmul_dnnl),
+                wrap_topi_schedule(topi.x86.schedule_matmul_dnnl),
+                name="matmul_dnnl.x86",
                 plevel=15,
             )
         length_after = len(strategy.specializations) if strategy.specializations else 0
         if length_before == length_after:
-            logger.warning("Currently mkldnn only support the data type to be float32. Skip.")
+            logger.warning("Currently dnnl only support the data type to be float32. Skip.")
 
     if is_auto_scheduler_enabled():
         strategy.add_implementation(
@@ -464,11 +464,11 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
             plevel=11,
         )
     else:
-        # If no cblas/mkl/mkldnn strategy choosed
+        # If no cblas/mkl/dnnl strategy choosed
         if not strategy.specializations:
             logger.warning(
                 "Matmul is not optimized for x86. "
-                "Recommend to use cblas/mkl/mkldnn for better performance."
+                "Recommend to use cblas/mkl/dnnl for better performance."
             )
         strategy.add_implementation(
             wrap_compute_matmul(topi.nn.matmul),
@@ -523,12 +523,12 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
                 name="dense_mkl.x86",
                 plevel=14,
             )
-    if "mkldnn" in target.libs:
+    if "dnnl" in target.libs:
         with SpecializedCondition(same_type and dtype == "float32"):
             strategy.add_implementation(
-                wrap_compute_dense(topi.x86.dense_mkldnn),
-                wrap_topi_schedule(topi.x86.schedule_dense_mkldnn),
-                name="dense_mkldnn.x86",
+                wrap_compute_dense(topi.x86.dense_dnnl),
+                wrap_topi_schedule(topi.x86.schedule_dense_dnnl),
+                name="dense_dnnl.x86",
                 plevel=15,
             )
     return strategy
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index debc84980df0c..aea3dfec43f8f 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -108,6 +108,12 @@ def __init__(self, target, host=None):
             When using a dictionary or json string to configure target, the possible values are
             same as target.
         """
+        if isinstance(target, str) and "-libs=mkldnn" in target:
+            target = target.replace("mkldnn", "dnnl")
+            warnings.warn(
+                "legacy supoort of mkldnn will be eprecated in the next release."
+                " Please replace -libs=mkldnn to -libs=dnnl to enable Intel OneDNN.",
+            )
         if isinstance(target, (dict, str)):
             target = convert(target)
         if isinstance(host, (dict, str)):
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index a28c75b81d3f1..fcdd948260ea6 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -23,7 +23,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm.contrib import mkldnn
+from tvm.contrib import dnnl
 from .. import nn
 from ..generic import schedule_extern
 from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload
@@ -269,31 +269,31 @@ def _callback(op):
     return s
 
 
-@autotvm.register_topi_compute("conv2d_nchw_mkldnn.x86")
-def conv2d_nchw_mkldnn(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d in NCHW format using mkldnn."""
+@autotvm.register_topi_compute("conv2d_nchw_dnnl.x86")
+def conv2d_nchw_dnnl(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d in NCHW format using dnnl."""
     groups = 1
-    _out = mkldnn.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, False, out_dtype)
+    _out = dnnl.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, False, out_dtype)
     return _out
 
 
-@autotvm.register_topi_schedule("conv2d_nchw_mkldnn.x86")
-def schedule_conv2d_nchw_mkldnn(_, outs):
-    """Create schedule for conv2d_nchw_mkldnn"""
+@autotvm.register_topi_schedule("conv2d_nchw_dnnl.x86")
+def schedule_conv2d_nchw_dnnl(_, outs):
+    """Create schedule for conv2d_nchw_dnnl"""
     return schedule_extern(outs)
 
 
-@autotvm.register_topi_compute("conv2d_nhwc_mkldnn.x86")
-def conv2d_nhwc_mkldnn(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d in NHWC format using mkldnn."""
+@autotvm.register_topi_compute("conv2d_nhwc_dnnl.x86")
+def conv2d_nhwc_dnnl(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d in NHWC format using dnnl."""
     groups = 1
-    _out = mkldnn.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, True, out_dtype)
+    _out = dnnl.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, True, out_dtype)
     return _out
 
 
-@autotvm.register_topi_schedule("conv2d_nhwc_mkldnn.x86")
-def schedule_conv2d_nhwc_mkldnn(_, outs):
-    """Create schedule for conv2d_nhwc_mkldnn"""
+@autotvm.register_topi_schedule("conv2d_nhwc_dnnl.x86")
+def schedule_conv2d_nhwc_dnnl(_, outs):
+    """Create schedule for conv2d_nhwc_dnnl"""
     return schedule_extern(outs)
 
 
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 1e4ccb7bb8c82..88a2499c2c1ec 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -24,7 +24,7 @@
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cblas
 from tvm.contrib import mkl
-from tvm.contrib import mkldnn
+from tvm.contrib import dnnl
 
 from .utils import get_simd_32bit_lanes
 from .. import generic, tag
@@ -424,15 +424,15 @@ def schedule_dense_mkl(_, outs):
     return generic.schedule_extern(outs)
 
 
-@autotvm.register_topi_compute("dense_mkldnn.x86")
-def dense_mkldnn(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute dense using mkldnn. This is an alias of matmul_nt operator."""
-    return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, mkldnn)
+@autotvm.register_topi_compute("dense_dnnl.x86")
+def dense_dnnl(cfg, data, weight, bias=None, out_dtype=None):
+    """Compute dense using dnnl. This is an alias of matmul_nt operator."""
+    return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, dnnl)
 
 
-@autotvm.register_topi_schedule("dense_mkldnn.x86")
-def schedule_dense_mkldnn(_, outs):
-    """Create schedule for dense_mkldnn. This is an alias of matmul_nt operator."""
+@autotvm.register_topi_schedule("dense_dnnl.x86")
+def schedule_dense_dnnl(_, outs):
+    """Create schedule for dense_dnnl. This is an alias of matmul_nt operator."""
     return generic.schedule_extern(outs)
 
 
@@ -468,17 +468,17 @@ def schedule_matmul_mkl(_, outs):
     return generic.schedule_extern(outs)
 
 
-@autotvm.register_topi_compute("matmul_mkldnn.x86")
-def matmul_mkldnn(
+@autotvm.register_topi_compute("matmul_dnnl.x86")
+def matmul_dnnl(
     cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
 ):
-    """Compute matmul using mkldnn."""
+    """Compute matmul using dnnl."""
     return matmul_blas_common(
-        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, mkldnn
+        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, dnnl
     )
 
 
-@autotvm.register_topi_schedule("matmul_mkldnn.x86")
-def schedule_matmul_mkldnn(_, outs):
-    """Create schedule for matmul_mkldnn."""
+@autotvm.register_topi_schedule("matmul_dnnl.x86")
+def schedule_matmul_dnnl(_, outs):
+    """Create schedule for matmul_dnnl."""
     return generic.schedule_extern(outs)
diff --git a/src/runtime/contrib/cblas/mkldnn.cc b/src/runtime/contrib/cblas/dnnl_blas.cc
similarity index 78%
rename from src/runtime/contrib/cblas/mkldnn.cc
rename to src/runtime/contrib/cblas/dnnl_blas.cc
index 31abd317c6a47..43d5ab35b4955 100644
--- a/src/runtime/contrib/cblas/mkldnn.cc
+++ b/src/runtime/contrib/cblas/dnnl_blas.cc
@@ -34,22 +34,22 @@ namespace tvm {
 namespace contrib {
 
 using namespace runtime;
-inline char MKLDNNBooleanToTransposeChar(bool trans) { return trans ? 'T' : 'N'; }
+inline char DNNLBooleanToTransposeChar(bool trans) { return trans ? 'T' : 'N'; }
 
-struct MKLDNNSgemmOp {
+struct DNNLSgemmOp {
   typedef float TDatatype;
   void operator()(bool ta, bool tb, int M, int N, int K, float alpha, float* A, int lda, float* B,
                   int ldb, float beta, float* C, int ldc) {
-    dnnl_sgemm(MKLDNNBooleanToTransposeChar(tb), MKLDNNBooleanToTransposeChar(ta), N, M, K, alpha,
-               B, ldb, A, lda, beta, C, ldc);
+    dnnl_sgemm(DNNLBooleanToTransposeChar(tb), DNNLBooleanToTransposeChar(ta), N, M, K, alpha, B,
+               ldb, A, lda, beta, C, ldc);
   }
 };
 
 // matrix multiplication for row major
-TVM_REGISTER_GLOBAL("tvm.contrib.mkldnn.matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
+TVM_REGISTER_GLOBAL("tvm.contrib.dnnl.matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
   ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
-  CallGemm(args, ret, MKLDNNSgemmOp());
+  CallGemm(args, ret, DNNLSgemmOp());
 });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc
index 7d3763d411baf..f3c3e9d0ea21d 100644
--- a/src/runtime/contrib/dnnl/dnnl.cc
+++ b/src/runtime/contrib/dnnl/dnnl.cc
@@ -338,7 +338,7 @@ extern "C" void dnnl_binary_op(float* data, float* weight, float* out, int algo_
 }
 
 // DNNL Conv2d single OP
-TVM_REGISTER_GLOBAL("tvm.contrib.mkldnn.conv2d").set_body([](TVMArgs args, TVMRetValue* ret) {
+TVM_REGISTER_GLOBAL("tvm.contrib.dnnl.conv2d").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* input = args[0];
   DLTensor* weights = args[1];
   DLTensor* output = args[2];
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 4a969dcee8bb9..f98e08ce94b6c 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -147,12 +147,8 @@
 #define TVM_INFO_USE_MKL "NOT-FOUND"
 #endif
 
-#ifndef TVM_INFO_USE_MKLDNN
-#define TVM_INFO_USE_MKLDNN "NOT-FOUND"
-#endif
-
-#ifndef TVM_INFO_USE_DNNL_CODEGEN
-#define TVM_INFO_USE_DNNL_CODEGEN "NOT-FOUND"
+#ifndef TVM_INFO_USE_DNNL
+#define TVM_INFO_USE_DNNL "NOT-FOUND"
 #endif
 
 #ifndef TVM_INFO_USE_CUDNN
@@ -266,7 +262,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_CUDNN", TVM_INFO_USE_CUDNN},
       {"USE_CUSTOM_LOGGING", TVM_INFO_USE_CUSTOM_LOGGING},
       {"USE_CUTLASS", TVM_INFO_USE_CUTLASS},
-      {"USE_DNNL_CODEGEN", TVM_INFO_USE_DNNL_CODEGEN},
+      {"USE_DNNL", TVM_INFO_USE_DNNL},
       {"USE_ETHOSN", TVM_INFO_USE_ETHOSN},
       {"USE_FALLBACK_STL_MAP", TVM_INFO_USE_FALLBACK_STL_MAP},
       {"USE_GRAPH_EXECUTOR_CUDA_GRAPH", TVM_INFO_USE_GRAPH_EXECUTOR_CUDA_GRAPH},
@@ -286,7 +282,6 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_MICRO", TVM_INFO_USE_MICRO},
       {"USE_MIOPEN", TVM_INFO_USE_MIOPEN},
       {"USE_MKL", TVM_INFO_USE_MKL},
-      {"USE_MKLDNN", TVM_INFO_USE_MKLDNN},
       {"USE_MSVC_MT", TVM_INFO_USE_MSVC_MT},
       {"USE_NNPACK", TVM_INFO_USE_NNPACK},
       {"USE_OPENCL", TVM_INFO_USE_OPENCL},
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index 2b99879d82271..59872e129722a 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -21,7 +21,7 @@
 import tvm.topi.testing
 from tvm.contrib import cblas
 from tvm.contrib import mkl
-from tvm.contrib import mkldnn
+from tvm.contrib import dnnl
 import tvm.testing
 
 
@@ -83,10 +83,10 @@ def test_matmul_add():
     verify_matmul_add(235, 128, 1024, mkl, True, False)
     verify_matmul_add(235, 128, 1024, mkl, False, True)
     verify_matmul_add(235, 128, 1024, mkl, True, True)
-    verify_matmul_add(235, 128, 1024, mkldnn)
-    verify_matmul_add(235, 128, 1024, mkldnn, True, False)
-    verify_matmul_add(235, 128, 1024, mkldnn, False, True)
-    verify_matmul_add(235, 128, 1024, mkldnn, True, True)
+    verify_matmul_add(235, 128, 1024, dnnl)
+    verify_matmul_add(235, 128, 1024, dnnl, True, False)
+    verify_matmul_add(235, 128, 1024, dnnl, False, True)
+    verify_matmul_add(235, 128, 1024, dnnl, True, True)
     verify_matmul_add(1, 16, 4, cblas)
     verify_matmul_add(1, 16, 3, cblas, True, False)
     verify_matmul_add(1, 16, 3, cblas, False, False)
@@ -95,10 +95,10 @@ def test_matmul_add():
     verify_matmul_add(1, 16, 3, mkl, True, False)
     verify_matmul_add(1, 16, 3, mkl, False, False)
     verify_matmul_add(1, 16, 3, mkl, True, True)
-    verify_matmul_add(1, 16, 4, mkldnn)
-    verify_matmul_add(1, 16, 3, mkldnn, True, False)
-    verify_matmul_add(1, 16, 3, mkldnn, False, False)
-    verify_matmul_add(1, 16, 3, mkldnn, True, True)
+    verify_matmul_add(1, 16, 4, dnnl)
+    verify_matmul_add(1, 16, 3, dnnl, True, False)
+    verify_matmul_add(1, 16, 3, dnnl, False, False)
+    verify_matmul_add(1, 16, 3, dnnl, True, True)
 
 
 def verify_quantized_matmul_add(m, l, n, transa=False, transb=False):
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index db1eb16b8ca35..f547565464704 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1997,11 +1997,11 @@ def test_conv2d_rocm_sdot4():
 
 
 @tvm.testing.requires_x86
-def test_conv2d_nchw_mkldnn():
-    if not tvm.get_global_func("tvm.contrib.mkldnn.conv2d", allow_missing=True):
+def test_conv2d_nchw_dnnl():
+    if not tvm.get_global_func("tvm.contrib.dnnl.conv2d", allow_missing=True):
         print(
-            "skip because extern mkldnn function is not available, \
-                built with MKLDNN=ON"
+            "skip because extern dnnl function is not available, \
+                built with dnnl=ON"
         )
         return
     d_shape = (1, 64, 56, 56)
@@ -2027,7 +2027,7 @@ def test_conv2d_nchw_mkldnn():
     data_np = np.random.uniform(1, 10, d_shape).astype("float32")
     weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
 
-    target = "llvm -mcpu=skylake-avx512 -libs=mkldnn"
+    target = "llvm -mcpu=skylake-avx512 -libs=dnnl"
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params={"weight": weight_np})
 
@@ -2045,11 +2045,11 @@ def test_conv2d_nchw_mkldnn():
 
 
 @tvm.testing.requires_x86
-def test_conv2d_nhwc_mkldnn():
-    if not tvm.get_global_func("tvm.contrib.mkldnn.conv2d", allow_missing=True):
+def test_conv2d_nhwc_dnnl():
+    if not tvm.get_global_func("tvm.contrib.dnnl.conv2d", allow_missing=True):
         print(
-            "skip because extern mkldnn function is not available, \
-                built with MKLDNN=ON"
+            "skip because extern dnnl function is not available, \
+                built with dnnl=ON"
         )
         return
     d_shape = (1, 56, 56, 64)
@@ -2077,7 +2077,7 @@ def test_conv2d_nhwc_mkldnn():
     data_np = np.random.uniform(1, 10, d_shape).astype("float32")
     weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
 
-    target = "llvm -mcpu=skylake-avx512 -libs=mkldnn"
+    target = "llvm -mcpu=skylake-avx512 -libs=dnnl"
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params={"weight": weight_np})
 
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index b67d3823ca845..84213be860dcf 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -27,7 +27,7 @@ echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
-echo set\(USE_DNNL_CODEGEN ON\) >> config.cmake
+echo set\(USE_DNNL ON\) >> config.cmake
 echo set\(USE_ARM_COMPUTE_LIB ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-11\) >> config.cmake
 echo set\(USE_NNPACK ON\) >> config.cmake

From 7de8980f24dbcf8568707cdc287edd89e5ae6407 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 10 Jun 2022 18:14:00 -0700
Subject: [PATCH 105/181] [FIX,METASCHEDULER] Fix tune_te (#11676)

`tune_te` was broken because it passed a primfunc to `tune_tir`. Now it
is wrapped in an IRModule. Also the test is re-enabled.
---
 python/tvm/meta_schedule/tune.py                    | 1 +
 tests/python/unittest/test_meta_schedule_tune_te.py | 7 +++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index cc7c4cbc93563..9ee02aa2bbc62 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -404,6 +404,7 @@ def tune_tir(
     )
 
     # pylint: disable=protected-access
+    mod = default_config.mod(mod)
     target = default_config.target(target)
     # pylint: enable=protected-access
     database = tune_extracted_tasks(
diff --git a/tests/python/unittest/test_meta_schedule_tune_te.py b/tests/python/unittest/test_meta_schedule_tune_te.py
index 52e5fde85ec99..d294b2ddd6e85 100644
--- a/tests/python/unittest/test_meta_schedule_tune_te.py
+++ b/tests/python/unittest/test_meta_schedule_tune_te.py
@@ -28,7 +28,6 @@
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 
 
-@pytest.mark.skip("Integration test")
 def test_tune_matmul():
     with tempfile.TemporaryDirectory() as work_dir:
         sch: Schedule = tune_te(
@@ -36,9 +35,9 @@ def test_tune_matmul():
             target=Target("llvm --num-cores=16"),
             config=TuneConfig(
                 strategy="replay_trace",
-                num_trials_per_iter=32,
-                max_trials_per_task=32,
-                max_trials_global=32,
+                num_trials_per_iter=1,
+                max_trials_per_task=1,
+                max_trials_global=1,
             ),
             work_dir=work_dir,
         )

From 50c6a9896d2c85cdb0eddd5302e041156fb52e90 Mon Sep 17 00:00:00 2001
From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com>
Date: Fri, 10 Jun 2022 18:15:05 -0700
Subject: [PATCH 106/181] [MetaSchedule] Generate MetaSchedule Dataset (#11641)

In order to build a dataset for improving the cost model for MetaSchedule, I added several files
including importing models to TVM, extracting tuning tasks, and sampling measure candidates.
Meanwhile, I exposed some methods in C++ to the Python side to assist the process.
---
 .../testing/dataset_collect_models.py         |  85 ++++++++
 .../testing/dataset_extract_tasks.py          | 104 ++++++++++
 .../testing/dataset_sample_candidates.py      | 191 ++++++++++++++++++
 .../meta_schedule/testing/relay_workload.py   |  39 ----
 .../search_strategy/evolutionary_search.cc    |  26 +++
 src/tir/schedule/analysis/analysis.cc         |   1 +
 6 files changed, 407 insertions(+), 39 deletions(-)
 create mode 100644 python/tvm/meta_schedule/testing/dataset_collect_models.py
 create mode 100644 python/tvm/meta_schedule/testing/dataset_extract_tasks.py
 create mode 100644 python/tvm/meta_schedule/testing/dataset_sample_candidates.py

diff --git a/python/tvm/meta_schedule/testing/dataset_collect_models.py b/python/tvm/meta_schedule/testing/dataset_collect_models.py
new file mode 100644
index 0000000000000..8992f73d2873f
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/dataset_collect_models.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+
+import argparse
+import os
+from typing import List, Tuple
+
+from tqdm import tqdm  # type: ignore
+from tvm.meta_schedule.testing.relay_workload import get_network
+
+
+# pylint: disable=too-many-branches
+def _build_dataset() -> List[Tuple[str, List[int]]]:
+    network_keys = []
+    for name in [
+        "resnet_18",
+        "resnet_50",
+        "mobilenet_v2",
+        "mobilenet_v3",
+        "wide_resnet_50",
+        "resnext_50",
+        "densenet_121",
+        "vgg_16",
+    ]:
+        for batch_size in [1, 4, 8]:
+            for image_size in [224, 240, 256]:
+                network_keys.append((name, [batch_size, 3, image_size, image_size]))
+    # inception-v3
+    for name in ["inception_v3"]:
+        for batch_size in [1, 2, 4]:
+            for image_size in [299]:
+                network_keys.append((name, [batch_size, 3, image_size, image_size]))
+    # resnet3d
+    for name in ["resnet3d_18"]:
+        for batch_size in [1, 2, 4]:
+            for image_size in [112, 128, 144]:
+                network_keys.append((name, [batch_size, 3, image_size, image_size, 16]))
+    # bert
+    for name in ["bert_tiny", "bert_base", "bert_medium", "bert_large"]:
+        for batch_size in [1, 2, 4]:
+            for seq_length in [64, 128, 256]:
+                network_keys.append((name, [batch_size, seq_length]))
+    # dcgan
+    for name in ["dcgan"]:
+        for batch_size in [1, 4, 8]:
+            for image_size in [64]:
+                network_keys.append((name, [batch_size, 3, image_size, image_size]))
+    return network_keys
+
+
+def main():
+    model_cache_dir = args.model_cache_dir
+    try:
+        os.makedirs(model_cache_dir, exist_ok=True)
+    except OSError:
+        print(f"Directory {model_cache_dir} cannot be created successfully.")
+    keys = _build_dataset()
+    for name, input_shape in tqdm(keys):
+        get_network(name=name, input_shape=input_shape, cache_dir=model_cache_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  # pylint: disable=invalid-name
+    parser.add_argument(
+        "--model_cache_dir",
+        type=str,
+        help="Please provide the full path to the model cache dir.",
+    )
+    args = parser.parse_args()  # pylint: disable=invalid-name
+    main()
diff --git a/python/tvm/meta_schedule/testing/dataset_extract_tasks.py b/python/tvm/meta_schedule/testing/dataset_extract_tasks.py
new file mode 100644
index 0000000000000..1795996a37179
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/dataset_extract_tasks.py
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+
+import argparse
+import glob
+import json
+import os
+
+from tqdm import tqdm  # type: ignore
+import tvm
+from tvm import meta_schedule as ms
+from tvm.ir import save_json
+from tvm.meta_schedule.testing.relay_workload import _load_cache
+from tvm.runtime import load_param_dict
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_cache_dir", type=str, help="Please provide the full path to the model cache dir."
+    )
+    parser.add_argument(
+        "--task_cache_dir", type=str, help="Please provide the full path to save extracted tasks."
+    )
+    parser.add_argument(
+        "--target", type=str, default="cuda", help="Please specify the target hardware for tuning."
+    )
+    return parser.parse_args()
+
+
+# pylint: disable=too-many-locals
+def extract_and_save_tasks(cache_file):
+    """Extract tuning tasks and cache the nonspatial ones in the given directory.
+
+    Parameters
+    ----------
+    cache_file : str
+        The filename of the cached model.
+
+    Returns
+    -------
+    None
+    """
+
+    mod, params_bytearray, _ = _load_cache(args.model_cache_dir, cache_file)
+    params = load_param_dict(params_bytearray)
+    try:
+        extracted_tasks = ms.extract_task_from_relay(mod, target=args.target, params=params)
+    except tvm.error.TVMError as error:
+        print(str(error))
+        return
+    task_cache_path = os.path.join(
+        args.task_cache_dir, cache_file.split(".")[0] + "_extracted_tasks.json"
+    )
+    is_spatial = tvm.get_global_func("tir.schedule.IsSpatialPrimFunc")
+    with open(task_cache_path, "w", encoding="utf8") as file:
+        for i, task in enumerate(extracted_tasks):
+            subgraph = task.dispatched[0]
+            prim_func = subgraph[subgraph.get_global_vars()[0]]
+            if not is_spatial(prim_func):
+                subgraph_str = save_json(subgraph)
+                json_obj = [task.task_name, json.loads(subgraph_str)]
+                json_str = json.dumps(json_obj)
+                assert "\n" not in json_str, "Failed to generate single line string."
+                if i == len(extracted_tasks) - 1:
+                    file.write(json_str)
+                else:
+                    file.write(json_str + "\n")
+
+
+args = _parse_args()  # pylint: disable=invalid-name
+
+
+def main():
+    if not os.path.isdir(args.model_cache_dir):
+        raise Exception("Please provide a correct model cache dir.")
+    try:
+        os.makedirs(args.task_cache_dir, exist_ok=True)
+    except OSError:
+        print(f"Directory {args.task_cache_dir} cannot be created successfully.")
+
+    paths = glob.glob(os.path.join(args.model_cache_dir, "*.json"))  # pylint: disable=invalid-name
+    for path in tqdm(paths):
+        filename = path.split("/")[-1]
+        extract_and_save_tasks(filename)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
new file mode 100644
index 0000000000000..c80d78173e2e4
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
@@ -0,0 +1,191 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+
+import argparse
+import glob
+import json
+import os
+from typing import List
+
+from tqdm import tqdm  # type: ignore
+import tvm
+from tvm import meta_schedule as ms
+from tvm.ir import load_json
+from tvm.target import Target
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task_cache_dir", type=str, help="Please provide the full path to the extracted tasks."
+    )
+    parser.add_argument(
+        "--candidate_cache_dir",
+        type=str,
+        help="Please provide the full path to save the sampled candidates.",
+    )
+    parser.add_argument(
+        "--target",
+        type=str,
+        default="nvidia/geforce-rtx-3070",
+        help="Please specify the target hardware for tuning.\
+                    Note: for generating dataset, the hardware does not need to be present.",
+    )
+    parser.add_argument(
+        "--init_population_size",
+        type=int,
+        default=256,
+        help="The initial population size used in evolutionary search.",
+    )
+    parser.add_argument(
+        "--num_samples_per_task",
+        type=int,
+        default=400,
+        help="The number of samples to gather per tuning task.",
+    )
+    parser.add_argument(
+        "--num_trials_per_iter",
+        type=int,
+        default=64,
+        help="The number of trials per iteration in evolutionary search.",
+    )
+    parser.add_argument(
+        "--max_trials_per_task",
+        type=int,
+        default=400,
+        help="The maximum number of trials per task in evolutionary search.",
+    )
+    parser.add_argument(
+        "--max_retry_per_task",
+        type=int,
+        default=10,
+        help="The maximum number of retry attempts allowed.",
+    )
+    parser.add_argument(
+        "--file_group",
+        type=int,
+        default=0,
+        help="To enable running multiple scripts in parallel, files [idx * 10 : (idx + 1) * 10]\
+        in the sorted file list from the given directory will be run.",
+    )
+    return parser.parse_args()
+
+
+# pylint: disable=too-many-locals
+def sample_candidates(task, task_name, model_name):
+    """Randomly sample candidates for a task and save the candidates in the given directory.
+
+    Parameters
+    ----------
+    task : IRModule
+        The initial ir module used for generating the search space.
+    task_name : str
+        The name of the task.
+    model_name : str
+        The name of the model.
+
+    Returns
+    -------
+    None
+    """
+    sample_init_population = tvm.get_global_func(
+        "meta_schedule.SearchStrategyEvolutionarySearchSampleInitPopulation"
+    )
+    evolve_with_cost_model = tvm.get_global_func(
+        "meta_schedule.SearchStrategyEvolutionarySearchEvolveWithCostModel"
+    )
+    strategy = ms.search_strategy.EvolutionarySearch(
+        num_trials_per_iter=args.num_trials_per_iter,
+        max_trials_per_task=args.max_trials_per_task,
+        init_measured_ratio=0.0,
+    )
+    target = Target(args.target)
+    context = ms.TuneContext(
+        mod=task,
+        target=target,
+        space_generator=ms.space_generator.PostOrderApply(),
+        search_strategy=strategy,
+        sch_rules=ms.default_config.schedule_rules(None, target),
+        postprocs=ms.default_config.postproc(None, target),
+        mutator_probs=ms.default_config.mutator_probs(None, target),
+        task_name=task_name,
+    )
+    context.initialize()
+    context.pre_tuning(
+        context.generate_design_space(),
+        database=ms.database.MemoryDatabase(),  # type: ignore
+        cost_model=ms.cost_model.RandomModel(),  # type: ignore
+    )
+
+    all_states: List[tvm.tir.Schedule] = []
+    num_retry, itr = 0, 0
+    states = sample_init_population(strategy, args.init_population_size)
+    while len(all_states) < args.num_samples_per_task and num_retry < args.max_retry_per_task:
+        states = evolve_with_cost_model(strategy, states, len(states))
+        all_states += states
+        if len(states) == 0:
+            states = sample_init_population(strategy, args.init_population_size)
+            num_retry += 1
+        else:
+            num_retry = 0
+        print(f"iter: {itr}, number of states sampled: {len(all_states)}")
+        itr += 1
+    all_states = all_states[: args.num_samples_per_task]
+
+    workload = ms.database.Workload(context.mod)
+    file_path = os.path.join(args.candidate_cache_dir, model_name, task_name + ".json")
+    with open(file_path, "w", encoding="utf8") as file:
+        for i, state in enumerate(all_states):
+            tuning_record = ms.database.TuningRecord(state.trace, workload)
+            json_str = json.dumps(tuning_record.as_json())
+            assert "\n" not in json_str, "Failed to generate single line string."
+            if i == len(all_states) - 1:
+                file.write(json_str)
+            else:
+                file.write(json_str + "\n")
+
+
+args = _parse_args()  # pylint: disable=invalid-name
+
+
+def main():
+    if not os.path.isdir(args.task_cache_dir):
+        raise Exception("Please provide a correct task cache dir.")
+    try:
+        os.makedirs(args.candidate_cache_dir, exist_ok=True)
+    except OSError:
+        print(f"Directory {args.candidate_cache_dir} cannot be created successfully.")
+
+    task_paths = sorted(glob.glob(os.path.join(args.task_cache_dir, "*.json")))[
+        args.file_group * 10 : (args.file_group + 1) * 10
+    ]
+    print(f"Selected models: {task_paths}")
+    for num, task_path in enumerate(task_paths):
+        print(f"Processing model {num} ...")
+        with open(task_path, "rb") as file:
+            tasks = file.readlines()
+        model_name = task_path.split("/")[-1][len("relay-") :][: -len("_extracted_tasks.json")]
+        os.makedirs(os.path.join(args.candidate_cache_dir, model_name), exist_ok=True)
+        for task_str in tqdm(tasks):
+            task_name, task_mod = json.loads(task_str)
+            task_mod = load_json(json.dumps(task_mod))
+            sample_candidates(task_mod, task_name, model_name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py
index 2dbd290a28ebd..3cdf251fe4b61 100644
--- a/python/tvm/meta_schedule/testing/relay_workload.py
+++ b/python/tvm/meta_schedule/testing/relay_workload.py
@@ -289,45 +289,6 @@ def extract_from_relay(
     return extracted_tasks
 
 
-def _build_dataset() -> List[Tuple[str, List[int]]]:
-    network_keys = []
-    for name in [
-        "resnet_18",
-        "resnet_50",
-        "mobilenet_v2",
-        "mobilenet_v3",
-        "wide_resnet_50",
-        "resnext_50",
-        "densenet_121",
-        "vgg_16",
-    ]:
-        for batch_size in [1, 4, 8]:
-            for image_size in [224, 240, 256]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size]))
-    # inception-v3
-    for name in ["inception_v3"]:
-        for batch_size in [1, 2, 4]:
-            for image_size in [299]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size]))
-    # resnet3d
-    for name in ["resnet3d_18"]:
-        for batch_size in [1, 2, 4]:
-            for image_size in [112, 128, 144]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size, 16]))
-    # bert
-    for name in ["bert_tiny", "bert_base", "bert_medium", "bert_large"]:
-        for batch_size in [1, 2, 4]:
-            for seq_length in [64, 128, 256]:
-                network_keys.append((name, [batch_size, seq_length]))
-    # dcgan
-    for name in ["dcgan"]:
-        for batch_size in [1, 4, 8]:
-            for image_size in [64]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size]))
-
-    return network_keys
-
-
 SUPPORTED = [
     # TorchVision
     "resnet_18",
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 7714af3fec74e..6935ee610e485 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -716,9 +716,35 @@ class EvolutionarySearch : public SearchStrategy {
                                                     EvolutionarySearchNode);
 };
 
+Array<Schedule> EvolutionarySearchSampleInitPopulation(EvolutionarySearch self, int num) {
+  std::vector<Schedule> results = self->state_->SampleInitPopulation(num);
+  return Array<Schedule>(results.begin(), results.end());
+}
+
+Array<Schedule> EvolutionarySearchEvolveWithCostModel(EvolutionarySearch self,
+                                                      Array<Schedule> population, int num) {
+  Array<Schedule> result;
+  std::vector<Schedule> population_vec =
+      std::vector<Schedule>(population.begin(), population.end());
+  std::vector<Schedule> schs = self->state_->EvolveWithCostModel(population_vec, num);
+  for (Schedule sch : schs) {
+    IRModule mod = sch->mod();
+    size_t shash = StructuralHash()(mod);
+    if (!self->state_->measured_workloads_.Has(mod, shash)) {
+      self->state_->measured_workloads_.Add(mod, shash);
+      result.push_back(sch);
+    }
+  }
+  return result;
+}
+
 TVM_REGISTER_NODE_TYPE(EvolutionarySearchNode);
 TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyEvolutionarySearch")
     .set_body_typed(SearchStrategy::EvolutionarySearch);
+TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyEvolutionarySearchSampleInitPopulation")
+    .set_body_typed(EvolutionarySearchSampleInitPopulation);
+TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyEvolutionarySearchEvolveWithCostModel")
+    .set_body_typed(EvolutionarySearchEvolveWithCostModel);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 0f84dfef1135f..7def8b8674e10 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -2234,6 +2234,7 @@ Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
   return TensorizeInfo(ret);
 }
 
+TVM_REGISTER_GLOBAL("tir.schedule.IsSpatialPrimFunc").set_body_typed(IsSpatialPrimFunc);
 TVM_REGISTER_GLOBAL("tir.schedule.GetTensorizeLoopMapping")
     .set_body_typed([](Schedule sch, BlockRV block, PrimFunc desc_func) {
       return GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block), desc_func);

From dfc8e95604afc8475970b8371d2a93d76b0bddf6 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 10 Jun 2022 19:52:29 -0700
Subject: [PATCH 107/181] [BYOC] Make CUTLASS BYOC integration 'Collage
 friendly'   (#11631)

* [BYOC] Make CUTLASS BYOC integration 'Collage friendly'

(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for
context, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).

Currently CUTLASS has four entry points:
 - The usual 'partition_for_cutlass' partitioning function, using the
   standard pattern table and pass machinery (see cutlass/build.py).
 - A 'tune_cutlass_kernels' function which augments CUTLASS partition
   functions with the results of building and running test kernels (see cutlass/build.py).
 - A 'relay.ext.cutlass' external codegen function which inspects the
   turning results and generates a CSourceModule for each partitions
   (see cutlass/codegen.cc).
 - A 'build_cutlass_kernels_vm' function which runs 'export_library' with
   all the nvcc compiler options needed to build all the CSourceModules
   (see cutlass/bild.py).

For Collage we'd like CUTLASS to have only two entry points: 'partition_for_cutlass',
and 'relay.ext.cutlass' or equivalent. This makes the CUTLASS external codegen integration
composable with other integrations, which in turn helps Collage avoid having to understand any
external codegen APIs other than the global pattern table and the custom compilation function/pass.

Collage also tends to end up requiring multiple partitions for the same backend since it is
more aggressive at mixing-and-matching smaller sub-graphs between backends. Thus we'd also like
to make sure all tuning, generated code and compilation overhead is shared between all such CUTLASS
partitions.

So, in this PR:
 - We add all the CUTLASS-specific tuning and compilation options as new Target
   attributes for the 'external codegen' "cutlass" TargetKind (cutlass/target.cc).
   The user now has one place to provide those settings, and we've already done the
   legwork to plumb the target instance.
 - We replace 'relay.ext.cutlass' with a 'RelayToTIR' custom pass hook
   'CompileForCutlass' (see cutlass/codegen.cc). This pass obviously can see all
   the CUTLASS partitions in the IRModule, so we can now share tuning results
   between them all and can be sure to generate a single CSourceModule. The pass can
   also invoke the compiler to yield a StaticModule, which we've also already done the
   legwork to support. In this way all CUTLASS-specific steps are handled at once.
 - For convenience we supply 'finalize_modules' and 'finalize_modules_vm' which
   invoke nvcc for final linking (using export_library as usual). However, there's now
   nothing CUTLASS specific in those helpers other than their overriding of the 'compiler' to
   be nvcc.
 - test_cutlass.py is updated to use the new API.

 Though this is a breaking change for existing users of the CUTLASS integration the
 change is pretty minor, as shown in test_cutlass.py.

* - Masa's comments

* - Remove unnecessary save.
---
 python/tvm/contrib/cutlass/__init__.py       |   2 +-
 python/tvm/contrib/cutlass/build.py          | 385 ++++++++++++-------
 python/tvm/contrib/cutlass/gen_tensor_op.py  |   4 +-
 python/tvm/relay/op/contrib/cutlass.py       |  17 +-
 python/tvm/testing/utils.py                  |   3 +
 src/relay/backend/contrib/cutlass/codegen.cc | 138 +++++--
 src/relay/backend/contrib/cutlass/codegen.h  |  48 +++
 src/relay/backend/contrib/cutlass/target.cc  |  33 +-
 tests/python/contrib/test_cutlass.py         |  99 +++--
 9 files changed, 507 insertions(+), 222 deletions(-)
 create mode 100644 src/relay/backend/contrib/cutlass/codegen.h

diff --git a/python/tvm/contrib/cutlass/__init__.py b/python/tvm/contrib/cutlass/__init__.py
index 69d3e9c4bd7c1..4b56ac4e164ad 100644
--- a/python/tvm/contrib/cutlass/__init__.py
+++ b/python/tvm/contrib/cutlass/__init__.py
@@ -15,4 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 """BYOC support for CUTLASS."""
-from .build import tune_cutlass_kernels, build_cutlass_kernels, build_cutlass_kernels_vm
+from .build import has_cutlass, num_cutlass_partitions, finalize_modules, finalize_modules_vm
diff --git a/python/tvm/contrib/cutlass/build.py b/python/tvm/contrib/cutlass/build.py
index bd372572c403b..0c8c2ad0b2b9b 100644
--- a/python/tvm/contrib/cutlass/build.py
+++ b/python/tvm/contrib/cutlass/build.py
@@ -22,6 +22,7 @@
 import tvm
 from tvm import runtime, relay
 from tvm.contrib.nvcc import get_cuda_version
+from tvm._ffi.registry import register_func
 from .gen_gemm import CutlassGemmProfiler
 from .gen_conv2d import CutlassConv2DProfiler
 from .library import ConvKind
@@ -29,6 +30,11 @@
 logger = logging.getLogger("cutlass")
 
 
+def has_cutlass():
+    """Returns true if the CUTLASS custom codegen is available"""
+    return tvm.get_global_func("relay.ext.cutlass.create_c_source_module", True) is not None
+
+
 def _get_cutlass_path():
     tvm_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../../../")
     cutlass_path = os.path.join(tvm_root, "3rdparty/cutlass")
@@ -49,6 +55,7 @@ def _get_cutlass_compile_options(sm, threads, use_fast_math=False):
     kwargs = {}
     kwargs["cc"] = "nvcc"
     kwargs["options"] = [
+        "-c",
         "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
         "-gencode=arch=compute_%d,code=[sm_%d,compute_%d]" % (sm, sm, sm),
         "-Xcompiler=-fPIC",
@@ -77,7 +84,7 @@ def __init__(self):
 
     def visit_call(self, call):
         op = call.op
-        if isinstance(op, relay.Function) and "PartitionedFromPattern" in op.attrs:
+        if isinstance(op, relay.Function) and "Composite" in op.attrs:
             self.signature["op_type"] = op.attrs["Composite"]
             for i, arg in enumerate(op.params):
                 self.signature["arg%d_shape" % i] = arg.checked_type.shape
@@ -285,6 +292,10 @@ def handle_conv2d(
     }
 
 
+def num_cutlass_partitions(mod):
+    return sum([(1 if "cutlass" in var.name_hint else 0) for var in mod.get_global_vars()])
+
+
 def tune_cutlass_kernels(
     mod,
     sm,
@@ -346,187 +357,279 @@ def tune_cutlass_kernels(
     for var in mod.get_global_vars():
         fun_name = var.name_hint
         func = mod[fun_name]
-        annotator = OpAnnotator()
         if "cutlass" in fun_name:
             num_cutlass_partition += 1
-            annotator.visit(func)
-            out_shape = annotator.signature["ret_shape"]
-            out_dtype = annotator.signature["ret_dtype"]
-            op_type = annotator.signature["op_type"]
-
-            new_attrs = {"op_type": op_type}
-            new_attrs.update(annotator.signature)
-            new_attrs.update(func.attrs)
-            arg0_shape = new_attrs["arg0_shape"]
-            arg1_shape = new_attrs["arg1_shape"]
-            arg0_dtype = new_attrs["arg0_dtype"]
-            arg1_dtype = new_attrs["arg1_dtype"]
-
-            if "conv2d" in op_type:
-                new_attrs["padding"] = annotator.op_attrs.padding
-                new_attrs["strides"] = annotator.op_attrs.strides
-                new_attrs["dilation"] = annotator.op_attrs.dilation
-
-                if "conv2d_transpose" in op_type:
-                    d_shape = out_shape
-                    w_shape = arg1_shape
-                elif "conv2d_backward_weight" in op_type:
-                    d_shape = arg1_shape
-                    w_shape = out_shape
-                else:
-                    d_shape = arg0_shape
-                    w_shape = arg1_shape
-
-                new_attrs.update(
-                    handle_conv2d(
-                        conv2d_profiler,
-                        op_type,
-                        d_shape,
-                        w_shape,
-                        annotator.op_attrs.padding,
-                        annotator.op_attrs.strides,
-                        annotator.op_attrs.dilation,
-                        out_dtype,
-                        arg0_dtype,
-                        arg1_dtype,
-                        use_3xtf32,
-                        split_k_slices,
-                        profile_all_alignments,
-                        find_first_valid,
-                        use_multiprocessing,
-                    )
-                )
-            elif "batch_matmul" in op_type:
-                new_attrs.update(
-                    handle_batch_matmul(
-                        gemm_profiler,
-                        op_type,
-                        arg0_shape,
-                        arg1_shape,
-                        out_dtype,
-                        arg0_dtype,
-                        arg1_dtype,
-                        use_3xtf32,
-                        find_first_valid,
-                        use_multiprocessing,
-                    )
-                )
-            elif "dense" in op_type:
-                new_attrs.update(
-                    handle_dense(
-                        gemm_profiler,
-                        op_type,
-                        arg0_shape,
-                        arg1_shape,
-                        out_dtype,
-                        arg0_dtype,
-                        arg1_dtype,
-                        use_3xtf32,
-                        find_first_valid,
-                        use_multiprocessing,
-                    )
-                )
-            else:
-                raise ValueError("%s unsupported composite" % op_type)
-
-            new_attrs = tvm.ir.make_node("DictAttrs", **new_attrs)
-            new_func = relay.Function(
-                func.params,
-                func.body,
-                ret_type=func.ret_type,
-                type_params=func.type_params,
-                attrs=new_attrs,
+            new_func = tune_cutlass_function(
+                func,
+                use_3xtf32,
+                split_k_slices,
+                profile_all_alignments,
+                find_first_valid,
+                use_multiprocessing,
+                gemm_profiler,
+                conv2d_profiler,
             )
             mod.update_func(var, new_func)
 
     return mod, num_cutlass_partition
 
 
-def build_cutlass_kernels(
-    lib, sm, tmp_dir="./tmp", lib_path="compile.so", threads=-1, use_fast_math=False
+def tune_cutlass_function(
+    func,
+    use_3xtf32,
+    split_k_slices,
+    profile_all_alignments,
+    find_first_valid,
+    use_multiprocessing,
+    gemm_profiler,
+    conv2d_profiler,
 ):
-    """Compile CUTLASS kernels in lib and return the runtime module ready to run.
+    """Given a function intended to be offloaded to CUTLASS,  profile each workload to select which
+    kernels to emit.
 
     Parameters
     ----------
-    lib : GraphExecutorFactoryModule
-        The output from relay.build containing compiled host code and non-cutlass kernels.
+    func : IRModule
+        The Relay Function to tune for.
 
-    sm : int
-        An integer specifying the compute capability. For example, 75 for Turing and
-        80 or 86 for Ampere.
+    use_3xtf32 : bool
+        Wheter or not use slower but very accurate (compared to tf32) 3xtf32 mode for
+        fp32 inputs on tensorcore.
 
-    tmp_dir : string, optional
-        A temporary directory where intermediate compiled artifacts will be stored.
+    split_k_slices : list of int
+        Split factor candidates for split-K GEMM. If split-K > 1, the GEMM K-loop is computed in
+        parallel accross split-K blocks, and a seperate global reduction kernel is launched to
+        accumulate partial reductions. The profiler will pick the best split-k factor from the
+        given candidate list. Note that the larger split-K factor requires a larger workspace.
+        Currently, parallel split-k has been tested only for wgrad. For GEMM and other conv2d
+        kinds, split_k_slices is ignored.
 
-    lib_path : string, optional
-        The path to a shared library which will be generated as the result of the build process.
+    profile_all_alignments : bool
+        When True, profile all kernal variants with smaller alignments than the largest possible.
+
+    find_first_valid : bool
+        Whether or not profile all candidate kernels, or stop profiling after
+        the first applicable kernel is found.
+
+    use_multiprocessing : bool
+        Whether or not compile profiler executables for different kernels in parallel.
 
-    threads : int, optional
-        The number of threads to use for compiling generated kernels. Only available for
-        CUDA 11.2 or later. Use all physical cores by default.
+    gemm_profiler : CutlassGemmProfiler
+        Profiler for dense operators. May cache results between tuned functions.
 
-    use_fast_math : bool, optional
-        Whether or not to use faster but less accurate math intrinsics.
+    conv2d_profiler : CutlassConv2DProfiler
+        Profiler for conv2d operators. May cach results between tuned functions.
 
     Returns
     -------
-    updated_lib : runtime.Module
-        The updated module with compiled cutlass kernels.
+    annot_func : Function
+        The input function with attributes capturing the best CUTLASS kernel found by tuning.
     """
-    kwargs = _get_cutlass_compile_options(sm, threads, use_fast_math)
-    lib.export_library(lib_path, workspace_dir=tmp_dir, **kwargs)
-    return runtime.load_module(lib_path)
+    annotator = OpAnnotator()
+    annotator.visit(func)
+    out_shape = annotator.signature["ret_shape"]
+    out_dtype = annotator.signature["ret_dtype"]
+    op_type = annotator.signature["op_type"]
+
+    new_attrs = {"op_type": op_type}
+    new_attrs.update(annotator.signature)
+    new_attrs.update(func.attrs)
+    arg0_shape = new_attrs["arg0_shape"]
+    arg1_shape = new_attrs["arg1_shape"]
+    arg0_dtype = new_attrs["arg0_dtype"]
+    arg1_dtype = new_attrs["arg1_dtype"]
+
+    if "conv2d" in op_type:
+        new_attrs["padding"] = annotator.op_attrs.padding
+        new_attrs["strides"] = annotator.op_attrs.strides
+        new_attrs["dilation"] = annotator.op_attrs.dilation
+
+        if "conv2d_transpose" in op_type:
+            d_shape = out_shape
+            w_shape = arg1_shape
+        elif "conv2d_backward_weight" in op_type:
+            d_shape = arg1_shape
+            w_shape = out_shape
+        else:
+            d_shape = arg0_shape
+            w_shape = arg1_shape
+
+        new_attrs.update(
+            handle_conv2d(
+                conv2d_profiler,
+                op_type,
+                d_shape,
+                w_shape,
+                annotator.op_attrs.padding,
+                annotator.op_attrs.strides,
+                annotator.op_attrs.dilation,
+                out_dtype,
+                arg0_dtype,
+                arg1_dtype,
+                use_3xtf32,
+                split_k_slices,
+                profile_all_alignments,
+                find_first_valid,
+                use_multiprocessing,
+            )
+        )
+    elif "batch_matmul" in op_type:
+        new_attrs.update(
+            handle_batch_matmul(
+                gemm_profiler,
+                op_type,
+                arg0_shape,
+                arg1_shape,
+                out_dtype,
+                arg0_dtype,
+                arg1_dtype,
+                use_3xtf32,
+                find_first_valid,
+                use_multiprocessing,
+            )
+        )
+    elif "dense" in op_type:
+        new_attrs.update(
+            handle_dense(
+                gemm_profiler,
+                op_type,
+                arg0_shape,
+                arg1_shape,
+                out_dtype,
+                arg0_dtype,
+                arg1_dtype,
+                use_3xtf32,
+                find_first_valid,
+                use_multiprocessing,
+            )
+        )
+    else:
+        raise ValueError("%s unsupported composite" % op_type)
+
+    new_attrs = tvm.ir.make_node("DictAttrs", **new_attrs)
+    return relay.Function(
+        func.params,
+        func.body,
+        ret_type=func.ret_type,
+        type_params=func.type_params,
+        attrs=new_attrs,
+    )
 
 
-def build_cutlass_kernels_vm(
-    vm_exec,
-    sm,
-    tmp_dir="./tmp",
-    lib_path="compile.so",
-    vmcode_path="vmcode.ro",
-    threads=-1,
-    use_fast_math=False,
-):
-    """Compile CUTLASS kernels in vm_exec and return a VM executable ready to run.
+@register_func("relay.ext.cutlass.compile_for_cutlass")
+def compile_for_cutlass(mod, cutlass_target):
+    """Given an IRModule with at least one Compiler='cutlass' Relay function, return a
+    LibraryModule with all such functions compiled into their PackedFunc-compatible form.
+     - First runs CUTLASS tuning to decide on the best kernels, which itself requires the
+       repeated compilation and execution of CUDA code using nvcc. The results of this
+       is captured as annotation on each relevant function. Kernel performance is cached
+       overall all functions.
+     - Then generates a single CSourceModule containing C code implementing all the
+       Compiler='cutlass' Relay functions, accounting for the tuning done above.
+     - Then compiles that CSourceModule with the appropriate nvcc arguments to yield
+       a static .o library. An export_library step will be required on the final runtime
+       module to link that library into the overall .so library.
+     See CompileForCutlass in src/relay/backend/contrib/cutlass/codegen.cc for where this
+     helper function is used to implement the RelayToTIR pass hook for CUTLASS."""
+
+    # Recover options from the current 'cutlass' Target
+    assert cutlass_target.kind.name == "cutlass"
+    tuning_config = {
+        key: cutlass_target.attrs.get(key)
+        for key in [
+            "sm",
+            "use_3xtf32",
+            "split_k_slices",
+            "profile_all_alignments",
+            "find_first_valid",
+            "use_multiprocessing",
+        ]
+    }
+    compile_config = {
+        key: cutlass_target.attrs.get(key) for key in ["sm", "threads", "use_fast_math"]
+    }
+    tmp_dir = cutlass_target.attrs.get("tmp_dir")
+
+    # Tune
+    logger.info("Tuning for CUTLASS")
+    mod, _ = tune_cutlass_kernels(mod, tmp_dir=tmp_dir, **tuning_config)
+
+    # Compile
+    logger.info("Creating CSource module for CUTLASS")
+    create_c_source_module = tvm._ffi.get_global_func("relay.ext.cutlass.create_c_source_module")
+    c_module = create_c_source_module(mod)
+    function_names = c_module.get_function("get_func_names")()
+    compile_options = _get_cutlass_compile_options(**compile_config)
+    lib_path = os.path.join(tmp_dir, "cutlass.o")
+    logger.info("Compiling generated CUTLASS code")
+    c_module.export_library(lib_path, workspace_dir=tmp_dir, **compile_options)
+
+    # Recover static library
+    logger.info("Loading compiled CUTLASS code")
+    final_mod = tvm.runtime.load_static_library(lib_path, function_names)
+
+    logger.info("Done with CUTLASS compilation")
+    return final_mod
+
+
+def finalize_modules(lib, lib_path="compile.so", tmp_dir="./tmp"):
+    """Returns lib with any C source, LLVM and static library modules complied and linked in ready
+    for use by the graph or AOT executors. This method is not specific to CUTLASS, however it does
+    assume nvcc will be used for final compilation and linking. It is provided here for
+    convenience.
 
     Parameters
     ----------
-    vm_exec : vm.Executable
-        The output from relay.vm.compile containing compiled host code and non-cutlass kernels.
+    lib : runtime.Module
+        The output from relay.build.
 
-    sm : int
-        An integer specifying the compute capability. For example, 75 for Turing and
-        80 or 86 for Ampere.
+    lib_path : string
+        The path to a shared library which will be generated as the result of the build process.
 
-    tmp_dir : string, optional
+    tmp_dir : string
         A temporary directory where intermediate compiled artifacts will be stored.
 
-    lib_path : string, optional
-        The path to a shared library which will be generated as the result of the build process.
+    Returns
+    -------
+    updated_lib : runtime.Module
+        The updated library with all compilation and linking completed.
 
-    vmcode_path : string, optional
-        The path where the VM bytecode will be serialized to.
+    """
+    lib_path = os.path.join(tmp_dir, lib_path)
+    lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
+    return runtime.load_module(lib_path)
 
-    threads : int, optional
-        The number of threads to use for compiling generated kernels. Only available for
-        CUDA 11.2 or later. Use all physical cores by default.
 
-    use_fast_math : bool, optional
-        Whether or not to use faster but less accurate math intrinsics.
+def finalize_modules_vm(vm_exec, lib_path="compile.so", vmcode_path="vmcode.ro", tmp_dir="./tmp"):
+    """Returns vm_exec with any C source, LLVM and static library modules compiled and linked in
+    ready for use by the VM executor. This method is not specific to CUTLASS, however it does
+    assume nvcc will be used for final compilation and linking. It is provided here for
+    convenience.
+
+    Parameters
+    ----------
+    vm_exec : vm.Executable
+        The output from relay.vm.compile containing compiled host code and kernels.
+
+    lib_path : string
+        The path to a shared library which will be generated as the result of the build process.
+
+    vmcode_path : string
+        The path where the VM bytecode will be serialized to as a side-effect.
+
+    tmp_dir : string
+        A temporary directory where intermediate compiled artifacts will be stored.
 
     Returns
     -------
-    updated_vm_exec: vm.Executable
-        The updated exectuable with compiled cutlass kernels.
+    updated_vm_exec : vm.Executable
+        The updated VM executable with all compilation and linking completed.
     """
     code, lib = vm_exec.save()
-    kwargs = _get_cutlass_compile_options(sm, threads, use_fast_math)
     lib_path = os.path.join(tmp_dir, lib_path)
     vmcode_path = os.path.join(tmp_dir, vmcode_path)
-    lib.export_library(lib_path, workspace_dir=tmp_dir, **kwargs)
+    lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
     with open(vmcode_path, "wb") as fo:
         fo.write(code)
     lib = tvm.runtime.load_module(lib_path)
-    code = bytearray(open(vmcode_path, "rb").read())
     return tvm.runtime.vm.Executable.load_exec(code, lib)
diff --git a/python/tvm/contrib/cutlass/gen_tensor_op.py b/python/tvm/contrib/cutlass/gen_tensor_op.py
index b3f40f09419cf..3c7e1aba2a199 100644
--- a/python/tvm/contrib/cutlass/gen_tensor_op.py
+++ b/python/tvm/contrib/cutlass/gen_tensor_op.py
@@ -332,10 +332,11 @@ def _compile(self, op):
         opath = os.path.join(self.binary_prefix, op["name"])
         if os.path.exists(opath):
             return
-        fi = tempfile.NamedTemporaryFile("w", delete=False, suffix=".cu")
+        fi = tempfile.NamedTemporaryFile("w", delete=False, prefix=self.binary_prefix, suffix=".cu")
         fi.write(op["src"])
         fi.close()
         cmd = self.cmd.format(cflags=self.cflags, src=fi.name, output=opath)
+        logger.info("invoking compilation %s", cmd)
         os.system(cmd)
         os.unlink(fi.name)
 
@@ -361,6 +362,7 @@ def evaluate(self, op, args):
         for arg in args:
             cmd.append(str(arg))
         try:
+            logger.info("invoking evaluation %s", cmd)
             sp = subprocess.run(cmd, capture_output=True, check=True)
             rt = float(sp.stdout)
             if rt == 0.0:
diff --git a/python/tvm/relay/op/contrib/cutlass.py b/python/tvm/relay/op/contrib/cutlass.py
index 5c906f7e69bed..1a441a6f03c2a 100644
--- a/python/tvm/relay/op/contrib/cutlass.py
+++ b/python/tvm/relay/op/contrib/cutlass.py
@@ -21,6 +21,7 @@
 from tvm.ir.transform import Sequential, PassContext
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
 from ...dataflow_pattern import wildcard, is_op, is_constant
 
 
@@ -200,8 +201,10 @@ def check_conv2d_residual(call, binary_op):
     return all(x == y for (x, y) in zip(lhs.checked_type.shape, rhs.checked_type.shape))
 
 
-def partition_for_cutlass(mod, params=None):
-    """Partition the input module into CUTLASS-supported subgraphs."""
+@register_pattern_table("cutlass")
+def pattern_table():
+    """Returns list of triples describing the name, dataflow pattern and predicate for all
+    the CUTLASS-supported operators."""
     dense_pat = ("cutlass.dense", make_gemm_pattern(False, None), check_gemm)
     dense_bias_pat = ("cutlass.dense_bias", make_gemm_pattern(True, None), check_gemm)
     dense_bias_relu_pat = ("cutlass.dense_bias_relu", make_gemm_pattern(True, "relu"), check_gemm)
@@ -273,9 +276,11 @@ def partition_for_cutlass(mod, params=None):
                     )
                 )
 
-    cutlass_patterns = (
-        residual_block_patterns + dense_patterns + conv2d_patterns + conv2d_grad_patterns
-    )
+    return residual_block_patterns + dense_patterns + conv2d_patterns + conv2d_grad_patterns
+
+
+def partition_for_cutlass(mod, params=None):
+    """Partition the input module into CUTLASS-supported subgraphs."""
 
     if params is not None:
         mod["main"] = bind_params_by_name(mod["main"], params)
@@ -290,6 +295,8 @@ def partition_for_cutlass(mod, params=None):
         with PassContext(opt_level=3):
             mod = remove_bn_pass(mod)
 
+    cutlass_patterns = relay.op.contrib.get_pattern_table("cutlass")
+
     seq = Sequential(
         [
             transform.InferType(),
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 59ff93cfea5c3..569ea0cca7ffd 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -926,6 +926,9 @@ def _any_gpu_exists():
 # Mark a test as requiring microTVM to run
 requires_micro = Feature("micro", "MicroTVM", cmake_flag="USE_MICRO")
 
+# Mark a test as requiring CUTLASS to run
+requires_cutlass = Feature("cutlass", "CUTLASS", cmake_flag="USE_CUTLASS")
+
 # Mark a test as requiring rpc to run
 requires_rpc = Feature("rpc", "RPC", cmake_flag="USE_RPC")
 
diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc
index b12da1ac62cba..db36d02896a2b 100644
--- a/src/relay/backend/contrib/cutlass/codegen.cc
+++ b/src/relay/backend/contrib/cutlass/codegen.cc
@@ -19,28 +19,30 @@
 
 /*!
  * \file src/relay/backend/contrib/cutlass/codegen.cc
- * \brief Implementation of CUTLASS codegen.
+ * \brief The 'custom' compilation pass for CUTLASS (invoked by the RelayToTIRTargetHook pass).
  */
 
+#include <tvm/relay/attrs/memory.h>
 #include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 
-#include <fstream>
 #include <numeric>
 #include <sstream>
 
+#include "../../../transforms/compiler_function_utils.h"
 #include "../../utils.h"
 #include "../codegen_c/codegen_c.h"
 
 namespace tvm {
 namespace relay {
 namespace contrib {
+namespace cutlass {
+
+namespace {
 
-using namespace backend;
 using Str2StrMap = std::unordered_map<std::string, std::string>;
 
 static Str2StrMap dtype_map = {{"float16", "cutlass::half_t"},
@@ -507,7 +509,8 @@ std::string Conv2dOp(std::string id, const Str2StrMap& attrs,
   return conv2d_decl.str();
 }
 
-class CodegenCutlass : public MemoizedExprTranslator<std::vector<Output>>, public CodegenCBase {
+class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output>>,
+                       public CodegenCBase {
  public:
   CodegenCutlass(const std::string& id, const Map<String, ObjectRef>& attrs) {
     this->ext_func_id_ = id;
@@ -593,6 +596,8 @@ class CodegenCutlass : public MemoizedExprTranslator<std::vector<Output>>, publi
 
   GenerateBodyOutput GenerateCompositeFunctionCall(const FunctionNode* callee,
                                                    const CallNode* caller) {
+    using backend::GetRootCall;
+
     const auto pattern_name = callee->GetAttr<runtime::String>(attr::kComposite);
     ICHECK(pattern_name.defined()) << "Only functions with composite attribute are supported.";
 
@@ -780,22 +785,22 @@ class CodegenCutlass : public MemoizedExprTranslator<std::vector<Output>>, publi
   std::vector<std::string> buf_decl_;
 };  // class CodegenCutlass
 
-class CutlassModuleCodegen : public CSourceModuleCodegenBase {
+class CutlassModuleCodegen {
  public:
-  std::pair<std::string, Array<String>> GenCutlassFunc(const Function& func) {
-    ICHECK(func.defined()) << "Input error: expect a Relay function.";
-    // Record the external symbol for runtime lookup.
-    auto sid = GetExtSymbol(func);
-    const auto* attrs = func->attrs.as<DictAttrsNode>();
-    ICHECK(attrs != nullptr);
-    const auto dict = attrs->dict;
-    CodegenCutlass builder(sid, dict);
-    auto out = builder.VisitExpr(func->body);
-    code_stream_ << builder.JIT(out);
-    return {sid, {}};
+  explicit CutlassModuleCodegen(IRModule mod) : mod_(std::move(mod)) {}
+
+  runtime::Module CreateCSourceModule() {
+    EmitPreamble();
+    for (const auto& kv : mod_->functions) {
+      if (const auto* function_node = GetCutlassFunctionNode(kv.second)) {
+        GenCutlassFunc(GetRef<Function>(function_node));
+      }
+    }
+    return Finalize();
   }
 
-  runtime::Module CreateCSourceModule(const ObjectRef& ref) override {
+ private:
+  void EmitPreamble() {
     // create header
     code_stream_ << "#include <cstdint>\n";
     code_stream_ << "#include <cstdlib>\n";
@@ -825,34 +830,101 @@ class CutlassModuleCodegen : public CSourceModuleCodegenBase {
     code_stream_ << "#include <cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h>\n";
     code_stream_ << "#include <cutlass/reduction/device/reduce_split_k.h>\n";
     code_stream_ << "#include <cutlass/reduction/thread/reduction_operators.h>\n";
+  }
 
-    ICHECK(ref->IsInstance<FunctionNode>());
-    auto res = GenCutlassFunc(Downcast<Function>(ref));
-    std::string code = code_stream_.str();
-    String sym = std::get<0>(res);
-    Array<String> variables = std::get<1>(res);
-    // Create a CSource module
+  void GenCutlassFunc(const Function& function) {
+    ICHECK(function.defined()) << "Input error: expect a Relay function.";
+
+    // Record the external symbol for runtime lookup.
+    Optional<String> opt_global_symbol = function->GetAttr<String>(tvm::attr::kGlobalSymbol);
+    ICHECK(opt_global_symbol.defined())
+        << "CUTLASS functions must have a " << tvm::attr::kGlobalSymbol << " attribute";
+    std::string sid = opt_global_symbol.value();
+    if (std::find(func_names_.begin(), func_names_.end(), sid) != func_names_.end()) {
+      // Already emitted.
+      return;
+    }
+    func_names_.push_back(sid);
+
+    const auto* attrs = function->attrs.as<DictAttrsNode>();
+    ICHECK(attrs != nullptr);
+    const auto dict = attrs->dict;
+    CodegenCutlass builder(sid, dict);
+    VLOG(1) << "Creating cutlass C code for '" << sid << "' from:\n" << PrettyPrint(function);
+    auto out = builder.VisitExpr(function->body);
+    code_stream_ << builder.JIT(out);
+  }
+
+  runtime::Module Finalize() {
+    ICHECK(!func_names_.empty())
+        << "Should only create CUTLASS CSourceModule if have at least one CUTLASS partition";
     const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate");
     ICHECK(pf != nullptr) << "Cannot find CSource module to create the external runtime module";
-    return (*pf)(code, "cu", Array<String>{sym}, variables);
+    VLOG(1) << "Generated CUTLASS code:" << std::endl << code_stream_.str();
+    return (*pf)(code_stream_.str(), "cu", func_names_, const_vars_);
   }
 
- private:
-  /*! \brief The code stream that will be compiled by NVCC */
+  /*!
+   * \brief Returns \p expr as function if it is a \p Function with "Compiler" attribute
+   * value "cutlass".
+   */
+  const FunctionNode* GetCutlassFunctionNode(const Expr& expr) {
+    if (const auto* function_node = expr.as<FunctionNode>()) {
+      Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+      if (opt_compiler.defined() && opt_compiler.value() == "cutlass") {
+        return function_node;
+      }
+    }
+    return nullptr;
+  }
+
+  /*! \brief Module we are compiling. */
+  IRModule mod_;
+  /*! \brief The accumulated code stream that will be compiled by NVCC */
   std::ostringstream code_stream_;
+  /*! \brief The accumulated function names. */
+  Array<String> func_names_;
+  /*! \brief The accumulated constant names. */
+  Array<String> const_vars_;
 };  // CutlassModuleCodegen
 
 /*!
- * \brief The external cutlass compiler/codegen tool. It takes a Relay
- * expression/module and compile it into a runtime module.
+ * \brief A small shim to redirect to the 'relay.ext.cutlass.compile_for_cutlass' Python
+ * function which does the main CUTLASS training, c-code generation and compilation steps.
  */
-runtime::Module CutlassCompiler(const ObjectRef& ref) {
-  CutlassModuleCodegen cutlass;
-  return cutlass.CreateCSourceModule(ref);
+transform::Pass CompileForCutlassImpl() {
+  auto pass_func = [=](IRModule mod, const transform::PassContext& pass_ctx) {
+    VLOG(1) << "CompileForCutlass input:" << std::endl << PrettyPrint(mod);
+    const auto* pf = runtime::Registry::Get("relay.ext.cutlass.compile_for_cutlass");
+    ICHECK(pf != nullptr) << "Cannot find compile_for_cutlass function";
+    Optional<Target> opt_cutlass_target = Target::Current();
+    ICHECK(opt_cutlass_target.defined()) << "Expecting Target::Current to be available";
+    ICHECK_EQ(opt_cutlass_target.value()->kind->name, "cutlass");
+    runtime::Module runtime_mod = (*pf)(mod, opt_cutlass_target.value());
+    Array<runtime::Module> external_mods =
+        mod->GetAttr<Array<runtime::Module>>("external_mods", Array<runtime::Module>()).value();
+    external_mods.push_back(runtime_mod);
+    return WithAttr(mod, "external_mods", external_mods);
+  };
+  return tvm::transform::CreateModulePass(pass_func, 0, "CompileForCutlass", {});
+}
+
+runtime::Module CreateCSourceModule(const IRModule& mod) {
+  VLOG(1) << "Creating CUTLASS CSource module from:" << std::endl << PrettyPrint(mod);
+  return CutlassModuleCodegen(mod).CreateCSourceModule();
 }
 
-TVM_REGISTER_GLOBAL("relay.ext.cutlass").set_body_typed(CutlassCompiler);
+}  // namespace
+
+TVM_REGISTER_GLOBAL("relay.ext.cutlass.create_c_source_module").set_body_typed(CreateCSourceModule);
+
+transform::Pass CompileForCutlass() {
+  return transform::Sequential(
+      {transforms::OutlineCompilerFunctionsWithExistingGlobalSymbols("cutlass"),
+       CompileForCutlassImpl(), transforms::MarkCompilerFunctionsAsExtern("cutlass")});
+}
 
+}  // namespace cutlass
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/contrib/cutlass/codegen.h b/src/relay/backend/contrib/cutlass/codegen.h
new file mode 100644
index 0000000000000..e70e97a2fafa5
--- /dev/null
+++ b/src/relay/backend/contrib/cutlass/codegen.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/cutlass/codegen.h
+ * \brief The 'custom' compilation pass for CUTLASS (invoked by the RelayToTIRTargetHook pass).
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_CUTLASS_CODEGEN_H_
+#define TVM_RELAY_BACKEND_CONTRIB_CUTLASS_CODEGEN_H_
+
+#include <tvm/ir/transform.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace cutlass {
+
+/*!
+ * \brief Returns the pass which replaces all calls to "Primitive" functions with "Compiler"
+ * attribute of "cutlass" with an call to an extern, and binds a \p runtime::StaticLibrary
+ * to the IRModule's "external_mods" attribute containing compiled implementations of
+ * those functions using the CUTLASS C++ template library.
+ */
+transform::Pass CompileForCutlass();
+
+}  // namespace cutlass
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_CUTLASS_CODEGEN_H_
diff --git a/src/relay/backend/contrib/cutlass/target.cc b/src/relay/backend/contrib/cutlass/target.cc
index 3a7384fb19cc8..7b377f340a579 100644
--- a/src/relay/backend/contrib/cutlass/target.cc
+++ b/src/relay/backend/contrib/cutlass/target.cc
@@ -24,9 +24,12 @@
 
 #include <tvm/target/target.h>
 
+#include "./codegen.h"
+
 namespace tvm {
 namespace relay {
 namespace contrib {
+namespace cutlass {
 
 /*!
  * \brief This external codegen target can use the CUTLASS template library included in
@@ -36,8 +39,36 @@ namespace contrib {
  *                     src/relay/backend/contrib/cutlass/codegen.cc
  */
 TVM_REGISTER_TARGET_KIND("cutlass", kDLCUDA)
-    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true))
+    .set_attr<FTVMRelayToTIR>("RelayToTIR", CompileForCutlass())
+    // An integer specifying the compute capability. For example, 75 for Turing and
+    // 80 or 86 for Ampere.
+    .add_attr_option<Integer>("sm", Integer(80))
+    // Whether to use slower but very accurate (compared to tf32) 3xtf32 mode for
+    // fp32 inputs on tensorcore.
+    .add_attr_option<Bool>("use_3xtf32", Bool(true))
+    // Split factor candidates for split-K GEMM. If split-K > 1, the GEMM K-loop is computed in
+    // parallel across split-K blocks, and a separate global reduction kernel is launched to
+    // accumulate partial reductions. The profiler will pick the best split-k factor from the
+    // given candidate list. Note that the larger split-K factor requires a larger workspace.
+    // Currently, parallel split-k has been tested only for wgrad. For GEMM and other conv2d
+    // kinds, split_k_slices is ignored.
+    .add_attr_option<Array<Integer>>("split_k_slices", Array<Integer>({1}))
+    // When True, profile all kernel variants with smaller alignments than the largest possible.
+    .add_attr_option<Bool>("profile_all_alignments", Bool(false))
+    // Whether to profile all candidate kernels, or stop profiling after the first applicable kernel
+    // is found.
+    .add_attr_option<Bool>("find_first_valid", Bool(false))
+    // Whether to compile profiler executables for different kernels in parallel.
+    .add_attr_option<Bool>("use_multiprocessing", Bool(false))
+    // Number of threads to use during compilation, or -1 to use number of cpus.
+    .add_attr_option<Integer>("threads", Integer(-1))
+    // Whether to replace sigmoid with tanh.
+    .add_attr_option<Bool>("use_fast_math", Bool(false))
+    // A temporary directory where intermediate compiled artifacts will be stored.
+    .add_attr_option<String>("tmp_dir", String("./tmp"));
 
+}  // namespace cutlass
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py
index 8e5238b17399c..753ee178f9d30 100644
--- a/tests/python/contrib/test_cutlass.py
+++ b/tests/python/contrib/test_cutlass.py
@@ -16,7 +16,6 @@
 # under the License.
 import logging
 import math
-import pytest
 import tvm
 from tvm import relay
 from tvm.contrib.cudnn import conv_output_shape
@@ -25,10 +24,12 @@
 from tvm.relay.op.contrib.cutlass import partition_for_cutlass
 from tvm.relay.transform import FirstOrderGradient, ToMixedPrecision, InferType
 from tvm.contrib.cutlass import (
-    tune_cutlass_kernels,
-    build_cutlass_kernels,
-    build_cutlass_kernels_vm,
+    has_cutlass,
+    num_cutlass_partitions,
+    finalize_modules,
+    finalize_modules_vm,
 )
+import tvm.testing
 
 logging.basicConfig(level=logging.INFO)
 
@@ -37,10 +38,6 @@ def has_cublas():
     return tvm.get_global_func("tvm.contrib.cublas.matmul", True) != None
 
 
-def has_cutlass():
-    return tvm.get_global_func("relay.ext.cutlass", True) != None
-
-
 def get_ref_rt_mod(mod, params, target="cuda"):
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params=params)
@@ -258,24 +255,33 @@ def profile_and_build(
     sm,
     split_k_slices=[1],
     tmp_dir="./tmp",
-    lib_path="compile.so",
     use_fast_math=False,
     use_3xtf32=True,
 ):
+    logging.info("before partitioning:\n%s", mod)
     mod = partition_for_cutlass(mod)
-    mod, num_cutlass_partition = tune_cutlass_kernels(
-        mod,
-        sm,
-        use_3xtf32=use_3xtf32,
-        split_k_slices=split_k_slices,
-        profile_all_alignments=False,
-        find_first_valid=True,
-        use_multiprocessing=True,
-        tmp_dir=tmp_dir,
+    logging.info("after partitioning:\n%s", mod)
+
+    num_cutlass_partition = num_cutlass_partitions(mod)
+    host = tvm.target.Target("llvm")
+    cuda = tvm.target.Target("cuda", host=host)
+    cutlass = tvm.target.Target(
+        {
+            "kind": "cutlass",
+            "sm": sm,
+            "use_3xtf32": use_3xtf32,
+            "split_k_slices": split_k_slices,
+            "profile_all_alignments": False,
+            "find_first_valid": True,
+            "use_multiprocessing": True,
+            "use_fast_math": use_fast_math,
+            "tmp_dir": tmp_dir,
+        },
+        host=host,
     )
     with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target="cuda", params=params)
-    lib = build_cutlass_kernels(lib, sm, tmp_dir, lib_path, use_fast_math=use_fast_math)
+        lib = relay.build(mod, target=[cuda, cutlass], params=params)
+    lib = finalize_modules(lib, "compile.so", tmp_dir)
     dev = tvm.device("cuda", 0)
     rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
     return rt_mod, dev, num_cutlass_partition
@@ -287,26 +293,30 @@ def profile_and_build_vm(
     sm,
     split_k_slices=[1],
     tmp_dir="./tmp",
-    lib_path="compile.so",
-    vmcode_path="vmcode.ro",
     use_fast_math=False,
     use_3xtf32=True,
 ):
     mod = partition_for_cutlass(mod)
-    mod, num_cutlass_partition = tune_cutlass_kernels(
-        mod,
-        sm,
-        split_k_slices=split_k_slices,
-        use_3xtf32=use_3xtf32,
-        profile_all_alignments=False,
-        find_first_valid=True,
-        tmp_dir=tmp_dir,
+    num_cutlass_partition = num_cutlass_partitions(mod)
+    host = tvm.target.Target("llvm")
+    cuda = tvm.target.Target("cuda", host=host)
+    cutlass = tvm.target.Target(
+        {
+            "kind": "cutlass",
+            "sm": sm,
+            "use_3xtf32": use_3xtf32,
+            "split_k_slices": split_k_slices,
+            "profile_all_alignments": False,
+            "find_first_valid": True,
+            "use_multiprocessing": True,
+            "use_fast_math": use_fast_math,
+            "tmp_dir": tmp_dir,
+        },
+        host=host,
     )
     with tvm.transform.PassContext(opt_level=3):
-        vm_exec = relay.vm.compile(mod, target="cuda", params=params)
-    vm_exec = build_cutlass_kernels_vm(
-        vm_exec, sm, tmp_dir, lib_path, vmcode_path, use_fast_math=use_fast_math
-    )
+        vm_exec = relay.vm.compile(mod, target=[cuda, cutlass], params=params)
+    vm_exec = finalize_modules_vm(vm_exec, "compile.so", tmp_dir)
     dev = tvm.device("cuda", 0)
     return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
 
@@ -325,8 +335,7 @@ def verify_dense(
     weight_dtype="float16",
     use_3xtf32=True,
 ):
-    if not has_cutlass():
-        return
+    assert has_cutlass()
     if sm < 80 and data_dtype == "float32":
         return
 
@@ -377,8 +386,7 @@ def verify_dense(
 def verify_batch_matmul(
     func, batch, M, N, K, ref_target="cuda", sm=80, atol=1e-5, rtol=1e-5, run_benchmark=False
 ):
-    if not has_cutlass():
-        return
+    assert has_cutlass()
     mod = tvm.IRModule.from_expr(func)
     typ = relay.transform.InferType()(mod)["main"].body.checked_type
     use_vm = any(isinstance(s, tvm.tir.Any) for s in typ.shape)
@@ -415,6 +423,7 @@ def verify_batch_matmul(
 K = 64
 
 
+@tvm.testing.requires_cutlass
 def test_dense():
     verify_dense(get_dense(M, N, K), M, N, K)
     verify_dense(get_dense(M, N, K, out_dtype="float32"), M, N, K)
@@ -449,21 +458,25 @@ def test_dense():
     )
 
 
+@tvm.testing.requires_cutlass
 def test_dense_bias():
     verify_dense(get_dense_bias(M, N, K), M, N, K)
     verify_dense(get_dense_bias(M, N, K, out_dtype="float32"), M, N, K)
 
 
+@tvm.testing.requires_cutlass
 def test_dense_bias_relu():
     verify_dense(get_dense_bias_relu(M, N, K), M, N, K)
     verify_dense(get_dense_bias_relu(M, N, K, out_dtype="float32"), M, N, K)
 
 
+@tvm.testing.requires_cutlass
 def test_dense_bias_gelu():
     verify_dense(get_dense_bias_gelu(M, N, K), M, N, K, atol=1e-3, rtol=1e-3)
     verify_dense(get_dense_bias_gelu(M, N, K, out_dtype="float32"), M, N, K, atol=1e-3, rtol=1e-3)
 
 
+@tvm.testing.requires_cutlass
 def test_dense_dynamic():
     data_shape = (relay.Any(), K)
     weight_shape = (relay.Any(), K)
@@ -490,6 +503,7 @@ def test_dense_dynamic():
     )
 
 
+@tvm.testing.requires_cutlass
 def test_batch_matmul():
     batch = 8
     verify_batch_matmul(get_batch_matmul(batch, M, N, K), batch, M, N, K)
@@ -527,8 +541,7 @@ def verify_conv2d_common(
     ref_target="cuda",
     use_vm=False,
 ):
-    if not has_cutlass():
-        return
+    assert has_cutlass()
     if sm < 80 and inputs[0].dtype == "float32":
         return
 
@@ -666,6 +679,7 @@ def verify_conv2d_backward_weight(
     )
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d():
     padding = (1, 1)
     for IC in [3, 16]:
@@ -746,6 +760,7 @@ def test_conv2d():
     )
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d_fusion():
     d_shape = (16, 16, 32, 32)
     w_shape = (32, 16, 3, 3)
@@ -793,6 +808,7 @@ def test_conv2d_fusion():
     )
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d_residual_block():
     d_shape = (16, 16, 32, 32)
     w_shape = (16, 16, 3, 3)
@@ -813,6 +829,7 @@ def test_conv2d_residual_block():
         verify_conv2d(func, func, d_shape, w_shape, sm=80, atol=tol, rtol=tol, run_benchmark=False)
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d_transpose():
     OC = 8
     IC = 16
@@ -852,6 +869,7 @@ def test_conv2d_transpose():
         )
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d_backward_weight():
     OC = 8
     IC = 16
@@ -890,6 +908,7 @@ def test_conv2d_backward_weight():
             )
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d_bwd():
     IC = 16
     OC = 8

From a8d60392baf1ee9ebc53f18e67f24fb552d393cd Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 10 Jun 2022 21:16:47 -0700
Subject: [PATCH 108/181] [TIR] Register CUDA WMMA tensor intrinsics (#11677)

* Register CUDA wmma tensor intrins

* Meta programming to generate wmma intrin

* format

* fix

* fix wmma_store

* lint

* Update cuda.py
---
 python/tvm/tir/tensor_intrin/cuda.py | 324 +++++++++++++++++++++++++++
 1 file changed, 324 insertions(+)

diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
index c5883fd072c57..909b13e35c7c1 100644
--- a/python/tvm/tir/tensor_intrin/cuda.py
+++ b/python/tvm/tir/tensor_intrin/cuda.py
@@ -16,7 +16,9 @@
 # under the License.
 # pylint: disable=invalid-name,missing-function-docstring
 """Intrinsics for tensorization on NVIDIA GPU."""
+from typing import Tuple
 from tvm.script import tir as T
+from tvm.tir.function import PrimFunc
 from .. import IntImm, Cast
 from ..._ffi import register_func
 from ...runtime import convert
@@ -482,3 +484,325 @@ def mma_store_impl(a: T.handle, c: T.handle) -> None:
 TensorIntrin.register(
     MMA_store_16x16_i32_global_INTRIN, *get_mma_store_intrin("int32", 8, "global")
 )
+
+
+######## WMMA intrinsics ########
+
+
+def get_wmma_fragment_index(buffer, m_dim, n_dim):
+    """Compute wmma fragment index using elem_offset of the buffer"""
+    frag_size = lift(m_dim * n_dim)
+    return buffer.elem_offset // frag_size + (buffer.elem_offset % frag_size) // n_dim
+
+
+def get_wmma_load_intrin(
+    m_dim: int,
+    n_dim: int,
+    k_dim: int,
+    dtype: str,
+    shared_scope: str,
+    is_b: bool,
+    is_col_major: bool,
+) -> Tuple[PrimFunc, PrimFunc]:
+    """Generator of wmma_load intrins"""
+    wmma_fragment_scope = "wmma.matrix_{}".format("b" if is_b else "a")
+    layout = "col_major" if is_col_major else "row_major"
+
+    @T.prim_func
+    def wmma_load_desc(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=shared_scope
+        )
+        C = T.match_buffer(
+            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=wmma_fragment_scope
+        )
+        with T.block("root"):
+            T.reads(A[0:m_dim, 0:n_dim])
+            T.writes(C[0:m_dim, 0:n_dim])
+            for i, j in T.grid(m_dim, n_dim):
+                with T.block("load"):
+                    vii, vjj = T.axis.remap("SS", [i, j])
+                    C[vii, vjj] = A[vii, vjj]
+
+    @T.prim_func
+    def wmma_load_impl(a: T.handle, c: T.handle) -> None:
+        s1 = T.var("int32")
+        s0 = T.var("int32")
+        A = T.match_buffer(
+            a,
+            (m_dim, n_dim),
+            dtype,
+            align=128,
+            offset_factor=16,
+            scope=shared_scope,
+            strides=[s1, s0],
+        )
+        C = T.match_buffer(
+            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=wmma_fragment_scope
+        )
+        with T.block("root"):
+            T.reads(A[0:m_dim, 0:n_dim])
+            T.writes(C[0:m_dim, 0:n_dim])
+            T.evaluate(
+                T.tvm_load_matrix_sync(
+                    C.data,
+                    m_dim,
+                    n_dim,
+                    k_dim,
+                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    A.access_ptr("r"),
+                    s1,
+                    layout,
+                    dtype="handle",
+                )
+            )
+
+    return wmma_load_desc, wmma_load_impl
+
+
+def get_wmma_fill_intrin(
+    m_dim: int, n_dim: int, k_dim: int, dtype: str
+) -> Tuple[PrimFunc, PrimFunc]:
+    """Generator of wmma_fill intrins"""
+    zero = IntImm("int32", 0).astype(dtype)
+
+    @T.prim_func
+    def wmma_fill_desc(c: T.handle) -> None:
+        C = T.match_buffer(
+            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+        with T.block("root"):
+            T.reads()
+            T.writes(C[0:m_dim, 0:n_dim])
+            for i, j in T.grid(m_dim, n_dim):
+                with T.block("init"):
+                    vii, vjj = T.axis.remap("SS", [i, j])
+                    C[vii, vjj] = zero
+
+    @T.prim_func
+    def wmma_fill_impl(c: T.handle) -> None:
+        C = T.match_buffer(
+            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+        with T.block("root"):
+            T.reads()
+            T.writes(C[0:m_dim, 0:n_dim])
+            T.evaluate(
+                T.tvm_fill_fragment(
+                    C.data,
+                    m_dim,
+                    n_dim,
+                    k_dim,
+                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    T.float32(0),
+                    dtype="handle",
+                )
+            )
+
+    return wmma_fill_desc, wmma_fill_impl
+
+
+def get_wmma_store_intrin(
+    m_dim: int, n_dim: int, k_dim: int, dtype: str, scope: str
+) -> Tuple[PrimFunc, PrimFunc]:
+    """Generator of wmma_store intrins"""
+
+    @T.prim_func
+    def wmma_store_desc(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+        C = T.match_buffer(c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=scope)
+        with T.block("root"):
+            T.reads(A[0:m_dim, 0:n_dim])
+            T.writes(C[0:m_dim, 0:n_dim])
+            for i, j in T.grid(m_dim, n_dim):
+                with T.block("store"):
+                    vii, vjj = T.axis.remap("SS", [i, j])
+                    C[vii, vjj] = A[vii, vjj]
+
+    @T.prim_func
+    def wmma_store_impl(a: T.handle, c: T.handle) -> None:
+        s1 = T.var("int32")
+        s0 = T.var("int32")
+        A = T.match_buffer(
+            a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+        C = T.match_buffer(
+            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=scope, strides=[s1, s0]
+        )
+        with T.block("root"):
+            T.reads(A[0:m_dim, 0:n_dim])
+            T.writes(C[0:m_dim, 0:n_dim])
+            T.evaluate(
+                T.tvm_store_matrix_sync(
+                    A.data,
+                    m_dim,
+                    n_dim,
+                    k_dim,
+                    get_wmma_fragment_index(A, m_dim, n_dim),
+                    C.access_ptr("w"),
+                    s1,
+                    "row_major",
+                    dtype="handle",
+                )
+            )
+
+    return wmma_store_desc, wmma_store_impl
+
+
+def get_wmma_sync_intrin(
+    m_dim: int, n_dim: int, k_dim: int, in_dtype: str, out_dtype: str, b_transposed: bool
+) -> Tuple[PrimFunc, PrimFunc]:
+    """Generator of wmma_sync intrins"""
+
+    def maybe_cast(v):
+        if in_dtype != out_dtype:
+            return Cast(out_dtype, v)
+        return v
+
+    def maybe_swap(i, j):
+        if b_transposed:
+            return j, i
+        return i, j
+
+    b_shape_0, b_shape_1 = maybe_swap(k_dim, n_dim)
+
+    @T.prim_func
+    def wmma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (m_dim, k_dim), in_dtype, align=128, offset_factor=16, scope="wmma.matrix_a"
+        )
+        B = T.match_buffer(
+            b,
+            maybe_swap(k_dim, n_dim),
+            in_dtype,
+            align=128,
+            offset_factor=16,
+            scope="wmma.matrix_b",
+        )
+        C = T.match_buffer(
+            c, (m_dim, n_dim), out_dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+
+        with T.block("root"):
+            T.reads(C[0:m_dim, 0:n_dim], A[0:m_dim, 0:k_dim], B[0:b_shape_0, 0:b_shape_1])
+            T.writes(C[0:m_dim, 0:n_dim])
+            for i, j, k in T.grid(m_dim, n_dim, k_dim):
+                with T.block(""):
+                    vii, vjj, vkk = T.axis.remap("SSR", [i, j, k])
+                    B_index_0, B_index_1 = maybe_swap(vkk, vjj)
+                    C[vii, vjj] = C[vii, vjj] + maybe_cast(A[vii, vkk]) * maybe_cast(
+                        B[B_index_0, B_index_1]
+                    )
+
+    @T.prim_func
+    def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (m_dim, k_dim), in_dtype, align=128, offset_factor=16, scope="wmma.matrix_a"
+        )
+        B = T.match_buffer(
+            b,
+            maybe_swap(k_dim, n_dim),
+            in_dtype,
+            align=128,
+            offset_factor=16,
+            scope="wmma.matrix_b",
+        )
+        C = T.match_buffer(
+            c, (m_dim, n_dim), out_dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+
+        with T.block("root"):
+            T.reads(C[0:m_dim, 0:n_dim], A[0:m_dim, 0:k_dim], B[0:b_shape_0, 0:b_shape_1])
+            T.writes(C[0:m_dim, 0:n_dim])
+            T.evaluate(
+                T.tvm_mma_sync(
+                    C.data,
+                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    A.data,
+                    get_wmma_fragment_index(A, m_dim, k_dim),
+                    B.data,
+                    get_wmma_fragment_index(B, b_shape_0, b_shape_1),
+                    C.data,
+                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    dtype="handle",
+                )
+            )
+
+    return wmma_sync_desc, wmma_sync_impl
+
+
+WMMA_SYNC_16x16x16_f16f16f32_INTRIN = "wmma_sync_16x16x16_f16f16f32"
+TensorIntrin.register(
+    WMMA_SYNC_16x16x16_f16f16f32_INTRIN,
+    *get_wmma_sync_intrin(16, 16, 16, "float16", "float32", False),
+)
+
+WMMA_SYNC_16x16x16_f16f16f32_TRANS_INTRIN = "wmma_sync_16x16x16_f16f16f32_trans"
+TensorIntrin.register(
+    WMMA_SYNC_16x16x16_f16f16f32_TRANS_INTRIN,
+    *get_wmma_sync_intrin(16, 16, 16, "float16", "float32", True),
+)
+
+WMMA_SYNC_16x16x16_f16f16f16_INTRIN = "wmma_sync_16x16x16_f16f16f16"
+TensorIntrin.register(
+    WMMA_SYNC_16x16x16_f16f16f16_INTRIN,
+    *get_wmma_sync_intrin(16, 16, 16, "float16", "float16", False),
+)
+
+WMMA_SYNC_16x16x16_f16f16f16_TRANS_INTRIN = "wmma_sync_16x16x16_f16f16f16_trans"
+TensorIntrin.register(
+    WMMA_SYNC_16x16x16_f16f16f16_TRANS_INTRIN,
+    *get_wmma_sync_intrin(16, 16, 16, "float16", "float16", True),
+)
+
+WMMA_LOAD_16x16x16_F16_A_INTRIN = "wmma_load_16x16x16_f16_a"
+TensorIntrin.register(
+    WMMA_LOAD_16x16x16_F16_A_INTRIN,
+    *get_wmma_load_intrin(16, 16, 16, "float16", "shared", False, False),
+)
+
+WMMA_LOAD_16x16x16_F16_B_INTRIN = "wmma_load_16x16x16_f16_b"
+TensorIntrin.register(
+    WMMA_LOAD_16x16x16_F16_B_INTRIN,
+    *get_wmma_load_intrin(16, 16, 16, "float16", "shared", True, False),
+)
+
+WMMA_LOAD_16x16x16_F16_A_INTRIN = "wmma_load_16x16x16_f16_a_trans"
+TensorIntrin.register(
+    WMMA_LOAD_16x16x16_F16_A_INTRIN,
+    *get_wmma_load_intrin(16, 16, 16, "float16", "shared", False, True),
+)
+
+WMMA_LOAD_16x16x16_F16_B_INTRIN = "wmma_load_16x16x16_f16_b_trans"
+TensorIntrin.register(
+    WMMA_LOAD_16x16x16_F16_B_INTRIN,
+    *get_wmma_load_intrin(16, 16, 16, "float16", "shared", True, True),
+)
+
+WMMA_FILL_16x16x16_F32_INTRIN = "wmma_fill_16x16x16_f32"
+TensorIntrin.register(WMMA_FILL_16x16x16_F32_INTRIN, *get_wmma_fill_intrin(16, 16, 16, "float32"))
+
+WMMA_FILL_16x16x16_F16_INTRIN = "wmma_fill_16x16x16_f16"
+TensorIntrin.register(WMMA_FILL_16x16x16_F16_INTRIN, *get_wmma_fill_intrin(16, 16, 16, "float16"))
+
+WMMA_STORE_16x16x16_F32_SHARED_INTRIN = "wmma_store_16x16x16_f32_shared"
+TensorIntrin.register(
+    WMMA_STORE_16x16x16_F32_SHARED_INTRIN, *get_wmma_store_intrin(16, 16, 16, "float32", "shared")
+)
+
+WMMA_STORE_16x16x16_F16_SHARED_INTRIN = "wmma_store_16x16x16_f16_shared"
+TensorIntrin.register(
+    WMMA_STORE_16x16x16_F16_SHARED_INTRIN, *get_wmma_store_intrin(16, 16, 16, "float16", "shared")
+)
+
+WMMA_STORE_16x16x16_F32_GLOBAL_INTRIN = "wmma_store_16x16x16_f32_global"
+TensorIntrin.register(
+    WMMA_STORE_16x16x16_F32_GLOBAL_INTRIN, *get_wmma_store_intrin(16, 16, 16, "float32", "global")
+)
+
+WMMA_STORE_16x16x16_F16_GLOBAL_INTRIN = "wmma_store_16x16x16_f16_global"
+TensorIntrin.register(
+    WMMA_STORE_16x16x16_F16_GLOBAL_INTRIN, *get_wmma_store_intrin(16, 16, 16, "float16", "global")
+)

From d0da0b94dea206400d3bf4e15cb7815713c5b6e7 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Sat, 11 Jun 2022 13:49:50 +0800
Subject: [PATCH 109/181] Fix typos in target warn of dnnl (#11678)

---
 python/tvm/target/target.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index aea3dfec43f8f..830cd03cec970 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -111,8 +111,8 @@ def __init__(self, target, host=None):
         if isinstance(target, str) and "-libs=mkldnn" in target:
             target = target.replace("mkldnn", "dnnl")
             warnings.warn(
-                "legacy supoort of mkldnn will be eprecated in the next release."
-                " Please replace -libs=mkldnn to -libs=dnnl to enable Intel OneDNN.",
+                "Legacy support of mkldnn is going to be deprecated. "
+                "Please use -libs=dnnl instead.",
             )
         if isinstance(target, (dict, str)):
             target = convert(target)

From 0df69611b2fb46724a0023dd8d389c9a1ecedcb8 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 11 Jun 2022 00:18:10 -0700
Subject: [PATCH 110/181] [MetaSchedule] JSONDatabase Utilities (#11680)

This PR adds some utility to JSONDatabase to accelerate its loading/saving time.
---
 python/tvm/meta_schedule/utils.py             |  28 +-
 src/meta_schedule/arg_info.cc                 |   2 +-
 src/meta_schedule/database/database.cc        |   2 +-
 src/meta_schedule/database/database_utils.cc  | 377 ++++++++++++++++++
 src/meta_schedule/database/json_database.cc   |  80 +++-
 src/meta_schedule/utils.h                     | 103 +++--
 .../unittest/test_meta_schedule_database.py   |  68 ++--
 7 files changed, 526 insertions(+), 134 deletions(-)
 create mode 100644 src/meta_schedule/database/database_utils.cc

diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index 919a29e6cf6c6..26bf20670955a 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -16,12 +16,11 @@
 # under the License.
 """Utilities for meta schedule"""
 import ctypes
-import json
 import logging
 import os
 import shutil
 from contextlib import contextmanager
-from typing import Any, List, Dict, Callable, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import psutil  # type: ignore
 from tvm._ffi import get_global_func, register_func
@@ -296,31 +295,6 @@ def _json_de_tvm(obj: Any) -> Any:
     raise TypeError("Not supported type: " + str(type(obj)))
 
 
-@register_func("meta_schedule.json_obj2str")
-def json_obj2str(json_obj: Any) -> str:
-    json_obj = _json_de_tvm(json_obj)
-    return json.dumps(json_obj)
-
-
-@register_func("meta_schedule.batch_json_str2obj")
-def batch_json_str2obj(json_strs: List[str]) -> List[Any]:
-    """Covert a list of JSON strings to a list of json objects.
-    Parameters
-    ----------
-    json_strs : List[str]
-        The list of JSON strings
-    Returns
-    -------
-    result : List[Any]
-        The list of json objects
-    """
-    return [
-        json.loads(json_str)
-        for json_str in map(str.strip, json_strs)
-        if json_str and (not json_str.startswith("#")) and (not json_str.startswith("//"))
-    ]
-
-
 def shash2hex(mod: IRModule) -> str:
     """Get the structural hash of a module.
 
diff --git a/src/meta_schedule/arg_info.cc b/src/meta_schedule/arg_info.cc
index 104662b6aad0c..9b225e8bea999 100644
--- a/src/meta_schedule/arg_info.cc
+++ b/src/meta_schedule/arg_info.cc
@@ -88,7 +88,7 @@ TensorInfo TensorInfo::FromJSON(const ObjectRef& json_obj) {
       dtype = runtime::String2DLDataType(dtype_str);
     }
     // Load json[2] => shape
-    shape = Downcast<Array<Integer>>(json_array->at(2));
+    shape = AsIntArray(json_array->at(2));
   } catch (const std::runtime_error& e) {  // includes tvm::Error and dmlc::Error
     LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
                << "\nThe error is: " << e.what();
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index 86d999e4fdf59..9905ff73c792c 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -115,7 +115,7 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w
     CHECK(json_array && json_array->size() == 4);
     // Load json[1] => run_secs
     if (json_array->at(1).defined()) {
-      run_secs = Downcast<Array<FloatImm>>(json_array->at(1));
+      run_secs = AsFloatArray(json_array->at(1));
     }
     // Load json[2] => target
     if (json_array->at(2).defined()) {
diff --git a/src/meta_schedule/database/database_utils.cc b/src/meta_schedule/database/database_utils.cc
new file mode 100644
index 0000000000000..278c5267ea93c
--- /dev/null
+++ b/src/meta_schedule/database/database_utils.cc
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <iomanip>
+#include <sstream>
+#include <vector>
+
+#include "../../support/str_escape.h"
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+void JSONDumps(ObjectRef json_obj, std::ostringstream& os) {
+  if (!json_obj.defined()) {
+    os << "null";
+  } else if (const auto* int_imm = json_obj.as<IntImmNode>()) {
+    if (int_imm->dtype == DataType::Bool()) {
+      if (int_imm->value) {
+        os << "true";
+      } else {
+        os << "false";
+      }
+    } else {
+      os << int_imm->value;
+    }
+  } else if (const auto* float_imm = json_obj.as<FloatImmNode>()) {
+    os << std::setprecision(20) << float_imm->value;
+  } else if (const auto* str = json_obj.as<runtime::StringObj>()) {
+    os << '"' << support::StrEscape(str->data, str->size) << '"';
+  } else if (const auto* array = json_obj.as<runtime::ArrayNode>()) {
+    os << "[";
+    int n = array->size();
+    for (int i = 0; i < n; ++i) {
+      if (i != 0) {
+        os << ",";
+      }
+      JSONDumps(array->at(i), os);
+    }
+    os << "]";
+  } else if (const auto* dict = json_obj.as<runtime::MapNode>()) {
+    int n = dict->size();
+    std::vector<std::pair<String, ObjectRef>> key_values;
+    key_values.reserve(n);
+    for (const auto& kv : *dict) {
+      if (const auto* k = kv.first.as<StringObj>()) {
+        key_values.emplace_back(GetRef<String>(k), kv.second);
+      } else {
+        LOG(FATAL) << "TypeError: Only string keys are supported in JSON dumps, but got: "
+                   << kv.first->GetTypeKey();
+      }
+    }
+    std::sort(key_values.begin(), key_values.end());
+    os << "{";
+    for (int i = 0; i < n; ++i) {
+      const auto& kv = key_values[i];
+      if (i != 0) {
+        os << ",";
+      }
+      os << '"' << support::StrEscape(kv.first->data, kv.first->size) << '"';
+      os << ":";
+      JSONDumps(kv.second, os);
+    }
+    os << "}";
+  } else {
+    LOG(FATAL) << "TypeError: Unsupported type in JSON object: " << json_obj->GetTypeKey();
+  }
+}
+
+std::string JSONDumps(ObjectRef json_obj) {
+  std::ostringstream os;
+  JSONDumps(json_obj, os);
+  return os.str();
+}
+
+class JSONTokenizer {
+ public:
+  enum class TokenType : int32_t {
+    kEOF = 0,          // end of file
+    kNull = 1,         // null
+    kTrue = 2,         // true
+    kFalse = 3,        // false
+    kLeftSquare = 4,   // [
+    kRightSquare = 5,  // ]
+    kLeftCurly = 6,    // {
+    kRightCurly = 7,   // }
+    kComma = 8,        // ,
+    kColon = 9,        // :
+    kInteger = 10,     // integers
+    kFloat = 11,       // floating point numbers
+    kString = 12,      // string
+  };
+
+  struct Token {
+    TokenType type;
+    ObjectRef value{nullptr};
+  };
+
+  explicit JSONTokenizer(const char* st, const char* ed) : cur_(st), end_(ed) {}
+
+  Token Next() {
+    for (; cur_ != end_ && std::isspace(*cur_); ++cur_) {
+    }
+    if (cur_ == end_) return Token{TokenType::kEOF};
+    if (NextLeftSquare()) return Token{TokenType::kLeftSquare};
+    if (NextRightSquare()) return Token{TokenType::kRightSquare};
+    if (NextLeftCurly()) return Token{TokenType::kLeftCurly};
+    if (NextRightCurly()) return Token{TokenType::kRightCurly};
+    if (NextComma()) return Token{TokenType::kComma};
+    if (NextColon()) return Token{TokenType::kColon};
+    if (NextNull()) return Token{TokenType::kNull};
+    if (NextTrue()) return Token{TokenType::kTrue};
+    if (NextFalse()) return Token{TokenType::kFalse};
+    Token token;
+    if (NextString(&token)) return token;
+    if (NextNumber(&token)) return token;
+    LOG(FATAL) << "ValueError: Cannot tokenize: " << std::string(cur_, end_);
+    throw;
+  }
+
+ private:
+  bool NextLeftSquare() { return NextLiteral('['); }
+  bool NextRightSquare() { return NextLiteral(']'); }
+  bool NextLeftCurly() { return NextLiteral('{'); }
+  bool NextRightCurly() { return NextLiteral('}'); }
+  bool NextComma() { return NextLiteral(','); }
+  bool NextColon() { return NextLiteral(':'); }
+  bool NextNull() { return NextLiteral("null", 4); }
+  bool NextTrue() { return NextLiteral("true", 4); }
+  bool NextFalse() { return NextLiteral("false", 5); }
+
+  bool NextNumber(Token* token) {
+    using runtime::DataType;
+    bool is_float = false;
+    const char* st = cur_;
+    for (; cur_ != end_; ++cur_) {
+      if (std::isdigit(*cur_) || *cur_ == '+' || *cur_ == '-') {
+        continue;
+      } else if (*cur_ == '.' || *cur_ == 'e' || *cur_ == 'E') {
+        is_float = true;
+      } else {
+        break;
+      }
+    }
+    if (st == cur_) {
+      return false;
+    }
+    // TODO(@junrushao1994): error checking
+    if (is_float) {
+      *token = Token{TokenType::kFloat,
+                     FloatImm(DataType::Float(64),  //
+                              std::stod(std::string(st, cur_)))};
+    } else {
+      *token = Token{TokenType::kInteger,  //
+                     Integer(std::stoi(std::string(st, cur_)))};
+    }
+    return true;
+  }
+
+  bool NextString(Token* token) {
+    if (cur_ == end_ || *cur_ != '"') return false;
+    ++cur_;
+    std::string str;
+    for (; cur_ != end_ && *cur_ != '\"'; ++cur_) {
+      if (*cur_ != '\\') {
+        str.push_back(*cur_);
+        continue;
+      }
+      ++cur_;
+      if (cur_ == end_) {
+        LOG(FATAL) << "ValueError: Unexpected end of string: \\";
+        throw;
+      }
+      switch (*cur_) {
+        case '\"':
+          str.push_back('\"');
+          break;
+        case '\\':
+          str.push_back('\\');
+          break;
+        case '/':
+          str.push_back('/');
+          break;
+        case 'b':
+          str.push_back('\b');
+          break;
+        case 'f':
+          str.push_back('\f');
+          break;
+        case 'n':
+          str.push_back('\n');
+          break;
+        case 'r':
+          str.push_back('\r');
+          break;
+        case 't':
+          str.push_back('\t');
+          break;
+        default:
+          LOG(FATAL) << "ValueError: Unsupported escape sequence: \\" << *cur_;
+      }
+    }
+    if (cur_ == end_) {
+      LOG(FATAL) << "ValueError: Unexpected end of string";
+    }
+    ++cur_;
+    *token = Token{TokenType::kString, String(str)};
+    return true;
+  }
+
+  bool NextLiteral(char c) {
+    if (cur_ != end_ && *cur_ == c) {
+      ++cur_;
+      return true;
+    }
+    return false;
+  }
+
+  bool NextLiteral(const char* str, int len) {
+    if (cur_ + len <= end_ && std::strncmp(cur_, str, len) == 0) {
+      cur_ += len;
+      return true;
+    }
+    return false;
+  }
+  /*! \brief The current pointer */
+  const char* cur_;
+  /*! \brief End of the string */
+  const char* end_;
+
+  friend class JSONParser;
+};
+
+class JSONParser {
+ public:
+  using TokenType = JSONTokenizer::TokenType;
+  using Token = JSONTokenizer::Token;
+
+  explicit JSONParser(const char* st, const char* ed) : tokenizer_(st, ed) {}
+
+  ObjectRef Get() {
+    Token token = tokenizer_.Next();
+    if (token.type == TokenType::kEOF) {
+      return ObjectRef(nullptr);
+    }
+    return ParseObject(std::move(token));
+  }
+
+ private:
+  ObjectRef ParseObject(Token token) {
+    switch (token.type) {
+      case TokenType::kNull:
+        return ObjectRef(nullptr);
+      case TokenType::kTrue:
+        return Bool(true);
+      case TokenType::kFalse:
+        return Bool(false);
+      case TokenType::kLeftSquare:
+        return ParseArray();
+      case TokenType::kLeftCurly:
+        return ParseDict();
+      case TokenType::kString:
+      case TokenType::kInteger:
+      case TokenType::kFloat:
+        return token.value;
+      case TokenType::kRightSquare:
+        LOG(FATAL) << "ValueError: Unexpected token: ]";
+      case TokenType::kRightCurly:
+        LOG(FATAL) << "ValueError: Unexpected token: }";
+      case TokenType::kComma:
+        LOG(FATAL) << "ValueError: Unexpected token: ,";
+      case TokenType::kColon:
+        LOG(FATAL) << "ValueError: Unexpected token: :";
+      case TokenType::kEOF:
+        LOG(FATAL) << "ValueError: Unexpected EOF";
+      default:
+        throw;
+    }
+  }
+
+  Array<ObjectRef> ParseArray() {
+    bool is_first = true;
+    Array<ObjectRef> results;
+    for (;;) {
+      Token token;
+      if (is_first) {
+        is_first = false;
+        token = Token{TokenType::kComma};
+      } else {
+        token = tokenizer_.Next();
+      }
+      // Three cases overall:
+      // - Case 1. 1 token: "]"
+      // - Case 2. 2 tokens: ",", "]"
+      // - Case 3. 2 tokens: ",", "obj"
+      if (token.type == TokenType::kRightSquare) {  // Case 1
+        break;
+      } else if (token.type == TokenType::kComma) {
+        token = tokenizer_.Next();
+        if (token.type == TokenType::kRightSquare) {  // Case 2
+          break;
+        }
+        // Case 3
+        results.push_back(ParseObject(std::move(token)));
+        continue;
+      } else {
+        LOG(FATAL) << "ValueError: Unexpected token before: " << tokenizer_.cur_;
+      }
+    }
+    return results;
+  }
+
+  Map<String, ObjectRef> ParseDict() {
+    bool is_first = true;
+    Map<String, ObjectRef> results;
+    for (;;) {
+      Token token;
+      if (is_first) {
+        is_first = false;
+        token = Token{TokenType::kComma};
+      } else {
+        token = tokenizer_.Next();
+      }
+      // Three cases overall:
+      // - Case 1. 1 token: "}"
+      // - Case 2. 2 tokens: ",", "}"
+      // - Case 3. 2 tokens: ",", "key", ":", "value"
+      if (token.type == TokenType::kRightCurly) {  // Case 1
+        break;
+      } else if (token.type == TokenType::kComma) {
+        token = tokenizer_.Next();
+        if (token.type == TokenType::kRightCurly) {  // Case 2
+          break;
+        }
+        // Case 3
+        ObjectRef key = ParseObject(std::move(token));
+        ICHECK(key->IsInstance<StringObj>())
+            << "ValueError: key must be a string, but gets: " << key;
+        token = tokenizer_.Next();
+        CHECK(token.type == TokenType::kColon)
+            << "ValueError: Unexpected token before: " << tokenizer_.cur_;
+        ObjectRef value = ParseObject(tokenizer_.Next());
+        results.Set(Downcast<String>(key), value);
+        continue;
+      } else {
+        LOG(FATAL) << "ValueError: Unexpected token before: " << tokenizer_.cur_;
+      }
+    }
+    return results;
+  }
+
+  JSONTokenizer tokenizer_;
+};
+
+ObjectRef JSONLoads(std::string str) {
+  const char* st = str.c_str();
+  const char* ed = st + str.length();
+  return JSONParser(st, ed).Get();
+}
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 155d223217da9..4f5bd9b136131 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -17,6 +17,7 @@
  * under the License.
  */
 #include <set>
+#include <thread>
 #include <unordered_map>
 
 #include "../utils.h"
@@ -46,6 +47,45 @@ struct SortTuningRecordByMeanRunSecs {
   }
 };
 
+/*!
+ * \brief Read lines from a json file.
+ * \param path The path to the json file.
+ * \param num_lines The number of threads used to concurrently parse the lines.
+ * \param allow_missing Whether to create new file when the given path is not found.
+ * \return An array containing lines read from the json file.
+ */
+std::vector<ObjectRef> JSONFileReadLines(const String& path, int num_threads, bool allow_missing) {
+  std::ifstream is(path);
+  if (is.good()) {
+    std::vector<String> json_strs;
+    for (std::string str; std::getline(is, str);) {
+      json_strs.push_back(str);
+    }
+    int n = json_strs.size();
+    std::vector<ObjectRef> json_objs;
+    json_objs.resize(n);
+    support::parallel_for_dynamic(0, n, num_threads, [&](int thread_id, int task_id) {
+      json_objs[task_id] = JSONLoads(json_strs[task_id]);
+    });
+    return json_objs;
+  }
+  CHECK(allow_missing) << "ValueError: File doesn't exist: " << path;
+  std::ofstream os(path);
+  CHECK(os.good()) << "ValueError: Cannot create new file: " << path;
+  return {};
+}
+
+/*!
+ * \brief Append a line to a json file.
+ * \param path The path to the json file.
+ * \param line The line to append.
+ */
+void JSONFileAppendLine(const String& path, const std::string& line) {
+  std::ofstream os(path, std::ofstream::app);
+  CHECK(os.good()) << "ValueError: Cannot open the file to write: " << path;
+  os << line << std::endl;
+}
+
 /*! \brief The default database implementation, which mimics two database tables with two files. */
 class JSONDatabaseNode : public DatabaseNode {
  public:
@@ -83,7 +123,7 @@ class JSONDatabaseNode : public DatabaseNode {
     // If `mod` is new in `workloads2idx_`, append it to the workload file
     if (inserted) {
       it->second = static_cast<int>(this->workloads2idx_.size()) - 1;
-      JSONFileAppendLine(this->path_workload, JSONObj2Str(workload->AsJSON()));
+      JSONFileAppendLine(this->path_workload, JSONDumps(workload->AsJSON()));
     }
     return it->first;
   }
@@ -91,7 +131,7 @@ class JSONDatabaseNode : public DatabaseNode {
   void CommitTuningRecord(const TuningRecord& record) {
     this->tuning_records_.insert(record);
     JSONFileAppendLine(this->path_tuning_record,
-                       JSONObj2Str(Array<ObjectRef>{
+                       JSONDumps(Array<ObjectRef>{
                            /*workload_index=*/Integer(this->workloads2idx_.at(record->workload)),
                            /*tuning_record=*/record->AsJSON()  //
                        }));
@@ -121,11 +161,12 @@ class JSONDatabaseNode : public DatabaseNode {
 
 Database Database::JSONDatabase(String path_workload, String path_tuning_record,
                                 bool allow_missing) {
+  int num_threads = std::thread::hardware_concurrency();
   ObjectPtr<JSONDatabaseNode> n = make_object<JSONDatabaseNode>();
   // Load `n->workloads2idx_` from `path_workload`
   std::vector<Workload> workloads;
   {
-    Array<ObjectRef> json_objs = JSONStr2Obj(JSONFileReadLines(path_workload, allow_missing));
+    std::vector<ObjectRef> json_objs = JSONFileReadLines(path_workload, num_threads, allow_missing);
     int n_objs = json_objs.size();
     n->workloads2idx_.reserve(n_objs);
     workloads.reserve(n_objs);
@@ -137,20 +178,25 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record,
   }
   // Load `n->tuning_records_` from `path_tuning_record`
   {
-    Array<ObjectRef> json_objs = JSONStr2Obj(JSONFileReadLines(path_tuning_record, allow_missing));
-    for (const ObjectRef& json_obj : json_objs) {
-      int workload_index = -1;
-      ObjectRef tuning_record{nullptr};
-      try {
-        const ArrayNode* arr = json_obj.as<ArrayNode>();
-        ICHECK_EQ(arr->size(), 2);
-        workload_index = Downcast<Integer>(arr->at(0));
-        tuning_record = arr->at(1);
-      } catch (std::runtime_error& e) {
-        LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
-                   << "\nThe error is: " << e.what();
-      }
-      n->tuning_records_.insert(TuningRecord::FromJSON(tuning_record, workloads[workload_index]));
+    std::vector<ObjectRef> json_objs =
+        JSONFileReadLines(path_tuning_record, num_threads, allow_missing);
+    std::vector<TuningRecord> records;
+    records.resize(json_objs.size(), TuningRecord{nullptr});
+    support::parallel_for_dynamic(
+        0, json_objs.size(), num_threads, [&](int thread_id, int task_id) {
+          const ObjectRef& json_obj = json_objs[task_id];
+          try {
+            const ArrayNode* arr = json_obj.as<ArrayNode>();
+            ICHECK_EQ(arr->size(), 2);
+            records[task_id] = TuningRecord::FromJSON(arr->at(1),  //
+                                                      workloads[Downcast<Integer>(arr->at(0))]);
+          } catch (std::runtime_error& e) {
+            LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
+                       << "\nThe error is: " << e.what();
+          }
+        });
+    for (const TuningRecord& record : records) {
+      n->tuning_records_.insert(record);
     }
   }
   n->path_workload = path_workload;
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index be7745f23d2ce..40c301c6174f9 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -107,38 +107,6 @@ class PyLogMessage {
 /*! \brief The type of the random state */
 using TRandState = support::LinearCongruentialEngine::TRandState;
 
-/*!
- * \brief Read lines from a json file.
- * \param path The path to the json file.
- * \param allow_missing Whether to create new file when the given path is not found.
- * \return An array containing lines read from the json file.
- */
-inline Array<String> JSONFileReadLines(const String& path, bool allow_missing) {
-  std::ifstream is(path);
-  if (is.good()) {
-    Array<String> results;
-    for (std::string str; std::getline(is, str);) {
-      results.push_back(str);
-    }
-    return results;
-  }
-  CHECK(allow_missing) << "ValueError: File doesn't exist: " << path;
-  std::ofstream os(path);
-  CHECK(os.good()) << "ValueError: Cannot create new file: " << path;
-  return {};
-}
-
-/*!
- * \brief Append a line to a json file.
- * \param path The path to the json file.
- * \param line The line to append.
- */
-inline void JSONFileAppendLine(const String& path, const std::string& line) {
-  std::ofstream os(path, std::ofstream::app);
-  CHECK(os.good()) << "ValueError: Cannot open the file to write: " << path;
-  os << line << std::endl;
-}
-
 /*!
  * \brief Get the base64 encoded result of a string.
  * \param str The string to encode.
@@ -168,31 +136,18 @@ inline std::string Base64Decode(std::string str) {
 }
 
 /*!
- * \brief Parse lines of json string into a json object.
- * \param lines The lines of json string.
- * \return Array of json objects parsed.
- * \note The function calls the python-side json parser in runtime registry.
+ * \brief Parses a json string into a json object.
+ * \param json_str The json string.
+ * \return The json object
  */
-inline Array<ObjectRef> JSONStr2Obj(const Array<String>& lines) {
-  static const runtime::PackedFunc* f_to_obj =
-      runtime::Registry::Get("meta_schedule.batch_json_str2obj");
-  ICHECK(f_to_obj) << "IndexError: Cannot find the packed function "
-                      "`meta_schedule.batch_json_str2obj` in the global registry";
-  return (*f_to_obj)(lines);
-}
+ObjectRef JSONLoads(std::string json_str);
 
 /*!
- * \brief Serialize a json object into a json string.
- * \param json_obj The json object to serialize.
- * \return A string containing the serialized json object.
- * \note The function calls the python-side json obj serializer in runtime registry.
+ * \brief Dumps a json object into a json string.
+ * \param json_obj The json object.
+ * \return The json string
  */
-inline String JSONObj2Str(const ObjectRef& json_obj) {
-  static const runtime::PackedFunc* f_to_str = runtime::Registry::Get("meta_schedule.json_obj2str");
-  ICHECK(f_to_str) << "IndexError: Cannot find the packed function "
-                      "`meta_schedule.json_obj2str` in the global registry";
-  return (*f_to_str)(json_obj);
-}
+std::string JSONDumps(ObjectRef json_obj);
 
 /*!
  * \brief Converts a structural hash code to string
@@ -447,6 +402,48 @@ inline double GetRunMsMedian(const RunnerResult& runner_result) {
   }
 }
 
+/*!
+ * \brief Convert the given object to an array of floating point numbers
+ * \param obj The object to be converted
+ * \return The array of floating point numbers
+ */
+inline Array<FloatImm> AsFloatArray(const ObjectRef& obj) {
+  const ArrayNode* arr = obj.as<ArrayNode>();
+  ICHECK(arr) << "TypeError: Expect an array, but gets: " << obj->GetTypeKey();
+  Array<FloatImm> results;
+  results.reserve(arr->size());
+  for (const ObjectRef& elem : *arr) {
+    if (const auto* int_imm = elem.as<IntImmNode>()) {
+      results.push_back(FloatImm(DataType::Float(32), int_imm->value));
+    } else if (const auto* float_imm = elem.as<FloatImmNode>()) {
+      results.push_back(FloatImm(DataType::Float(32), float_imm->value));
+    } else {
+      LOG(FATAL) << "TypeError: Expect an array of float or int, but gets: " << elem->GetTypeKey();
+    }
+  }
+  return results;
+}
+
+/*!
+ * \brief Convert the given object to an array of integers
+ * \param obj The object to be converted
+ * \return The array of integers
+ */
+inline Array<Integer> AsIntArray(const ObjectRef& obj) {
+  const ArrayNode* arr = obj.as<ArrayNode>();
+  ICHECK(arr) << "TypeError: Expect an array, but gets: " << obj->GetTypeKey();
+  Array<Integer> results;
+  results.reserve(arr->size());
+  for (const ObjectRef& elem : *arr) {
+    if (const auto* int_imm = elem.as<IntImmNode>()) {
+      results.push_back(Integer(int_imm->value));
+    } else {
+      LOG(FATAL) << "TypeError: Expect an array of integers, but gets: " << elem->GetTypeKey();
+    }
+  }
+  return results;
+}
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py
index 1edfbe6c7a782..ff0f350d89147 100644
--- a/tests/python/unittest/test_meta_schedule_database.py
+++ b/tests/python/unittest/test_meta_schedule_database.py
@@ -17,20 +17,18 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 """Test Meta Schedule Database"""
 import os.path as osp
-import sys
 import tempfile
 from typing import Callable
 
-import pytest
 import tvm
 import tvm.testing
+from tvm import meta_schedule as ms
 from tvm import tir
 from tvm.ir.module import IRModule
-from tvm.meta_schedule.arg_info import ArgInfo
-from tvm.meta_schedule.database import JSONDatabase, TuningRecord
 from tvm.script import tir as T
 from tvm.tir import Schedule
 
+
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
 # fmt: off
 @tvm.script.ir_module
@@ -92,13 +90,13 @@ def _create_schedule(mod: IRModule, sch_fn: Callable[[Schedule], None]) -> Sched
     return sch
 
 
-def _create_tmp_database(tmpdir: str) -> JSONDatabase:
+def _create_tmp_database(tmpdir: str) -> ms.database.JSONDatabase:
     path_workload = osp.join(tmpdir, "workloads.json")
     path_tuning_record = osp.join(tmpdir, "tuning_records.json")
-    return JSONDatabase(path_workload, path_tuning_record)
+    return ms.database.JSONDatabase(path_workload, path_tuning_record)
 
 
-def _equal_record(a: TuningRecord, b: TuningRecord):
+def _equal_record(a: ms.database.TuningRecord, b: ms.database.TuningRecord):
     assert str(a.trace) == str(b.trace)
     assert str(a.run_secs) == str(b.run_secs)
     # AWAIT(@zxybazh): change to export after fixing "(bool)0"
@@ -113,15 +111,15 @@ def test_meta_schedule_tuning_record_round_trip():
     with tempfile.TemporaryDirectory() as tmpdir:
         database = _create_tmp_database(tmpdir)
         workload = database.commit_workload(mod)
-        record = TuningRecord(
+        record = ms.database.TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
             workload,
             [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
-            ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
         )
         database.commit_tuning_record(record)
-        new_record = TuningRecord.from_json(record.as_json(), workload)
+        new_record = ms.database.TuningRecord.from_json(record.as_json(), workload)
         _equal_record(record, new_record)
 
 
@@ -138,12 +136,12 @@ def test_meta_schedule_database_has_workload():
     with tempfile.TemporaryDirectory() as tmpdir:
         database = _create_tmp_database(tmpdir)
         workload = database.commit_workload(mod)
-        record = TuningRecord(
+        record = ms.database.TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
             workload,
             [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
-            ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
         )
         database.commit_tuning_record(record)
         assert len(database) == 1
@@ -156,12 +154,12 @@ def test_meta_schedule_database_add_entry():
     with tempfile.TemporaryDirectory() as tmpdir:
         database = _create_tmp_database(tmpdir)
         workload = database.commit_workload(mod)
-        record = TuningRecord(
+        record = ms.database.TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
             workload,
             [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
-            ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
         )
         database.commit_tuning_record(record)
         assert len(database) == 1
@@ -176,12 +174,12 @@ def test_meta_schedule_database_missing():
         database = _create_tmp_database(tmpdir)
         workload = database.commit_workload(mod)
         workload_2 = database.commit_workload(mod_2)
-        record = TuningRecord(
+        record = ms.database.TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
             workload,
             [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
-            ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
         )
         database.commit_tuning_record(record)
         ret = database.get_top_k(workload_2, 3)
@@ -195,47 +193,47 @@ def test_meta_schedule_database_sorting():
         token = database.commit_workload(mod)
         trace = _create_schedule(mod, _schedule_matmul).trace
         records = [
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [7.0, 8.0, 9.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [1.0, 2.0, 3.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [4.0, 5.0, 6.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [1.1, 1.2, 600.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [1.0, 100.0, 6.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [4.0, 9.0, 8.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
         ]
         for record in records:
@@ -257,31 +255,31 @@ def test_meta_schedule_database_reload():
         token = database.commit_workload(mod)
         trace = _create_schedule(mod, _schedule_matmul).trace
         records = [
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [7.0, 8.0, 9.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [1.0, 2.0, 3.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [4.0, 5.0, 6.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
         ]
         for record in records:
             database.commit_tuning_record(record)
-        new_database = JSONDatabase(  # pylint: disable=unused-variable
+        new_database = ms.database.JSONDatabase(
             path_workload=database.path_workload,
             path_tuning_record=database.path_tuning_record,
         )

From 8f6543e9e6173cd45b678e91b5a637ff7f8e0e02 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Sat, 11 Jun 2022 12:32:08 -0700
Subject: [PATCH 111/181] [Relay] Finish implementations of WithFields (#11674)

---
 include/tvm/relay/adt.h                      |  31 +--
 include/tvm/relay/expr.h                     | 173 ++++-----------
 include/tvm/relay/expr_functor.h             |   2 +
 include/tvm/relay/function.h                 |  25 +--
 src/relay/backend/contrib/cutlass/codegen.cc |  11 +-
 src/relay/ir/expr.cc                         |  60 ++++--
 tests/cpp/relay/with_fields_test.cc          | 215 +++++++++++++++++++
 7 files changed, 323 insertions(+), 194 deletions(-)
 create mode 100644 tests/cpp/relay/with_fields_test.cc

diff --git a/include/tvm/relay/adt.h b/include/tvm/relay/adt.h
index 31dec22041462..cdb8e52d2359f 100644
--- a/include/tvm/relay/adt.h
+++ b/include/tvm/relay/adt.h
@@ -264,17 +264,9 @@ class Clause : public ObjectRef {
 };
 
 /*!
- * \brief Returns the clause with given properties. A null property denotes 'no change'.
- * Returns clause if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param clause The clause to copy.
- * \param opt_lhs The (optional) lhs for the copied clause. If none, ret_clause->lhs = clause->lhs.
- * \param opt_rhs The (optional) rhs for the copied clause. If none,
- * ret_clause->rhs = clause->rhs.
- * \return If all
- * properties are null or the same as the property in the input clause (i.e., opt_lhs is null or
- * opt_lhs.value() == clause->lhs, etc.), then we return clause. Otherwise, we return a copy of
- * clause with the different fields overwritten. (i.e., if opt_lhs.value() != clause->lhs, then
- * ret_clause->lhs = opt_lhs.value()).
+ * \brief Returns \p clause with the given properties. A null property denotes 'no change'.
+ * Returns \p clause if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Clause WithFields(Clause clause, Optional<Pattern> opt_lhs = Optional<Pattern>(),
                   Optional<Expr> opt_rhs = Optional<Expr>());
@@ -337,20 +329,9 @@ class Match : public Expr {
 };
 
 /*!
- * \brief Returns the match with given properties. A null property denotes 'no change'.
- * Returns match if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param match The match to copy.
- * \param opt_data The (optional) data for the copied match. If none, ret_match->data = match->data.
- * \param opt_clauses The (optional) clauses for the copied match. If none, ret_match->clauses =
- * match->clauses.
- * \param opt_complete The (optional) complete for the copied match. If none, ret_match->complete =
- * match->complete.
- * \param opt_span The (optional) span for the copied match. If none, ret_match->span = match->span.
- * \return If all properties are null or the same as the
- * property in the input match (i.e., opt_clauses is null or opt_clauses.value() == match->clauses,
- * etc.), then we return match. Otherwise, we return a copy of match with the different fields
- * overwritten. (i.e., if opt_clauses.value() != match->clauses, then ret_match->clauses =
- * opt_clauses.value()).
+ * \brief Returns \p match with the given properties. A null property denotes 'no change'.
+ * Returns \p match if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Match WithFields(Match match, Optional<Expr> opt_data = Optional<Expr>(),
                  Optional<Array<Clause>> opt_clauses = Optional<Array<Clause>>(),
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index fe570806922fd..6b014c8478d8a 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -39,6 +39,16 @@
 #include "./type.h"
 
 namespace tvm {
+
+/*!
+ * \brief Returns \p global_var with the given properties. A null property denotes 'no change'.
+ * Returns \p global_var if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
+ */
+GlobalVar WithFields(GlobalVar global_var, Optional<String> opt_name_hint = {},
+                     Optional<Type> opt_type = {}, Optional<VirtualDevice> opt_virtual_device = {},
+                     Optional<Span> opt_span = {});
+
 namespace relay {
 
 using Expr = tvm::RelayExpr;
@@ -97,8 +107,17 @@ class Constant : public Expr {
   TVM_DLL explicit Constant(runtime::NDArray data, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Constant, RelayExpr, ConstantNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(ConstantNode);
 };
 
+/*!
+ * \brief Returns \p constant with the given properties. A null property denotes 'no change'.
+ * Returns \p constant if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
+ */
+Constant WithFields(Constant constant, Optional<runtime::NDArray> opt_data = {},
+                    Optional<VirtualDevice> opt_virtual_device = {}, Optional<Span> opt_span = {});
+
 /*! \brief Tuple of multiple Exprs */
 class Tuple;
 /*! \brief Tuple container */
@@ -149,15 +168,9 @@ class Tuple : public Expr {
 };
 
 /*!
- * \brief Returns the tuple with given properties. A null property denotes 'no change'.
- * Returns this if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param tuple The tuple to copy
- * \param opt_fields The (optional) fields for the copied tuple. If none, ret_tuple->fields =
- * tuple->fields.
- * \param opt_virtual_device The (optional) virtual_device for the copied tuple. If none,
- * ret_tuple->virtual_device = tuple->virtual_device.
- * \param opt_span The (optional) span for the copied tuple. If none,
- * ret_tuple->span = tuple->span.
+ * \brief Returns \p tuple with the given properties. A null property denotes 'no change'.
+ * Returns \p tuple if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Tuple WithFields(Tuple tuple, Optional<Array<Expr>> opt_fields = Optional<Array<Expr>>(),
                  Optional<VirtualDevice> opt_virtual_device = Optional<VirtualDevice>(),
@@ -251,19 +264,9 @@ class Var : public Expr {
 };
 
 /*!
- * \brief Returns the var with given properties. A null property denotes 'no change'.
- * Returns var if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param var The var to copy.
- * \param opt_vid The (optional) vid for the copied var. If none, ret_var->vid = var->vid.
- * \param opt_type_annotation The (optional) type_annotation for the copied var. If none,
- * ret_var->type_annotation = var->type_annotation.
- * \param opt_virtual_device The (optional) virtual_device for the copied tuple. If none,
- * ret_tuple->virtual_device = tuple->virtual_device.
- * \param opt_span The (optional) span for the copied var. If none, ret_var->span = var->span.
- * \return If all properties are null or the same as the property in the input var (i.e., opt_vid is
- * null or opt_vid.value() == var->vid, etc.), then we return var. Otherwise, we return a copy of
- * call with the different fields overwritten. (i.e., if opt_vid.value() != var->vid, then
- * ret_var->vid = opt_.value()).
+ * \brief Returns \p vor with the given properties. A null property denotes 'no change'.
+ * Returns \p var if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Var WithFields(Var var, Optional<Id> opt_vid = Optional<Id>(),
                Optional<Type> opt_type_annotation = Optional<Type>(),
@@ -374,22 +377,9 @@ class Call : public Expr {
 };
 
 /*!
- * \brief Returns the call with given properties. A null property denotes 'no change'.
- * Returns call if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param call The call to copy.
- * \param opt_op The (optional) op for the copied call. If none, ret_call->op = call->op.
- * \param opt_args The (optional) args for the copied call. If none, ret_call->args = call->args.
- * \param opt_attrs The (optional) attrs for the copied call. If none, ret_call->attrs =
- * call->attrs.
- * \param opt_type_args The (optional) type args for the copied call. If none,
- * ret_call->type_args = call->type_args.
- * \param opt_virtual_device The (optional) virtual_device for the copied call. If none,
- * ret_call->virtual_device = call->virtual_device.
- * \param opt_span The (optional) span for the copied call. If none, ret_call->span = call->span.
- * \return If all properties are null or the same as the property in the input call (i.e., opt_op is
- * null or opt_op.value() == call->op, etc.), then we return call. Otherwise, we return a copy of
- * call with the different fields overwritten. (i.e., if opt_op.value() != call->op, then
- * ret_call->op = opt_op.value()).
+ * \brief Returns \p call with the given properties. A null property denotes 'no change'.
+ * Returns \p call if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Call WithFields(Call call, Optional<Expr> opt_op = Optional<Expr>(),
                 Optional<Array<Expr>> opt_args = Optional<Array<Expr>>(),
@@ -475,19 +465,9 @@ class Let : public Expr {
 };
 
 /*!
- * \brief Returns the let with given properties. A null property denotes 'no change'.
- * Returns let if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param let The let to copy.
- * \param opt_var The (optional) var for the copied let. If none, ret_let->op = let->op.
- * \param opt_value The (optional) value for the copied let. If none, ret_let->args = let->args.
- * \param opt_body The (optional) body for the copied let. If none, ret_let->attrs = let->attrs.
- * \param opt_virtual_device The (optional) virtual_device for the copied let. If none,
- * ret_let->virtual_device = let->virtual_device.
- * \param opt_span The (optional) span for the copied let. If none, ret_let->span = let->span.
- * \return If all properties are null or the same as the property in the input let (i.e., opt_var is
- * null or opt_var.value() == let->var, etc.), then we return let. Otherwise, we return a copy of
- * let with the different fields overwritten. (i.e., if opt_var.value() != let->var, then
- * ret_let->var = opt_var.value()).
+ * \brief Returns \p let with the given properties. A null property denotes 'no change'.
+ * Returns \p let if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Let WithFields(Let let, Optional<Var> opt_var = Optional<Var>(),
                Optional<Expr> opt_value = Optional<Expr>(),
@@ -559,23 +539,9 @@ class If : public Expr {
 };
 
 /*!
- * \brief Returns the if_expr with given properties. A null property denotes 'no change'.
- * Returns if_expr if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param if_expr The if expression to copy.
- * \param opt_cond The (optional) cond for the copied if_expr. If none, ret_if->cond =
- * if_expr->cond.
- * \param opt_true_branch The (optional) true_branch for the copied if_expr. If none,
- * ret_if->true_branch = ret_if->false_branch.
- * \param opt_false_branch The (optional) false_branch
- * for the copied if_expr. If none, ret_if->false_branch = if_expr->false_branch.
- * \param opt_virtual_device The (optional) virtual_device for the copied if_expr. If none,
- * ret_if->virtual_device = if_expr->virtual_device.
- * \param opt_span The (optional) span for the copied if_expr. If none,
- * ret_if->span = if_expr->span.
- * \return If all properties are null or the same as the property in
- * the input if_expr (i.e., opt_cond is null or opt_cond.value() == if_expr->cond, etc.), then we
- * return if_expr. Otherwise, we return a copy of if_expr with the different fields overwritten.
- * (i.e., if opt_cond.value() != if_expr->cond, then ret_if->cond = opt_cond.value()).
+ * \brief Returns \p if_expr with the given properties. A null property denotes 'no change'.
+ * Returns \p if_expr if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 If WithFields(If if_expr, Optional<Expr> opt_cond = Optional<Expr>(),
               Optional<Expr> opt_true_branch = Optional<Expr>(),
@@ -628,22 +594,9 @@ class TupleGetItem : public Expr {
 };
 
 /*!
- * \brief Returns the tuple_get_item with given properties. A null property denotes 'no change'.
- * Returns if_expr if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param tuple_get_item The tuple_get_item to copy.
- * \param opt_tuple The (optional) tuple for the copied tuple_get_item. If none,
- * ret_tuple_get_item->tuple = tuple_get_item->tuple.
- * \param opt_index The (optional) index for the copied tuple_get_item. If none,
- * ret_tuple_get_item->index = tuple_get_item->index.
- * \param opt_virtual_device The (optional) virtual_device for the copied tuple_get_item.
- * If none, ret_tuple_get_item->virtual_device = tuple_get_item->virtual_device.
- * \param opt_span The (optional) span for the copied tuple_get_item. If none,
- * ret_tuple_get_item->span = tuple_get_item->span.
- * \return If all properties are null or the same as the property in the input tuple_get_item
- * (i.e., opt_tuple is null or opt_tuple.value() == tuple_get_item->tuple, etc.), then we return
- * tuple_get_item. Otherwise, we return a copy of tuple_get_item with the different fields
- * overwritten. (i.e., if opt_tuple.value() != tuple_get_item->tuple, then
- * ret_tuple_get_item->tuple = opt_tuple.value()).
+ * \brief Returns \p tuple_get_item with the given properties. A null property denotes 'no change'.
+ * Returns \p tuple_get_item if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 TupleGetItem WithFields(TupleGetItem tuple_get_item, Optional<Expr> opt_tuple = Optional<Expr>(),
                         Optional<Integer> opt_index = Optional<Integer>(),
@@ -692,21 +645,9 @@ class RefCreate : public Expr {
 };
 
 /*!
- * \brief Returns the ref create with given properties. A null property denotes 'no change'.
- * Returns ref_create if all properties are unchanged. Otherwise, returns a copy with the new
+ * \brief Returns \p ref_create with the given properties. A null property denotes 'no change'.
+ * Returns \p ref_crete if all properties are unchanged. Otherwise, returns a copy with the new
  * fields.
- * \param ref_create The ref_create to copy.
- * \param opt_value The (optional) value for the copied ref_create. If none,
- * ret_ref_create->value = ref_create->value.
- * \param opt_virtual_device The (optional) virtual_device for the copied ref_create. If none,
- * ret_ref_create->virtual_device = ref_create->virtual_device.
- * \param opt_span The (optional) span for the copied ref_create. If none,
- * ret_ref_create->span = ref_create->span.
- * \return If all properties are null or the same as the property in the input ref_create
- * (i.e., opt_value is null or opt_value.value() == ref_create->value, etc.), then we return
- * ref_create. Otherwise, we return a copy of ref_create with the different fields overwritten.
- * (i.e., if opt_value.value() != ref_create->value, then
- * ret_ref_create->value = opt_value.value()).
  */
 RefCreate WithFields(RefCreate ref_create, Optional<Expr> opt_value = Optional<Expr>(),
                      Optional<VirtualDevice> opt_virtual_device = Optional<VirtualDevice>(),
@@ -754,20 +695,9 @@ class RefRead : public Expr {
 };
 
 /*!
- * \brief Returns the ref read with given properties. A null property denotes 'no change'.
- * Returns ref_read if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param ref_read The ref_read to copy.
- * \param opt_ref The (optional) ref for the copied ref_read. If none, ret_ref_read->ref =
- * ref_read->ref.
- * \param opt_virtual_device
- * The (optional) virtual_device for the copied ref_read. If none, ret_ref_read->virtual_device =
- * ref_read->virtual_device.
- * \param opt_span The (optional) span for the copied ref_read. If none, ret_ref_read->span =
- * ref_read->span.
- * \return If all properties are null or the same as the property in the input
- * ref_read (i.e., opt_ref is null or opt_ref.value() == ref_read->ref, etc.), then we return
- * ref_read. Otherwise, we return a copy of ref_read with the different fields overwritten. (i.e.,
- * if opt_ref.value() != ref_read->ref, then ret_ref_read->ref = opt_ref.value()).
+ * \brief Returns \p ref_read with the given properties. A null property denotes 'no change'.
+ * Returns \p ref_read if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 RefRead WithFields(RefRead ref_read, Optional<Expr> opt_ref = Optional<Expr>(),
                    Optional<VirtualDevice> opt_virtual_device = Optional<VirtualDevice>(),
@@ -820,22 +750,9 @@ class RefWrite : public Expr {
 };
 
 /*!
- * \brief Returns the ref write with given properties. A null property denotes 'no change'.
- * Returns ref_write if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param ref_write The ref_write to copy.
- * \param opt_ref The (optional) ref for the copied ref_write. If none,
- * ret_ref_write->ref = ref_write->ref.
- * \param opt_value The (optional) value for the copied ref_write. If none,
- * ret_ref_write->value = ref_write->value.
- * \param opt_virtual_device
- * The (optional) virtual_device for the copied ref_write. If none, ret_ref_write->virtual_device =
- * ref_write->virtual_device.
- * \param opt_span The (optional) span for the copied ref_write. If none, ret_ref_write->span =
- * ref_write->span.
- * \return If all properties are null or the same as the property in the input ref_write (i.e.,
- * opt_ref is null or opt_ref.value() == ref_write->ref, etc.), then we return ref_write. Otherwise,
- * we return a copy of ref_write with the different fields overwritten. (i.e., if ref_write.value()
- * != ref_write->ref, then ret_ref_write->ref = opt_ref.value()).
+ * \brief Returns \p ref_write with the given properties. A null property denotes 'no change'.
+ * Returns \p ref_write if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 RefWrite WithFields(RefWrite ref_write, Optional<Expr> opt_ref = Optional<Expr>(),
                     Optional<Expr> opt_value = Optional<Expr>(),
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index d8f575dfdf485..280a1f8a6c29c 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -240,6 +240,8 @@ class MixedModeVisitor : public ::tvm::relay::ExprVisitor {
    */
   explicit MixedModeVisitor(int visit_limit = 1);
 
+  using ExprVisitor::VisitExpr_;
+
   /*!
    * \brief VisitExpr is finalized to preserve call expansion of dataflow regions
    */
diff --git a/include/tvm/relay/function.h b/include/tvm/relay/function.h
index 052d04fe24119..874d4f2334162 100644
--- a/include/tvm/relay/function.h
+++ b/include/tvm/relay/function.h
@@ -121,28 +121,9 @@ class Function : public BaseFunc {
 };
 
 /*!
- * \brief Returns the function with given properties. A null property denotes 'no change'.
- * Returns function if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param function The function to copy.
- * \param opt_params The (optional) params for the copied function. If none,
- * ret_function->params = function->params.
- * \param opt_body The (optional) body for the copied function. If none,
- * ret_function->body = function->body.
- * \param opt_ret_type The (optional) return type for the copied function. If none,
- * ret_function->ret_type = function->ret_type.
- * \param opt_ty_params The (optional) type params for the copied function. If none,
- * ret_function->type_params = function->type_params.
- * \param opt_attrs
- * The (optional) attributes for the copied function. If none,
- * ret_function->attrs = function->attrs.
- * \param opt_virtual_device The (optional) virtual_device for the copied function. If none,
- * ret_function->virtual_device = function->virtual_device.
- * \param opt_span The (optional) span for the copied function. If none,
- * ret_function->span = function->span.
- * \return If all properties are null or the same as the property in the input function
- * (i.e., opt_params is null or opt_params.value() == function->params, etc.), then we return
- * function. Otherwise, we return a copy of function with the different fields overwritten. (i.e.,
- * if opt_params.value() != function->params, then ret_function->params = opt_params.value()).
+ * \brief Returns \p function with the given properties. A null property denotes 'no change'.
+ * Returns \p function if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Function WithFields(Function function, Optional<Array<Var>> opt_params = Optional<Array<Var>>(),
                     Optional<Expr> opt_body = Optional<Expr>(),
diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc
index db36d02896a2b..97c039ee29cf2 100644
--- a/src/relay/backend/contrib/cutlass/codegen.cc
+++ b/src/relay/backend/contrib/cutlass/codegen.cc
@@ -602,7 +602,7 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
     ICHECK(pattern_name.defined()) << "Only functions with composite attribute are supported.";
 
     if (pattern_name == "cutlass.dense") {
-      const auto* dense_call = GetRootCall(callee->body.as<CallNode>(), 0, {"nn.dense"});
+      const auto* dense_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.dense");
       return GenerateBody(dense_call, "cutlass_dense", GetArgumentNames(caller),
                           DenseArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.dense_bias") {
@@ -637,11 +637,11 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
                           DenseArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.batch_matmul") {
       const auto* batch_matmul_call =
-          GetRootCall(callee->body.as<CallNode>(), 0, {"nn.batch_matmul"});
+          GetRootCall(callee->body.as<CallNode>(), 0, "nn.batch_matmul");
       return GenerateBody(batch_matmul_call, "cutlass_batch_matmul", GetArgumentNames(caller),
                           BatchMatmulArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d") {
-      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0, {"nn.conv2d"});
+      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d");
       return GenerateBody(conv2d_call, "cutlass_conv2d", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d_bias") {
@@ -704,13 +704,12 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
       return GenerateBody(conv2d_call, pattern_name.value(), GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d_transpose") {
-      const auto* conv2d_call =
-          GetRootCall(callee->body.as<CallNode>(), 0, {"nn.conv2d_transpose"});
+      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d_transpose");
       return GenerateBody(conv2d_call, "cutlass_conv2d_transpose", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_), true, false));
     } else if (pattern_name == "cutlass.conv2d_backward_weight") {
       const auto* conv2d_call =
-          GetRootCall(callee->body.as<CallNode>(), 0, {"nn.conv2d_backward_weight"});
+          GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d_backward_weight");
       return GenerateBody(conv2d_call, "cutlass_conv2d_backward_weight", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_), false, true));
     }
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index fc76577bd7c07..85892e8223af1 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -27,6 +27,26 @@
 
 namespace tvm {
 
+GlobalVar WithFields(GlobalVar global_var, Optional<String> opt_name_hint, Optional<Type> opt_type,
+                     Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+  String name_hint = opt_name_hint.value_or(global_var->name_hint);
+  Type type = opt_type.value_or(global_var->checked_type());
+  VirtualDevice virtual_device = opt_virtual_device.value_or(global_var->virtual_device());
+  Span span = opt_span.value_or(global_var->span);
+  bool all_fields_unchanged =
+      name_hint.same_as(global_var->name_hint) && type.same_as(global_var->checked_type()) &&
+      virtual_device.same_as(global_var->virtual_device()) && span.same_as(global_var->span);
+  if (!all_fields_unchanged) {
+    GlobalVarNode* cow_global_var_node = global_var.CopyOnWrite();
+    cow_global_var_node->name_hint = name_hint;
+    cow_global_var_node->checked_type_ = type;
+    cow_global_var_node->virtual_device_ = virtual_device;
+    cow_global_var_node->span = span;
+  }
+
+  return global_var;
+}
+
 VirtualDevice RelayExprNode::virtual_device() const {
   if (!this->virtual_device_.defined()) {
     // virtual_device_ should always be defined, unless we imported this node from JSON using an old
@@ -77,6 +97,25 @@ TensorType ConstantNode::tensor_type() const {
   return TensorType(shape, dtype);
 }
 
+Constant WithFields(Constant constant, Optional<runtime::NDArray> opt_data,
+                    Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+  runtime::NDArray data = opt_data.value_or(constant->data);
+  VirtualDevice virtual_device = opt_virtual_device.value_or(constant->virtual_device());
+  Span span = opt_span.value_or(constant->span);
+
+  bool all_fields_unchanged = data.same_as(constant->data) &&
+                              virtual_device.same_as(constant->virtual_device()) &&
+                              span.same_as(constant->span);
+
+  if (!all_fields_unchanged) {
+    ConstantNode* cow_constant_node = constant.CopyOnWrite();
+    cow_constant_node->data = data;
+    cow_constant_node->virtual_device_ = virtual_device;
+    cow_constant_node->span = span;
+  }
+  return constant;
+}
+
 Tuple::Tuple(tvm::Array<relay::Expr> fields, Span span) {
   ObjectPtr<TupleNode> n = make_object<TupleNode>();
   n->fields = std::move(fields);
@@ -90,6 +129,7 @@ TVM_REGISTER_NODE_TYPE(TupleNode);
 TVM_REGISTER_GLOBAL("relay.ir.Tuple").set_body_typed([](tvm::Array<relay::Expr> fields, Span span) {
   return Tuple(fields, span);
 });
+
 Tuple WithFields(Tuple tuple, Optional<Array<Expr>> opt_fields,
                  Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
   Array<Expr> fields = opt_fields.value_or(tuple->fields);
@@ -189,6 +229,7 @@ Call::Call(Expr op, Array<Expr> args, Attrs attrs, Array<Type> type_args, Span s
 Call WithFields(Call call, Optional<Expr> opt_op, Optional<Array<Expr>> opt_args,
                 Optional<Attrs> opt_attrs, Optional<Array<Type>> opt_type_args,
                 Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+  // Collect new values for fields.
   Expr op = opt_op.value_or(call->op);
   Array<Expr> args = opt_args.value_or(call->args);
   Attrs attrs = opt_attrs.value_or(call->attrs);
@@ -196,37 +237,30 @@ Call WithFields(Call call, Optional<Expr> opt_op, Optional<Array<Expr>> opt_args
   VirtualDevice virtual_device = opt_virtual_device.value_or(call->virtual_device());
   Span span = opt_span.value_or(call->span);
 
+  // Check if anything changed.
   bool unchanged = op.same_as(call->op) && attrs.same_as(call->attrs) &&
                    virtual_device.same_as(call->virtual_device()) && span.same_as(call->span);
-
-  // Check that the args are unchanged
   if (unchanged) {
-    bool all_args_unchanged = true;
     if (args.size() == call->args.size()) {
       for (size_t i = 0; i < args.size(); i++) {
-        all_args_unchanged &= args[i].same_as(call->args[i]);
+        unchanged &= args[i].same_as(call->args[i]);
       }
     } else {
-      all_args_unchanged = false;
+      unchanged = false;
     }
-    unchanged &= all_args_unchanged;
   }
-
-  // Check that the type_args are unchanged
   if (unchanged) {
-    bool all_type_args_unchanged = true;
     if (type_args.size() == call->type_args.size()) {
       for (size_t i = 0; i < type_args.size(); i++) {
-        all_type_args_unchanged &= type_args[i].same_as(call->type_args[i]);
+        unchanged &= type_args[i].same_as(call->type_args[i]);
       }
     } else {
-      all_type_args_unchanged = false;
+      unchanged = false;
     }
-
-    unchanged &= all_type_args_unchanged;
   }
 
   if (!unchanged) {
+    // If call is only references, update it in place. Otherwise copy and update.
     CallNode* cow_call_node = call.CopyOnWrite();
     cow_call_node->op = op;
     cow_call_node->args = args;
diff --git a/tests/cpp/relay/with_fields_test.cc b/tests/cpp/relay/with_fields_test.cc
new file mode 100644
index 0000000000000..48e04c259bb51
--- /dev/null
+++ b/tests/cpp/relay/with_fields_test.cc
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and lixmitations
+ * under the License.
+ */
+
+/*!
+ * \brief Proof-of-concept unit tests for the family of WithFields helpers.
+ * Only Call, GlobalVar and Constant are currently tested.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/parser/parser.h>
+#include <tvm/relay/adt.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+
+namespace tvm {
+namespace relay {
+namespace {
+
+IRModule TestIRModule() {
+  return parser::ParseModule("string",
+                             R"(
+    #[version = "0.0.5"]
+    def @main(%data : Tensor[(1, 304, 128, 128), float32],
+             %weight1 : Tensor[(304, 1, 3, 3), float32],
+             %bias1 : Tensor[(304), float32],
+             %weight2 : Tensor[(256, 304, 1, 1), float32],
+             %bias2 : Tensor[(256), float32]) -> Tensor[(1, 256, 128, 128), float32] {
+      %0 = nn.conv2d(%data, %weight1, padding=[1, 1, 1, 1], groups=304, channels=304, kernel_size=[3, 3]);
+      %1 = nn.bias_add(%0, %bias1);
+      %2 = nn.relu(%1);
+      %3 = nn.conv2d(%2, %weight2, padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+      %4 = nn.bias_add(%3, %bias2);
+      nn.relu(%4)
+    }
+  )");
+}
+
+Function TestFunction() { return Downcast<Function>(TestIRModule()->Lookup("main")); }
+Call TestCall() { return Downcast<Call>(TestFunction()->body); }
+GlobalVar TestGlobalVar() { return TestIRModule()->GetGlobalVar("main"); }
+VirtualDevice TestVirtualDevice() { return VirtualDevice::ForDevice({kDLCUDA, 3}); }
+Span TestSpan() { return Span(SourceName::Get("foo"), 3, 4, 6, 42); }
+Constant TestConstant() {
+  return Constant(runtime::NDArray::Empty({}, DataType::Int(32), {kDLCPU, 0}));
+}
+
+//
+// Call
+//
+
+TEST(WithFields, Call_Noop) {
+  Call call = TestCall();
+  Call result = WithFields(call);
+  ASSERT_TRUE(result.same_as(call));
+}
+
+TEST(WithFields, Call_Op) {
+  Call call = TestCall();
+  Op new_op = Op::Get("tanh");
+  Call result = WithFields(call, new_op);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->op.same_as(new_op));
+  ASSERT_TRUE(result->op.same_as(new_op));
+}
+
+TEST(WithFields, Call_Args) {
+  Call call = TestCall();
+  Array<Expr> new_args = {Tuple(Array<Expr>())};
+  Call result = WithFields(call, /*opt_op=*/{}, new_args);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->args.same_as(new_args));
+  ASSERT_TRUE(result->args.same_as(new_args));
+}
+
+TEST(WithFields, Call_Attrs) {
+  Call call = TestCall();
+  Attrs new_attrs = DictAttrs(Map<String, ObjectRef>());
+  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, new_attrs);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->attrs.same_as(new_attrs));
+  ASSERT_TRUE(result->attrs.same_as(new_attrs));
+}
+
+TEST(WithFields, Call_TypeArgs) {
+  Call call = TestCall();
+  Array<Type> new_type_args;
+  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, /*opt_attrs=*/{}, new_type_args);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->type_args.same_as(new_type_args));
+  ASSERT_TRUE(result->type_args.same_as(new_type_args));
+}
+
+TEST(WithFields, Call_VirtualDevice) {
+  Call call = TestCall();
+  VirtualDevice new_virtual_device = TestVirtualDevice();
+  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, /*opt_attrs=*/{},
+                           /*opt_type_args=*/{}, new_virtual_device);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->virtual_device().same_as(new_virtual_device));
+  ASSERT_TRUE(result->virtual_device().same_as(new_virtual_device));
+}
+
+TEST(WithFields, Call_Span) {
+  Call call = TestCall();
+  Span new_span = TestSpan();
+  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, /*opt_attrs=*/{},
+                           /*opt_type_args=*/{}, /*opt_virtual_device=*/{}, new_span);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->span.same_as(new_span));
+  ASSERT_TRUE(result->span.same_as(new_span));
+}
+
+//
+// GlobalVar
+//
+
+TEST(WithFields, GlobalVar_Noop) {
+  GlobalVar gv = TestGlobalVar();
+  GlobalVar result = WithFields(gv);
+  ASSERT_TRUE(result.same_as(gv));
+}
+
+TEST(WithFields, GlobalVar_Name) {
+  GlobalVar gv = TestGlobalVar();
+  String new_name("foo");
+  GlobalVar result = WithFields(gv, new_name);
+  ASSERT_FALSE(result.same_as(gv));
+  ASSERT_FALSE(gv->name_hint.same_as(new_name));
+  ASSERT_TRUE(result->name_hint.same_as(new_name));
+}
+
+TEST(WithFields, GlobalVar_Type) {
+  GlobalVar gv = TestGlobalVar();
+  Type new_type = TupleType(Array<Type>());
+  GlobalVar result = WithFields(gv, /*opt_name_hint=*/{}, new_type);
+  ASSERT_FALSE(result.same_as(gv));
+  ASSERT_FALSE(gv->checked_type().same_as(new_type));
+  ASSERT_TRUE(result->checked_type().same_as(new_type));
+}
+
+TEST(WithFields, GlobalVar_VirtualDevice) {
+  GlobalVar gv = TestGlobalVar();
+  VirtualDevice new_virtual_device = TestVirtualDevice();
+  GlobalVar result = WithFields(gv, /*opt_name_hint=*/{}, /*opt_type=*/{}, new_virtual_device);
+  ASSERT_FALSE(result.same_as(gv));
+  ASSERT_FALSE(gv->virtual_device().same_as(new_virtual_device));
+  ASSERT_TRUE(result->virtual_device().same_as(new_virtual_device));
+}
+
+TEST(WithFields, GlobalVar_Span) {
+  GlobalVar gv = TestGlobalVar();
+  Span new_span = TestSpan();
+  GlobalVar result =
+      WithFields(gv, /*opt_name_hint=*/{}, /*opt_type=*/{}, /*opt_virtual_device=*/{}, new_span);
+  ASSERT_FALSE(result.same_as(gv));
+  ASSERT_FALSE(gv->span.same_as(new_span));
+  ASSERT_TRUE(result->span.same_as(new_span));
+}
+
+//
+// Constant
+//
+
+TEST(WithFields, Constant_Noop) {
+  Constant constant = TestConstant();
+  Constant result = WithFields(constant);
+  ASSERT_TRUE(result.same_as(constant));
+}
+
+TEST(WithFields, Constant_Data) {
+  Constant constant = TestConstant();
+  runtime::NDArray new_data = runtime::NDArray::Empty({}, DataType::Float(32), {kDLCPU, 0});
+  Constant result = WithFields(constant, new_data);
+  ASSERT_FALSE(result.same_as(constant));
+  ASSERT_FALSE(constant->data.same_as(new_data));
+  ASSERT_TRUE(result->data.same_as(new_data));
+}
+
+TEST(WithFields, Constant_VirtualDevice) {
+  Constant constant = TestConstant();
+  VirtualDevice new_virtual_device = TestVirtualDevice();
+  Constant result = WithFields(constant, /*opt_data=*/{}, new_virtual_device);
+  ASSERT_FALSE(result.same_as(constant));
+  ASSERT_FALSE(constant->virtual_device().same_as(new_virtual_device));
+  ASSERT_TRUE(result->virtual_device().same_as(new_virtual_device));
+}
+
+TEST(WithFields, Constant_Span) {
+  Constant constant = TestConstant();
+  Span new_span = TestSpan();
+  Constant result = WithFields(constant, /*opt_data=*/{}, /*opt_virtual_device=*/{}, new_span);
+  ASSERT_FALSE(result.same_as(constant));
+  ASSERT_FALSE(constant->span.same_as(new_span));
+  ASSERT_TRUE(result->span.same_as(new_span));
+}
+
+}  // namespace
+}  // namespace relay
+}  // namespace tvm

From 8341e33d05868b7bb8496c913679b7951836f3b9 Mon Sep 17 00:00:00 2001
From: WANG Zihan <wzh1999_frog@126.com>
Date: Mon, 13 Jun 2022 07:14:43 +0800
Subject: [PATCH 112/181] [Bugfix] Shape inference of weight for grouped
 `nn.conv3d` (#11681)

* Fix `nn.conv3d` weight shape inference.

* Add test for conv3d type inference with groups.
---
 src/relay/op/nn/convolution.cc       | 14 ++------------
 tests/python/relay/test_op_level2.py |  7 +++++++
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 0c882589e9cb1..a6f6390b21108 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -438,18 +438,8 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (param->kernel_size.defined() && param->channels.defined()) {
     ICHECK_EQ(param->kernel_size.size(), 3);
     ICHECK_EQ(param->dilation.size(), 3);
-    Array<IndexExpr> wshape;
-    tvm::tir::ExprDeepEqual expr_equal;
-
-    if (expr_equal(param->channels, param->groups) && !expr_equal(param->channels, 1)) {
-      // infer weight's shape for depthwise convolution
-      wshape = {{dshape_ncdhw[1], indexdiv(param->groups, dshape_ncdhw[1]), param->kernel_size[0],
-                 param->kernel_size[1], param->kernel_size[2]}};
-    } else {
-      wshape = {{param->channels, indexdiv(dshape_ncdhw[1], param->groups), param->kernel_size[0],
-                 param->kernel_size[1], param->kernel_size[2]}};
-    }
-
+    Array<IndexExpr> wshape({param->channels, indexdiv(dshape_ncdhw[1], param->groups),
+                             param->kernel_size[0], param->kernel_size[1], param->kernel_size[2]});
     wshape = trans_kernel_layout.BackwardShape(wshape);
     channels = param->channels;
     dilated_ksize_z = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index f547565464704..dd6a54b959cc0 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -522,6 +522,13 @@ def test_conv3d_infer_type():
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((n, d, h, w, 16), "int32")
 
+    # Infer with groups
+    x = relay.var("x", relay.TensorType((1, 16, 224, 224, 224), "float32"))
+    w = relay.var("w", relay.TensorType((4, 4, 1, 1, 1), "float32"))
+    y = relay.nn.conv3d(x, w, groups=4, kernel_size=(1, 1, 1), channels=4)
+    yy = run_infer_type(y)
+    assert yy.checked_type == relay.TensorType((1, 4, 224, 224, 224), "float32")
+
 
 @tvm.testing.uses_gpu
 def test_conv3d_run():

From 005f05e26f895c12b522e3e770884cc98d7e54b9 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 13 Jun 2022 19:02:27 +0900
Subject: [PATCH 113/181] fixed cutlass byoc build break (#11686)

---
 src/relay/backend/contrib/cutlass/codegen.cc | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc
index 97c039ee29cf2..772007792ae62 100644
--- a/src/relay/backend/contrib/cutlass/codegen.cc
+++ b/src/relay/backend/contrib/cutlass/codegen.cc
@@ -602,7 +602,8 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
     ICHECK(pattern_name.defined()) << "Only functions with composite attribute are supported.";
 
     if (pattern_name == "cutlass.dense") {
-      const auto* dense_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.dense");
+      const auto* dense_call =
+          GetRootCall(callee->body.as<CallNode>(), 0, std::vector<std::string>{"nn.dense"});
       return GenerateBody(dense_call, "cutlass_dense", GetArgumentNames(caller),
                           DenseArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.dense_bias") {
@@ -637,11 +638,12 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
                           DenseArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.batch_matmul") {
       const auto* batch_matmul_call =
-          GetRootCall(callee->body.as<CallNode>(), 0, "nn.batch_matmul");
+          GetRootCall(callee->body.as<CallNode>(), 0, std::vector<std::string>{"nn.batch_matmul"});
       return GenerateBody(batch_matmul_call, "cutlass_batch_matmul", GetArgumentNames(caller),
                           BatchMatmulArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d") {
-      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d");
+      const auto* conv2d_call =
+          GetRootCall(callee->body.as<CallNode>(), 0, std::vector<std::string>{"nn.conv2d"});
       return GenerateBody(conv2d_call, "cutlass_conv2d", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d_bias") {
@@ -704,12 +706,13 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
       return GenerateBody(conv2d_call, pattern_name.value(), GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d_transpose") {
-      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d_transpose");
+      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0,
+                                            std::vector<std::string>{"nn.conv2d_transpose"});
       return GenerateBody(conv2d_call, "cutlass_conv2d_transpose", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_), true, false));
     } else if (pattern_name == "cutlass.conv2d_backward_weight") {
-      const auto* conv2d_call =
-          GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d_backward_weight");
+      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0,
+                                            std::vector<std::string>{"nn.conv2d_backward_weight"});
       return GenerateBody(conv2d_call, "cutlass_conv2d_backward_weight", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_), false, true));
     }

From 9ecb5712688ee340cbb72b648a9d0e522cdb2413 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 13 Jun 2022 03:03:34 -0700
Subject: [PATCH 114/181] [docs] Add links to v0.8.0 docs (#11647)

This uses the new code from https://github.com/tlc-pack/tlcpack-sphinx-addon/pull/5 with a link to the v0.8.0 docs. We can update this in the future as we add more releases.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/install/ubuntu_install_python_package.sh | 2 +-
 docs/conf.py                                    | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 0353814efcb83..74dad236c1cfb 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -33,7 +33,7 @@ pip3 install --upgrade \
     Pillow==9.1.0 \
     psutil \
     pytest \
-    tlcpack-sphinx-addon==0.2.1 \
+    git+https://github.com/tlc-pack/tlcpack-sphinx-addon.git@14906063f938b7569e40f3d47a0ca39c181fb6ea \
     pytest-profiling \
     pytest-xdist \
     requests \
diff --git a/docs/conf.py b/docs/conf.py
index 9d55e20c03e5c..4faa20ddc0c8c 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -60,7 +60,7 @@
 # General information about the project.
 project = "tvm"
 author = "Apache Software Foundation"
-copyright = "2020 - 2021, %s" % author
+copyright = "2020 - 2022, %s" % author
 github_doc_root = "https://github.com/apache/tvm/tree/main/docs/"
 
 os.environ["TVM_BUILD_DOC"] = "1"
@@ -383,10 +383,10 @@ def force_gc(gallery_conf, fname):
 ## Setup header and other configs
 import tlcpack_sphinx_addon
 
-footer_copyright = "© 2020 Apache Software Foundation | All right reserved"
+footer_copyright = "© 2022 Apache Software Foundation | All rights reserved"
 footer_note = " ".join(
     """
-Copyright © 2020 The Apache Software Foundation. Apache TVM, Apache, the Apache feather,
+Copyright © 2022 The Apache Software Foundation. Apache TVM, Apache, the Apache feather,
 and the Apache TVM project logo are either trademarks or registered trademarks of
 the Apache Software Foundation.""".split(
         "\n"
@@ -425,6 +425,7 @@ def force_gc(gallery_conf, fname):
     "header_dropdown": header_dropdown,
     "header_logo": header_logo,
     "header_logo_link": header_logo_link,
+    "version_prefixes": ["main", "v0.8.0/"],
 }
 
 # add additional overrides

From eb611482e3fb6da463d7458424518792d03fb89e Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 13 Jun 2022 05:04:11 -0500
Subject: [PATCH 115/181] [UnitTests] Parametrized test_topi_argwhere.py
 (#11651)

Refactored while debugging breakage of tests in
https://github.com/apache/tvm/pull/11646.  Submitting as a separate
PR, as it isn't necessary or related to the primary changes in that
PR.
---
 .../python/topi/python/test_topi_argwhere.py  | 72 +++++++++----------
 1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py
index 8592f57b74a4d..bc43dbb2b051e 100644
--- a/tests/python/topi/python/test_topi_argwhere.py
+++ b/tests/python/topi/python/test_topi_argwhere.py
@@ -16,8 +16,10 @@
 # under the License.
 """Test for argwhere operator"""
 import numpy as np
+import pytest
 
 import tvm
+import tvm.testing
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
@@ -29,56 +31,50 @@
 
 _argwhere_compute = {"llvm": topi.argwhere, "cuda": topi.cuda.argwhere}
 
+data_shape = tvm.testing.parameter(
+    (1,),
+    (100,),
+    (1, 1),
+    (5, 3),
+    (32, 64),
+    (128, 65),
+    (200, 500),
+    (6, 5, 3),
+    (1, 1, 1),
+    (1, 1, 1, 1),
+    (6, 4, 5, 3),
+    (1, 1, 1, 1, 1),
+    (6, 4, 5, 3, 7),
+)
 
-def verify_argwhere(data_shape):
+
+@tvm.testing.parametrize_targets("llvm", "cuda")
+def test_argwhere(target, dev, data_shape):
     dtype = "int32"
     np_data = np.random.choice([0, 1, 2, 3], size=data_shape).astype(dtype)
     np_out = np.argwhere(np_data)
     out_shape = np_out.shape[0]
+
     np_shape = np.ones(shape=(out_shape, len(data_shape)), dtype=dtype)
 
     out_shape = te.placeholder(shape=(out_shape, len(data_shape)), name="out_shape", dtype=dtype)
     condition = te.placeholder(shape=data_shape, name="condition", dtype=dtype)
 
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not dev.exist or target not in _argwhere_compute:
-            return
-
-        with tvm.target.Target(target):
-            out = _argwhere_compute[target](out_shape, condition)
-            s_func = tvm.topi.testing.dispatch(target, _argwhere_schedule)
-            sch = s_func(out)
-
-        func = tvm.build(sch, [out_shape, condition, out], target, name="argwhere")
-
-        args = [tvm.nd.array(np_shape, dev)]
-        args.append(tvm.nd.array(np_data, dev))
-        args.append(tvm.nd.empty(out.shape, device=dev, dtype=condition.dtype))
-        func(*args)
-        np.set_printoptions(threshold=np.inf)
-        tvm.testing.assert_allclose(args[-1].numpy(), np.array(np_out))
-
-    for target, _ in tvm.testing.enabled_targets():
-        check_device(target)
+    with tvm.target.Target(target):
+        out = _argwhere_compute[target](out_shape, condition)
+        s_func = tvm.topi.testing.dispatch(target, _argwhere_schedule)
+        sch = s_func(out)
 
+    func = tvm.build(sch, [out_shape, condition, out], target, name="argwhere")
 
-@tvm.testing.uses_gpu
-def test_argwhere():
-    verify_argwhere((1,))
-    verify_argwhere((100,))
-    verify_argwhere((1, 1))
-    verify_argwhere((5, 3))
-    verify_argwhere((32, 64))
-    verify_argwhere((128, 65))
-    verify_argwhere((200, 500))
-    verify_argwhere((6, 5, 3))
-    verify_argwhere((1, 1, 1))
-    verify_argwhere((1, 1, 1, 1))
-    verify_argwhere((6, 4, 5, 3))
-    verify_argwhere((1, 1, 1, 1, 1))
-    verify_argwhere((6, 4, 5, 3, 7))
+    args = [tvm.nd.array(np_shape, dev)]
+    args.append(tvm.nd.array(np_data, dev))
+    args.append(tvm.nd.empty(out.shape, device=dev, dtype=condition.dtype))
+    func(*args)
+    np.set_printoptions(threshold=np.inf)
+    tvm_out = args[-1].numpy()
+    tvm.testing.assert_allclose(tvm_out, np_out)
 
 
 if __name__ == "__main__":
-    test_argwhere()
+    tvm.testing.main()

From 2a5ff18bc5a6c466364270bba33a4462be7570e2 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Mon, 13 Jun 2022 13:24:18 +0100
Subject: [PATCH 116/181] Added a docstring to missing CMSIS-NN test (#11690)

* Made CMSIS-NN tests pylint compliant

Change-Id: I6bc536a80a24a1603e9f75f8ee9a26d0d88f10df

* Removed comments that disabled pylint checks

Change-Id: Iee513a4a5bef1db5b78e1d25a30ac7202f8b0e92

* Fixed pylint issue in the generate_constants test

Change-Id: Icd341cf524b331ced1fc7ef282b67296583b0fa4
---
 tests/python/contrib/test_cmsisnn/test_generate_constants.py | 1 +
 tests/python/contrib/test_cmsisnn/utils.py                   | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_cmsisnn/test_generate_constants.py b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
index e6faa1a243f5c..86737370bc5d7 100644
--- a/tests/python/contrib/test_cmsisnn/test_generate_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
@@ -51,6 +51,7 @@ def __init__(self, enable_bias, multiplier, shift):
         self.shift_ = shift
 
     def visit_call(self, call):
+        """Tests if the multiplier and shift constants required by CMSIS-NN API were generated"""
         super().visit_call(call)
         if isinstance(call.op, tvm.ir.expr.GlobalVar):
             multiplier = call.args[2]
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index e69329ebc5a42..9cd15988c132b 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -224,5 +224,4 @@ def make_qnn_relu(expr, fused_activation_fn, scale, zero_point, dtype):
         )
     if fused_activation_fn == "RELU":
         return tvm.relay.op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
-
     raise ValueError("Invalid argument provided with fused_activation_fn")

From 2df4524e04cf48f759175a746632efe6ff0a7ea6 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 13 Jun 2022 07:00:34 -0700
Subject: [PATCH 117/181] [Hexagon] Tighten requirements on inclusion of
 runtime sources (#11635)

* Tighten requirements on when Hexagon runtime sources
are included in the runtime build. Specifically only include them
when building for hexagon rpc on hardware and do not include them
for x86 (host, simulator) or android builds.

* Remove device_api.cpu binding to hexagon in simulator rpc session.

Co-authored-by: Adam Straw <astraw@octoml.ai>
Co-authored-by: Karl Koscher <kkoscher@octoml.ai>

* if(BUILD_FOR_HEXAGON)

Co-authored-by: Adam Straw <astraw@octoml.ai>
Co-authored-by: Karl Koscher <kkoscher@octoml.ai>
---
 cmake/modules/Hexagon.cmake                  | 2 +-
 src/runtime/hexagon/rpc/simulator/session.cc | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 03ab62de66b09..6e9b7dc70cbfc 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -116,7 +116,7 @@ function(add_hexagon_wrapper_paths)
   link_directories("${HEXAGON_TOOLCHAIN}/lib/iss")
 endfunction()
 
-if(BUILD_FOR_HEXAGON OR USE_HEXAGON_RPC)
+if(BUILD_FOR_HEXAGON)
   # Common sources for TVM runtime with Hexagon support
   file_glob_append(RUNTIME_HEXAGON_SRCS
     "${TVMRT_SOURCE_DIR}/hexagon/*.cc"
diff --git a/src/runtime/hexagon/rpc/simulator/session.cc b/src/runtime/hexagon/rpc/simulator/session.cc
index 7d88bbb748d02..0469ad5e6e1aa 100644
--- a/src/runtime/hexagon/rpc/simulator/session.cc
+++ b/src/runtime/hexagon/rpc/simulator/session.cc
@@ -568,10 +568,6 @@ detail::Optional<HEXAPI_Cpu> SimulatorRPCChannel::GetCPU(const detail::MaybeStri
 }
 
 SimulatorRPCChannel::SimulatorRPCChannel(int stack_size, std::string args) {
-  const auto* api = tvm::runtime::Registry::Get("device_api.hexagon");
-  ICHECK(api != nullptr);
-  tvm::runtime::Registry::Register("device_api.cpu", true).set_body(*api);
-
   const char* sdk_root_env = std::getenv("HEXAGON_SDK_ROOT");
   ICHECK(sdk_root_env != nullptr) << "Please set HEXAGON_SDK_ROOT";
   const char* toolchain_env = std::getenv("HEXAGON_TOOLCHAIN");

From e61ad7ab826a73347280468e2da47f215f76e05d Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 13 Jun 2022 08:41:53 -0700
Subject: [PATCH 118/181] [MetaSchedule] Add Profiler Support For Tuning
 Efficiency Optimization (#11486)

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 include/tvm/meta_schedule/profiler.h          | 103 ++++++++++++++
 python/tvm/meta_schedule/__init__.py          |   1 +
 python/tvm/meta_schedule/profiler.py          |  76 ++++++++++
 .../testing/tune_relay_meta_schedule.py       |  29 ++--
 .../measure_callback/add_to_database.cc       |   1 +
 .../measure_callback/echo_statistics.cc       |   1 +
 .../measure_callback/measure_callback.cc      |   1 +
 .../measure_callback/remove_build_artifact.cc |   1 +
 .../measure_callback/update_cost_model.cc     |   6 +-
 src/meta_schedule/profiler.cc                 | 134 ++++++++++++++++++
 .../search_strategy/evolutionary_search.cc    |  74 +++++-----
 src/meta_schedule/tune_context.cc             |  11 +-
 src/meta_schedule/utils.h                     |   1 +
 .../unittest/test_meta_schedule_profiler.py   |  46 ++++++
 .../test_meta_schedule_search_strategy.py     |   4 +-
 15 files changed, 434 insertions(+), 55 deletions(-)
 create mode 100644 include/tvm/meta_schedule/profiler.h
 create mode 100644 python/tvm/meta_schedule/profiler.py
 create mode 100644 src/meta_schedule/profiler.cc
 create mode 100644 tests/python/unittest/test_meta_schedule_profiler.py

diff --git a/include/tvm/meta_schedule/profiler.h b/include/tvm/meta_schedule/profiler.h
new file mode 100644
index 0000000000000..0f6572cca98b8
--- /dev/null
+++ b/include/tvm/meta_schedule/profiler.h
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_PROFILER_H_
+#define TVM_META_SCHEDULE_PROFILER_H_
+
+#include <tvm/ir/module.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/target/target.h>
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace meta_schedule {
+
+class ScopedTimer {
+ public:
+  ~ScopedTimer() {
+    if (deferred_ != nullptr) {
+      deferred_();
+    }
+  }
+
+ private:
+  friend class Profiler;
+
+  explicit ScopedTimer(runtime::TypedPackedFunc<void()> deferred) : deferred_(deferred) {}
+  runtime::TypedPackedFunc<void()> deferred_;
+};
+
+/*! \brief A generic profiler */
+class ProfilerNode : public runtime::Object {
+ public:
+  /*! \brief The segments that are already profiled */
+  std::unordered_map<std::string, double> stats_sec;
+  /*! \brief Counter for the total time used */
+  runtime::PackedFunc total_timer;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    // `stats_sec` is not visited.
+    // `total_timer` is not visited.
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.Profiler";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ProfilerNode, runtime::Object);
+
+ public:
+  /*! \brief Get the internal stats of the running time */
+  Map<String, FloatImm> Get() const;
+  /*! \brief Return a summary of profiling results as table format */
+  String Table() const;
+};
+
+/*!
+ * \brief Managed reference to ProfilerNode
+ * \sa ProfilerNode
+ */
+class Profiler : public runtime::ObjectRef {
+ public:
+  Profiler();
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(Profiler, runtime::ObjectRef, ProfilerNode);
+
+  /*! \brief Entering the scope of the context manager */
+  void EnterWithScope();
+  /*! \brief Exiting the scope of the context manager */
+  void ExitWithScope();
+  /*! \brief Returns the current profiler */
+  static Optional<Profiler> Current();
+  /*!
+   * \brief Profile the time usage in the given scope in the given name.
+   * \param name Name for the scope.
+   * \return A scope timer for time profiling.
+   */
+  static ScopedTimer TimedScope(String name);
+};
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_PROFILER_H_
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index 0028fbdf4faa6..26cf446b10900 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -30,6 +30,7 @@
     search_strategy,
     space_generator,
 )
+from .profiler import Profiler
 from .apply_history_best import ApplyHistoryBest
 from .extracted_task import ExtractedTask
 from .relay_integration import extract_task_from_relay
diff --git a/python/tvm/meta_schedule/profiler.py b/python/tvm/meta_schedule/profiler.py
new file mode 100644
index 0000000000000..a83d0fa16eabf
--- /dev/null
+++ b/python/tvm/meta_schedule/profiler.py
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A context manager that profiles tuning time cost for different parts."""
+from __future__ import annotations
+
+import logging
+from contextlib import contextmanager
+from typing import Dict, Optional
+
+from tvm._ffi import register_object
+from tvm.runtime import Object
+
+from . import _ffi_api
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@register_object("meta_schedule.Profiler")
+class Profiler(Object):
+    """Tuning time profiler."""
+
+    def __init__(self) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.Profiler,  # type: ignore # pylint: disable=no-member
+        )
+
+    def get(self) -> Dict[str, float]:
+        """Get the profiling results in minutes"""
+        return _ffi_api.ProfilerGet(self)  # type: ignore # pylint: disable=no-member
+
+    def table(self) -> str:
+        """Get the profiling results in a table format"""
+        return _ffi_api.ProfilerTable(self)  # type: ignore # pylint: disable=no-member
+
+    def __enter__(self) -> "Profiler":
+        """Entering the scope of the context manager"""
+        _ffi_api.ProfilerEnterWithScope(self)  # type: ignore # pylint: disable=no-member
+        return self
+
+    def __exit__(self, ptype, value, trace) -> None:
+        """Exiting the scope of the context manager"""
+        _ffi_api.ProfilerExitWithScope(self)  # type: ignore # pylint: disable=no-member
+
+    @staticmethod
+    def current() -> Optional["Profiler"]:
+        """Get the current profiler."""
+        return _ffi_api.ProfilerCurrent()  # type: ignore # pylint: disable=no-member
+
+    @staticmethod
+    def timeit(name: str):
+        """Timeit a block of code"""
+
+        @contextmanager
+        def _timeit():
+            try:
+                f = _ffi_api.ProfilerTimedScope(name)  # type: ignore # pylint: disable=no-member
+                yield
+            finally:
+                if f:
+                    f()
+
+        return _timeit()
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
index bd858e0f2d369..ee26b6303da04 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
@@ -122,19 +122,22 @@ def main():
         alloc_repeat=alloc_repeat,
         max_workers=ARGS.rpc_workers,
     )
-    lib = ms.tune_relay(
-        mod=mod,
-        target=ARGS.target,
-        config=ms.TuneConfig(
-            strategy="evolutionary",
-            num_trials_per_iter=64,
-            max_trials_per_task=ARGS.num_trials,
-            max_trials_global=ARGS.num_trials,
-        ),
-        runner=runner,  # type: ignore
-        work_dir=ARGS.work_dir,
-        params=params,
-    )
+    with ms.Profiler() as profiler:
+        lib = ms.tune_relay(
+            mod=mod,
+            target=ARGS.target,
+            config=ms.TuneConfig(
+                strategy="evolutionary",
+                num_trials_per_iter=64,
+                max_trials_per_task=ARGS.num_trials,
+                max_trials_global=ARGS.num_trials,
+            ),
+            runner=runner,  # type: ignore
+            work_dir=ARGS.work_dir,
+            params=params,
+        )
+    print("Tuning Time:")
+    print(profiler.table())
     graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
     for input_name, input_shape in input_info.items():
         if input_dtype.startswith("float"):
diff --git a/src/meta_schedule/measure_callback/add_to_database.cc b/src/meta_schedule/measure_callback/add_to_database.cc
index 27b4e55a7de5b..e86da3720f35b 100644
--- a/src/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/meta_schedule/measure_callback/add_to_database.cc
@@ -30,6 +30,7 @@ class AddToDatabaseNode : public MeasureCallbackNode {
     if (!task_scheduler->database.defined()) {
       return;
     }
+    auto _ = Profiler::TimedScope("AddToDatabase");
     TuneContext task = task_scheduler->tasks[task_id];
     Database database = task_scheduler->database.value();
     Workload workload = database->CommitWorkload(task->mod.value());
diff --git a/src/meta_schedule/measure_callback/echo_statistics.cc b/src/meta_schedule/measure_callback/echo_statistics.cc
index e45f98b52ea0a..5f3dce06f09c3 100644
--- a/src/meta_schedule/measure_callback/echo_statistics.cc
+++ b/src/meta_schedule/measure_callback/echo_statistics.cc
@@ -82,6 +82,7 @@ class EchoStatisticsNode : public MeasureCallbackNode {
     if (this->task_info.empty()) {
       SetupTaskInfo(task_scheduler->tasks);
     }
+    auto _ = Profiler::TimedScope("EchoStatistics");
     ICHECK_EQ(measure_candidates.size(), builder_results.size());
     ICHECK_EQ(measure_candidates.size(), runner_results.size());
     int n = measure_candidates.size();
diff --git a/src/meta_schedule/measure_callback/measure_callback.cc b/src/meta_schedule/measure_callback/measure_callback.cc
index c7851a6fadf62..e49f5216ec57c 100644
--- a/src/meta_schedule/measure_callback/measure_callback.cc
+++ b/src/meta_schedule/measure_callback/measure_callback.cc
@@ -27,6 +27,7 @@ void PyMeasureCallbackNode::Apply(const TaskScheduler& task_scheduler,
                                   const Array<BuilderResult>& builds,                 //
                                   const Array<RunnerResult>& results) {
   ICHECK(f_apply != nullptr) << "PyMeasureCallback's Apply method not implemented!";
+  auto _ = Profiler::TimedScope(this->f_as_string());
   return f_apply(task_scheduler, task_id, measure_candidates, builds, results);
 }
 
diff --git a/src/meta_schedule/measure_callback/remove_build_artifact.cc b/src/meta_schedule/measure_callback/remove_build_artifact.cc
index 649636def1127..67267dff91c88 100644
--- a/src/meta_schedule/measure_callback/remove_build_artifact.cc
+++ b/src/meta_schedule/measure_callback/remove_build_artifact.cc
@@ -28,6 +28,7 @@ class RemoveBuildArtifactNode : public MeasureCallbackNode {
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
     static const PackedFunc* f_rm = runtime::Registry::Get("meta_schedule.remove_build_dir");
+    auto _ = Profiler::TimedScope("RemoveBuildArtifact");
     for (const BuilderResult& build_result : builder_results) {
       if (Optional<String> path = build_result->artifact_path) {
         (*f_rm)(path.value());
diff --git a/src/meta_schedule/measure_callback/update_cost_model.cc b/src/meta_schedule/measure_callback/update_cost_model.cc
index 00f6f94eb7d3f..5b6208581cc70 100644
--- a/src/meta_schedule/measure_callback/update_cost_model.cc
+++ b/src/meta_schedule/measure_callback/update_cost_model.cc
@@ -27,11 +27,11 @@ class UpdateCostModelNode : public MeasureCallbackNode {
              const Array<MeasureCandidate>& measure_candidates,
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
+    auto _ = Profiler::TimedScope("UpdateCostModel");
     TuneContext task = task_scheduler->tasks[task_id];
-    ICHECK(task_scheduler->cost_model.defined())  //
+    ICHECK(task_scheduler->cost_model.defined())
         << "Cost model must be defined for the task scheduler!";
-    ICHECK(task->measure_candidates.defined())  //
-        << "Task's measure candidates must be present!";
+    ICHECK(task->measure_candidates.defined()) << "Task's measure candidates must be present!";
     CostModel cost_model = task_scheduler->cost_model.value();
     ICHECK_EQ(measure_candidates.size(), builder_results.size());
     ICHECK_EQ(runner_results.size(), builder_results.size());
diff --git a/src/meta_schedule/profiler.cc b/src/meta_schedule/profiler.cc
new file mode 100644
index 0000000000000..d3f72bb70577a
--- /dev/null
+++ b/src/meta_schedule/profiler.cc
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <algorithm>
+
+#include "./utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+/**************** Profiler ****************/
+
+Map<String, FloatImm> ProfilerNode::Get() const {
+  Map<String, FloatImm> ret;
+  for (const auto& kv : stats_sec) {
+    ret.Set(kv.first, FloatImm(DataType::Float(64), kv.second));
+  }
+  return ret;
+}
+
+String ProfilerNode::Table() const {
+  CHECK(!stats_sec.empty()) << "ValueError: The stats are empty. Please run the profiler first.";
+  CHECK(stats_sec.count("Total"))
+      << "ValueError: The total time is not recorded. This method should be called only after "
+         "exiting the profiler's with scope.";
+  double total = stats_sec.at("Total");
+  struct Entry {
+    String name;
+    double minutes;
+    double percentage;
+    bool operator<(const Entry& other) const { return percentage > other.percentage; }
+  };
+  std::vector<Entry> table_entry;
+  for (const auto& kv : stats_sec) {
+    table_entry.push_back(Entry{kv.first, kv.second / 60.0, kv.second / total * 100.0});
+  }
+  std::sort(table_entry.begin(), table_entry.end());
+  support::TablePrinter p;
+  p.Row() << "ID"
+          << "Name"
+          << "Time (min)"
+          << "Percentage";
+  p.Separator();
+  for (int i = 0, n = table_entry.size(); i < n; ++i) {
+    if (i == 0) {
+      p.Row() << "" << table_entry[i].name << table_entry[i].minutes << table_entry[i].percentage;
+    } else {
+      p.Row() << i << table_entry[i].name << table_entry[i].minutes << table_entry[i].percentage;
+    }
+  }
+  return p.AsStr();
+}
+
+Profiler::Profiler() {
+  ObjectPtr<ProfilerNode> n = make_object<ProfilerNode>();
+  n->stats_sec.clear();
+  n->total_timer = nullptr;
+  data_ = n;
+}
+
+PackedFunc ProfilerTimedScope(String name) {
+  if (Optional<Profiler> opt_profiler = Profiler::Current()) {
+    return TypedPackedFunc<void()>([profiler = opt_profiler.value(),                  //
+                                    tik = std::chrono::high_resolution_clock::now(),  //
+                                    name = std::move(name)]() {
+      auto tok = std::chrono::high_resolution_clock::now();
+      double duration = std::chrono::duration_cast<std::chrono::seconds>(tok - tik).count();
+      profiler->stats_sec[name] += duration;
+    });
+  }
+  return nullptr;
+}
+
+ScopedTimer Profiler::TimedScope(String name) { return ScopedTimer(ProfilerTimedScope(name)); }
+
+/**************** Context Manager ****************/
+
+std::vector<Profiler>* ThreadLocalProfilers() {
+  static thread_local std::vector<Profiler> profilers;
+  return &profilers;
+}
+
+void Profiler::EnterWithScope() {
+  ThreadLocalProfilers()->push_back(*this);
+  (*this)->total_timer = ProfilerTimedScope("Total");
+}
+
+void Profiler::ExitWithScope() {
+  ThreadLocalProfilers()->pop_back();
+  if ((*this)->total_timer != nullptr) {
+    (*this)->total_timer();
+    (*this)->total_timer = nullptr;
+  }
+}
+
+Optional<Profiler> Profiler::Current() {
+  std::vector<Profiler>* profilers = ThreadLocalProfilers();
+  if (profilers->empty()) {
+    return NullOpt;
+  } else {
+    return profilers->back();
+  }
+}
+
+TVM_REGISTER_NODE_TYPE(ProfilerNode);
+TVM_REGISTER_GLOBAL("meta_schedule.Profiler").set_body_typed([]() -> Profiler {
+  return Profiler();
+});
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerEnterWithScope")
+    .set_body_method(&Profiler::EnterWithScope);
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerExitWithScope")
+    .set_body_method(&Profiler::ExitWithScope);
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerCurrent").set_body_typed(Profiler::Current);
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerGet").set_body_method<Profiler>(&ProfilerNode::Get);
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerTable").set_body_method<Profiler>(&ProfilerNode::Table);
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerTimedScope").set_body_typed(ProfilerTimedScope);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 6935ee610e485..acde7f65a86c1 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -220,6 +220,7 @@ Array<MeasureCandidate> AssembleCandidates(const std::vector<Schedule>& picks,
 std::vector<double> PredictNormalizedScore(const std::vector<Schedule>& candidates,
                                            const TuneContext& context, const CostModel& cost_model,
                                            const Array<ArgInfo>& args_info) {
+  auto _ = Profiler::TimedScope("EvoSearch/Evolve/PredictNormalizedScore");
   ICHECK(!candidates.empty()) << "Candidates given for score prediction can not be empty list!";
   std::vector<double> scores =
       cost_model->Predict(context, AssembleCandidates(candidates, args_info));
@@ -437,6 +438,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
 };
 
 std::vector<Schedule> EvolutionarySearchNode::State::PickBestFromDatabase(int num) {
+  auto _ = Profiler::TimedScope("EvoSearch/PickBestFromDatabase");
   std::vector<tir::Trace> measured_traces;
   measured_traces.reserve(num);
   Array<TuningRecord> top_records = this->database_->GetTopK(this->token_, num);
@@ -466,6 +468,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickBestFromDatabase(int nu
 }
 
 std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int num) {
+  auto _ = Profiler::TimedScope("EvoSearch/SampleInitPopulation");
   ThreadedTraceApply pp(self->context_->postprocs);
   std::vector<Schedule> out_schs;
   while (static_cast<int>(out_schs.size()) < self->init_min_unmeasured) {
@@ -529,43 +532,46 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     ConcurrentBitmask cbmask(self->population_size);
     std::vector<Schedule> next_population(self->population_size, Schedule{nullptr});
     // The worker function
-    auto f_find_candidate = [&cbmask, &population, &next_population, &pp, this](int thread_id,
-                                                                                int trace_id) {
-      // Prepare samplers
-      PerThreadData& data = this->per_thread_data_.at(thread_id);
-      TRandState* rand_state = &data.rand_state;
-      const IRModule& mod = data.mod;
-      std::function<int()>& trace_sampler = data.trace_sampler;
-      std::function<Optional<Mutator>()>& mutator_sampler = data.mutator_sampler;
-      Schedule& result = next_population.at(trace_id);
-      int sampled_trace_id = -1;
-      // Loop until success
-      for (int fail_count = 0; fail_count <= self->genetic_max_fail_count; ++fail_count) {
-        sampled_trace_id = trace_sampler();
-        tir::Trace trace = population.at(sampled_trace_id)->trace().value();
-        if (Optional<Mutator> opt_mutator = mutator_sampler()) {
-          // Decision: mutate
-          Mutator mutator = opt_mutator.value();
-          if (Optional<tir::Trace> new_trace = mutator->Apply(trace, rand_state)) {
-            if (Optional<Schedule> sch = pp.Apply(mod, new_trace.value(), rand_state)) {
-              // note that sch's trace is different from new_trace
-              // because it contains post-processing information
-              result = sch.value();
-              break;
+    {
+      auto _ = Profiler::TimedScope("EvoSearch/Evolve/Mutation");
+      auto f_find_candidate = [&cbmask, &population, &next_population, &pp, this](int thread_id,
+                                                                                  int trace_id) {
+        // Prepare samplers
+        PerThreadData& data = this->per_thread_data_.at(thread_id);
+        TRandState* rand_state = &data.rand_state;
+        const IRModule& mod = data.mod;
+        std::function<int()>& trace_sampler = data.trace_sampler;
+        std::function<Optional<Mutator>()>& mutator_sampler = data.mutator_sampler;
+        Schedule& result = next_population.at(trace_id);
+        int sampled_trace_id = -1;
+        // Loop until success
+        for (int fail_count = 0; fail_count <= self->genetic_max_fail_count; ++fail_count) {
+          sampled_trace_id = trace_sampler();
+          tir::Trace trace = population.at(sampled_trace_id)->trace().value();
+          if (Optional<Mutator> opt_mutator = mutator_sampler()) {
+            // Decision: mutate
+            Mutator mutator = opt_mutator.value();
+            if (Optional<tir::Trace> new_trace = mutator->Apply(trace, rand_state)) {
+              if (Optional<Schedule> sch = pp.Apply(mod, new_trace.value(), rand_state)) {
+                // note that sch's trace is different from new_trace
+                // because it contains post-processing information
+                result = sch.value();
+                break;
+              }
             }
+          } else if (cbmask.QueryAndMark(sampled_trace_id)) {
+            // Decision: do not mutate
+            break;
           }
-        } else if (cbmask.QueryAndMark(sampled_trace_id)) {
-          // Decision: do not mutate
-          break;
         }
-      }
-      // if retry count exceeds the limit, reuse an old sample
-      if (!result.defined()) {
-        result = population.at(sampled_trace_id);
-      }
-    };
-    support::parallel_for_dynamic(0, self->population_size, self->context_->num_threads,
-                                  f_find_candidate);
+        // if retry count exceeds the limit, reuse an old sample
+        if (!result.defined()) {
+          result = population.at(sampled_trace_id);
+        }
+      };
+      support::parallel_for_dynamic(0, self->population_size, self->context_->num_threads,
+                                    f_find_candidate);
+    }
     population.swap(next_population);
     TVM_PY_LOG(INFO, self->context_->logging_func) << "Evolve iter #" << iter << " done. Summary:\n"
                                                    << pp.SummarizeFailures();
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 362db0a380971..0c70dcf5c406f 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -75,6 +75,7 @@ void TuneContextNode::_SetMeasureCandidates(const Array<MeasureCandidate>& candi
 }
 
 void TuneContextNode::_SendToBuilder(const Builder& builder) {
+  auto _ = Profiler::TimedScope("SendToBuilder");
   Array<MeasureCandidate> candidates = this->measure_candidates.value();
   Target target = this->target.value();
   Array<BuilderInput> inputs;
@@ -86,6 +87,7 @@ void TuneContextNode::_SendToBuilder(const Builder& builder) {
 }
 
 void TuneContextNode::_SendToRunner(const Runner& runner) {
+  auto _ = Profiler::TimedScope("SendToRunner");
   Array<MeasureCandidate> candidates = this->measure_candidates.value();
   Array<BuilderResult> builder_results = this->builder_results.value();
   Target target = this->target.value();
@@ -133,9 +135,12 @@ Array<RunnerResult> TuneContextNode::_Join() {
   Array<RunnerFuture> futures = this->runner_futures.value();
   int n = futures.size();
   Array<RunnerResult> results;
-  results.reserve(n);
-  for (RunnerFuture future : futures) {
-    results.push_back(future->Result());
+  {
+    auto _ = Profiler::TimedScope("JoinRunnerFutures");
+    results.reserve(n);
+    for (RunnerFuture future : futures) {
+      results.push_back(future->Result());
+    }
   }
   this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results);
   ICHECK(this->measure_candidates.defined());
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 40c301c6174f9..c399696a82d79 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -28,6 +28,7 @@
 #include <tvm/meta_schedule/database.h>
 #include <tvm/meta_schedule/feature_extractor.h>
 #include <tvm/meta_schedule/measure_callback.h>
+#include <tvm/meta_schedule/profiler.h>
 #include <tvm/meta_schedule/runner.h>
 #include <tvm/meta_schedule/schedule_rule.h>
 #include <tvm/meta_schedule/search_strategy.h>
diff --git a/tests/python/unittest/test_meta_schedule_profiler.py b/tests/python/unittest/test_meta_schedule_profiler.py
new file mode 100644
index 0000000000000..36a3d634bac51
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_profiler.py
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Test Meta Schedule Profiler """
+import time
+
+from tvm import meta_schedule as ms
+
+
+def test_meta_schedule_profiler_context_manager():
+    with ms.Profiler() as profiler:
+        time.sleep(1)
+        with ms.Profiler.timeit("Level0"):
+            time.sleep(1)
+            with ms.Profiler.timeit("Level1"):
+                time.sleep(2)
+    # Note that the results are in seconds
+
+    result = profiler.get()
+    assert len(result) == 3
+    assert 3.9 <= result["Total"] <= 4.1
+    assert 2.9 <= result["Level0"] <= 3.1
+    assert 1.9 <= result["Level1"] <= 2.1
+
+
+def test_meta_schedule_no_context():
+    with ms.Profiler.timeit("Level0"):
+        assert ms.Profiler.current() is None
+
+
+if __name__ == "__main__":
+    test_meta_schedule_profiler_context_manager()
+    test_meta_schedule_no_context()
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index fd8c023b5e4e0..1201e4100a979 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -119,7 +119,7 @@ def test_meta_schedule_replay_func(
     assert num_trials_each_iter == [7, 7, 6]
 
 
-def test_meta_schedule_evolutionary_search():  # pylint: disable = invalid-name]
+def test_meta_schedule_evolutionary_search():  # pylint: disable = invalid-name
     def _schedule_matmul_small(sch: Schedule):
         block = sch.get_block("matmul")
         _, j, k = sch.get_loops(block=block)
@@ -185,7 +185,7 @@ def _schedule_matmul_small(sch: Schedule):
     assert num_trials_each_iter.count(0) < 5
 
 
-def test_meta_schedule_evolutionary_search_early_stop():  # pylint: disable = invalid-name]
+def test_meta_schedule_evolutionary_search_early_stop():  # pylint: disable = invalid-name
     def _schedule_matmul_empty(sch: Schedule):
         return sch
 

From 1420df7744001ad6f805913cf060b6a333bd55d0 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 13 Jun 2022 08:50:19 -0700
Subject: [PATCH 119/181] [TE] Support schedulable TIR compute definitions in
 TOPI (#11589)

This PR adds `te.extern_primfunc` which provides the interface around TE ExternOp that allows a TVMScript defined schedulable TIR PrimFunc to be inlined into a TE compute graph. The result is that TIR can be used for compute definitions in Relay OpStrategies and, paired with meta-scheduler support in relay as introduced in #10578, these compute definitions can be scheduled and tuned as demonstrated in the attached tests.

Prior to this, compute definitions were limited to those definable in TE only. As a consequence of this patch and ongoing improvements to TVMScript meta-programming (#11097), TOPI can be extended to include compute and scheduling functions targeting schedulable TIR uniformly.
---
 python/tvm/te/__init__.py                     |   1 +
 python/tvm/te/operation.py                    |  82 ++++++
 src/arith/domain_touched.cc                   | 106 ++++++--
 src/relay/backend/task_extraction.cc          |   2 +-
 src/te/operation/create_primfunc.cc           |  99 ++++---
 .../test_meta_schedule_relay_tir_compute.py   | 174 ++++++++++++
 .../unittest/test_tir_te_extern_primfunc.py   | 257 ++++++++++++++++++
 7 files changed, 668 insertions(+), 53 deletions(-)
 create mode 100644 tests/python/unittest/test_meta_schedule_relay_tir_compute.py
 create mode 100644 tests/python/unittest/test_tir_te_extern_primfunc.py

diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
index 1777d8707c7ce..a52422f6c1d25 100644
--- a/python/tvm/te/__init__.py
+++ b/python/tvm/te/__init__.py
@@ -40,6 +40,7 @@
 from .operation import placeholder, compute, scan, extern, var, size_var, const
 from .operation import thread_axis, reduce_axis
 from .operation import create_prim_func
+from .operation import extern_primfunc
 
 from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp, HybridOp
 from .autodiff import gradient
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index df5dd2c4ffd81..ada5c369ad3bc 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -24,6 +24,7 @@
 import tvm._ffi
 import tvm.tir
 import tvm.tir._ffi_api
+import tvm.arith._ffi_api
 from tvm._ffi.base import string_types
 from tvm.ir import Array
 from tvm.runtime import convert
@@ -354,6 +355,87 @@ def extern(
     return res[0] if len(res) == 1 else res
 
 
+def extern_primfunc(input_tensors: List[_tensor.Tensor], primfunc: tvm.tir.PrimFunc, **kwargs):
+    """Compute tensors via a schedulable TIR PrimFunc
+
+    Parameters
+    ----------
+    input_tensors: list of Tensor
+        Input tensors that map to the corresponding primfunc input params.
+
+    primfunc: PrimFunc
+        The TIR PrimFunc
+
+    Returns
+    -------
+    tensor: Tensor or list of Tensors
+        The created tensor or tuple of tensors if it contains multiple outputs.
+
+    Example
+    -------
+    In the code below, a TVMScript defined TIR PrimFunc is inlined into
+    a TE ExternOp. Applying te.create_prim_func on this
+
+    .. code-block:: python
+
+        A = te.placeholder((128, 128), name="A")
+        B = te.placeholder((128, 128), name="B")
+
+        @T.prim_func
+        def before_split(a: T.handle, b: T.handle) -> None:
+            A = T.match_buffer(a, (128, 128))
+            B = T.match_buffer(b, (128, 128))
+            for i, j in T.grid(128, 128):
+                with T.block("B"):
+                    vi, vj = T.axis.remap("SS", [i, j])
+                    B[vi, vj] = A[vi, vj] * 2.0
+
+        C = te.extern_primfunc([A, B], func)
+    """
+    access_map = {
+        k: tuple(v) for k, v in tvm.arith._ffi_api.DomainTouchedAccessMap(primfunc).items()
+    }
+    in_buffers = [buf for buf, access in access_map.items() if len(access[0])]
+    out_buffers = [buf for buf, access in access_map.items() if len(access[1])]
+    assert in_buffers, "PrimFunc has no input buffers"
+    assert out_buffers, "PrimFunc has no output buffers"
+
+    outputs = []
+    inplace = []
+    input_buffers = in_buffers
+    for obuf in out_buffers:
+        if obuf in in_buffers:
+            inplace.append(obuf)
+        else:
+            outputs.append(obuf)
+
+    if not outputs:
+        iobuf = inplace.pop()
+        input_buffers.remove(iobuf)
+        outputs = [iobuf]
+
+    assert len(input_buffers) == len(input_tensors), (
+        "The number of provided input input_tensors does not match the number of ",
+        "input buffers in the primfunc",
+    )
+    for tensor, buffer in zip(input_tensors, input_buffers):
+        # TODO(csullivan): Can a stronger comparison between Tensor<>Buffer be made?
+        assert tensor.shape == buffer.shape, (
+            "The input input_tensors provided do not match the input buffers in the ",
+            "primfunc. Please check that the order of input te.Input_Tensors and the ",
+            "order of the primfunc variables in the params list agree.",
+        )
+    output = extern(
+        [buf.shape for buf in outputs],
+        input_tensors,
+        lambda ins, outs: primfunc.body,
+        in_buffers=input_buffers,
+        out_buffers=outputs,
+        **kwargs,
+    )
+    return output
+
+
 def var(name="tindex", dtype="int32", span=None):
     """Create a new variable with specified name and dtype
 
diff --git a/src/arith/domain_touched.cc b/src/arith/domain_touched.cc
index 3c3da5f4b99b2..8874f4f165069 100644
--- a/src/arith/domain_touched.cc
+++ b/src/arith/domain_touched.cc
@@ -26,6 +26,7 @@
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 
@@ -34,18 +35,54 @@ namespace arith {
 
 using namespace tir;
 
+namespace {
+
+using BufferTouches = std::vector<std::vector<IntSet>>;
+
+struct LoadAccess {
+  BufferTouches set;
+};
+
+struct StoreAccess {
+  BufferTouches set;
+};
+
+struct CombinedAccess {
+  BufferTouches set;
+};
+
+using BufferDomainAccess = std::tuple<LoadAccess, StoreAccess, CombinedAccess>;
+
+}  // namespace
+
 // Find Read region of the tensor in the stmt.
 class BufferTouchedDomain final : public StmtExprVisitor {
  public:
-  BufferTouchedDomain(const Buffer& buffer, bool consider_loads, bool consider_stores)
-      : buffer_(buffer), consider_loads_(consider_loads), consider_stores_(consider_stores) {}
+  BufferTouchedDomain(const Stmt& stmt) { operator()(stmt); }
+
+  std::unordered_map<const BufferNode*, BufferDomainAccess>& GetAccessedBufferRegions() {
+    return buffer_access_map_;
+  }
+
+  Region FindUnion(const Buffer& buffer, bool consider_loads, bool consider_stores) {
+    auto kv = buffer_access_map_.find(buffer.get());
+    CHECK(kv != buffer_access_map_.end())
+        << "The requested buffer is not contained in the provided stmt body.";
 
-  Region Find(const Stmt& stmt) {
-    operator()(stmt);
     Region ret;
     Range none;
-    for (size_t i = 0; i < bounds_.size(); ++i) {
-      ret.push_back(arith::Union(bounds_[i]).CoverRange(none));
+    BufferTouches bounds;
+    if (consider_loads && consider_stores) {
+      bounds = std::get<CombinedAccess>(kv->second).set;
+    } else if (consider_loads) {
+      bounds = std::get<LoadAccess>(kv->second).set;
+    } else if (consider_stores) {
+      bounds = std::get<StoreAccess>(kv->second).set;
+    } else {
+      CHECK(false) << "Must consider at least on of either loads and stores, but both are false";
+    }
+    for (size_t i = 0; i < bounds.size(); ++i) {
+      ret.push_back(arith::Union(bounds[i]).CoverRange(none));
     }
     return ret;
   }
@@ -78,41 +115,70 @@ class BufferTouchedDomain final : public StmtExprVisitor {
   }
 
   void VisitExpr_(const BufferLoadNode* op) final {
-    if (consider_loads_ && buffer_.same_as(op->buffer)) {
-      Touch(op->indices);
-    }
+    // Record load-exclusive buffer access
+    Touch(&std::get<LoadAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
+    // Record load-store inclusive buffer access
+    Touch(&std::get<CombinedAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitStmt_(const BufferStoreNode* op) final {
-    if (consider_stores_ && buffer_.same_as(op->buffer)) {
-      Touch(op->indices);
-    }
+    // Record store-exclusive buffer access
+    Touch(&std::get<StoreAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
+    // Record load-store inclusive buffer access
+    Touch(&std::get<CombinedAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
     StmtExprVisitor::VisitStmt_(op);
   }
 
  private:
-  void Touch(const Array<PrimExpr>& args) {
-    if (args.size() > bounds_.size()) {
-      bounds_.resize(args.size());
+  template <typename ArrayType>
+  void Touch(BufferTouches* bounds, const ArrayType& args) const {
+    if (args.size() > bounds->size()) {
+      bounds->resize(args.size());
     }
     for (size_t i = 0; i < args.size(); ++i) {
-      bounds_[i].emplace_back(EvalSet(args[i], dom_map_));
+      (*bounds)[i].emplace_back(EvalSet(args[i], dom_map_));
     }
   }
 
-  const Buffer& buffer_;
-  bool consider_loads_, consider_stores_;
-  std::vector<std::vector<IntSet> > bounds_;
+  std::unordered_map<const BufferNode*, BufferDomainAccess> buffer_access_map_;
   std::unordered_map<const VarNode*, IntSet> dom_map_;
 };
 
 Region DomainTouched(const Stmt& stmt, const Buffer& buffer, bool consider_loads,
                      bool consider_stores) {
-  return BufferTouchedDomain(buffer, consider_loads, consider_stores).Find(stmt);
+  return BufferTouchedDomain(stmt).FindUnion(buffer, consider_loads, consider_stores);
+}
+
+Map<Buffer, runtime::ADT> DomainTouchedAccessMap(const PrimFunc& func) {
+  auto buffer_access_map = BufferTouchedDomain(func->body).GetAccessedBufferRegions();
+  Map<Buffer, runtime::ADT> ret;
+  auto& buffer_map = func->buffer_map;
+  for (auto& var : func->params) {
+    auto& buffer = buffer_map[var];
+    auto& access = buffer_access_map[buffer.get()];
+    Array<Array<IntSet>> loads, stores, combined;
+    for (std::vector<IntSet>& touch : std::get<LoadAccess>(access).set) {
+      loads.push_back(Array<IntSet>(touch));
+    }
+    for (std::vector<IntSet>& touch : std::get<StoreAccess>(access).set) {
+      stores.push_back(Array<IntSet>(touch));
+    }
+    for (std::vector<IntSet>& touch : std::get<CombinedAccess>(access).set) {
+      combined.push_back(Array<IntSet>(touch));
+    }
+
+    std::vector<ObjectRef> fields;
+    fields.push_back(loads);
+    fields.push_back(stores);
+    fields.push_back(combined);
+    ret.Set(buffer, runtime::ADT::Tuple(fields));
+  }
+  return ret;
 }
 
 TVM_REGISTER_GLOBAL("arith.DomainTouched").set_body_typed(DomainTouched);
+TVM_REGISTER_GLOBAL("arith.DomainTouchedAccessMap").set_body_typed(DomainTouchedAccessMap);
 
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index 6ec881111d770..421a92c245e79 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -52,7 +52,7 @@ bool DefaultTaskFilter(const Array<te::Tensor>& args) {
     stack.pop_back();
     if (tensor->op->IsInstance<PlaceholderOpNode>()) {
       // do nothing
-    } else if (tensor->op->IsInstance<ComputeOpNode>()) {
+    } else if (tensor->op->IsInstance<ComputeOpNode>() || tensor->op->IsInstance<ExternOpNode>()) {
       Array<Tensor> inputs = tensor->op->InputTensors();
       for (const Tensor& v : inputs) {
         if (!visited.count(v.get())) {
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 27cfdd605c5d4..2aeb799a04cbc 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -395,8 +395,7 @@ Stmt GenerateStmtFromExternOp(const te::ExternOp& extern_op, CreateFuncInfo* inf
                             /*annotations=*/extern_op->attrs));
 }
 
-PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
-  // Step 1. Create tensor read graph.
+Array<te::Operation> CollectOrderedOps(const Array<te::Tensor>& arg_list) {
   Array<te::Operation> arg_ops;
   for (const te::Tensor& arg : arg_list) {
     arg_ops.push_back(arg->op);
@@ -404,53 +403,67 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
   te::ReadGraph g = te::CreateReadGraph(arg_ops);
   Array<te::Operation> order = te::PostDFSOrder(arg_ops, g);
 
-  // Step 2. Checking all Operations are supported.
   for (const te::Operation& op : order) {
     if (!(op->IsInstance<te::PlaceholderOpNode>() || op->IsInstance<te::ComputeOpNode>() ||
           op->IsInstance<te::ExternOpNode>()))
       LOG(FATAL) << "TypeError: Unsupported Operation: " << op->GetTypeKey() << ". "
                  << "Only te.placeholder and te.compute are allowed for now.";
   }
+  return order;
+}
 
-  // Infomations used in CreatePrimFunc and its sub-functions.
-  CreateFuncInfo info(arg_list);
-  // Root body stmts.
-  Array<Stmt> root_stmts;
-  // Analyzer
-  arith::Analyzer analyzer;
+void InitializeBufferBinds(const Array<te::Operation>& ordered_ops, CreateFuncInfo* info) {
+  // Process any TE operations which contain user defined buffers
+  for (const auto& op : ordered_ops) {
+    // Initialize the tensor2buffer binds map with buffers defined by the te.extern
+    if (const auto* extern_op = op.as<te::ExternOpNode>()) {
+      ICHECK_EQ(extern_op->inputs.size(), extern_op->input_placeholders.size());
+      for (size_t i = 0; i < extern_op->inputs.size(); ++i) {
+        const te::Tensor& input = extern_op->inputs[i];
+        const Buffer& buffer = extern_op->input_placeholders[i];
+        info->tensor2buffers[input] = buffer;
+      }
+    }
+  }
+}
 
-  // Step 3. Rewrite compute stages into blocks.
-  for (const te::Operation& op : order) {
-    if (const auto* placeholder = op.as<te::PlaceholderOpNode>()) {
-      // Case 1. PlaceholderOp (te.placeholder)
-      ICHECK_EQ(op->num_outputs(), 1);
-      const te::Tensor& tensor = op.output(0);
-      // Check op is in op list
-      ICHECK(info.IsArg(tensor));
+void RewriteStageToBlock(const te::Operation& op, CreateFuncInfo* info, Array<Stmt>* root_stmts,
+                         arith::Analyzer* analyzer) {
+  if (const auto* placeholder = op.as<te::PlaceholderOpNode>()) {
+    // Case 1. PlaceholderOp (te.placeholder)
+    ICHECK_EQ(op->num_outputs(), 1);
+    const te::Tensor& tensor = op.output(0);
+    // Check op is in op list
+    ICHECK(info->IsArg(tensor));
+    // Declare a buffer for any argument tensors without a pre-existing
+    // buffer declaration recorded in the tensor2buffer binds map
+    if (info->tensor2buffers.count(tensor) == 0) {
       const Buffer& buffer =
           decl_buffer(placeholder->shape, placeholder->dtype, placeholder->name, "global");
-      info.tensor2buffers[tensor] = buffer;
-    } else if (const auto* compute_op = op.as<te::ComputeOpNode>()) {
-      // Case 2. ComputeOp (te.compute)
-      root_stmts.push_back(
-          GenerateStmtFromCompute(GetRef<te::ComputeOp>(compute_op), &info, &analyzer));
-    } else if (const auto extern_op = op.as<te::ExternOpNode>()) {
-      // Case 3. ExternOp (te.extern)
-      root_stmts.push_back(GenerateStmtFromExternOp(GetRef<te::ExternOp>(extern_op), &info));
-    } else {
-      ICHECK(false) << "TypeError: Unsupported Operation: " << op->GetTypeKey() << ". "
-                    << "Only te.placeholder and te.compute are allowed for now.";
+      info->tensor2buffers[tensor] = buffer;
     }
+  } else if (const auto* compute_op = op.as<te::ComputeOpNode>()) {
+    // Case 2. ComputeOp (te.compute)
+    root_stmts->push_back(
+        GenerateStmtFromCompute(GetRef<te::ComputeOp>(compute_op), info, analyzer));
+  } else if (const auto extern_op = op.as<te::ExternOpNode>()) {
+    // Case 3. ExternOp (te.extern)
+    root_stmts->push_back(GenerateStmtFromExternOp(GetRef<te::ExternOp>(extern_op), info));
+  } else {
+    ICHECK(false) << "TypeError: Unsupported Operation: " << op->GetTypeKey() << ". "
+                  << "Only te.placeholder and te.compute are allowed for now.";
   }
+}
 
-  // Step 4. Create func and complete it.
+PrimFunc GenerateAndCompletePrimFunc(const Array<te::Tensor>& arg_list,
+                                     const Array<Stmt>& root_stmts, CreateFuncInfo* info) {
   Array<Var> parameters;
   Map<Var, Buffer> buffer_map;
   for (const te::Tensor& tensor : arg_list) {
     Var arg("var_" + tensor->GetNameHint(), PrimType(DataType::Handle()));
     parameters.push_back(arg);
-    auto it = info.tensor2buffers.find(tensor);
-    ICHECK(it != info.tensor2buffers.end());
+    auto it = info->tensor2buffers.find(tensor);
+    ICHECK(it != info->tensor2buffers.end());
     buffer_map.Set(arg, it->second);
   }
   PrimFunc func = WithAttrs(PrimFunc(/*params=*/std::move(parameters),
@@ -460,10 +473,32 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
                             {{"global_symbol", String("main")}, {"tir.noalias", Bool(true)}});
   const auto* complete = runtime::Registry::Get("script.Complete");
   ICHECK(complete);
-  func = (*complete)(std::move(func), info.root_alloc);
+  func = (*complete)(std::move(func), info->root_alloc);
   return LayoutFreePlaceholdersNormalizer().Process(std::move(func));
 }
 
+PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
+  // Infomations used in CreatePrimFunc and its sub-functions.
+  CreateFuncInfo info(arg_list);
+  // Root body stmts.
+  Array<Stmt> root_stmts;
+  // Analyzer
+  arith::Analyzer analyzer;
+
+  // Step 1. Create ordered array of operations and validate they are supported.
+  Array<te::Operation> order = CollectOrderedOps(arg_list);
+
+  // Step 2. Initialize buffer binds map
+  InitializeBufferBinds(order, &info);
+
+  // Step 3. Rewrite compute stages into blocks.
+  for (const te::Operation& op : order) {
+    RewriteStageToBlock(op, &info, &root_stmts, &analyzer);
+  }
+  // Step 4. Create func and complete prim func.
+  return GenerateAndCompletePrimFunc(arg_list, root_stmts, &info);
+}
+
 TVM_REGISTER_GLOBAL("te.CreatePrimFunc").set_body_typed(CreatePrimFunc);
 
 }  // namespace tir
diff --git a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
new file mode 100644
index 0000000000000..b62b638c03dc9
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
@@ -0,0 +1,174 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import tvm
+import tvm.testing
+import tvm.topi.testing
+
+from tvm.script import tir as T
+from tvm import tir, te, relay, topi, autotvm
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+from tvm.meta_schedule import ApplyHistoryBest
+from tvm.meta_schedule.testing import apply_fixed_schedules
+
+
+def compute_tir_conv2d_nchw_oihw(data_shape, weight_shape, dtype):
+    assert dtype == "float32"
+    OC, IC, FH, FW = weight_shape
+
+    padding = (0, 0, 0, 0)
+    strides = (1, 1)
+    dilation = (1, 1)
+    output_shape = (
+        data_shape[0],
+        weight_shape[0],
+        (data_shape[2] - ((weight_shape[2] - 1) * dilation[0] + 1) + padding[0] + padding[1])
+        // strides[0]
+        + 1,
+        (data_shape[3] - ((weight_shape[3] - 1) * dilation[1] + 1) + padding[2] + padding[3])
+        // strides[1]
+        + 1,
+    )
+    N, K, BH, BW = output_shape
+
+    # fmt: off
+    @T.prim_func
+    def conv2d(a: T.handle, filt: T.handle, b: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, data_shape, dtype=dtype)
+        Filter = T.match_buffer(filt, weight_shape, dtype=dtype)
+        B = T.match_buffer(b, output_shape, dtype=dtype)
+        for n, k, bh, bw in T.grid(N, K, BH, BW):
+            with T.block("init"):
+                vn, vk, vbh, vbw = T.axis.remap("SSSS", [n, k, bh, bw])
+                B[vn, vk, vbh, vbw] = T.float32(0)
+            for ic, fh, fw in T.grid(IC, FH, FW):
+                with T.block("update"):
+                    vn, vk, vbh, vbw, vc, vfh, vfw = T.axis.remap("SSSSRRR", [n, k, bh, bw, ic, fh, fw])
+                    B[vn, vk, vbh, vbw] = B[vn, vk, vbh, vbw] + A[vn, vc, vbh + vfh, vbw + vfw] * Filter[vk, vc, vfh, vfw]
+    # fmt: on
+
+    return conv2d
+
+
+def schedule_tir_conv2d_nchw_oihw(sch):
+    update_block = sch.get_block("update")
+    vn, vk, vbh, vbw, vc, vfh, vfw = sch.get_loops(update_block)
+    sch.split(vk, factors=(None, 32))
+
+
+@autotvm.register_topi_compute("test/conv2d_1")
+def _compute_conv2d_1(cfg, input, filter, strides, padding, dilation, out_dtype):
+    prim_func = compute_tir_conv2d_nchw_oihw(input.shape, filter.shape, input.dtype)
+    output = te.extern_primfunc([input, filter], prim_func, name="tir")
+    return output
+
+
+@autotvm.register_topi_schedule("test/conv2d_1")
+def _schedule_conv2d_1(cfg, outs):
+    s = te.create_schedule([x.op for x in outs])
+    return s
+
+
+@tvm.target.override_native_generic_func("test_conv2d_strategy")
+def _tmp_strategy(attrs, inputs, out_type, target):
+    strategy = relay.op.OpStrategy()
+    if attrs.groups == 1 and attrs.data_layout == "NCHW" and attrs.kernel_layout == "OIHW":
+        strategy.add_implementation(
+            relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_1),
+            relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_1),
+            name="conv2d_2",
+            plevel=15,
+        )
+    else:
+        raise ValueError("No valid strategy found")
+    return strategy
+
+
+def get_conv2d(data_shape, weight_shape, **kwargs):
+    data = relay.var("data", shape=data_shape, dtype="float32")
+    weight = relay.var("weight", shape=weight_shape, dtype="float32")
+    conv2d = relay.nn.conv2d(
+        data,
+        weight,
+        **kwargs,
+    )
+    return relay.Function([data, weight], conv2d)
+
+
+def get_ref(data, weight, stride, padding):
+    return tvm.topi.testing.conv2d_nchw_python(data, weight, stride, padding)
+
+
+def test_conv2d():
+    N, IC, H, W = 1, 64, 56, 56
+    OC, IC, FH, FW = 128, 64, 3, 3
+    data_shape = (N, IC, H, W)
+    weight_shape = (OC, IC, FH, FW)
+    padding = (0, 0)
+    strides = (1, 1)
+
+    relay_mod = tvm.IRModule.from_expr(
+        get_conv2d(
+            data_shape,
+            weight_shape,
+            padding=padding,
+            strides=strides,
+            channels=OC,
+            kernel_size=(FH, FW),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+        )
+    )
+
+    data_np = np.random.randn(*data_shape).astype("float32")
+    weight_np = np.random.randn(*weight_shape).astype("float32")
+
+    target = "llvm"
+    params = {"weight": weight_np}
+
+    def schedule_fn(task, sch):
+        if "nn_conv2d" in task.task_name:
+            schedule_tir_conv2d_nchw_oihw(sch)
+            return True
+        return False
+
+    with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy):
+        database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
+        with ApplyHistoryBest(database):
+            with tvm.transform.PassContext(
+                opt_level=3,
+                config={"relay.backend.use_meta_schedule": True},
+            ):
+                lib = relay.build(relay_mod, target=target, params=params)
+
+    dev = tvm.device(target, 0)
+
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    ref = get_ref(data_np, weight_np, strides, padding)
+
+    tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
+
+
+if __name__ == "__main__":
+    test_conv2d()
diff --git a/tests/python/unittest/test_tir_te_extern_primfunc.py b/tests/python/unittest/test_tir_te_extern_primfunc.py
new file mode 100644
index 0000000000000..26752145620a5
--- /dev/null
+++ b/tests/python/unittest/test_tir_te_extern_primfunc.py
@@ -0,0 +1,257 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import pytest
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import tir, te, TVMError
+from tvm.script import tir as T
+from tvm.arith import _ffi_api as _ffi_arith_api
+from tvm.tir.schedule import _ffi_api as _ffi_schedule_api
+
+
+# TODO(csullivan): Additional tests cases needed:
+# - PrimFunc with 1 arg, inplace update
+# - PrimFunc with buffer that uses custom storage_scope
+
+
+@T.prim_func
+def func_1(A: T.Buffer[(16,), "float32"], C: T.Buffer[(1,), "float32"]):
+    for i in T.serial(
+        0,
+        16,
+    ):
+        with T.block():
+            B = T.alloc_buffer((1,), dtype="float32")
+            with T.block():
+                B[0] = A[i] * T.float32(2)
+            with T.block():
+                C[0] = C[0] + A[i] + B[0] + T.float32(1)
+                A[i] = B[0] + T.float32(1)
+
+
+def verify_func_1(module):
+    a_np = np.random.randint(low=-128, high=127, size=(16,)).astype(np.float32)
+    c_np = np.zeros((1,), dtype=np.float32)
+    a = tvm.nd.array(a_np, device=tvm.cpu(0))
+    c = tvm.nd.array(c_np, device=tvm.cpu(0))
+
+    module(a, c)
+    tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1), c.numpy(), rtol=1e-4)
+    # also test in place update
+    tvm.testing.assert_allclose(a_np * 2 + 1, a.numpy(), rtol=1e-4)
+
+
+@T.prim_func
+def func_2(
+    C: T.Buffer[(1,), "float32"], A: T.Buffer[(16,), "float32"], D: T.Buffer[(2,), "float32"]
+):
+    for i in T.serial(
+        0,
+        16,
+    ):
+        with T.block():
+            B = T.alloc_buffer((1,), dtype="float32")
+            with T.block():
+                B[0] = A[i] * T.float32(2)
+            with T.block():
+                C[0] = C[0] + A[i] + B[0] + T.float32(1) + D[0]
+                A[i] = B[0] + T.float32(1) + D[1]
+
+
+def verify_func_2(module):
+    a_np = np.random.randint(low=-128, high=127, size=(16,)).astype(np.float32)
+    d_np = np.random.randint(low=-128, high=127, size=(2,)).astype(np.float32)
+    c_np = np.zeros((1,), dtype=np.float32)
+    a = tvm.nd.array(a_np, device=tvm.cpu(0))
+    d = tvm.nd.array(d_np, device=tvm.cpu(0))
+    c = tvm.nd.array(c_np, device=tvm.cpu(0))
+
+    module(c, a, d)
+    tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1 + d_np[0]), c.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np * 2 + 1 + d_np[1], a.numpy(), rtol=1e-4)
+
+
+@T.prim_func
+def func_3(
+    C: T.Buffer[(1,), "float32"],
+    A: T.Buffer[(16,), "float32"],
+    D: T.Buffer[(2,), "float32"],
+    E: T.Buffer[(16,), "float32"],
+    F: T.Buffer[(16,), "float32"],
+):
+    for i in T.serial(
+        0,
+        16,
+    ):
+        with T.block():
+            B = T.alloc_buffer((1,), dtype="float32")
+            with T.block():
+                B[0] = A[i] * T.float32(2)
+            with T.block():
+                E[i] = A[i]
+                F[i] = E[i] + 1.0
+                C[0] = C[0] + A[i] + B[0] + T.float32(1) + D[0]
+                A[i] = B[0] + T.float32(1) + D[1]
+
+
+def verify_func_3(module):
+    a_np = np.random.randint(low=-128, high=127, size=(16,)).astype(np.float32)
+    d_np = np.random.randint(low=-128, high=127, size=(2,)).astype(np.float32)
+    c_np = np.zeros((1,), dtype=np.float32)
+    e_np = np.zeros((16,), dtype=np.float32)
+    f_np = np.zeros((16,), dtype=np.float32)
+    a = tvm.nd.array(a_np, device=tvm.cpu(0))
+    d = tvm.nd.array(d_np, device=tvm.cpu(0))
+    c = tvm.nd.array(c_np, device=tvm.cpu(0))
+    e = tvm.nd.array(e_np, device=tvm.cpu(0))
+    f = tvm.nd.array(f_np, device=tvm.cpu(0))
+
+    module(c, a, d, e, f)
+    tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1 + d_np[0]), c.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np * 2 + 1 + d_np[1], a.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np, e.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np + 1, f.numpy(), rtol=1e-4)
+
+
+@T.prim_func
+def func_4(
+    C: T.Buffer[(1,), "float32"],
+    A: T.Buffer[(16,), "float32"],
+    F: T.Buffer[(16,), "float32"],
+    D: T.Buffer[(2,), "float32"],
+    E: T.Buffer[(16,), "float32"],
+):
+    for i in T.serial(
+        0,
+        16,
+    ):
+        with T.block():
+            B = T.alloc_buffer((1,), dtype="float32")
+            with T.block():
+                B[0] = A[i] * T.float32(2)
+            with T.block():
+                E[i] = A[i]
+                F[i] = E[i] + 1.0
+                C[0] = C[0] + A[i] + B[0] + T.float32(1) + D[0]
+                A[i] = B[0] + T.float32(1) + D[1]
+
+
+def verify_func_4(module):
+    a_np = np.random.randint(low=-128, high=127, size=(16,)).astype(np.float32)
+    d_np = np.random.randint(low=-128, high=127, size=(2,)).astype(np.float32)
+    c_np = np.zeros((1,), dtype=np.float32)
+    e_np = np.zeros((16,), dtype=np.float32)
+    f_np = np.zeros((16,), dtype=np.float32)
+    a = tvm.nd.array(a_np, device=tvm.cpu(0))
+    d = tvm.nd.array(d_np, device=tvm.cpu(0))
+    c = tvm.nd.array(c_np, device=tvm.cpu(0))
+    e = tvm.nd.array(e_np, device=tvm.cpu(0))
+    f = tvm.nd.array(f_np, device=tvm.cpu(0))
+
+    module(c, a, f, d, e)
+    tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1 + d_np[0]), c.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np * 2 + 1 + d_np[1], a.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np, e.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np + 1, f.numpy(), rtol=1e-4)
+
+
+class TestPrimFuncs:
+    func, verify = tvm.testing.parameters(
+        [func_1, verify_func_1],
+        [func_2, verify_func_2],
+        [func_3, verify_func_3],
+        [func_4, verify_func_4],
+    )
+
+    def test_primfunc_call(self, func, verify):
+        target = tvm.target.Target("llvm")
+        func = tvm.build(func, target=target)
+        verify(func)
+
+    def test_te_extern_call(self, func, verify):
+        ir_mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+        prim_func = ir_mod["main"]
+
+        input_tensors = create_input_tensors_for_primfunc(prim_func)
+        output = te.extern_primfunc(input_tensors, prim_func)
+        rt_prim_func = te.create_prim_func(tensors_from_extern_op(output, prim_func))
+        tvm.ir.assert_structural_equal(tvm.lower(prim_func), tvm.lower(rt_prim_func))
+
+        target = tvm.target.Target("llvm")
+        func = tvm.build(rt_prim_func, target=target)
+        verify(func)
+
+
+def tensors_from_extern_op(extern, func):
+    if isinstance(extern, list):
+        output_tensors = extern
+    else:
+        output_tensors = [extern]
+    output_buffers = []
+    input_buffers = []
+    input_tensors = []
+    for ext in output_tensors:
+        output_buffers.extend(ext.op.output_placeholders)
+        input_buffers.extend(ext.op.input_placeholders)
+        input_tensors.extend(ext.op.input_tensors)
+    input_binds = dict(zip(input_buffers, input_tensors))
+    output_binds = dict(zip(output_buffers, output_tensors))
+    buffer_to_tensor = {**input_binds, **output_binds}
+    ordered_tensors = []
+    for var in func.params:
+        buf = func.buffer_map[var]
+        ordered_tensors.append(buffer_to_tensor[buf])
+    return ordered_tensors
+
+
+def create_input_tensors_for_primfunc(primfunc):
+    access_map = {k: tuple(v) for k, v in _ffi_arith_api.DomainTouchedAccessMap(primfunc).items()}
+    in_buffers = [buf for buf, access in access_map.items() if len(access[0])]
+    out_buffers = [buf for buf, access in access_map.items() if len(access[1])]
+    assert in_buffers, "PrimFunc has no input buffers"
+    assert out_buffers, "PrimFunc has no output buffers"
+
+    outputs = []
+    inplace = []
+    inputs = in_buffers
+    for obuf in out_buffers:
+        if obuf in in_buffers:
+            inplace.append(obuf)
+        else:
+            outputs.append(obuf)
+
+    if not outputs:
+        iobuf = inplace.pop()
+        inputs.remove(iobuf)
+        outputs = [iobuf]
+
+    def create_tensors(input_buffers):
+        tensors = []
+        for buf in input_buffers:
+            t = te.placeholder(buf.shape, dtype=buf.dtype, name=buf.name + "_placeholder")
+            tensors.append(t)
+        return tensors
+
+    return create_tensors(inputs)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From 86eec51536ceeedd20df9cfdfd32dbca6f2bc111 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Mon, 13 Jun 2022 09:27:47 -0700
Subject: [PATCH 120/181] Updated install from source docs to include
 additional instructions for M1 macs. (#11675)

Co-authored-by: Noah Verke <noahverke@nverke-MBP.local>
---
 docs/install/from_source.rst | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 8597de224cd9f..5634c20b49ccd 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -83,6 +83,14 @@ Homebrew to ensure the dependencies are correctly installed and configured:
     brew install llvm
     brew install python@3.8
 
+If you are on macOS with an M1 Processor you may need to use conda to manage dependencies while building. Specifically you may need, `Miniforge <https://github.com/conda-forge/miniforge>`_ to ensure that the dependencies obtained using pip are compatible with M1. 
+
+.. code:: bash
+
+    brew install miniforge
+    conda init
+    conda create --name tvm python=3.8
+    conda activate tvm
 
 We use cmake to build the library.
 The configuration of TVM can be modified by editing `config.cmake` and/or by passing cmake flags to the command line:
@@ -264,8 +272,6 @@ TVM package
 Depending on your development environment, you may want to use a virtual environment and package manager, such
 as ``virtualenv`` or ``conda``, to manage your python packages and dependencies.
 
-to install and maintain your python development environment.
-
 The python package is located at `tvm/python`
 There are two ways to install the package:
 

From 85a190af7dfa12da090376eb17e6daa552433d56 Mon Sep 17 00:00:00 2001
From: Jinkun Lin <lazycal12@gmail.com>
Date: Mon, 13 Jun 2022 12:31:38 -0400
Subject: [PATCH 121/181] Fix onnx round import with float64 inputs. (#11685)

* Fix onnx round import with float64 inputs.

* Fix lint and optimize dtype mapping.
---
 python/tvm/relay/frontend/onnx.py          |  7 +++---
 tests/python/frontend/onnx/test_forward.py | 29 +++++++++++++++++-----
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 29c0a778ef6ee..595f12d4d5bda 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -5095,9 +5095,10 @@ def _impl_v11(cls, inputs, attr, params):
         # Onnx round uses Banker's rounding which rounds .5 to the nearest even integer
 
         x = inputs[0]
-        half = _expr.const(0.5, dtype="float32")
-        one = _expr.const(1, dtype="float32")
-        two = _expr.const(2, dtype="float32")
+        dtype = infer_type(x).checked_type.dtype
+        half = _expr.const(0.5, dtype=dtype)
+        one = _expr.const(1, dtype=dtype)
+        two = _expr.const(2, dtype=dtype)
 
         rounded = _op.ceil(x - half)
         bankers_mask = one - (_op.ceil(x + half) - _op.floor(x + half))
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 967597f7d12b8..c58e920ead1bd 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -966,24 +966,38 @@ def add_noop_to_input_attr(attr_name, attr):
 
 
 def _test_onnx_op_elementwise(
-    target, dev, inshape, outfunc, npargs, dtype, opname, kwargs, opset=None
+    target, dev, inshape, outfunc, npargs, dtype, opname, kwargs, opset=None, verify=True
 ):
     indata = np.random.uniform(-1, 1, size=inshape).astype(dtype)
     outdata = outfunc(indata, **npargs)
 
     y = helper.make_node(opname, ["in"], ["out"], **kwargs)
 
+    ONNX_DTYPE = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)]
+
     graph = helper.make_graph(
         [y],
         opname + "_test",
-        inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))],
+        inputs=[helper.make_tensor_value_info("in", ONNX_DTYPE, list(indata.shape))],
+        outputs=[helper.make_tensor_value_info("out", ONNX_DTYPE, list(outdata.shape))],
     )
 
     model = helper.make_model(graph, producer_name=opname + "_test")
-    verify_with_ort_with_inputs(
-        model, [indata], [outdata.shape], opset=opset, dtype=dtype, target=target, dev=dev
-    )
+    if verify:
+        verify_with_ort_with_inputs(
+            model, [indata], [outdata.shape], opset=opset, dtype=dtype, target=target, dev=dev
+        )
+    else:
+        get_tvm_output(
+            model,
+            [indata],
+            target,
+            dev,
+            [outdata.shape],
+            dtype,
+            opset=opset,
+            opt_level=3,
+        )
 
 
 @tvm.testing.parametrize_targets
@@ -1058,6 +1072,9 @@ def test_clip_min_max_as_inputs(target, dev):
 @tvm.testing.parametrize_targets
 def test_round(target, dev):
     _test_onnx_op_elementwise(target, dev, (2, 4, 5, 6), np.round, {}, "float32", "Round", {})
+    _test_onnx_op_elementwise(
+        target, dev, (2, 4, 5, 6), np.round, {}, "float64", "Round", {}, verify=False
+    )  # TODO: enable verification once ORT supports float64
 
 
 def _test_finite_ops(target, dev, inshape, outfunc, npargs, dtype, opname, kwargs):

From 76b9ce9b1f7d2b7e64b4b9c9d456a02b8a010473 Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Mon, 13 Jun 2022 10:22:54 -0700
Subject: [PATCH 122/181] [Hexagon] Add HexagonThreadManager (#11653)

* Adding initial threadmanager class

* Fixed compile errors

* Moving constant defines inside class

* Updating qurt includes

* use default scope for hexagon buffers

* Updating buffer allocations

* Fixed bug where array of pointers treated as array of structs

* - Updated HexgonDeviceAPI to use HexagonThreadManager
- Updated HexagonThreadManager interface to use TVMStreams
- Added second `Dispatch` interfce in thread manager to use PackedFuncs
- Updated thread manager to use vector for dynamic semaphore allocation
- Added "#if defined(__hexagon__)" in several places to prevent compilation errors

* Bug fixes + interface addition + basic thread tests
 - Fixed GetStreams not returning the streams properly
 - Added missing semaphore cleanup to prevent qurt kernel resource leakage
 - new interface functions:
   - Start() : now all worker threads are blocked on initialization until ThreadManager->Start() is called
   - WaitOnThreads() : blocking call which waits until all worker thread queues are empty
 - added extra debug logging
 - Two new basic thread tests working

* Adding initial ThreadManager tests

* HexagonThreadManager tests and refactor

* remove stack / pipe size member vars

* init pointers in the header file

* move all mem allocs to SpawnThreads

* start_semaphore as object instead of pointer

* fix bug with WaitOnThreads deadlock + Wait/Signal off by one error

* add / refactor Signal / Wait tests

* add SyncFromTo test cases

* add Dispatch test cases

* add pipe fill and overflow cases

* Updating dispatch to return bool and fix pipe overflow problem

* change around min / max values for stack / pipe

* integrate pipe fill / overflow tests back into HTM test suite

* use HexagonBuffer

* assert if stack / pipe sizes fall below min

* Changed semaphore vector to store pointers, not structs (fixes vector capacity adjustment invaliding in-use addresses).

* add producer consumer, thread order test cases

* change to unordered_map for semaphores and remove PreallocateSyncs

* tests running on device

* code cleanup for compile warnings

* remove #if defined(__hexagon__) guards

* copyright, format, lint

* add hexagon buffer map class

* remove redundant thread manager tests

* revert Hex Dev API changes for threading

* add comments; remove untested code to dispatch / wrap a packed func

* pass pipe address and not HTM pointer to thread context

* rename to HexagonBufferManager

* cleanup ahead of PR

* use DLOG(INFO)

* refactor GetStreamHandles to return a vector by value

* adjust HexagonBufferManager methods; use thread_manager file names

* style guidelines and debug prints

* reinterpret cast for TVMStreamHandle

* end member variables with underscore

Co-authored-by: Joseph McMahan <jmcmahan@octoml.ai>
---
 src/runtime/hexagon/hexagon_buffer_manager.h  |  81 +++++
 src/runtime/hexagon/hexagon_device_api.cc     |  29 +-
 src/runtime/hexagon/hexagon_device_api.h      |  23 +-
 src/runtime/hexagon/hexagon_thread_manager.cc | 291 ++++++++++++++++
 src/runtime/hexagon/hexagon_thread_manager.h  | 194 +++++++++++
 .../hexagon/hexagon_thread_manager_tests.cc   | 324 ++++++++++++++++++
 6 files changed, 901 insertions(+), 41 deletions(-)
 create mode 100644 src/runtime/hexagon/hexagon_buffer_manager.h
 create mode 100644 src/runtime/hexagon/hexagon_thread_manager.cc
 create mode 100644 src/runtime/hexagon/hexagon_thread_manager.h
 create mode 100644 tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc

diff --git a/src/runtime/hexagon/hexagon_buffer_manager.h b/src/runtime/hexagon/hexagon_buffer_manager.h
new file mode 100644
index 0000000000000..658a39fac8a80
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_buffer_manager.h
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_BUFFER_MANAGER_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_BUFFER_MANAGER_H_
+
+#include <tvm/runtime/logging.h>
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+
+#include "hexagon_buffer.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+class HexagonBufferManager {
+ public:
+  /*!
+   * \brief Free a HexagonBuffer.
+   * \param ptr Address of the HexagonBuffer as returned by `AllocateHexagonBuffer`.
+   */
+  void FreeHexagonBuffer(void* ptr) {
+    auto it = hexagon_buffer_map_.find(ptr);
+    CHECK(it != hexagon_buffer_map_.end())
+        << "Attempt made to free unknown or already freed dataspace allocation";
+    CHECK(it->second != nullptr);
+    hexagon_buffer_map_.erase(it);
+  }
+  /*!
+   * \brief Allocate a HexagonBuffer.
+   * \param args Templated arguments to pass through to HexagonBuffer constructor.
+   */
+  template <typename... Args>
+  void* AllocateHexagonBuffer(Args&&... args) {
+    auto buf = std::make_unique<HexagonBuffer>(std::forward<Args>(args)...);
+    void* ptr = buf->GetPointer();
+    hexagon_buffer_map_.insert({ptr, std::move(buf)});
+    return ptr;
+  }
+
+  //! \brief Returns whether the HexagonBuffer is in the map.
+  size_t count(void* ptr) { return hexagon_buffer_map_.count(ptr); }
+
+  //! \brief Returns an iterator to the HexagonBuffer within the map.
+  HexagonBuffer* find(void* ptr) {
+    auto it = hexagon_buffer_map_.find(ptr);
+    if (it != hexagon_buffer_map_.end()) {
+      return it->second.get();
+    }
+    return nullptr;
+  }
+
+ private:
+  //! \brief Contains the HexagonBuffer objects managed by this class.
+  std::unordered_map<void*, std::unique_ptr<HexagonBuffer>> hexagon_buffer_map_;
+};
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_BUFFER_MANAGER_H_
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index c9c1586008e3a..92a7b22784fb5 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -32,7 +32,6 @@
 #include <cstring>
 
 #include "../workspace_pool.h"
-#include "hexagon_buffer.h"
 #include "hexagon_common.h"
 
 namespace tvm {
@@ -74,14 +73,14 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
   }
 
   if (ndim == 0) {
-    return AllocateHexagonBuffer(typesize, alignment, mem_scope);
+    return hexbuffs.AllocateHexagonBuffer(typesize, alignment, mem_scope);
   } else if (ndim == 1) {
     size_t nbytes = shape[0] * typesize;
-    return AllocateHexagonBuffer(nbytes, alignment, mem_scope);
+    return hexbuffs.AllocateHexagonBuffer(nbytes, alignment, mem_scope);
   } else if (ndim == 2) {
     size_t nallocs = shape[0];
     size_t nbytes = shape[1] * typesize;
-    return AllocateHexagonBuffer(nallocs, nbytes, alignment, mem_scope);
+    return hexbuffs.AllocateHexagonBuffer(nallocs, nbytes, alignment, mem_scope);
   } else {
     LOG(FATAL) << "Hexagon Device API supports only 1d and 2d allocations, but received ndim = "
                << ndim;
@@ -97,13 +96,13 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme
   if (alignment < kHexagonAllocAlignment) {
     alignment = kHexagonAllocAlignment;
   }
-  return AllocateHexagonBuffer(nbytes, alignment, String("global"));
+  return hexbuffs.AllocateHexagonBuffer(nbytes, alignment, String("global"));
 }
 
 void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
   CHECK(ptr) << "buffer pointer is null";
   CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
-  FreeHexagonBuffer(ptr);
+  hexbuffs.FreeHexagonBuffer(ptr);
 }
 
 // WorkSpace: runtime allocations for Hexagon
@@ -119,7 +118,7 @@ void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_
 
 void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
   CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
-  CHECK(hexagon_buffer_map_.count(data) != 0)
+  CHECK(hexbuffs.count(data) != 0)
       << "Attempt made to free unknown or already freed workspace allocation";
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
@@ -143,13 +142,7 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan
   CHECK_EQ(to->byte_offset, 0);
   CHECK_EQ(GetDataSize(*from), GetDataSize(*to));
 
-  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* {
-    auto it = this->hexagon_buffer_map_.find(ptr);
-    if (it != this->hexagon_buffer_map_.end()) {
-      return it->second.get();
-    }
-    return nullptr;
-  };
+  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* { return hexbuffs.find(ptr); };
 
   HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
   HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
@@ -172,14 +165,6 @@ void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void
   memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
 }
 
-void HexagonDeviceAPI::FreeHexagonBuffer(void* ptr) {
-  auto it = hexagon_buffer_map_.find(ptr);
-  CHECK(it != hexagon_buffer_map_.end())
-      << "Attempt made to free unknown or already freed dataspace allocation";
-  CHECK(it->second != nullptr);
-  hexagon_buffer_map_.erase(it);
-}
-
 TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
   void* dst = args[0];
   void* src = args[1];
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index 6f65bf402757f..4da12e35fbe73 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -30,6 +30,7 @@
 #include <vector>
 
 #include "hexagon_buffer.h"
+#include "hexagon_buffer_manager.h"
 
 namespace tvm {
 namespace runtime {
@@ -72,7 +73,7 @@ class HexagonDeviceAPI final : public DeviceAPI {
    */
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
 
-  //! Erase from tracked hexagon_buffer_map and free
+  //! Erase from HexagonBufferManager and free
   void FreeWorkspace(Device dev, void* data) final;
 
   /*!
@@ -127,18 +128,6 @@ class HexagonDeviceAPI final : public DeviceAPI {
                       TVMStreamHandle stream) final;
 
  private:
-  /*! \brief Helper to allocate a HexagonBuffer and register the result
-   *  in the owned buffer map.
-   *  \return Raw data storage managed by the hexagon buffer
-   */
-  template <typename... Args>
-  void* AllocateHexagonBuffer(Args&&... args) {
-    auto buf = std::make_unique<HexagonBuffer>(std::forward<Args>(args)...);
-    void* ptr = buf->GetPointer();
-    hexagon_buffer_map_.insert({ptr, std::move(buf)});
-    return ptr;
-  }
-
   /*! \brief Helper to check if the device type is valid for the Hexagon Device API
    *  \return Boolean indicating whether the device type is valid
    */
@@ -148,12 +137,8 @@ class HexagonDeviceAPI final : public DeviceAPI {
            (DLDeviceType(dev.device_type) == kDLCPU);
   }
 
-  /*! \brief Helper to free a HexagonBuffer and unregister the result
-   *  from the owned buffer map.
-   */
-  void FreeHexagonBuffer(void* ptr);
-  //! Lookup table for the HexagonBuffer managing an allocation.
-  std::unordered_map<void*, std::unique_ptr<HexagonBuffer>> hexagon_buffer_map_;
+  //! \brief Manages underlying HexagonBuffer allocations
+  HexagonBufferManager hexbuffs;
 };
 }  // namespace hexagon
 }  // namespace runtime
diff --git a/src/runtime/hexagon/hexagon_thread_manager.cc b/src/runtime/hexagon/hexagon_thread_manager.cc
new file mode 100644
index 0000000000000..5d67b142e5754
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_thread_manager.cc
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "hexagon_thread_manager.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+HexagonThreadManager::HexagonThreadManager(unsigned num_threads, unsigned thread_stack_size_bytes,
+                                           unsigned thread_pipe_size_words) {
+  // Note: could technically manage more software threads than allowable hardware threads, but there
+  // is no system constant defined
+  //  in the qurt libs for that maximum.
+  CHECK(num_threads);
+  CHECK_LE(num_threads, QURT_MAX_HTHREAD_LIMIT);
+  nthreads_ = num_threads;
+
+  CHECK_GE(thread_stack_size_bytes, MIN_STACK_SIZE_BYTES);
+  CHECK_LE(thread_stack_size_bytes, MAX_STACK_SIZE_BYTES);
+
+  CHECK_GE(thread_pipe_size_words, MIN_PIPE_SIZE_WORDS);
+  CHECK_LE(thread_pipe_size_words, MAX_PIPE_SIZE_WORDS);
+
+  DLOG(INFO) << "Spawning threads";
+  SpawnThreads(thread_stack_size_bytes, thread_pipe_size_words);
+
+  // Initially, block all threads until we get the Start() call
+  qurt_sem_init_val(&start_semaphore_, 0);
+  for (unsigned i = 0; i < nthreads_; i++) {
+    Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_wait, &start_semaphore_);
+  }
+}
+
+HexagonThreadManager::~HexagonThreadManager() {
+  // In case Start() was never explicitly called, call it now to prevent deadlock
+  if (qurt_sem_get_val(&start_semaphore_) == 0) {
+    Start();
+  }
+
+  DLOG(INFO) << "Threads started";
+
+  // dispatch a command to each thread to exit with status 0
+  for (unsigned i = 0; i < nthreads_; i++) {
+    bool success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_exit, nullptr);
+    while (!success) {
+      success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_exit, nullptr);
+    }
+  }
+
+  DLOG(INFO) << "Threads exited";
+
+  // join with each thread (wait for them to terminate); if already exited, the call returns
+  // immediately
+  int status;  // don't actually care what the thread exit status was
+  for (unsigned i = 0; i < nthreads_; i++) {
+    qurt_thread_join(threads_[i], &status);
+  }
+
+  DLOG(INFO) << "Threads joined";
+
+  // Destroy semaphores
+  qurt_sem_destroy(&start_semaphore_);
+  for (auto it : semaphores_) {
+    qurt_sem_destroy(it.second);
+    free(it.second);
+  }
+
+  DLOG(INFO) << "Semaphores destroyed";
+
+  // Delete pipe objects and contexts
+  for (unsigned i = 0; i < nthreads_; i++) {
+    qurt_pipe_destroy(&pipes_[i]);
+    delete contexts_[i];
+  }
+
+  DLOG(INFO) << "Pipes and contexts deleted";
+
+  // Dealloc memory blocks
+  hexbuffs_.FreeHexagonBuffer(stack_buffer_);
+  hexbuffs_.FreeHexagonBuffer(pipe_buffer_);
+
+  DLOG(INFO) << "Buffers freed";
+}
+
+void HexagonThreadManager::SpawnThreads(unsigned thread_stack_size_bytes,
+                                        unsigned thread_pipe_size_words) {
+  // allocate all stack space for threads
+  stack_buffer_ = hexbuffs_.AllocateHexagonBuffer(thread_stack_size_bytes * nthreads_,
+                                                  MEM_ALIGNMENT, String("global"));
+  // allocate space for pipe buffers (command queues)
+  unsigned thread_pipe_size_bytes = thread_pipe_size_words * sizeof(qurt_pipe_data_t);
+  pipe_buffer_ = hexbuffs_.AllocateHexagonBuffer(thread_pipe_size_bytes * nthreads_, MEM_ALIGNMENT,
+                                                 String("global"));
+
+  threads_.resize(nthreads_);
+  pipes_.resize(nthreads_);
+  contexts_.resize(nthreads_);
+
+  DLOG(INFO) << "Buffers allocated";
+
+  // First, create pipe resources for all threads
+  char* next_pipe_start = reinterpret_cast<char*>(pipe_buffer_);
+  for (unsigned i = 0; i < nthreads_; i++) {
+    qurt_pipe_attr_t pipe_attr;
+    qurt_pipe_attr_init(&pipe_attr);
+    qurt_pipe_attr_set_buffer(&pipe_attr, reinterpret_cast<qurt_pipe_data_t*>(next_pipe_start));
+    next_pipe_start += thread_pipe_size_bytes;
+    qurt_pipe_attr_set_buffer_partition(&pipe_attr, QURT_PIPE_ATTR_MEM_PARTITION_RAM);
+    qurt_pipe_attr_set_elements(&pipe_attr, thread_pipe_size_words);
+
+    // create the pipe
+    int rc = qurt_pipe_init(&pipes_[i], &pipe_attr);
+    CHECK_EQ(rc, QURT_EOK);
+  }
+
+  DLOG(INFO) << "Pipes created";
+
+  // Create all threads
+  char* next_stack_start = reinterpret_cast<char*>(stack_buffer_);
+  for (unsigned i = 0; i < nthreads_; i++) {
+    // create initialize the thread attr
+    qurt_thread_attr_t thread_attr;
+    char name[32];
+    qurt_thread_attr_init(&thread_attr);
+    qurt_thread_attr_set_stack_addr(&thread_attr, next_stack_start);
+    qurt_thread_attr_set_stack_size(&thread_attr, thread_stack_size_bytes);
+    snprintf(name, sizeof(name), "thread %d", i);
+    qurt_thread_attr_set_name(&thread_attr, name);
+    next_stack_start += thread_stack_size_bytes;
+
+    // create the thread
+    contexts_[i] = new ThreadContext(&pipes_[i], i);
+    int rc = qurt_thread_create(&threads_[i], &thread_attr, thread_main, contexts_[i]);
+    CHECK_EQ(rc, QURT_EOK);
+  }
+
+  DLOG(INFO) << "Threads created";
+}
+
+const std::vector<TVMStreamHandle> HexagonThreadManager::GetStreamHandles() {
+  std::vector<TVMStreamHandle> out;
+  for (unsigned i = 0; i < nthreads_; i++) {
+    // threads identified by index into `threads` array
+    out.push_back(reinterpret_cast<TVMStreamHandle>(i));
+  }
+  return out;
+}
+
+bool HexagonThreadManager::Dispatch(TVMStreamHandle stream, voidfunc f, void* args) {
+  unsigned thread = reinterpret_cast<unsigned>(stream);
+  DLOG(INFO) << "Dispatching to stream " << thread;
+  Command* cmd = new Command(f, args);  // Command object freed by receiving thread
+  qurt_pipe_data_t msg = (qurt_pipe_data_t)(cmd);
+  qurt_pipe_t* pipeAddr = &pipes_[thread];
+
+  int trysend = qurt_pipe_try_send(pipeAddr, msg);
+  return trysend == 0;
+}
+
+void HexagonThreadManager::Start() { thread_signal(&start_semaphore_); }
+
+void HexagonThreadManager::WaitOnThreads() {
+  // Using standard signal mechanism to block the "main" thread on all worker threads.
+  // Note: this would be slightly more efficient as a barrier, but would need some extra code to
+  //  wait on the barrier that would only be used once.
+
+  // In case Start() was never explicitly called, call it now to prevent deadlock
+  if (qurt_sem_get_val(&start_semaphore_) == 0) {
+    Start();
+  }
+
+  std::vector<qurt_sem_t> finished;
+  finished.resize(nthreads_);
+
+  // initialize one semaphore for each thread
+  for (unsigned i = 0; i < nthreads_; i++) {
+    qurt_sem_init_val(&finished[i], 0);
+  }
+  // dispatch signal() command to each thread on their private semaphore
+  for (unsigned i = 0; i < nthreads_; i++) {
+    bool success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_signal, &finished[i]);
+    while (!success) {
+      success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_signal, &finished[i]);
+    }
+  }
+  // wait on each semaphore, one at a time
+  for (unsigned i = 0; i < nthreads_; i++) {
+    thread_wait(&finished[i]);
+  }
+
+  // clean up
+  for (unsigned i = 0; i < nthreads_; i++) {
+    qurt_sem_destroy(&finished[i]);
+  }
+}
+
+void HexagonThreadManager::CheckSemaphore(unsigned syncID) {
+  if (semaphores_.find(syncID) == semaphores_.end()) {
+    semaphores_[syncID] = reinterpret_cast<qurt_sem_t*>(malloc(sizeof(qurt_sem_t)));
+    qurt_sem_init_val(semaphores_[syncID], 0);
+  }
+}
+
+bool HexagonThreadManager::Signal(TVMStreamHandle thread, SyncPoint syncID) {
+  CheckSemaphore(syncID);
+  DLOG(INFO) << "Dispatching signal to thread " << thread << " on semaphore ID " << syncID
+             << " located @ 0x" << std::hex << semaphores_[syncID];
+  return Dispatch(thread, thread_signal, semaphores_[syncID]);
+}
+
+bool HexagonThreadManager::Wait(TVMStreamHandle thread, SyncPoint syncID) {
+  CheckSemaphore(syncID);
+  DLOG(INFO) << "Dispatching wait to thread " << thread << " on semaphore ID " << syncID
+             << " located @ 0x" << std::hex << semaphores_[syncID];
+  return Dispatch(thread, thread_wait, semaphores_[syncID]);
+}
+
+/* Create a sync_from_to relationship with a dynamic semaphore allocation.
+Makes use of thread_wait_free to also free the semaphore after sync is complete.
+*/
+bool HexagonThreadManager::SyncFromTo(TVMStreamHandle signal_thread, TVMStreamHandle wait_thread) {
+  qurt_sem_t* sem = reinterpret_cast<qurt_sem_t*>(malloc(sizeof(qurt_sem_t)));
+  qurt_sem_init_val(sem, 0);
+  if (Dispatch(signal_thread, thread_signal, sem)) {
+    return Dispatch(wait_thread, thread_wait_free, sem);
+  } else {
+    return false;
+  }
+}
+
+void HexagonThreadManager::thread_signal(void* semaphore) {
+  DLOG(INFO) << "Signaling semaphore addr 0x" << std::hex << semaphore;
+  qurt_sem_add(reinterpret_cast<qurt_sem_t*>(semaphore), QURT_MAX_HTHREAD_LIMIT);
+}
+
+void HexagonThreadManager::thread_wait(void* semaphore) {
+  DLOG(INFO) << "Waiting on semaphore addr 0x" << std::hex << semaphore;
+  qurt_sem_down(reinterpret_cast<qurt_sem_t*>(semaphore));
+}
+
+/* Wait on the passed semaphore object, then free it. */
+void HexagonThreadManager::thread_wait_free(void* semaphore) {
+  qurt_sem_down(reinterpret_cast<qurt_sem_t*>(semaphore));  // blocks until signal is complete
+  qurt_sem_destroy(reinterpret_cast<qurt_sem_t*>(semaphore));
+  free(semaphore);
+}
+
+void HexagonThreadManager::thread_exit(void* status) {
+  DLOG(INFO) << "thread exiting";
+  qurt_thread_exit((uint64_t)status);
+}
+
+void HexagonThreadManager::thread_main(void* context) {
+  ThreadContext* tc = static_cast<ThreadContext*>(context);
+  unsigned index = tc->index;
+  qurt_pipe_t* mypipe = tc->pipe;
+
+  DLOG(INFO) << "Thread " << index << " spawned";
+
+  while (true) {  // loop, executing commands from pipe
+    DLOG(INFO) << "Thread " << index << " receiving command";
+    qurt_pipe_data_t msg = qurt_pipe_receive(mypipe);  // blocks if empty
+    Command* cmd = reinterpret_cast<Command*>(msg);
+    voidfunc f = cmd->f;
+    void* args = cmd->args;
+    delete cmd;
+    f(args);
+  }
+  // thread exit is handled by dispatching an exit command
+}
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon_thread_manager.h b/src/runtime/hexagon/hexagon_thread_manager.h
new file mode 100644
index 0000000000000..3422fef3879ed
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_thread_manager.h
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_THREAD_MANAGER_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_THREAD_MANAGER_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/logging.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "hexagon_buffer.h"
+#include "hexagon_buffer_manager.h"
+#include "hexagon_common.h"
+#include "qurt.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+class HexagonThreadManager {
+  //! \brief Void function.
+  using voidfunc = void (*)(void*);
+  //! \brief Semaphore ID.
+  using SyncPoint = unsigned;
+  //! \brief Alignment of underlying memory allocations.
+  const unsigned MEM_ALIGNMENT = 32;
+  //! \brief Minimum stack size in bytes per thread.
+  const unsigned MIN_STACK_SIZE_BYTES = 0x400;  // 1KB
+  //! \brief Maximum stack size in bytes per thread.
+  const unsigned MAX_STACK_SIZE_BYTES = 0x10000;  // 64KB
+  //! \brief Minimum pipe (or command buffer) size in words (or commands) per thread.
+  const unsigned MIN_PIPE_SIZE_WORDS = 10;
+  //! \brief Maximum pipe (or command buffer) size in words (or commands) per thread.
+  const unsigned MAX_PIPE_SIZE_WORDS = 0x10000;  // 64K words
+
+ public:
+  /*!
+   * \brief Spawn a number of Hexagon threads with a given stack (in bytes) and pipe (a.k.a. command
+   * buffer; in words or commands) within the min and max values specified above.
+   * \param num_threads Number of threads to spawn.
+   * \param thread_stack_size_bytes Stack size in bytes per thread.
+   * \param thread_pipe_size_words Pipe (or command buffer) size in words (or commands).
+   */
+  HexagonThreadManager(unsigned, unsigned thread_stack_size_bytes, unsigned thread_pipe_size_words);
+
+  //! \brief Destructor
+  ~HexagonThreadManager();
+
+  /*!
+   * \brief Get the spawned threads as stream handles.
+   * \returns Vector of stream handles.
+   */
+  const std::vector<TVMStreamHandle> GetStreamHandles();
+
+  /*!
+   * \brief Non-blocking dispatch of a void function and args on a given thread.
+   * \param thread Stream handle of the thread on which to dispatch the void function.
+   * \param f Void function to be dispatched.
+   * \param args Arguments to pass to the void function.
+   * \returns Boolean value indicating success or failure of the dispatch; user must either 1)
+   * `Start` threads executing to clear space in the pipe before retrying dispatch or 2) create a
+   * `HexagonThreadManager` with a larger pipe.
+   */
+  bool Dispatch(TVMStreamHandle thread, voidfunc f, void* args);
+  /*!
+   * \brief Non-blocking signal of a semaphore with a given ID.
+   * \param thread Stream handle of the thread which will signal the semaphore.
+   * \param syncID ID of the semaphore to be signaled.
+   * \returns Boolean value indicating success or failure of the dispatch of the signal; user must
+   * either 1) `Start` threads executing to clear space in the pipe before retrying dispatch or 2)
+   * create a `HexagonThreadManager` with a larger pipe.
+   */
+  bool Signal(TVMStreamHandle thread, SyncPoint syncID);
+  /*!
+   * \brief Non-blocking wait on a semaphore with a given ID.
+   * \param thread Stream handle of the thread which will wait on the semaphore.
+   * \param syncID ID of the semaphore on which to wait.
+   * \returns Boolean value indicating success or failure of the dispatch of the wait; user must
+   * either 1) `Start` threads executing to clear space in the pipe before retrying dispatch or 2)
+   * create a `HexagonThreadManager` with a larger pipe.
+   */
+  bool Wait(TVMStreamHandle thread, SyncPoint syncID);
+  /*!
+   * \brief Creates a synchronization point between two threads by creating a semaphore,
+   *dispatching the `signal_thread` to signal that semaphore and dispatching the `wait_thread to
+   *wait on that semaphore.
+   * \param signal_thread Stream handle for the thread which will signal the
+   *semaphore.
+   * \param wait_thread Stream handle for the thread which will wait on the semaphore.
+   * \returns Boolean value indicating success or failure of the combined dispatch of both the
+   *signal and the wait; user must either 1) `Start` threads executing to clear space in the pipe
+   *before retrying dispatch or 2) create a `HexagonThreadManager` with a larger pipe.
+   */
+  bool SyncFromTo(TVMStreamHandle signal_thread, TVMStreamHandle wait_thread);
+  //! \brief Unblock threads to start execution.
+  void Start();
+  //! \brief Unblock threads to start execution if `Start` has not already been called; blocking
+  //! call to wait until all threads have empty pipes.
+  void WaitOnThreads();
+
+ private:
+  struct ThreadContext {
+    qurt_pipe_t* pipe;
+    unsigned index;
+    ThreadContext(qurt_pipe_t* pipe, unsigned index) : pipe(pipe), index(index) {}
+  };
+
+  //! \brief Helper function for the constructor to spawn threads.
+  void SpawnThreads(unsigned thread_stack_size_bytes, unsigned thread_pipe_size_words);
+
+  //! \brief Helper function for `Signal` and `Wait` to create, initialize and map semaphores by ID.
+  void CheckSemaphore(unsigned syncID);
+
+  //! \brief Void function executed by a thread to signal a semaphore.
+  static void thread_signal(void* semaphore);
+
+  //! \brief Void function executed by a thread to wait on a semaphore; used by `Wait`.
+  static void thread_wait(void* semaphore);
+
+  //! \brief Void function executed by a thread to wait on and free a semaphore; used by
+  //! `SyncFromTo`.
+  static void thread_wait_free(void* semaphore);
+
+  //! \brief Void function executed by a thread to exit at time of destruction.
+  static void thread_exit(void* status);
+
+  //! \brief Void function executed by each thread as `main`.
+  static void thread_main(void* context);
+
+  //! \brief Manages underlying HexagonBuffer allocations.
+  HexagonBufferManager hexbuffs_;
+
+  //! \brief Number of threads allocatted.
+  unsigned nthreads_{0};
+
+  //! \brief Pointer to the base of the stacks allocated for all threads; size = `nthreads` *
+  //! `thread_stack_size_bytes`.
+  void* stack_buffer_{nullptr};
+
+  //! \brief Pointer to the base of the pipes (or command buffers) allocated for all threads; size =
+  //! `nthreads` * `thread_pipe_size_words` * sizeof(word).
+  void* pipe_buffer_{nullptr};
+
+  //! \brief QURT thread structure for each spawned thread.
+  std::vector<qurt_thread_t> threads_;
+
+  //! \brief QURT pipe (or command buffer) structure for each spawned thread.
+  std::vector<qurt_pipe_t> pipes_;
+
+  //! \brief Thread context passed into each `thread_main` function.
+  std::vector<ThreadContext*> contexts_;
+
+  //! \brief Semaphores used by `Signal` and `Wait` mapped by ID.
+  std::unordered_map<unsigned, qurt_sem_t*> semaphores_;
+
+  //! \brief Start semaphore created at time of construction; signled by `Start`.
+  qurt_sem_t start_semaphore_;
+
+  /*!
+   *\brief Encapsulate a void function pointer + arg pointer; sent via pipe to threads to execute.
+   */
+  struct Command {
+    voidfunc f;
+    void* args;
+    Command(voidfunc f, void* args) : f(f), args(args) {}
+  };
+};
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_THREAD_MANAGER_H_
diff --git a/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
new file mode 100644
index 0000000000000..aa86e4638df3d
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/logging.h>
+
+#include "../src/runtime/hexagon/hexagon_thread_manager.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::hexagon;
+
+class HexagonThreadManagerTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    htm = new HexagonThreadManager(threads, stack_size, pipe_size);
+    streams = htm->GetStreamHandles();
+  }
+  void TearDown() override { delete htm; }
+  HexagonThreadManager* htm{nullptr};
+  std::vector<TVMStreamHandle> streams;
+  int answer{0};
+  const unsigned threads{6};
+  const unsigned pipe_size{100};
+  const unsigned stack_size{0x4000};  // 16KB
+};
+
+TEST_F(HexagonThreadManagerTest, ctor_errors) {
+  // zero threads
+  ASSERT_THROW(HexagonThreadManager(0, stack_size, pipe_size), InternalError);
+  // too many threads
+  ASSERT_THROW(HexagonThreadManager(0x10000000, stack_size, pipe_size), InternalError);
+  // stack too small
+  ASSERT_THROW(HexagonThreadManager(6, 0, pipe_size), InternalError);
+  // stack too big
+  ASSERT_THROW(HexagonThreadManager(6, 0x10000000, pipe_size), InternalError);
+  // pipe too small
+  ASSERT_THROW(HexagonThreadManager(6, stack_size, 9), InternalError);
+  // pipe too big
+  ASSERT_THROW(HexagonThreadManager(6, stack_size, 0x10000000), InternalError);
+}
+
+TEST_F(HexagonThreadManagerTest, init) {
+  CHECK(htm != nullptr);
+  CHECK_EQ(streams.size(), threads);
+}
+
+void get_the_answer(void* answer) { *reinterpret_cast<int*>(answer) = 42; }
+
+TEST_F(HexagonThreadManagerTest, dispatch) {
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->Start();
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, dispatch_wait) {
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, wait_signal) {
+  htm->Wait(streams[0], 0);
+  htm->Signal(streams[1], 0);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, re_signal) {
+  htm->Wait(streams[0], 0);
+  htm->Signal(streams[1], 0);
+  htm->Signal(streams[1], 0);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, re_wait) {
+  htm->Wait(streams[0], 0);
+  htm->Signal(streams[1], 0);
+  htm->Wait(streams[0], 0);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, wait_signal_x2) {
+  htm->Wait(streams[0], 0);
+  htm->Signal(streams[1], 0);
+  htm->Wait(streams[0], 1);
+  htm->Signal(streams[1], 1);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, signal_wait) {
+  htm->Signal(streams[1], 0);
+  htm->Wait(streams[0], 0);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, sync_from_to) {
+  htm->SyncFromTo(streams[1], streams[0]);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, sync_from_to_self) {
+  htm->SyncFromTo(streams[0], streams[0]);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, sync_from_to_x2) {
+  htm->SyncFromTo(streams[0], streams[1]);
+  htm->SyncFromTo(streams[1], streams[0]);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, sync_from_to_all) {
+  htm->SyncFromTo(streams[5], streams[4]);
+  htm->SyncFromTo(streams[4], streams[3]);
+  htm->SyncFromTo(streams[3], streams[2]);
+  htm->SyncFromTo(streams[2], streams[1]);
+  htm->SyncFromTo(streams[1], streams[0]);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, pipe_fill) {
+  // fill the pipe
+  for (int i = 0; i < pipe_size; ++i) {
+    htm->Dispatch(streams[0], get_the_answer, &answer);
+  }
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, pipe_overflow) {
+  // fill the pipe
+  for (int i = 0; i < pipe_size; ++i) {
+    htm->Dispatch(streams[0], get_the_answer, &answer);
+  }
+  // overflow the pipe
+  bool space = htm->Dispatch(streams[0], get_the_answer, &answer);
+  CHECK_EQ(space, false);
+}
+
+void increment(void* voidptr) {
+  int* intptr = reinterpret_cast<int*>(voidptr);
+  *intptr = *intptr + 1;
+}
+
+TEST_F(HexagonThreadManagerTest, producer_consumer) {
+  htm->Dispatch(streams[5], increment, &answer);
+  htm->SyncFromTo(streams[5], streams[4]);
+  htm->Dispatch(streams[4], increment, &answer);
+  htm->SyncFromTo(streams[4], streams[3]);
+  htm->Dispatch(streams[3], increment, &answer);
+  htm->SyncFromTo(streams[3], streams[2]);
+  htm->Dispatch(streams[2], increment, &answer);
+  htm->SyncFromTo(streams[2], streams[1]);
+  htm->Dispatch(streams[1], increment, &answer);
+  htm->SyncFromTo(streams[1], streams[0]);
+  htm->Dispatch(streams[0], increment, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 6);
+}
+
+TEST_F(HexagonThreadManagerTest, producer_consumer_signal_wait) {
+  htm->Wait(streams[0], 0);
+  htm->Wait(streams[1], 1);
+  htm->Wait(streams[2], 2);
+  htm->Wait(streams[3], 3);
+  htm->Wait(streams[4], 4);
+
+  htm->Dispatch(streams[5], increment, &answer);
+  htm->Signal(streams[5], 4);
+  htm->Dispatch(streams[4], increment, &answer);
+  htm->Signal(streams[4], 3);
+  htm->Dispatch(streams[3], increment, &answer);
+  htm->Signal(streams[3], 2);
+  htm->Dispatch(streams[2], increment, &answer);
+  htm->Signal(streams[2], 1);
+  htm->Dispatch(streams[1], increment, &answer);
+  htm->Signal(streams[1], 0);
+  htm->Dispatch(streams[0], increment, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 6);
+}
+
+struct ToAppend {
+  std::vector<int>* arr;
+  int value;
+  ToAppend(std::vector<int>* addr, int value) : arr(addr), value(value){};
+};
+
+void append(void* toappend) {
+  ToAppend* cmd = reinterpret_cast<ToAppend*>(toappend);
+  cmd->arr->push_back(cmd->value);
+}
+
+TEST_F(HexagonThreadManagerTest, thread_order) {
+  std::vector<int> arr;
+
+  ToAppend cmd0(&arr, 0);
+  htm->Dispatch(streams[0], append, &cmd0);
+  htm->SyncFromTo(streams[0], streams[1]);
+
+  ToAppend cmd1(&arr, 1);
+  htm->Dispatch(streams[1], append, &cmd1);
+  htm->SyncFromTo(streams[1], streams[2]);
+
+  ToAppend cmd2(&arr, 2);
+  htm->Dispatch(streams[2], append, &cmd2);
+  htm->SyncFromTo(streams[2], streams[3]);
+
+  ToAppend cmd3(&arr, 3);
+  htm->Dispatch(streams[3], append, &cmd3);
+  htm->SyncFromTo(streams[3], streams[4]);
+
+  ToAppend cmd4(&arr, 4);
+  htm->Dispatch(streams[4], append, &cmd4);
+  htm->SyncFromTo(streams[4], streams[5]);
+
+  ToAppend cmd5(&arr, 5);
+  htm->Dispatch(streams[5], append, &cmd5);
+  htm->WaitOnThreads();
+  for (int i = 0; i < threads; ++i) {
+    CHECK_EQ(arr[i], i);
+  }
+}
+
+TEST_F(HexagonThreadManagerTest, thread_order_signal_wait) {
+  std::vector<int> arr;
+
+  htm->Wait(streams[1], 1);
+  htm->Wait(streams[2], 2);
+  htm->Wait(streams[3], 3);
+  htm->Wait(streams[4], 4);
+  htm->Wait(streams[5], 5);
+
+  ToAppend cmd0(&arr, 0);
+  htm->Dispatch(streams[0], append, &cmd0);
+  htm->Signal(streams[0], 1);
+
+  ToAppend cmd1(&arr, 1);
+  htm->Dispatch(streams[1], append, &cmd1);
+  htm->Signal(streams[1], 2);
+
+  ToAppend cmd2(&arr, 2);
+  htm->Dispatch(streams[2], append, &cmd2);
+  htm->Signal(streams[2], 3);
+
+  ToAppend cmd3(&arr, 3);
+  htm->Dispatch(streams[3], append, &cmd3);
+  htm->Signal(streams[3], 4);
+
+  ToAppend cmd4(&arr, 4);
+  htm->Dispatch(streams[4], append, &cmd4);
+  htm->Signal(streams[4], 5);
+
+  ToAppend cmd5(&arr, 5);
+  htm->Dispatch(streams[5], append, &cmd5);
+  htm->WaitOnThreads();
+  for (int i = 0; i < threads; ++i) {
+    CHECK_EQ(arr[i], i);
+  }
+}
+
+struct ToWrite {
+  int* addr;
+  int value;
+  ToWrite(int* addr, int value) : addr(addr), value(value){};
+};
+
+void thread_write_val(void* towrite) {
+  ToWrite* cmd = reinterpret_cast<ToWrite*>(towrite);
+  *(cmd->addr) = cmd->value;
+  delete cmd;
+}
+
+TEST_F(HexagonThreadManagerTest, dispatch_writes) {
+  std::vector<int> array;
+  std::vector<int> truth;
+  array.resize(streams.size());
+  truth.resize(streams.size());
+  for (int i = 0; i < streams.size(); i++) {
+    int val = i * 2;
+    ToWrite* cmd = new ToWrite(&array[i], val);
+    htm->Dispatch(streams[i], thread_write_val, cmd);
+    truth[i] = val;
+  }
+  htm->Start();
+  htm->WaitOnThreads();
+  for (int i = 0; i < streams.size(); i++) {
+    CHECK_EQ(array[i], truth[i]);
+  }
+}

From b659332a4cf8a25d68af6590b23dcd68294b6b0c Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Mon, 13 Jun 2022 15:20:09 -0700
Subject: [PATCH 123/181] [AutoTVM][Autoscheduler] Default build funcs inherit
 PassContext (#11632)

* init commit

* lint

* empty commit

* test results

* reset progress

* lint

* fix
---
 python/tvm/auto_scheduler/measure.py          |  21 ++--
 python/tvm/autotvm/measure/measure_methods.py |  29 ++++-
 tests/python/integration/test_tuning.py       | 112 ++++++++++++++++++
 3 files changed, 145 insertions(+), 17 deletions(-)

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 2a4a03bbe8e7c..6f331499b0428 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -31,23 +31,22 @@
 We implement these in python to utilize python's multiprocessing and error handling.
 """
 
+import logging
+import multiprocessing
 import os
-import time
 import shutil
 import tempfile
-import multiprocessing
-import logging
+import time
 
 import tvm._ffi
-from tvm.runtime import Object, module, ndarray
+from tvm.autotvm.env import AutotvmGlobalScope, reset_global_scope
+from tvm.contrib import ndk, tar
+from tvm.contrib.popen_pool import PopenPoolExecutor, PopenWorker, StatusKind
 from tvm.driver import build_module
 from tvm.ir import transform
-from tvm.autotvm.env import AutotvmGlobalScope, reset_global_scope
-from tvm.contrib import tar, ndk
-from tvm.contrib.popen_pool import PopenWorker, PopenPoolExecutor, StatusKind
+from tvm.runtime import Object, module, ndarray
 from tvm.target import Target
 
-
 from . import _ffi_api
 from .loop_state import StateObject
 from .utils import (
@@ -59,8 +58,8 @@
     request_remote,
 )
 from .workload_registry import (
-    serialize_workload_registry_entry,
     deserialize_workload_registry_entry,
+    serialize_workload_registry_entry,
 )
 
 # pylint: disable=invalid-name
@@ -555,8 +554,8 @@ def __init__(
         device=0,
     ):
         # pylint: disable=import-outside-toplevel
-        from tvm.rpc.tracker import Tracker
         from tvm.rpc.server import Server
+        from tvm.rpc.tracker import Tracker
 
         self.tracker = Tracker(port=9000, port_end=10000, silent=True)
         device_key = "$local$device$%d" % self.tracker.port
@@ -630,7 +629,7 @@ def _local_build_worker(inp_serialized, build_func, verbose):
         filename = os.path.join(dirname, "tmp_func." + build_func.output_format)
 
         try:
-            with transform.PassContext():
+            with transform.PassContext().current():
                 func = build_module.build(sch, args, target=task.target)
             func.export_library(filename, build_func)
         # pylint: disable=broad-except
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index f582bd1974aa5..8fc0da89c4c63 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -31,9 +31,9 @@
 import time
 import traceback
 import typing
+import warnings
 from collections import namedtuple
 from random import getrandbits
-import warnings
 
 import tvm._ffi
 import tvm.ir.transform
@@ -505,10 +505,6 @@ def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option
         if not config.valid():
             raise InstantiationError(config.errors)
 
-        opts = build_option or {}
-        if check_gpu:  # Add verify pass to filter out invalid configs in advance.
-            opts["tir.add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))]
-
         # if target is vta, we need to use vta build
         if (
             hasattr(measure_input.target, "device_name")
@@ -519,7 +515,28 @@ def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option
 
             func = vta.build(s, args, target_host=task.target_host)
         else:
-            with tvm.ir.transform.PassContext(config=opts):
+            current_pass_context: tvm.ir.transform.PassContext = (
+                tvm.ir.transform.PassContext.current()
+            )
+            current_config = dict(current_pass_context.config)
+            if build_option is not None:
+                current_config.update(build_option)
+
+            if "tir.add_lower_pass" in current_config:
+                current_add_lower_pass = list(current_config["tir.add_lower_pass"])
+            else:
+                current_add_lower_pass = []
+            if check_gpu:
+                current_add_lower_pass.append((2, gpu_verify_pass(**check_gpu)))
+            current_config["tir.add_lower_pass"] = current_add_lower_pass
+
+            with tvm.ir.transform.PassContext(
+                opt_level=current_pass_context.opt_level,
+                required_pass=current_pass_context.required_pass,
+                disabled_pass=current_pass_context.disabled_pass,
+                instruments=current_pass_context.instruments,
+                config=current_config,
+            ):
                 func = build(s, args, target_host=task.target_host, runtime=runtime)
     return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
 
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 03f38aa9cc9e3..a3dca33e71ee1 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -28,7 +28,11 @@
 import tvm.relay
 import tvm.testing
 from tvm import autotvm, te
+from tvm.autotvm.measure import measure_methods
 from tvm.autotvm.tuner import RandomTuner
+from tvm.contrib import tar
+from tvm.ir.instrument import pass_instrument
+from tvm.ir.transform import PassContext
 from tvm.target import Target
 
 
@@ -180,6 +184,114 @@ def runner(target, dev):
     run_test_with_all_multiprocessing(runner, target, dev)
 
 
+@tvm.testing.parametrize_targets("cuda", "opencl")
+def test_tuning_gpu_inherits_pass_context(target, dev):
+    """Autotvm tuner inherits PassContexts but also adds a gpu verification pass by default.
+
+    Test that using PassContext inherits passes properly but also runs gpu verification pass.
+    """
+    from tvm.tir.analysis import _ffi_api as _analysis_ffi_api
+
+    @pass_instrument
+    class PassInstrumentChecker:
+        """Pass Instrument that simply sees if it's been run."""
+
+        def __init__(self):
+            self.has_been_run = False
+
+        def run_after_pass(self, mod, info):
+            self.has_been_run = True
+
+    class GPUVerifyPassMocked:
+        """Context manager that mocks tir.analysis.verify_gpu_code meant
+        to verify the pass has been run. This is done by patching the ffi func handles."""
+
+        FFI_FUNC_HANDLE = "tir.analysis.verify_gpu_code"
+        FUNC_NAME = "verify_gpu_code"
+
+        def __init__(self) -> None:
+            self.old_impl = tvm._ffi.get_global_func(self.FFI_FUNC_HANDLE)
+            self.has_been_run = False
+
+        def gpu_verify_pass_mocked(self):
+            """Get the replacement for the gpu verification pass."""
+
+            def _gpu_verify_pass_mocked(*args, **kwargs):
+                self.has_been_run = True
+                return self.old_impl(*args, **kwargs)
+
+            return _gpu_verify_pass_mocked
+
+        def __enter__(self):
+            tvm._ffi.register_func(
+                self.FFI_FUNC_HANDLE, self.gpu_verify_pass_mocked(), override=True
+            )
+
+            # Also overwrite the python bindings
+            setattr(
+                _analysis_ffi_api, self.FUNC_NAME, tvm._ffi.get_global_func(self.FFI_FUNC_HANDLE)
+            )
+
+        def __exit__(self, *args, **kwargs):
+            # Restore FFI status back to normal
+            tvm._ffi.register_func(self.FFI_FUNC_HANDLE, self.old_impl, override=True)
+            setattr(_analysis_ffi_api, self.FUNC_NAME, self.old_impl)
+
+    class OverwrittenBuildFunc(measure_methods._WrappedBuildFunc):
+        """BuildFunc that mocks and patches as necessary to test proper passes are run."""
+
+        def __call__(self, measure_input, tmp_dir, **kwargs):
+            instrument = PassInstrumentChecker()
+            mocked_pass_checker = GPUVerifyPassMocked()
+            with mocked_pass_checker:
+                with PassContext(instruments=[instrument]):
+                    regular_result = super().__call__(measure_input, tmp_dir, **kwargs)
+
+                    # Check instrument has been run, meaning context was inherited by builder
+                    assert instrument.has_been_run
+
+                    # But also check the gpu verification pass has been run
+                    # (which was not in the inherited ctx)
+                    assert mocked_pass_checker.has_been_run
+
+                    return regular_result
+
+    class MockedLocalBuilder(measure_methods.LocalBuilder):
+        """As measure_methods.LocalBuilder but overwrites the PassContext for testing."""
+
+        def __init__(
+            self,
+            timeout=10,
+            n_parallel=None,
+            build_kwargs=None,
+            build_func="default",
+            do_fork=False,
+            runtime=None,
+        ):
+            super().__init__(timeout, n_parallel, build_kwargs, build_func, do_fork, runtime)
+            self.build_func = OverwrittenBuildFunc(tar.tar, runtime)
+
+    def runner(target, dev):
+        task, target = get_sample_task(target, None)
+        logging.info("task config space: %s", task.config_space)
+
+        # Note: we use the MockedLocalBuilder here instead of autotvm.LocalBuilder()
+        measure_option = autotvm.measure_option(MockedLocalBuilder(), autotvm.LocalRunner())
+
+        results = []
+
+        tuner = RandomTuner(task)
+        tuner.tune(
+            n_trial=1,
+            measure_option=measure_option,
+            callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
+        )
+
+        assert len(results) == 1
+
+    run_test_with_all_multiprocessing(runner, target, dev)
+
+
 def test_tuning_cpu():
     def runner():
         ir_mod = tvm.parser.fromtext(

From 81cc0864004bb64c8c70ce0ed1abbc3a8755458c Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Mon, 13 Jun 2022 16:29:35 -0600
Subject: [PATCH 124/181] [WIP] [CI] Bump CI GPU image version (#11637)

* Bump CI GPU image version

* Run generate,py
---
 Jenkinsfile            | 2 +-
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ad7771b817455..85bc6b075a829 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -50,7 +50,7 @@
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220519-055908-ddfa1da69'
+ci_gpu = 'tlcpack/ci-gpu:20220606-055910-bf4b8f5c7'
 ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 4e344c56d7f72..75e67d9e9ffa9 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -52,7 +52,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220519-055908-ddfa1da69'
+ci_gpu = 'tlcpack/ci-gpu:20220606-055910-bf4b8f5c7'
 ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'

From a82d2f516e0f484ad3d91fa2dd9997cfc016893f Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 14 Jun 2022 00:35:14 -0700
Subject: [PATCH 125/181] [Hotfix][MetaSchedule] Importing from test foldeer
 (#11695)

A concurrent merge breaks the unittest which imports directly from
`meta_schedule.testing`.
---
 .../unittest/test_meta_schedule_relay_tir_compute.py     | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
index b62b638c03dc9..b208276539cc1 100644
--- a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
+++ b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
@@ -18,12 +18,11 @@
 import tvm
 import tvm.testing
 import tvm.topi.testing
-
-from tvm.script import tir as T
-from tvm import tir, te, relay, topi, autotvm
-from tvm.relay.testing.temp_op_attr import TempOpAttr
+from tvm import autotvm, relay, te, tir
 from tvm.meta_schedule import ApplyHistoryBest
-from tvm.meta_schedule.testing import apply_fixed_schedules
+from tvm.meta_schedule.testing.utils import apply_fixed_schedules
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+from tvm.script import tir as T
 
 
 def compute_tir_conv2d_nchw_oihw(data_shape, weight_shape, dtype):

From 27b0aad5a55254815a076dbcacb53e9725019f9d Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 14 Jun 2022 16:00:28 +0530
Subject: [PATCH 126/181] [BYOC-OpenCLML] OpenCLML integration with TVM.
 (#10243)

* [BYOC-OpenCLML] OpenCLML integration with TVM.

* [BYOC-OpenCLML] Cleanup and review.
---
 CMakeLists.txt                                |    3 +
 cmake/config.cmake                            |    5 +
 cmake/modules/LibInfo.cmake                   |    2 +
 cmake/modules/contrib/CLML.cmake              |   58 +
 python/tvm/relay/op/contrib/__init__.py       |    1 +
 python/tvm/relay/op/contrib/clml.py           |  247 ++++
 src/relay/backend/contrib/clml/codegen.cc     |  412 +++++++
 src/runtime/contrib/clml/clml_runtime.cc      | 1091 +++++++++++++++++
 src/support/libinfo.cc                        |    2 +
 tests/python/contrib/test_clml/__init__.py    |   17 +
 .../contrib/test_clml/infrastructure.py       |  256 ++++
 .../python/contrib/test_clml/test_network.py  |  139 +++
 tests/python/contrib/test_clml/test_ops.py    |  216 ++++
 13 files changed, 2449 insertions(+)
 create mode 100644 cmake/modules/contrib/CLML.cmake
 create mode 100644 python/tvm/relay/op/contrib/clml.py
 create mode 100644 src/relay/backend/contrib/clml/codegen.cc
 create mode 100644 src/runtime/contrib/clml/clml_runtime.cc
 create mode 100644 tests/python/contrib/test_clml/__init__.py
 create mode 100644 tests/python/contrib/test_clml/infrastructure.py
 create mode 100644 tests/python/contrib/test_clml/test_network.py
 create mode 100644 tests/python/contrib/test_clml/test_ops.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 22386656442e2..6931b40c667d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,6 +110,8 @@ tvm_option(USE_TENSORRT_RUNTIME "Build with TensorRT runtime" OFF)
 tvm_option(USE_RUST_EXT "Build with Rust based compiler extensions, STATIC, DYNAMIC, or OFF" OFF)
 tvm_option(USE_VITIS_AI "Build with VITIS-AI Codegen support" OFF)
 tvm_option(SUMMARIZE "Print CMake option summary after configuring" OFF)
+tvm_option(USE_CLML "Build with CLML Codegen support" OFF)
+tvm_option(USE_CLML_GRAPH_EXECUTOR "Build with CLML graph runtime" OFF)
 
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
@@ -492,6 +494,7 @@ include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/contrib/VitisAI.cmake)
 include(cmake/modules/contrib/Verilator.cmake)
+include(cmake/modules/contrib/CLML.cmake)
 include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)
 include(cmake/modules/RustExt.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 2c22d2b4986b4..212b565f25fbe 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -269,6 +269,11 @@ set(USE_VITIS_AI OFF)
 # Build Verilator codegen and runtime
 set(USE_VERILATOR OFF)
 
+#Whether to use CLML codegen
+set(USE_CLML OFF)
+# USE_CLML_GRAPH_EXECUTOR - CLML SDK PATH or ON or OFF
+set(USE_CLML_GRAPH_EXECUTOR OFF)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 2c07a94ed5325..06c42494a3314 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -113,6 +113,8 @@ function(add_lib_info src_file)
     TVM_INFO_USE_THRUST="${USE_THRUST}"
     TVM_INFO_USE_VITIS_AI="${USE_VITIS_AI}"
     TVM_INFO_USE_VULKAN="${USE_VULKAN}"
+    TVM_INFO_USE_CLML="${USE_CLML}"
+    TVM_INFO_USE_CLML_GRAPH_EXECUTOR="${USE_CLML_GRAPH_EXECUTOR}"
   )
 
 endfunction()
diff --git a/cmake/modules/contrib/CLML.cmake b/cmake/modules/contrib/CLML.cmake
new file mode 100644
index 0000000000000..30e60423b03bf
--- /dev/null
+++ b/cmake/modules/contrib/CLML.cmake
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_CLML)
+    file(GLOB CLML_RELAY_CONTRIB_SRC src/relay/backend/contrib/clml/*.cc)
+    file(GLOB CLML_RUNTIME_MODULE src/runtime/contrib/clml/clml_runtime.cc)
+    list(APPEND COMPILER_SRCS ${CLML_RELAY_CONTRIB_SRC})
+    if(NOT USE_CLML_GRAPH_EXECUTOR)
+        list(APPEND COMPILER_SRCS ${CLML_RUNTIME_MODULE})
+    endif()
+    message(STATUS "Build with CLML support...")
+endif()
+
+if(USE_CLML_GRAPH_EXECUTOR)
+    set(CLML_PATH ${CMAKE_CURRENT_SOURCE_DIR}/clml)
+    # Detect custom CLML path.
+    if (NOT USE_CLML_GRAPH_EXECUTOR STREQUAL "ON")
+        set(CLML_PATH ${USE_CLML_GRAPH_EXECUTOR})
+    endif()
+
+    file(GLOB CLML_CONTRIB_SRC src/runtime/contrib/clml/*)
+
+    # Cmake needs to find clml library, include and support directories
+    # in the path specified by CLML_PATH.
+    set(CLML_INCLUDE_DIRS ${CLML_PATH}/include ${CLML_PATH})
+    include_directories(${CLML_INCLUDE_DIRS})
+    find_library(EXTERN_CLML_COMPUTE_LIB
+          NAMES OpenCL libOpenCL
+          HINTS "${CLML_PATH}" "${CLML_PATH}/lib64"
+          )
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_CLML_COMPUTE_LIB})
+    list(APPEND RUNTIME_SRCS ${CLML_CONTRIB_SRC})
+    message(STATUS "Build with CLML graph runtime support: "
+            ${EXTERN_CLML_COMPUTE_LIB})
+
+    # Set flag to detect CLML graph runtime support.
+    add_definitions(-DTVM_GRAPH_EXECUTOR_CLML)
+
+    message(STATUS "Enable OpenCL as fallback to CLML")
+    file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
+    list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
+    set(USE_OPENCL ON)
+
+endif()
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
index a03d0f6d4f1cc..01708e8452bd9 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -26,3 +26,4 @@
 from .libtorch import *
 from .tensorrt import *
 from .cutlass import *
+from .clml import *
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
new file mode 100644
index 0000000000000..cacd10de2865e
--- /dev/null
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -0,0 +1,247 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""CLML Library supported operators."""
+import tvm
+
+from tvm import relay
+from tvm._ffi import register_func
+from tvm.relay import transform
+from tvm.relay.build_module import bind_params_by_name
+
+from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item
+from .register import register_pattern_table
+from ..strategy.generic import is_depthwise_conv2d
+
+
+def is_clml_runtime_enabled():
+    """Check if the CLML graph runtime is present.
+
+    Returns
+    -------
+    ret: bool
+        True if present, False if not.
+    """
+    check_enabled = tvm.get_global_func("relay.op.is_clml_runtime_enabled", True)
+    if check_enabled:
+        return check_enabled()
+    return False
+
+
+def partition_for_clml(mod, params=None):
+    """Partition the graph greedily offloading supported
+    operators to CLML Library.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : Optional[Dict[str, NDArray]]
+        Constant input parameters.
+
+    Returns
+    -------
+    ret : annotated and partitioned module.
+    """
+
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            transform.FoldConstant(),
+            transform.MergeComposite(clml_pattern_table()),
+            transform.AnnotateTarget("clml", False),
+            transform.MergeCompilerRegions(),
+            transform.PartitionGraph(),
+        ]
+    )
+
+    result_mod = seq(mod)
+    return result_mod
+
+
+@register_func("relay.ext.clml.optimize")
+def preprocess_module(mod):
+    """
+    Pre-process a module containing functions ready for CLML codegen. For now we enforce OIHW
+    kernel layout and fold the transforms away.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+
+    Returns
+    -------
+    preprocessed_mod : The processed module.
+    """
+
+    def convert_layout_conv2d(conv2d_function):
+        def convert_conv(attrs, inputs, tinfos, desired_layouts):
+            new_attrs = dict(attrs)
+            data_info = tinfos[0]
+            weight_info = tinfos[1]
+            desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
+            new_attrs["data_layout"] = desired_data_layout
+            new_attrs["kernel_layout"] = desired_kernel_layout
+
+            if is_depthwise_conv2d(
+                data_info.shape,
+                attrs["data_layout"],
+                weight_info.shape,
+                attrs["kernel_layout"],
+                attrs["groups"],
+            ):
+                dkl = desired_kernel_layout
+                new_attrs["kernel_layout"] = dkl[1] + dkl[0] + dkl[2] + dkl[3]
+            return conv2d_function(*inputs, **new_attrs)
+
+        return convert_conv
+
+    with OpAttrContext(
+        "nn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.nn.conv2d)
+    ):
+        seq = tvm.transform.Sequential(
+            [
+                transform.ConvertLayout({"nn.conv2d": ["NCHW", "OIHW"]}),
+                transform.FoldConstant(),
+            ]
+        )
+        preprocessed_mod = seq(mod)
+    return preprocessed_mod
+
+
+@register_pattern_table("clml")
+def clml_pattern_table():
+    """Get the CLML pattern table."""
+
+    def conv_pattern():
+        """Create a convolution pattern."""
+        pattern = is_op("nn.conv2d")(wildcard(), is_constant())
+        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
+        pattern = pattern.optional(
+            lambda x: is_op("nn.batch_norm")(
+                x, is_constant(), is_constant(), is_constant(), is_constant()
+            )
+        )
+        pattern = pattern.optional(is_tuple_get_item)
+        pattern = pattern.optional(is_op("nn.relu"))
+        return pattern
+
+    def batch_norm_pattern():
+        """Create a batch norm pattern."""
+        pattern = is_op("nn.batch_norm")(
+            wildcard(), is_constant(), is_constant(), is_constant(), is_constant()
+        )
+        pattern = is_tuple_get_item(pattern)
+        return pattern
+
+    def dense_pattern():
+        """Create a dense pattern."""
+        pattern = is_op("nn.dense")(wildcard(), is_constant())
+        pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
+        return pattern
+
+    def pad_pattern():
+        """Create a pad pattern."""
+        pattern = is_op("nn.pad")(wildcard(), wildcard())
+        return pattern
+
+    def check_conv(extract):
+        """Check conv pattern is supported by CLML."""
+        call = extract
+        if isinstance(call, tvm.relay.expr.TupleGetItem):
+            call = call.tuple_value
+        elif call.op.name == "nn.relu":
+            call = call.args[0]
+            if isinstance(call, tvm.relay.expr.TupleGetItem):
+                call = call.tuple_value
+        while call.op.name != "nn.conv2d":
+            call = call.args[0]
+        attrs, args = call.attrs, call.args
+        if attrs.data_layout != "NCHW":
+            return False
+        data_typ = args[0].checked_type
+        kernel_typ = args[1].checked_type
+        is_depthwise = is_depthwise_conv2d(
+            data_typ.shape,
+            attrs["data_layout"],
+            kernel_typ.shape,
+            attrs["kernel_layout"],
+            attrs["groups"],
+        )
+        if attrs.groups != 1 and not is_depthwise:
+            return False
+        return True
+
+    return [
+        ("clml.conv2d", conv_pattern(), check_conv),
+        ("clml.dense", dense_pattern()),
+        ("clml.pad", pad_pattern()),
+        ("clml.batch_norm", batch_norm_pattern()),
+    ]
+
+
+def _register_external_op_helper(op_name, supported=True):
+    @tvm.ir.register_op_attr(op_name, "target.clml")
+    def _func_wrapper(expr):
+        return supported
+
+    return _func_wrapper
+
+
+_register_external_op_helper("clip")
+_register_external_op_helper("relu")
+_register_external_op_helper("nn.global_avg_pool2d")
+_register_external_op_helper("nn.global_max_pool2d")
+_register_external_op_helper("nn.softmax")
+_register_external_op_helper("reshape")
+
+
+class OpAttrContext(object):
+    """Temporarily changes the attr of an op."""
+
+    def __init__(self, op_name, attr_key, attr_value):
+        """Saves the required info for RAII pattern usage.
+
+        Parameters
+        ----------
+        op_name : str
+            The op name.
+
+        attr_key : str
+            The attribute name.
+
+        attr_value : object
+            The attribute value.
+        """
+        self.op = relay.op.get(op_name)
+        self.attr_key = attr_key
+        self.attr_value = attr_value
+
+    def __enter__(self):
+        self.older_attr = self.op.get_attr(self.attr_key)
+        self.op.reset_attr(self.attr_key)
+        self.op.set_attr(self.attr_key, self.attr_value)
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        self.op.reset_attr(self.attr_key)
+        if self.older_attr:
+            self.op.set_attr(self.attr_key, self.older_attr)
diff --git a/src/relay/backend/contrib/clml/codegen.cc b/src/relay/backend/contrib/clml/codegen.cc
new file mode 100644
index 0000000000000..fa082a423d785
--- /dev/null
+++ b/src/relay/backend/contrib/clml/codegen.cc
@@ -0,0 +1,412 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/clml/codegen.cc
+ * \brief Implementation of the Relay -> CLML JSON serializer.
+ */
+#include <tvm/ir/module.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/type.h>
+#include <tvm/tir/analysis.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*!
+ * \brief Generates an CLMLModule from a relay expression. This "compilation"
+ * does not require CLML since the actual conversion using CLML APIs is
+ * deferred until creation of the runtime. This step simply serializes the
+ * relay program into a JSON string.
+ */
+class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
+  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+  CLMLJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr), clml_symbol_(symbol) {}
+
+  /*!
+   * \brief A series of operators that form a composite
+   * convolution. Supports nn.conv2d
+   */
+  struct CompositeConvNode {
+    const CallNode* pad = nullptr;
+    const CallNode* conv = nullptr;
+    const CallNode* bn = nullptr;
+    const CallNode* bias = nullptr;
+    const CallNode* activation = nullptr;
+    std::string act_type;
+  };
+
+  /*!
+   * \brief Visit call nodes and generate appropriate JSON node.
+   *
+   * \param cn The current call node.
+   * \return A list of graph entry nodes.
+   */
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) override {
+    if (cn->op.as<OpNode>()) {
+      return JSONSerializer::VisitExpr_(cn);
+    }
+    if (!cn->op.as<FunctionNode>()) {
+      LOG(FATAL) << "CLML JSON runtime does not support calls to " << cn->op->GetTypeKey();
+    }
+    auto fn = cn->op.as<FunctionNode>();
+    auto comp = fn->GetAttr<String>(attr::kComposite);
+    ICHECK(comp.defined()) << "CLML JSON runtime only supports composite functions.";
+    const std::string name = comp.value();
+    std::shared_ptr<JSONGraphNode> json_node;
+    if (name == "clml.conv2d") {
+      json_node = CreateCompositeConvJSONNode(cn);
+    } else if (name == "clml.batch_norm") {
+      json_node = CreateBatchNormJSONNode(cn);
+    } else if (name == "clml.dense") {
+      json_node = CreateDenseJSONNode(cn);
+    } else if (name == "clml.pad") {
+      json_node = CreatePadJSONNode(cn);
+    } else {
+      LOG(FATAL) << "Unrecognized CLML  pattern: " << name;
+    }
+    return AddNode(json_node, GetRef<Expr>(cn));
+  }
+
+  /*!
+   * \brief Visit call nodes and generate ordered params.
+   *
+   * \param cn The current constant node.
+   * \return A list of graph entry nodes.
+   */
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const ConstantNode* cn) override {
+    std::string name = "clml_" + clml_symbol_ + "_const_" + std::to_string(clml_params_.size());
+    clml_params_.push_back(name);
+    clml_params_map_[name] = cn->data;
+    auto node = std::make_shared<JSONGraphNode>(name, "const" /* op_type_ */);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
+
+  Array<String> GetParams() const { return clml_params_; }
+  Map<String, runtime::NDArray> GetParamsMap() const {
+    return Map<String, runtime::NDArray>(clml_params_map_);
+  }
+
+ private:
+  std::string clml_symbol_;
+  Array<String> clml_params_;
+  std::unordered_map<String, runtime::NDArray> clml_params_map_;
+  /*!
+   * \brief Extract convolution nodes from a composite function.
+   *
+   * \param cn The call node of the composite function.
+   * \return Extracted composite convolution nodes.
+   */
+  static CompositeConvNode UnpackCompositeConvolution(const CallNode* cn) {
+    CompositeConvNode nodes{};
+
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    // Traverse composite convolution function from child to parent
+    const auto* current_call = fn->body.as<CallNode>();
+    if (fn->body.as<TupleGetItemNode>()) {
+      auto tuple_item = fn->body.as<TupleGetItemNode>();
+      current_call = tuple_item->tuple.as<CallNode>();
+    } else {
+      current_call = fn->body.as<CallNode>();
+    }
+    if (backend::IsOp(current_call, "nn.relu")) {
+      nodes.activation = current_call;
+      nodes.act_type = "relu";
+      if (current_call->args[0].as<TupleGetItemNode>()) {
+        auto tuple_item = current_call->args[0].as<TupleGetItemNode>();
+        current_call = tuple_item->tuple.as<CallNode>();
+      } else {
+        current_call = current_call->args[0].as<CallNode>();
+      }
+    }
+    if (backend::IsOp(current_call, "nn.batch_norm")) {
+      nodes.bn = current_call;
+      current_call = current_call->args[0].as<CallNode>();
+    }
+    if (backend::IsOp(current_call, "add")) {
+      nodes.bias = current_call;
+      current_call = current_call->args[0].as<CallNode>();
+    }
+    // Enforce a convolution node exists at this point during traversal
+    ICHECK(backend::IsOp(current_call, "nn.conv2d"));
+    nodes.conv = current_call;
+    if (!current_call->args.empty() && current_call->args[0]->IsInstance<CallNode>()) {
+      current_call = current_call->args[0].as<CallNode>();
+      if (backend::IsOp(current_call, "nn.pad")) {
+        nodes.pad = current_call;
+      }
+    }
+    return nodes;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a composite convolution.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateCompositeConvJSONNode(const CallNode* cn) {
+    CompositeConvNode nodes = UnpackCompositeConvolution(cn);
+
+    const auto* conv_attr = nodes.conv->attrs.as<Conv2DAttrs>();
+    ICHECK(conv_attr);
+
+    std::string name;
+    std::string name_prefix = "nn";
+
+    // Distinguish between normal and depth-wise convolution
+    if (conv_attr->channels.defined() &&
+        tvm::tir::ExprDeepEqual()(conv_attr->channels, conv_attr->groups) &&
+        conv_attr->groups != 1) {
+      name = "depthwise_conv2d";
+      ICHECK(conv_attr->kernel_layout == "IOHW")
+          << "Kernel layout must be IHWO, has the module been pre-processed correctly?";
+    } else {
+      name = "conv2d";
+      ICHECK(conv_attr->kernel_layout == "OIHW")
+          << "Kernel layout must be OHWI, has the module been pre-processed correctly?";
+    }
+
+    // Inputs must be added in the same order they appear in the relay graph.
+    std::vector<JSONGraphNodeEntry> inputs;
+
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    inputs.push_back(VisitExpr(nodes.conv->args[1])[0]);
+    if (nodes.bias) {
+      inputs.push_back(VisitExpr(nodes.bias->args[1])[0]);
+    }
+    // Deal with Batchnorm Fusing here
+    if (nodes.bn) {
+      inputs.push_back(VisitExpr(nodes.bn->args[1])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[2])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[3])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[4])[0]);
+    }
+
+    auto json_node = std::make_shared<JSONGraphNode>(name_prefix + "." + name, "kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, nodes.conv);
+
+    if (nodes.bn) {
+      const auto* bn_attr = nodes.bn->attrs.as<BatchNormAttrs>();
+      std::vector<dmlc::any> bn_any_attr;
+      std::vector<std::string> bn_args = {
+          std::to_string(bn_attr->axis), std::to_string(bn_attr->epsilon),
+          std::to_string(bn_attr->center), std::to_string(bn_attr->scale)};
+      bn_any_attr.emplace_back(bn_args);
+      json_node->SetAttr("batchnorm", bn_any_attr);
+    }
+
+    // Override attributes
+    if (nodes.pad) {
+      const auto* pad_attr = nodes.pad->attrs.as<PadAttrs>();
+      ICHECK(pad_attr);
+      auto p = pad_attr->pad_width;
+      // Standard convolution pad layout for TVM: dimension wise pair of pre and post padding.
+      // CLML takes dimension wise pre-padding followed by dimension wise post-padding.
+      std::vector<std::string> padding = {std::to_string(p[2][0].as<IntImmNode>()->value),
+                                          std::to_string(p[3][0].as<IntImmNode>()->value),
+                                          std::to_string(p[2][1].as<IntImmNode>()->value),
+                                          std::to_string(p[3][1].as<IntImmNode>()->value)};
+      std::vector<dmlc::any> padding_attr;
+      padding_attr.emplace_back(padding);
+      json_node->SetAttr("padding", padding_attr);
+    }
+
+    if (nodes.activation) {
+      std::vector<std::string> activation_type = {nodes.act_type};
+      std::vector<dmlc::any> act_attr;
+      act_attr.emplace_back(activation_type);
+      json_node->SetAttr("activation_type", act_attr);
+    }
+    return json_node;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a Batchnorm operator.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateBatchNormJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* tuple_item = fn->body.as<TupleGetItemNode>();
+    ICHECK(tuple_item);
+    const auto* bn = tuple_item->tuple.as<CallNode>();
+    ICHECK(bn);
+    const auto* bn_op = bn->op.as<OpNode>();
+    ICHECK(bn_op);
+    const std::string name = bn_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    inputs.push_back(VisitExpr(bn->args[1])[0]);
+    inputs.push_back(VisitExpr(bn->args[2])[0]);
+    inputs.push_back(VisitExpr(bn->args[3])[0]);
+    inputs.push_back(VisitExpr(bn->args[4])[0]);
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, bn);
+    return json_node;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a Dense operator.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateDenseJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* dense = fn->body.as<CallNode>();
+    const CallNode* bias = nullptr;
+
+    if (backend::IsOp(dense, "add")) {
+      bias = dense;
+      dense = dense->args[0].as<CallNode>();
+    }
+    ICHECK(backend::IsOp(dense, "nn.dense"));
+    const auto* dense_op = dense->op.as<OpNode>();
+    ICHECK(dense_op);
+    const std::string name = dense_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    inputs.push_back(VisitExpr(dense->args[1])[0]);
+    if (bias) {
+      inputs.push_back(VisitExpr(bias->args[1])[0]);
+    }
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, dense);
+    return json_node;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a Pad operator.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreatePadJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* pad = fn->body.as<CallNode>();
+    const auto* pad_op = pad->op.as<OpNode>();
+    ICHECK(pad_op);
+    const std::string name = pad_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+
+    const auto* pad_attr = pad->attrs.as<PadAttrs>();
+    ICHECK(pad_attr);
+    auto p = pad_attr->pad_width;
+    // TVM padding format: Dimension wise pair of pre and post padding.
+    // CLML padding format: Dimension wise pre padding followed by dimension wise post padding.
+    std::vector<std::string> padding = {std::to_string(p[2][0].as<IntImmNode>()->value),
+                                        std::to_string(p[2][1].as<IntImmNode>()->value),
+                                        std::to_string(p[3][0].as<IntImmNode>()->value),
+                                        std::to_string(p[3][1].as<IntImmNode>()->value)};
+    std::vector<dmlc::any> padding_attr;
+    padding_attr.emplace_back(padding);
+    json_node->SetAttr("pad_width", padding_attr);
+
+    std::vector<std::string> pad_mode = {pad_attr->pad_mode};
+    std::vector<dmlc::any> pad_mode_attr;
+    pad_mode_attr.emplace_back(pad_mode);
+    json_node->SetAttr("pad_mode", pad_mode_attr);
+
+    return json_node;
+  }
+};
+
+/*!
+ * \brief Create a runtime module for CLML.
+ *
+ * This consists of a series of "serialized functions" which each represent a
+ * sub-graph to be computed by CLML and will each be executed independently from
+ * one another. Each function consists of serialized JSON describing the sub-graph
+ * and serialized constant tensors.
+ *
+ * \note The CLML runtime module only supports a single operator per
+ * sub-graph currently.
+ *
+ * \param ref The ext_func Relay expression/module to be executed using extern ops.
+ * \return A runtime module.
+ */
+runtime::Module CLMLCompiler(const ObjectRef& ref) {
+  ICHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
+  Function func = Downcast<Function>(ref);
+  std::string func_name = backend::GetExtSymbol(func);
+
+  CLMLJSONSerializer serializer(func_name, func);
+  serializer.serialize();
+  std::string graph_json = serializer.GetJSON();
+  auto param_names = serializer.GetParams();
+  const auto* pf = runtime::Registry::Get("runtime.clml_runtime_create");
+  ICHECK(pf != nullptr) << "Cannot find CLML runtime module to create";
+  runtime::Module lib = (*pf)(func_name, graph_json, param_names);
+  return lib;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.clml").set_body_typed(CLMLCompiler);
+
+/*!
+ * \brief Check whether CLML graph runtime is used.
+ *
+ * \return True if CLML graph runtime is enabled, False if not.
+ */
+inline constexpr bool IsCLMLRuntimeEnabled() {
+#if TVM_GRAPH_EXECUTOR_CLML
+  return true;
+#else
+  return false;
+#endif
+}
+
+TVM_REGISTER_GLOBAL("relay.op.is_clml_runtime_enabled").set_body_typed(IsCLMLRuntimeEnabled);
+
+Map<String, runtime::NDArray> CLMLConstantUpdater(Expr func, std::string symbol) {
+  CLMLJSONSerializer serializer(symbol, func);
+  serializer.serialize();
+  auto pmap = serializer.GetParamsMap();
+  return pmap;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.clml.constant_updater").set_body_typed(CLMLConstantUpdater);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
new file mode 100644
index 0000000000000..7966c0e78b2d7
--- /dev/null
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -0,0 +1,1091 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/clml/clml_runtime.cc
+ * \brief A simple JSON runtime for CLML.
+ */
+
+#include <CL/cl.h>
+#include <CL/opencl.h>
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+#include <CL/cl_qcom_ml_ops.h>
+#endif
+#include <stdlib.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <map>
+#include <utility>
+
+#include "../../opencl/opencl_common.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+
+class CLMLRuntime : public JSONRuntimeBase {
+ public:
+  /*!
+   * \brief The CLML runtime module. Deserialize the provided functions
+   * on creation and store in the layer cache.
+   *
+   * \param symbol_name The name of the function.
+   * \param graph_json serialized JSON representation of a sub-graph.
+   * \param const_names The names of each constant in the sub-graph.
+   */
+  explicit CLMLRuntime(const std::string& symbol_name, const std::string& graph_json,
+                       const Array<String>& const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+
+  ~CLMLRuntime() {
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+    cl_int result = 0;
+    if (this->is_tuning_run) {
+      result = h_ClmlIntf->clReleaseMLTuningCacheQCOM(this->tuning_cache);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTuningCacheQCOM:" << result;
+    }
+    for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
+      auto tensor_desc = it->second.first;
+      result = h_ClmlIntf->clReleaseMLTensorQCOM(tensor_desc->tensor);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+      result = clReleaseMemObject(tensor_desc->memory);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMemObject:" << result;
+    }
+    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+      result = h_ClmlIntf->clReleaseMLOpQCOM(this->layer_.function[i]);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLOpQCOM:" << result;
+    }
+    for (auto it = this->layer_.in_placeholder.begin(); it != this->layer_.in_placeholder.end();
+         it++) {
+      result = h_ClmlIntf->clReleaseMLTensorQCOM((*it)->tensor);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+    }
+    for (auto it = this->layer_.out_placeholder.begin(); it != this->layer_.out_placeholder.end();
+         it++) {
+      result = h_ClmlIntf->clReleaseMLTensorQCOM((*it)->tensor);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+    }
+    result = h_ClmlIntf->clReleaseMLTensorMemoryDescriptorSetQCOM(layer_.descriptorSet);
+    ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorMemoryDescriptorSetQCOM:" << result;
+#endif
+  }
+
+  /*!
+   * \brief The type key of the module.
+   *
+   * \return module type key.
+   */
+  const char* type_key() const override { return "clml"; }
+
+  /*!
+   * \brief Initialize runtime. Create CLML layer from JSON
+   * representation.
+   *
+   * \param consts The constant params from compiled model.
+   */
+  void Init(const Array<NDArray>& consts) override {
+    ICHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+    SetupConstants(consts);
+
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+    InitCLML();
+#endif
+
+    BuildEngine();
+  }
+
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+  std::vector<cl_uint> GetVectorValues(const std::vector<std::string>& val) {
+    std::vector<cl_uint> array;
+    for (auto i : val) {
+      array.push_back((cl_uint)stoi(i));
+    }
+    return array;
+  }
+
+  void InitCLML() {
+    // Setup CLML Context
+    cl_int result = 0;
+
+    // Initialize Context and Command Queue
+    result = clGetPlatformIDs(1, &platform, NULL);
+    ICHECK(result == CL_SUCCESS) << "clGetPlatformIDs:" << result;
+
+    uint32_t num_devices = 0;
+    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
+    ICHECK(result == CL_SUCCESS && num_devices == 1) << "clGetDeviceIDs:" << result;
+
+    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
+    ICHECK(device_id && result == CL_SUCCESS) << "clGetDeviceIDs:" << result;
+
+    if (!ExtensionStringPresent(device_id)) {
+      LOG(WARNING) << "CLML Runtime Init: Qualcomm extn not present.\n";
+      return;
+    }
+
+    // Reuse the OpenCl work space from TVM Device API.
+    auto func = tvm::runtime::Registry::Get("device_api.opencl");
+    ICHECK(func != nullptr) << "Cannot find OpenCL device_api in registry";
+    auto device_api = static_cast<cl::OpenCLWorkspace*>(((*func)()).operator void*());
+    this->context = device_api->context;
+    bool queue_found = false;
+    for (size_t i = 0; i < device_api->devices.size(); ++i) {
+      if (device_api->devices[i] == device_id) {
+        this->queue = device_api->queues[i];
+        this->evts = &(device_api->events[i]);
+        queue_found = true;
+      }
+    }
+    ICHECK(queue_found != false) << "Device queue not found in OpenCL Workspace";
+
+    // Query and Get CLML Interface
+    static const cl_uint MAX_VERSIONS = 256;
+    cl_int majorVersions[MAX_VERSIONS];
+    cl_int minorVersions[MAX_VERSIONS];
+    cl_uint numVersions = 0;
+    result = clQueryMLInterfaceVersionsQCOM(NULL, NULL, 0, &numVersions);
+    ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
+    ICHECK(numVersions > 0u);
+    ICHECK(numVersions <= MAX_VERSIONS);
+
+    result = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, numVersions, NULL);
+    ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
+
+    for (cl_uint i = 0; i < numVersions; ++i) {
+      if (majorVersions[i] == 2) {
+        LOG(WARNING) << "CLML Version Selected:" << majorVersions[i] << " : " << majorVersions[i];
+        h_ClmlIntf = clGetMLInterfaceV2QCOM(0);
+        ICHECK(h_ClmlIntf != NULL) << "clGetMLInterfaceV2QCOM:" << result;
+        break;
+      }
+    }
+    char* tune_flag;
+    if ((tune_flag = getenv("CLML_IS_TUNNING_RUN")))
+      this->is_tuning_run = std::stoi(tune_flag);
+    else
+      this->is_tuning_run = 0;
+
+    if (!(tuning_file = getenv("CLML_TUNNING_CACHE"))) this->is_tuning_run = 0;
+    // A Tuning run, so create the cache from scratch
+    result = h_ClmlIntf->clCreateMLTuningCacheQCOM(&tuning_cache);
+    ICHECK(result == CL_SUCCESS) << "clCreateMLTuningCacheQCOM:" << result;
+    if (!this->is_tuning_run && this->tuning_file) {
+      std::vector<unsigned char> buffer;
+      buffer = readBinFile(this->tuning_file);
+      result = h_ClmlIntf->clLoadMLTuningCacheQCOM(tuning_cache, buffer.size(), buffer.data());
+      ICHECK(result == CL_SUCCESS) << "clLoadMLTuningCacheQCOM:" << result;
+    }
+  }
+
+  std::vector<unsigned char> readBinFile(const std::string& filename) {
+    std::ifstream fin(filename, std::ios::binary | std::ios::ate);
+    if (!fin.good()) {
+      LOG(FATAL) << "ERROR: Could not load tuning cache file: " + filename;
+    }
+    ICHECK(fin.good());
+    int64_t size = fin.tellg();
+    fin.seekg(0, std::ios::beg);
+    std::vector<unsigned char> buffer(static_cast<size_t>(size));
+    char* ptr = reinterpret_cast<char*>(buffer.data());
+    fin.read(ptr, size);
+    ICHECK(fin.good());
+    return buffer;
+  }
+
+  void CopyDataToCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
+                            cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM) {
+    cl_int result = 0;
+    cl_event evt = NULL;
+    result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, data, layout, tensor->tensor,
+                                                        tensor->memory,
+                                                        0,      // n waitlist
+                                                        NULL,   // waitlist
+                                                        &evt);  // event
+    ICHECK((evt != NULL) && result == CL_SUCCESS) << "clEnqueueWriteMLTensorDataQCOM:" << result;
+  }
+
+  void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
+                              cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM) {
+    cl_int result = 0;
+    cl_event readEvent = NULL;
+    // Read the output tensor
+    result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(queue, tensor->tensor, tensor->memory, data,
+                                                       layout,
+                                                       0,            // n waitlist
+                                                       NULL,         // waitlist
+                                                       &readEvent);  // event
+    ICHECK(result == CL_SUCCESS) << "clEnqueueReadMLTensorDataQCOM:" << result;
+
+    result = clWaitForEvents(1, &readEvent);
+    ICHECK(result == CL_SUCCESS) << "clWaitForEvents:" << result;
+  }
+
+  /*!
+   * \brief Unpack inputs and outputs and run inference on a given layer.
+   *
+   * \param args Access inputs and outputs.
+   * \param function The layer to execute inference on.
+   * \return Status of inference.
+   */
+  void Run() override {
+    cl_int result = 0;
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      uint32_t eid = EntryID(nid, 0);
+      if (nodes_[nid].GetOpType() == "input") {
+        void* data = data_entry_[eid]->data;
+        size_t isize = 1;
+        for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
+          isize *= data_entry_[eid]->shape[j];
+        }
+        if (kDLCPU == data_entry_[eid]->device.device_type) {
+          CopyDataToCLMLTensor(layer_.inputs[i], data);
+        } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+          layer_.in_placeholder[i]->memory = static_cast<cl_mem>(
+              ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+          cl_event cpy_evt = NULL;
+          result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
+              queue, layer_.in_placeholder[i]->tensor, layer_.in_placeholder[i]->memory,
+              layer_.inputs[i]->tensor, layer_.inputs[i]->memory, 0, NULL, &cpy_evt);
+          ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << result;
+        } else {
+          DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
+          cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+          int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
+          void* tmpptr = reinterpret_cast<void*>(malloc(isize * dtype_size));
+          TVMArrayCopyToBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
+                              isize * dtype_size);
+          CopyDataToCLMLTensor(layer_.inputs[i], tmpptr);
+          free(tmpptr);
+        }
+      }
+    }
+
+    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+      this->evts->resize(this->evts->size() + 1);
+      cl_event* evt = &(this->evts->back());
+      result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
+                                             this->layer_.descriptorSet, 0, NULL, evt);
+      ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result;
+    }
+
+    if (getenv("CLML_PROFILING")) {
+      cl_ulong start, end;
+      cl_ulong duration = 0;
+      clWaitForEvents(1, &(this->evts->back()));
+      for (size_t i = 0; i < this->layer_.layer_names.size(); ++i) {
+        clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
+                                &start, nullptr);
+        clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end,
+                                nullptr);
+        duration += (end - start);
+        LOG(WARNING) << "Layer:" << this->layer_.layer_names[i] << " Duration:" << (end - start);
+      }
+      LOG(WARNING) << "Total Duration:" << duration;
+    }
+
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      void* data = data_entry_[eid]->data;
+
+      size_t osize = 1;
+      for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
+        osize *= data_entry_[eid]->shape[j];
+      }
+      if (kDLCPU == data_entry_[eid]->device.device_type) {
+        CopyDataFromCLMLTensor(layer_.outputs[0], data);
+      } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+        layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
+            ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+        cl_event cpy_evt = NULL;
+        result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
+            queue, layer_.outputs[i]->tensor, layer_.outputs[i]->memory,
+            layer_.out_placeholder[i]->tensor, layer_.out_placeholder[i]->memory, 0, NULL,
+            &cpy_evt);
+        ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << result;
+      } else {
+        DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
+        cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+        int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
+
+        void* tmpptr = reinterpret_cast<void*>(malloc(osize * dtype_size));
+        CopyDataFromCLMLTensor(layer_.outputs[0], tmpptr);
+        TVMArrayCopyFromBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
+                              osize * dtype_size);
+        free(tmpptr);
+      }
+    }
+  }
+
+ private:
+  /*!
+   * \brief Build CLML layer from JSON representation and cache.
+   *
+   * \note For the time being only one layer or operator is supported
+   * per engine.
+   */
+  void BuildEngine() {
+    size_t nid;
+    for (nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() == "input") {
+        auto clml_input = MakeCLMLTensorFromJSONNode(node);
+        this->layer_.storage_map.insert({nid, std::make_pair(clml_input, node)});
+        this->layer_.inputs.push_back(clml_input);
+        // Input copy placeholder Tensor
+        this->layer_.in_placeholder.push_back(
+            MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM));
+      } else if (node.GetOpType() == "kernel") {
+        auto op_name = node.GetOpName();
+        if ("nn.conv2d" == op_name) {
+          auto out = CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_CONVOLUTION_QCOM);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.depthwise_conv2d" == op_name) {
+          auto out = CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_DEPTHWISE_QCOM);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.relu6" == op_name) {
+          auto out = CreateReLULayer(&layer_, node, CL_ACTIVATION_RELU6);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.relu" == op_name) {
+          auto out = CreateReLULayer(&layer_, node, CL_ACTIVATION_RELU);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.batch_norm" == op_name) {
+          auto out = CreateBatchNormLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name) {
+          auto out = CreateGlobalPoolingLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("reshape" == op_name) {
+          auto out = CreateReshapeLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.dense" == op_name) {
+          auto out = CreateDenseLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.softmax" == op_name) {
+          auto out = CreateSoftMaxLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.pad" == op_name) {
+          auto out = CreatePadLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("clip" == op_name) {
+          auto out = CreateClipLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else {
+          LOG(FATAL) << "Unsupported op: " << op_name;
+        }
+        this->layer_.layer_names.push_back(op_name);
+      } else if (node.GetOpType() != "const") {
+        LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
+      }
+    }
+    if (nid > 0) {
+      this->layer_.outputs.push_back(this->layer_.storage_map[nid - 1].first);
+      this->layer_.out_placeholder.push_back(
+          MakeCLMLTensorFromJSONNode(nodes_[nid - 1], CL_TENSOR_LAYOUT_NCHW_QCOM));
+    }
+    // ALlocate device memories and initialize the params if any
+    cl_int result = 0;
+    for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
+      auto tensor_desc = it->second.first;
+      JSONGraphNode node = it->second.second;
+      void* node_data = nullptr;
+
+      allocateTensorMemory(h_ClmlIntf, context, tensor_desc);
+
+      if (node.GetOpType() == "const") {
+        node_data = data_entry_[EntryID(it->first, 0)]->data;
+        if (node_data != nullptr) {
+          CopyDataToCLMLTensor(tensor_desc, node_data);
+        }
+      }
+      this->layer_.tensorMemDescs.push_back(*tensor_desc);
+    }
+
+    // Setup descriptor set
+    result = h_ClmlIntf->clCreateMLTensorMemoryDescriptorSetQCOM(&this->layer_.descriptorSet);
+    ICHECK(result == CL_SUCCESS) << "clCreateMLTensorMemoryDescriptorSetQCOM:" << result;
+
+    result = h_ClmlIntf->clUpdateMLTensorMemoryDescriptorSetQCOM(
+        this->layer_.descriptorSet, static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
+        this->layer_.tensorMemDescs.data());
+    ICHECK(result == CL_SUCCESS) << "clUpdateMLTensorMemoryDescriptorSetQCOM:" << result;
+
+    if (this->is_tuning_run) {
+      LOG(WARNING) << "CLML Tunning In Progress:";
+      for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+        LOG(WARNING) << "CLML Tunning:" << i;
+        result = h_ClmlIntf->clTuneMLOpQCOM(queue, this->layer_.function[i],
+                                            this->layer_.descriptorSet, this->tuning_cache, NULL);
+        ICHECK(result == CL_SUCCESS) << "clTuneMLOpQCOM:" << result;
+      }
+
+      size_t cacheLenBytes = 0;
+      size_t lenRet = 0;
+      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 0, NULL, &cacheLenBytes);
+      ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM:" << result;
+
+      std::vector<unsigned char> savedCache(cacheLenBytes, 0);
+      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, savedCache.size(),
+                                                   savedCache.data(), &lenRet);
+      assert(result == CL_SUCCESS);
+
+      std::ofstream cache_out(tuning_file, std::ios_base::binary);
+      if (cache_out) {
+        cache_out.write(reinterpret_cast<char*>(savedCache.data()), savedCache.size());
+        cache_out.close();
+      }
+      LOG(WARNING) << "CLML: Tuning cache dumped to:" << tuning_file;
+    }
+  }
+
+  /*!
+   * \brief CLML objects we cache in order to avoid needing to construct
+   * a new layer each time.
+   */
+  struct CachedLayer {
+    std::vector<cl_ml_op_qcom> function;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> inputs;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> in_placeholder;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> outputs;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> out_placeholder;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_outs;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_ins;
+    std::map<int, std::pair<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>, JSONGraphNode>>
+        storage_map;
+    std::vector<cl_ml_tensor_memory_desc_qcom> tensorMemDescs;
+    std::vector<cl_ml_tensor_memory_desc_qcom> in_tensorMemDescs;
+    std::vector<cl_ml_tensor_memory_desc_qcom> out_tensorMemDescs;
+    cl_ml_tensor_mem_desc_set_qcom descriptorSet;
+    std::vector<std::string> layer_names;
+    cl_ml_tensor_qcom unusedTensor = NULL;
+  };
+
+  struct tensor_dims_t {
+    uint32_t n, c, h, w;
+  };
+
+  bool ExtensionStringPresent(cl_device_id device_id) {
+    cl_int result = 0;
+
+    size_t reqd_size = 0;
+    result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, NULL, &reqd_size);
+    ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
+
+    std::vector<char> buf(reqd_size);
+    result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, buf.data(), NULL);
+    ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
+
+    std::string extensions(buf.data());
+    LOG(WARNING) << "OpenCL Extensions:" << extensions;
+    return (extensions.find("cl_qcom_ml_ops") != std::string::npos);
+  }
+
+  cl_ml_tensor_qcom DeviceMakeCLMLTensor(
+      void* pClmlIntf, cl_context context, tensor_dims_t dims,
+      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+      cl_channel_type dtype = CL_FLOAT) {
+    cl_ml_tensor_qcom tensor;
+    cl_int result = CL_OUT_OF_RESOURCES;
+
+    cl_ml_tensor_desc_qcom desc = {
+        dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, { 0 }};
+    CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
+    result = clmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &tensor);
+    ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
+    (void)result;
+    return tensor;
+  }
+
+  cl_int allocateTensorMemory(void* pClmlIntf, cl_context context,
+                              std::shared_ptr<cl_ml_tensor_memory_desc_qcom> pTensorMemDesc) {
+    uint32_t size = 0;
+    cl_int result = CL_OUT_OF_HOST_MEMORY;
+    cl_mem buffer = NULL;
+
+    CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
+    result = clmlIntf->clGetMLTensorMemorySizeQCOM(context, pTensorMemDesc->tensor, &size);
+    ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result;
+
+    buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &result);
+    ICHECK(result == CL_SUCCESS) << "clCreateBuffer:" << result;
+
+    pTensorMemDesc->memory = buffer;
+
+    return result;
+  }
+
+  tensor_dims_t get_tensor_dims(const JSONGraphNode& node) {
+    std::vector<int64_t> shape = node.GetOpShape()[0];
+    tensor_dims_t dims;
+    dims.n = shape[0];
+    dims.c = shape[1];
+    dims.h = shape[2];
+    dims.w = shape[3];
+    return dims;
+  }
+
+  cl_channel_type MakeCLDataType(const DLDataType& data_type) {
+    if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 32) {
+      return CL_FLOAT;
+    } else if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 16) {
+      return CL_HALF_FLOAT;
+    } else {
+      LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
+      return -1;
+    }
+  }
+
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
+      const JSONGraphNode& tensor_rep, void* data, std::vector<size_t> c_shape,
+      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint dtype = CL_FLOAT) {
+    std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
+    std::vector<size_t> clml_shape(shape.begin(), shape.end());
+    if (c_shape.size() > 0) {
+      clml_shape = c_shape;
+    }
+    // Make sure the tensors with dimensions less than 4 are padded with 1.
+    clml_shape.push_back(1);
+    clml_shape.push_back(1);
+    clml_shape.push_back(1);
+
+    tensor_dims_t dims;
+    dims.n = clml_shape[0];
+    dims.c = clml_shape[1];
+    dims.h = clml_shape[2];
+    dims.w = clml_shape[3];
+    DLDataType tvm_dtype = tensor_rep.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+
+    auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    tensor_dsc->tensor = DeviceMakeCLMLTensor(h_ClmlIntf, context, dims, layout, cl_dtype);
+    return tensor_dsc;
+  }
+
+  /*!
+   * \brief Create an CLML tensor given the JSON representation. If scale
+   * and offset are given, then create a quantized CLML tensor.
+   *
+   * \param tensor The tensor to represent.
+   * \return CLML Tensor.
+   */
+
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONEntry(
+      const JSONGraphNodeEntry& tensor, std::vector<size_t> shape = {},
+      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint dtype = CL_FLOAT) {
+    JSONGraphNode node = nodes_[tensor.id_];
+    if (this->layer_.storage_map.find(tensor.id_) == this->layer_.storage_map.end()) {
+      void* node_data = nullptr;
+      if (node.GetOpType() == "const") {
+        node_data = data_entry_[EntryID(tensor)]->data;
+      }
+      auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, dtype, node_data, shape);
+      this->layer_.storage_map.insert({tensor.id_, std::make_pair(clml_tensor, node)});
+      return clml_tensor;
+    } else {
+      return this->layer_.storage_map[tensor.id_].first;
+    }
+  }
+  /*!
+   * \brief Create an CLML tensor given the JSON representation. If scale
+   * and offset are given, then create a quantized CLML tensor.
+   *
+   * \param node The tensor to represent.
+   * \param data (optional) Constant data of input node.
+   * \return CLML Tensor.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONNode(
+      const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+      cl_uint dtype = CL_FLOAT, void* data = nullptr, std::vector<size_t> shape = {}) {
+    return MakeCLMLTensor(node, data, shape, layout, dtype);
+  }
+  /*!
+   * \brief Create a 2D convolution layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateConvolution2DLayer(
+      CachedLayer* layer, const JSONGraphNode& node, cl_convolution_mode_qcom mode) {
+    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
+    std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
+    std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
+    std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+    if (!node.HasAttr("padding")) {
+      clml_padding.resize(4);
+      std::fill(clml_padding.begin(), clml_padding.end(), 0);
+    }
+    cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[0], clml_padding[1]};
+    cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[2], clml_padding[3]};
+    std::vector<cl_uint> v_strides = GetVectorValues(strides);
+    std::vector<cl_uint> v_dilation = GetVectorValues(dilation);
+    cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_strides[0], v_strides[1]};
+    cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_dilation[0], v_dilation[1]};
+    cl_int result = 0;
+
+    cl_uint groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
+    if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
+      ICHECK(groups == 1) << "CLML convolution only supports group size of 1.";
+    } else {
+      groups = 1;  // Don't need to pass groups to depthwise
+    }
+
+    bool has_act = false;
+    std::string activation_type;
+    cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU;
+    if (node.HasAttr("activation_type")) {
+      activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
+      ICHECK(activation_type == "relu" || activation_type == "relu6")
+          << "Unknown activation type:" << activation_type;
+      if (activation_type == "relu") {
+        clml_act_type = CL_ACTIVATION_RELU;
+      } else {
+        clml_act_type = CL_ACTIVATION_RELU6;
+      }
+      has_act = true;
+    }
+    cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
+                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    // Collect inputs and outputs, handling nn.conv2d.
+    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
+    size_t num_inputs = inputs.size();
+    bool has_bias;
+    bool has_bn;
+    ICHECK(num_inputs >= 2U && num_inputs <= 7U)
+        << "Batchnorm fused convolution requires bax 7 arguments";
+    has_bias = (num_inputs == 3) || (num_inputs == 7);
+    has_bn = (num_inputs == 6) || (num_inputs == 7);
+    // Input
+    auto input = MakeCLMLTensorFromJSONEntry(inputs[0]);
+
+    // Weight
+    auto weight = MakeCLMLTensorFromJSONEntry(inputs[1]);
+
+    // Bias
+    auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    if (has_bias) {
+      bias = MakeCLMLTensorFromJSONEntry(inputs[2]);
+    } else {
+      cl_ml_tensor_desc_qcom desc = {};
+      desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+      result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor);
+      ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
+      bias->tensor = layer_.unusedTensor;
+    }
+    // Output
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    cl_ml_op_convolution_desc_qcom conv_desc{mode,
+                                             groups,
+                                             4,
+                                             {clml_padding_b[0], clml_padding_b[1]},
+                                             {clml_padding_a[0], clml_padding_a[1]},
+                                             {clml_strides[0], clml_strides[1]},
+                                             {clml_dilation[0], clml_dilation[1]},
+                                             0,
+                                             CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    cl_ml_op_qcom op = NULL;
+    if (!has_bn) {
+      if (!has_act) {
+        result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM(
+            context, 0, &conv_desc, input->tensor, weight->tensor, bias->tensor, output->tensor,
+            &op, NULL);
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      } else {
+        result = h_ClmlIntf->clCreateMLOpFusedConvolutionActivationForwardQCOM(
+            context, 0, &conv_desc, &act_desc, input->tensor, weight->tensor, bias->tensor, NULL,
+            output->tensor, &op, tuning_cache);
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      }
+      layer_.func_ins.push_back(input);
+      layer->function.push_back(op);
+    } else {
+      int bn_index = has_bias ? 3 : 2;
+      int axis = std::stoi(node.GetAttr<std::vector<std::string>>("batchnorm")[0]);
+      auto bn_dims = get_tensor_dims(nodes_[inputs[bn_index].id_]);
+      std::vector<size_t> bn_shape = {1, 1, 1, 1};
+      bn_shape[axis] = bn_dims.n;
+      auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape);
+      bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape);
+      bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape);
+      bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape);
+
+      cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
+                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+      if (!has_act) {
+        result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM(
+            context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor, bias->tensor,
+            output->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op,
+            tuning_cache);
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      } else {
+        result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM(
+            context, 0, &conv_desc, &bn_desc, &act_desc, input->tensor, weight->tensor,
+            bias->tensor, output->tensor, NULL, bn_mean->tensor, bn_var->tensor, bn_scale->tensor,
+            bn_bias->tensor, &op, tuning_cache);
+
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      }
+      layer_.func_ins.push_back(input);
+      layer->function.push_back(op);
+    }
+    return output;
+  }
+
+  /*!
+   * \brief Create a ReLU(X) layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateReLULayer(
+      CachedLayer* layer, const JSONGraphNode& node,
+      cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
+                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    cl_ml_tensor_desc_qcom desc = {};
+    desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+    result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor);
+    ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
+
+    result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM(context, 0, &act_desc, input->tensor,
+                                                           layer_.unusedTensor, output->tensor, &op,
+                                                           tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Activation Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a batch norm layer.
+   *
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateBatchNormLayer(CachedLayer* layer,
+                                                                      const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+    auto bn_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
+    std::vector<size_t> bn_shape = {1, 1, 1, 1};
+    bn_shape[axis] = bn_dims.n;
+    auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape);
+    bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape);
+    bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape);
+    bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape);
+
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
+                                            CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM(
+        context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor,
+        bn_bias->tensor, output->tensor, &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Batchnorm Error:" << result;
+
+    layer->function.push_back(op);
+    layer_.func_ins.push_back(input);
+    return output;
+  }
+
+  /*!
+   * \brief Create a global pooling layer.
+   *
+   * \note Currently global_max_pool2d and global_avg_pool2d are supported.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateGlobalPoolingLayer(
+      CachedLayer* layer, const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+    cl_ml_op_pooling_desc_qcom pool_desc = {
+        node.GetOpName() == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
+                                                   : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
+        4,  // reserved
+        {0, 0},
+        {0, 0},
+        {1, 1},
+        {in_dims.w, in_dims.h},
+        CL_PROPAGATE_NAN_QCOM,
+        CL_ARITHMETIC_MODE_FP32_QCOM,
+    };
+
+    cl_ml_tensor_desc_qcom desc = {};
+    desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+    result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor);
+    ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
+
+    result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(context, 0, &pool_desc, input->tensor,
+                                                        layer_.unusedTensor, output->tensor, &op,
+                                                        tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a SoftMax layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateSoftMaxLayer(CachedLayer* layer,
+                                                                    const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto out_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, CL_FLOAT, nullptr,
+                                             {out_dims.n, out_dims.c, 1, 1});
+
+    cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM,
+                                               CL_SOFTMAX_MODE_INSTANCE_QCOM,
+                                               CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(context, 0, &softmax_desc, input->tensor,
+                                                 output->tensor, &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a Pad layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreatePadLayer(CachedLayer* layer,
+                                                                const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    std::string pad_mode = node.GetAttr<std::vector<std::string>>("pad_mode")[0];
+    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("pad_width");
+    std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+
+    cl_pad_mode_qcom clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
+    if (pad_mode == "constant")
+      clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
+    else if (pad_mode == "edge")
+      clml_pad_mode = CL_PAD_MODE_SYMMETRIC_QCOM;
+    else if (pad_mode == "reflect")
+      clml_pad_mode = CL_PAD_MODE_REFLECT_QCOM;
+    else
+      LOG(FATAL) << "Padding mode not supported by CLML:" << pad_mode;
+
+    cl_ml_op_pad_desc_qcom pad_desc{
+        clml_pad_mode,
+        {0, 0},
+        {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0},
+        CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpPadQCOM(context, 0, &pad_desc, input->tensor, output->tensor,
+                                             &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Pad Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a Reshape layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateReshapeLayer(CachedLayer* layer,
+                                                                    const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    result = h_ClmlIntf->clCreateMLOpReshapeQCOM(context, 0, input->tensor, output->tensor, &op,
+                                                 tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a dense layer.
+   *
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateDenseLayer(CachedLayer* layer,
+                                                                  const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto wt_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
+    bool has_bias = node.GetInputs().size() == 3 ? true : false;
+
+    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c});
+    auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    if (has_bias) {
+      auto bias_dims = get_tensor_dims(nodes_[node.GetInputs()[2].id_]);
+      bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1});
+    }
+
+    cl_ml_op_fully_connected_desc_qcom fc_desc = {1, CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM,
+                                                  CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    if (has_bias) {
+      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0, &fc_desc, input->tensor,
+                                                          weight->tensor, bias->tensor,
+                                                          output->tensor, &op, tuning_cache);
+    } else {
+      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0, &fc_desc, input->tensor,
+                                                          weight->tensor, NULL, output->tensor, &op,
+                                                          tuning_cache);
+    }
+    ICHECK(op && result == CL_SUCCESS) << "Fully Connected Error:" << result;
+
+    layer->function.push_back(op);
+    layer_.func_ins.push_back(input);
+    return output;
+  }
+
+  /*!
+   * \brief Create a Clip(X) layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateClipLayer(CachedLayer* layer,
+                                                                 const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    cl_float a_max = std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
+    cl_float a_min = std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);
+
+    cl_ml_op_clip_desc_qcom clip_desc = {CL_CLIP_BY_VALUE_QCOM,
+                                         {{a_max}, CL_FLOAT},
+                                         {{a_min}, CL_FLOAT},
+                                         CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpClipQCOM(context, 0, &clip_desc, input->tensor, output->tensor,
+                                              &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Clip Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief The network layers represented by acl functions.
+   * \note Currently only supports a single layer.
+   */
+
+  CachedLayer layer_;
+  // CLML Context
+  CLMLInterfaceV2QCOM* h_ClmlIntf = NULL;
+  cl_platform_id platform = NULL;
+  cl_context context = NULL;
+  cl_device_id device_id = NULL;
+  cl_command_queue queue = NULL;
+  std::vector<cl_event>* evts;
+  cl_ml_tuningcache_qcom tuning_cache = NULL;
+  bool is_tuning_run;
+  char* tuning_file;
+#else
+  void Run() override {
+    LOG(FATAL) << "Cannot call run on CLML module without runtime enabled. "
+               << "Please build with USE_CLML_GRAPH_EXECUTOR.";
+  }
+
+  void BuildEngine() {
+    LOG(WARNING) << "CLML engine is not initialized. "
+                 << "Please build with USE_CLML_GRAPH_EXECUTOR.";
+  }
+#endif
+};
+
+runtime::Module CLMLRuntimeCreate(const String& symbol_name, const String& graph_json,
+                                  const Array<String>& const_names) {
+  auto n = make_object<CLMLRuntime>(symbol_name, graph_json, const_names);
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.clml_runtime_create").set_body_typed(CLMLRuntimeCreate);
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_clml")
+    .set_body_typed(JSONRuntimeBase::LoadFromBinary<CLMLRuntime>);
+}  //  namespace contrib
+}  //  namespace runtime
+}  //  namespace tvm
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index f98e08ce94b6c..be0cd9eb8f52c 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -310,6 +310,8 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_THRUST", TVM_INFO_USE_THRUST},
       {"USE_VITIS_AI", TVM_INFO_USE_VITIS_AI},
       {"USE_VULKAN", TVM_INFO_USE_VULKAN},
+      {"USE_CLML", TVM_INFO_USE_CLML},
+      {"USE_CLML_GRAPH_EXECUTOR", TVM_INFO_USE_CLML_GRAPH_EXECUTOR},
   };
   return result;
 }
diff --git a/tests/python/contrib/test_clml/__init__.py b/tests/python/contrib/test_clml/__init__.py
new file mode 100644
index 0000000000000..dfeb9ae5c88e9
--- /dev/null
+++ b/tests/python/contrib/test_clml/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Infrastructure and tests for CLML"""
diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py
new file mode 100644
index 0000000000000..19901d733e4c5
--- /dev/null
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -0,0 +1,256 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from itertools import zip_longest, combinations
+import json
+import os
+import warnings
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm import rpc
+
+# from tvm.contrib.debugger import debug_runtime as graph_executor
+from tvm.contrib import graph_executor
+from tvm.relay.op.contrib import clml
+from tvm.contrib import utils
+from tvm.autotvm.measure import request_remote
+from tvm.relay.expr_functor import ExprMutator, Call
+
+
+class Device:
+    """
+    Configuration for CLML tests.
+
+    Check tests/python/contrib/clml/ for the presence of an test_config.json file.
+    This file can be used to override the default configuration here which will attempt to run the Arm
+    Compute Library runtime tests locally if the runtime is available. Changing the configuration
+    will allow these runtime tests to be offloaded to a remote Arm device via a tracker for example.
+
+    Notes
+    -----
+        The test configuration will be loaded once when the the class is created. If the configuration
+        changes between tests, any changes will not be picked up.
+
+    Parameters
+    ----------
+    device : RPCSession
+        Allows tests to connect to and use remote device.
+
+    Attributes
+    ----------
+    connection_type : str
+        Details the type of RPC connection to use. Options:
+        local - Use the local device,
+        tracker - Connect to a tracker to request a remote device,
+        remote - Connect to a remote device directly.
+    host : str
+        Specify IP address or hostname of remote target.
+    port : int
+        Specify port number of remote target.
+    target : str
+        The compilation target.
+    device_key : str
+        The device key of the remote target. Use when connecting to a remote device via a tracker.
+    cross_compile : str
+        Specify path to cross compiler to use when connecting a remote device from a non-arm platform.
+    """
+
+    connection_type = "tracker"
+    host = "localhost"
+    port = 9090
+    target = "opencl"
+    target_host = "llvm -mtriple=aarch64-linux-gnu"
+    device_key = ""
+    cross_compile = ""
+
+    def __init__(self):
+        """Keep remote device for lifetime of object."""
+        self.device = self._get_remote()
+
+    @classmethod
+    def _get_remote(cls):
+        """Get a remote (or local) device to use for testing."""
+        if cls.connection_type == "tracker":
+            device = request_remote(cls.device_key, cls.host, cls.port, timeout=1000)
+        elif cls.connection_type == "remote":
+            device = rpc.connect(cls.host, cls.port)
+        elif cls.connection_type == "local":
+            device = rpc.LocalSession()
+        else:
+            raise ValueError(
+                "connection_type in test_config.json should be one of: " "local, tracker, remote."
+            )
+
+        return device
+
+    @classmethod
+    def load(cls, file_name):
+        """Load test config
+
+        Load the test configuration by looking for file_name relative
+        to the test_clml directory.
+        """
+        location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+        config_file = os.path.join(location, file_name)
+        if not os.path.exists(config_file):
+            warnings.warn("Config file doesn't exist, resuming CLML tests with default config.")
+            return
+        with open(config_file, mode="r") as config:
+            test_config = json.load(config)
+
+        cls.connection_type = test_config["connection_type"]
+        cls.host = test_config["host"]
+        cls.port = test_config["port"]
+        cls.target = test_config["target"]
+        cls.target_host = test_config["target_host"]
+        cls.device_key = test_config.get("device_key") or ""
+        cls.cross_compile = test_config.get("cross_compile") or ""
+
+
+def skip_runtime_test():
+    """Skip test if it requires the runtime and it's not present."""
+    # CLML codegen not present.
+    if not tvm.get_global_func("relay.ext.clml", True):
+        print("Skip because CLML codegen is not available.")
+        return True
+
+    # Remote device is in use or CLML runtime not present
+    # Note: Ensure that the device config has been loaded before this check
+    if not Device.connection_type != "local" and not clml.is_clml_runtime_enabled():
+        print("Skip because runtime isn't present or a remote device isn't being used.")
+        return True
+
+
+def skip_codegen_test():
+    """Skip test if it requires the CLML codegen and it's not present."""
+    if not tvm.get_global_func("relay.ext.clml", True):
+        print("Skip because CLML codegen is not available.")
+        return True
+
+
+def build_module(mod, target, target_host, params=None, enable_clml=True):
+    """Build module with option to build for CLML."""
+    if isinstance(mod, tvm.relay.expr.Call):
+        mod = tvm.IRModule.from_expr(mod)
+
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        if enable_clml:
+            mod = clml.partition_for_clml(mod, params)
+        relay.backend.te_compiler.get().clear()
+        # print("Build  Mod:", mod)
+        return relay.build(mod, target=target, target_host=target_host, params=params)
+
+
+def build_and_run(
+    mod,
+    inputs,
+    outputs,
+    params,
+    device,
+    enable_clml=True,
+    no_runs=1,
+    config=None,
+):
+    """Build and run the relay module."""
+    if config is None:
+        config = {}
+
+    try:
+        libm = build_module(mod, device.target, device.target_host, params, enable_clml)
+
+        clml_modules = extract_clml_modules(libm)
+        for mod in clml_modules:
+            source = mod.get_source("json")
+            codegen = json.loads(source)["nodes"]
+            # remove input and const names as these cannot be predetermined
+            for node in range(len(codegen)):
+                if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
+                    codegen[node]["name"] = ""
+            codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
+
+    except Exception as e:
+        err_msg = "The module could not be built.\n"
+        if config:
+            err_msg += f"The test failed with the following parameters: {config}\n"
+        err_msg += str(e)
+        raise Exception(err_msg)
+
+    lib = update_lib(libm, device.device, device.cross_compile)
+    gen_module = graph_executor.GraphModule(lib["default"](device.device.cl(0)))
+    gen_module.set_input(**inputs)
+    out = []
+    for _ in range(no_runs):
+        gen_module.run()
+        out.append([gen_module.get_output(i) for i in range(outputs)])
+    time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=50)
+    cost = time_f().mean
+    print("%g secs/iteration\n" % cost)
+    return out
+
+
+def update_lib(lib, device, cross_compile):
+    """Export the library to the remote/local device."""
+    lib_name = "mod.so"
+    temp = utils.tempdir()
+    lib_path = temp.relpath(lib_name)
+    if cross_compile:
+        lib.export_library(lib_path, cc=cross_compile)
+    else:
+        lib.export_library(lib_path)
+    device.upload(lib_path)
+    lib = device.load_module(lib_name)
+    return lib
+
+
+def extract_clml_modules(module):
+    """Get the CLML module(s) from llvm module."""
+    return list(filter(lambda mod: mod.type_key == "clml", module.get_lib().imported_modules))
+
+
+def verify_codegen(
+    module,
+    known_good_codegen,
+    num_clml_modules=1,
+    tvm_ops=0,
+    target="llvm -mtriple=aarch64-linux-gnu",
+):
+    """Check clml codegen against a known good output."""
+    module = build_module(module, target, tvm_ops=tvm_ops, clml_partitions=num_clml_modules)
+    clml_modules = extract_clml_modules(module)
+
+    assert len(clml_modules) == num_clml_modules, (
+        f"The number of CLML modules produced ({len(clml_modules)}) does not "
+        f"match the expected value ({num_clml_modules})."
+    )
+
+    for mod in clml_modules:
+        source = mod.get_source("json")
+        codegen = json.loads(source)["nodes"]
+        # remove input and const names as these cannot be predetermined
+        for node in range(len(codegen)):
+            if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
+                codegen[node]["name"] = ""
+        codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
+        known_good_codegen_str = json.dumps(known_good_codegen, sort_keys=True, indent=2)
+
+        assert codegen_str == known_good_codegen_str, (
+            f"The JSON produced by codegen does not match the expected result. \n"
+            f"Actual={codegen_str} \n"
+            f"Expected={known_good_codegen_str}"
+        )
diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py
new file mode 100644
index 0000000000000..d89676f10e3a8
--- /dev/null
+++ b/tests/python/contrib/test_clml/test_network.py
@@ -0,0 +1,139 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""OpenCL ML network tests."""
+
+import numpy as np
+import pytest
+from tvm import testing
+from tvm import relay
+
+import tvm
+from test_clml.infrastructure import skip_runtime_test, build_and_run
+from test_clml.infrastructure import Device
+
+
+def _build_and_run_network(mod, params, inputs, data, device, atol, rtol):
+    """Helper function to build and run a network."""
+
+    outputs = []
+    for clml in [True, False]:
+        outputs.append(
+            build_and_run(
+                mod,
+                data,
+                1,
+                params,
+                device,
+                enable_clml=clml,
+            )[0]
+        )
+    return outputs
+
+
+def _get_keras_model(keras_model, inputs_dict, data):
+    """Convert Keras graph to relay."""
+    inputs = {}
+    for name, (shape, _) in inputs_dict.items():
+        inputs[keras_model.input_names[0]] = shape
+
+    from tensorflow.keras.layers import Input
+    from tensorflow.keras.models import Model
+
+    def get_bottom_top_model(model, layer_name):
+        layer = model.get_layer(layer_name)
+        bottom_input = model.layers[0].input
+        bottom_output = bottom_input
+        for layer in model.layers:
+            bottom_output = layer(bottom_output)
+            if layer.name == layer_name:
+                break
+        bottom_model = Model(bottom_input, bottom_output)
+        return bottom_model
+
+    keras_model = get_bottom_top_model(keras_model, "predictions")
+    ref_output = keras_model.predict(data["input_1"].transpose(0, 2, 3, 1))
+
+    mod, params = relay.frontend.from_keras(keras_model, inputs, layout="NCHW")
+    return mod, params, ref_output
+
+
+def test_mobilenet():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    dtype = "float16"
+
+    def get_model():
+        from tensorflow.keras.applications import MobileNet
+
+        mobilenet = MobileNet(
+            include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000
+        )
+        mobilenet.load_weights("mobilenet_1_0_224_tf.h5")
+        inputs = {mobilenet.input_names[0]: ((1, 3, 224, 224), "float32")}
+
+        data = {}
+        np.random.seed(0)
+
+        for name, (shape, dtype) in inputs.items():
+            if dtype == "uint8":
+                low, high = 0, 1
+            else:
+                low, high = -1, 1
+            data[name] = np.random.uniform(low, high, shape).astype(dtype)
+
+        mod, params, ref_outputs = _get_keras_model(mobilenet, inputs, data)
+        return mod, params, inputs, data, ref_outputs
+
+    mod, params, inputs, input_data, ref_outputs = get_model()
+    outputs = _build_and_run_network(
+        mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
+    )
+
+    # test
+    print("OpenCL:", outputs[0][0].asnumpy().shape)
+    print("CLML:", outputs[1][0].asnumpy().shape)
+
+    opencl_sort = np.argsort(outputs[1][0].asnumpy()).flatten()
+    clml_sort = np.argsort(outputs[0][0].asnumpy()).flatten()
+
+    tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5)
+
+
+"""
+    tvm.testing.assert_allclose(
+         ref_outputs, outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
+    print("OpenCL to Keras looks good")
+    tvm.testing.assert_allclose(
+         outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
+    print("OpenCL to CLML looks good")
+    exit(0)
+
+    tvm.testing.assert_allclose(
+         ref_outputs.transpose(0, 3, 1, 2), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
+    print("OpenCL to Keras looks good")
+    tvm.testing.assert_allclose(
+         outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
+    print("OpenCL to CLML looks good")
+"""
+
+
+if __name__ == "__main__":
+    test_mobilenet()
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
new file mode 100644
index 0000000000000..63f5bc168fd0f
--- /dev/null
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -0,0 +1,216 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""CLML integration conv2d tests."""
+
+import numpy as np
+
+np.random.seed(0)
+
+import tvm
+from tvm import testing
+from tvm import relay
+from tvm.ir import IRModule
+
+from test_clml.infrastructure import (
+    skip_runtime_test,
+    skip_codegen_test,
+    build_and_run,
+    Device,
+)
+
+
+def _get_conv_model(
+    shape,
+    kernel_h,
+    kernel_w,
+    padding,
+    strides,
+    dilation,
+    groups,
+    dtype,
+    channels,
+    var,
+    has_bias=False,
+    has_activation=False,
+    has_pad=False,
+):
+    """Return a model and any parameters it may have"""
+    a = relay.var(next(iter(var)), shape=shape, dtype=dtype)
+    input_arr = var[next(iter(var))]
+    if has_pad:
+        p = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 0))
+        a = relay.nn.pad(a, pad_width=p)
+        padding = (0, 0, 0, 0)
+    else:
+        if len(padding) == 2:
+            padding = (padding[0], padding[1], padding[0], padding[1])
+        shape = (shape[0], shape[1], shape[2] + padding[0] * 2, shape[3] + padding[1] * 2)
+    is_depthwise = shape[1] == channels == groups
+
+    weight_format = "OIHW" if is_depthwise else "OIHW"
+    if weight_format == "IOHW":
+        weight_shape = (shape[1] // groups, channels, kernel_h, kernel_w)
+    else:
+        weight_shape = (channels, shape[1] // groups, kernel_h, kernel_w)
+
+    w = tvm.nd.array(np.random.uniform(-1, 1, weight_shape).astype(dtype))
+    weights = relay.const(w, dtype)
+    out = relay.nn.conv2d(
+        a,
+        weights,
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NCHW",
+        kernel_layout=weight_format,
+        dilation=dilation,
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        channels=channels,
+        out_dtype=dtype,
+    )
+    params = {"w": w}
+    if has_bias:
+        bias_shape = weight_shape[2] if is_depthwise else weight_shape[0]
+        b = tvm.nd.array(np.random.uniform(-1, 1, bias_shape).astype(dtype))
+        biasc = relay.const(b, dtype)
+        out = relay.nn.bias_add(out, biasc, axis=1)
+        params["b"] = b
+
+    if has_activation:
+        out = relay.nn.relu(out)
+
+    print("Out:", out)
+
+    return out, params
+
+
+def test_conv2d():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    dtype = "float32"
+
+    trials = [
+        # Normal convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (15, 16, 12), (False, False, True)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, True)],
+        # Normal convolution
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (16, 12, 15), (False, False, True)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False)],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (16, 12, 15), (False, False, False)],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True)],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False)],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False)],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (14, 10, 10), (False, True, True)],
+    ]
+
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+    ) in trials:
+        shape = (1, *shape)
+        groups = 1
+        outputs = []
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(-1, 1, shape).astype(dtype)),
+        }
+
+        func, params = _get_conv_model(
+            shape,
+            kernel_h,
+            kernel_w,
+            pad,
+            stride,
+            dilation,
+            groups,
+            dtype,
+            out_channels,
+            inputs,
+            has_pad=composite[0],
+            has_bias=composite[1],
+            has_activation=composite[2],
+        )
+        opencl_out = build_and_run(func, inputs, 1, params, device, enable_clml=False)[0]
+        clml_out = build_and_run(func, inputs, 1, params, device, enable_clml=True)[0]
+
+        tvm.testing.assert_allclose(
+            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-5, atol=1e-5
+        )
+
+
+def test_batchnorm():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    dtype = "float32"
+
+    in_shape = (1, 8, 64, 64)
+    channels = 8
+
+    input_arr = tvm.nd.array(np.random.uniform(-1, 1, in_shape).astype(dtype))
+    inp = relay.var("a", shape=in_shape, dtype=dtype)
+    gamma_arr = tvm.nd.array(np.random.uniform(-1, 1, (channels)).astype(dtype))
+    beta_arr = tvm.nd.array(np.random.uniform(-1, 1, (channels)).astype(dtype))
+    gamma = relay.const(gamma_arr, dtype)
+    beta = relay.const(beta_arr, dtype)
+
+    mean_arr = tvm.nd.array(np.mean(input_arr.asnumpy(), axis=(0, 2, 3), keepdims=False))
+    mean = relay.const(mean_arr)
+    variance_arr = tvm.nd.array(np.var(input_arr.asnumpy(), axis=(0, 2, 3), keepdims=False))
+    variance = relay.const(variance_arr)
+
+    params = {}
+
+    func = relay.nn.batch_norm(inp, gamma, beta, mean, variance, axis=1, epsilon=0.0001)[0]
+    mod = IRModule.from_expr(func)
+
+    inputs = {
+        "a": input_arr,
+    }
+
+    opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+    clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+
+    tvm.testing.assert_allclose(
+        clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-5, atol=1e-5
+    )
+
+
+if __name__ == "__main__":
+    # test_conv2d()
+    test_batchnorm()

From 1f6f849db27d82e739540c031658c8f360197cdd Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 14 Jun 2022 11:17:58 -0700
Subject: [PATCH 127/181] cleanup (#11659)

---
 tests/python/contrib/test_hexagon/test_usmp.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_usmp.py b/tests/python/contrib/test_hexagon/test_usmp.py
index 9ae0feeaff219..e9d07d1344fd6 100644
--- a/tests/python/contrib/test_hexagon/test_usmp.py
+++ b/tests/python/contrib/test_hexagon/test_usmp.py
@@ -26,12 +26,11 @@
 from tvm.contrib.hexagon.session import Session
 from tvm.testing.usmp import is_tvm_backendallocworkspace_calls
 
-from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain
 
 usmp_enabled = tvm.testing.parameter(False, True)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_conv2d(hexagon_session: Session, aot_host_target, aot_target, usmp_enabled):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)

From 9d5782f10b8e8db47527e09466d8be0b5a7998b3 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Tue, 14 Jun 2022 14:47:35 -0400
Subject: [PATCH 128/181] [tests][hexagon] Fix `allocate_hexagon_array` bug.
 (#11709)

Fix bug where `allocate_hexagon_array` in
`tests/python/contrib/test_hexagon/infrastructure.py` wasn't
respecting the caller-specified `memory_scope`.
---
 tests/python/contrib/test_hexagon/infrastructure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 0c9a9478c870e..01eef86e6b5b8 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -48,7 +48,7 @@ def allocate_hexagon_array(
         for dim_i, dim_f in zip(boundaries[:-1], boundaries[1:])
     ]
 
-    arr = tvm.nd.empty(physical_shape, dtype=dtype, device=dev)
+    arr = tvm.nd.empty(physical_shape, dtype=dtype, device=dev, mem_scope=mem_scope)
 
     if data is not None:
         arr.copyfrom(data.reshape(physical_shape))

From 9a71092585e48c4f774ae8f433f67e4e183c5ce3 Mon Sep 17 00:00:00 2001
From: Alexey Voronov <alexey.voronov@deelvin.com>
Date: Tue, 14 Jun 2022 22:34:34 +0300
Subject: [PATCH 129/181] Move FlattenAtrousConv before AlterOpLayout in the
 default opt pipeline. (#11706)

Co-authored-by: Andrey Malyshev <elvin.nnov@gmail.com>

Co-authored-by: Andrey Malyshev <elvin.nnov@gmail.com>
---
 src/relay/backend/utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 133f9a9fc3878..3c6e642e846e7 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -250,6 +250,7 @@ Array<Pass> GetPassPrefix(bool is_homogeneous, bool is_vm) {
   pass_seqs.push_back(transform::SimplifyExpr());
   pass_seqs.push_back(transform::CanonicalizeCast());
   pass_seqs.push_back(transform::CanonicalizeOps());
+  pass_seqs.push_back(transform::FlattenAtrousConv());
 
   // Alter layout transformation is currently only applied to homogeneous execution.
   if (is_homogeneous) {
@@ -263,7 +264,6 @@ Array<Pass> GetPassPrefix(bool is_homogeneous, bool is_vm) {
   pass_seqs.push_back(transform::FastMath());
   pass_seqs.push_back(transform::FoldConstant());
 
-  pass_seqs.push_back(transform::FlattenAtrousConv());
   return pass_seqs;
 }
 

From 02fff50cbb7cdeb333563be70fd5601d49fb08ab Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Tue, 14 Jun 2022 12:56:42 -0700
Subject: [PATCH 130/181] [Hexagon] remove #if defined(__hexagon__) where it is
 no longer needed (#11708)

* remove #if defined(__hexagon__) where it is no longer needed

* format and lint
---
 src/runtime/hexagon/hexagon_buffer.cc               | 13 +++----------
 src/runtime/hexagon/hexagon_common.cc               |  7 +------
 src/runtime/hexagon/hexagon_user_dma.cc             |  4 ++--
 src/runtime/hexagon/hexagon_user_dma_instructions.h |  2 +-
 4 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index 909d37481147f..0fc71d8ac29c6 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -20,16 +20,13 @@
 
 #include <tvm/runtime/module.h>
 
-#include "hexagon_common.h"
-
-#if defined(__hexagon__)
-#include "HAP_compute_res.h"
-#endif
-
 #include <algorithm>
 #include <string>
 #include <utility>
 
+#include "HAP_compute_res.h"
+#include "hexagon_common.h"
+
 namespace tvm {
 namespace runtime {
 namespace hexagon {
@@ -60,7 +57,6 @@ struct DDRAllocation : public Allocation {
 
 struct VTCMAllocation : public Allocation {
   VTCMAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
-#if defined(__hexagon__)
     compute_res_attr_t res_info;
     HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info));
 
@@ -83,13 +79,10 @@ struct VTCMAllocation : public Allocation {
       LOG(ERROR) << "ERROR: Unable to acquire requeisted resource.";
       return;
     }
-#endif
   }
   ~VTCMAllocation() {
-#if defined(__hexagon__)
     HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_));
     data_ = nullptr;
-#endif
   }
   unsigned int context_id_{0};
 };
diff --git a/src/runtime/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon_common.cc
index 2a2ddbdfa0327..ec65dffebe51f 100644
--- a/src/runtime/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon_common.cc
@@ -32,17 +32,13 @@
 #include <vector>
 
 #include "../library_module.h"
-#include "hexagon_buffer.h"
-
-#if defined(__hexagon__)
 #include "HAP_perf.h"
-#endif
+#include "hexagon_buffer.h"
 
 namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-#if defined(__hexagon__)
 class HexagonTimerNode : public TimerNode {
  public:
   virtual void Start() { start = HAP_perf_get_time_us(); }
@@ -62,7 +58,6 @@ TVM_REGISTER_OBJECT_TYPE(HexagonTimerNode);
 TVM_REGISTER_GLOBAL("profiling.timer.hexagon").set_body_typed([](Device dev) {
   return Timer(make_object<HexagonTimerNode>());
 });
-#endif
 }  // namespace hexagon
 
 namespace {
diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc
index 9bf7a9f6c1d42..7f4d0e77bc3c5 100644
--- a/src/runtime/hexagon/hexagon_user_dma.cc
+++ b/src/runtime/hexagon/hexagon_user_dma.cc
@@ -29,7 +29,7 @@ namespace runtime {
 namespace hexagon {
 
 int init_hexagon_user_dma() {
-#if defined(__hexagon__) && __HEXAGON_ARCH__ >= 68
+#if __HEXAGON_ARCH__ >= 68
   // reset DMA engine
   unsigned int status = dmpause() & DM0_STATUS_MASK;
   if (status != DM0_STATUS_IDLE) {
@@ -40,7 +40,7 @@ int init_hexagon_user_dma() {
 }
 
 int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) {
-#if defined(__hexagon__) && __HEXAGON_ARCH__ >= 68
+#if __HEXAGON_ARCH__ >= 68
   static int config_dma = init_hexagon_user_dma();
   if (config_dma != DMA_SUCCESS) {
     return DMA_FAILURE;
diff --git a/src/runtime/hexagon/hexagon_user_dma_instructions.h b/src/runtime/hexagon/hexagon_user_dma_instructions.h
index c7255bc003ea3..e160b73956584 100644
--- a/src/runtime/hexagon/hexagon_user_dma_instructions.h
+++ b/src/runtime/hexagon/hexagon_user_dma_instructions.h
@@ -24,7 +24,7 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-#if defined(__hexagon__) && __HEXAGON_ARCH__ >= 68
+#if __HEXAGON_ARCH__ >= 68
 
 inline unsigned int dmpause() {
   unsigned int dm0 = 0;

From d92a7731a28d87d6644d18619f94f341250318b3 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Wed, 15 Jun 2022 01:50:12 +0530
Subject: [PATCH 131/181] [CI] [Hexagon] Update docker tag in jenkins (#11588)

---
 Jenkinsfile            | 2 +-
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 85bc6b075a829..8e284ee951c8c 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -56,7 +56,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
 ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
-ci_hexagon = 'tlcpack/ci-hexagon:20220516-190055-672ce3365'
+ci_hexagon = 'tlcpack/ci-hexagon:20220603-203325-cee74c9f8'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 75e67d9e9ffa9..8977a31de82bd 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -58,7 +58,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
 ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
-ci_hexagon = 'tlcpack/ci-hexagon:20220516-190055-672ce3365'
+ci_hexagon = 'tlcpack/ci-hexagon:20220603-203325-cee74c9f8'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images

From 5b3cef30f963a236205088848d7dc660a1f6c7fc Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Tue, 14 Jun 2022 15:28:25 -0700
Subject: [PATCH 132/181] [microTVM][zephyr] Add support for host-driven AoT
 execution on zephyr (#11650)

* - add support for host-driven AoT execution on zephyr;
- add initial version of reference counting to prevent python code from inadvertently freeing tensors during garbage collection;
- add support for numerical indices to host-drive AoT get_input();
- add two initial tests for host-driven AoT execution on zephyr;
- rename existing zephyr AoT exec. test;

* address PR feedback

* increase stack size to accommodate qemu_riscv64 stack usage
---
 .../template_project/crt_config/crt_config.h  |   2 +-
 .../template_project/microtvm_api_server.py   |   2 +-
 python/tvm/micro/session.py                   |  10 +-
 python/tvm/runtime/ndarray.py                 |   2 +-
 src/runtime/crt/aot_executor/aot_executor.c   |  12 +-
 .../aot_executor_module/aot_executor_module.c |  30 +++-
 src/runtime/crt/common/crt_runtime_api.c      |  49 +++---
 src/runtime/crt/common/ndarray.c              |  26 ++-
 .../crt/graph_executor/graph_executor.c       |   4 +-
 .../graph_executor_module.c                   |  13 +-
 src/runtime/crt/host/main.cc                  |   3 -
 .../tvm/runtime/crt/internal/common/ndarray.h |   8 +
 .../crt/microtvm_rpc_server/rpc_server.cc     |   6 +
 src/runtime/graph_executor/graph_executor.h   |   2 +-
 tests/micro/zephyr/conftest.py                |   4 +-
 tests/micro/zephyr/test_zephyr_aot_exec.py    | 157 ++++++++++++++++++
 ....py => test_zephyr_aot_exec_standalone.py} |   0
 17 files changed, 276 insertions(+), 54 deletions(-)
 create mode 100644 tests/micro/zephyr/test_zephyr_aot_exec.py
 rename tests/micro/zephyr/{test_zephyr_aot.py => test_zephyr_aot_exec_standalone.py} (100%)

diff --git a/apps/microtvm/zephyr/template_project/crt_config/crt_config.h b/apps/microtvm/zephyr/template_project/crt_config/crt_config.h
index c3beaed522f26..3481d342a1ce6 100644
--- a/apps/microtvm/zephyr/template_project/crt_config/crt_config.h
+++ b/apps/microtvm/zephyr/template_project/crt_config/crt_config.h
@@ -36,7 +36,7 @@
 #define TVM_CRT_MAX_ARGS 10
 
 /*! Size of the global function registry, in bytes. */
-#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 256
+#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 512
 
 /*! Maximum number of registered modules. */
 #define TVM_CRT_MAX_REGISTERED_MODULES 2
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index bcf9f78f4b112..dad4cdf9d64c3 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -420,7 +420,7 @@ def _create_prj_conf(self, project_dir, options):
     API_SERVER_CRT_LIBS_TOKEN = "<API_SERVER_CRT_LIBS>"
 
     CRT_LIBS_BY_PROJECT_TYPE = {
-        "host_driven": "microtvm_rpc_server microtvm_rpc_common common",
+        "host_driven": "microtvm_rpc_server microtvm_rpc_common aot_executor_module aot_executor common",
         "aot_demo": "memory microtvm_rpc_common common",
     }
 
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 4c38476207ba3..967eaee629586 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -39,7 +39,7 @@
 
 @register_error
 class SessionTerminatedError(Exception):
-    """Raised when a transport read operationd discovers that the remote session is terminated."""
+    """Raised when a transport read operation discovers that the remote session is terminated."""
 
 
 class Session:
@@ -86,12 +86,18 @@ def __init__(
 
         self._rpc = None
         self._graph_executor = None
+        self._enable_rpc_logger = False
 
         self._exit_called = False
 
     def get_system_lib(self):
         return self._rpc.get_function("runtime.SystemLib")()
 
+    def create_aot_executor(self):
+        return self._rpc.get_function("tvm.aot_executor.create")(
+            self.get_system_lib(), self.device, "default"
+        )
+
     def _wrap_transport_read(self, n, timeout_microsec):
         try:
             return self.transport.read(
@@ -133,7 +139,7 @@ def __enter__(self):
                     int(timeouts.session_start_timeout_sec * 1e6),
                     int(timeouts.session_established_timeout_sec * 1e6),
                     self._cleanup,
-                    False,
+                    self._enable_rpc_logger,
                 )
             )
             self.device = self._rpc.cpu(0)
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 3d4764d6164a3..9d3a3aff21659 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -127,7 +127,7 @@ def __setitem__(self, in_slice, value):
             raise TypeError("type %s not supported" % str(type(value)))
 
     def copyfrom(self, source_array):
-        """Perform an synchronize copy from the array.
+        """Perform a synchronous copy from the array.
 
         Parameters
         ----------
diff --git a/src/runtime/crt/aot_executor/aot_executor.c b/src/runtime/crt/aot_executor/aot_executor.c
index 1360c40b0fa4f..1724fabec4a0e 100644
--- a/src/runtime/crt/aot_executor/aot_executor.c
+++ b/src/runtime/crt/aot_executor/aot_executor.c
@@ -173,21 +173,29 @@ int TVMAotExecutor_Init(TVMAotExecutor* executor, TVMModuleHandle module_handle,
   for (i = 0; i < md->num_inputs; ++i) {
     LOG_DEBUG("input allocate[%d]: %s\n", i, md->inputs[i].name);
 
+    TVMNDArray* array = &executor->args[arg_idx++];
+
     status = TVMNDArray_Empty(md->inputs[i].num_shape, md->inputs[i].shape, md->inputs[i].dtype,
-                              executor->device, &executor->args[arg_idx++]);
+                              executor->device, array);
     if (status != 0) {
       return status;
     }
+
+    TVMNDArray_IncrementReference(array);
   }
 
   for (i = 0; i < md->num_outputs; ++i) {
     LOG_DEBUG("output allocate[%d]: %s\n", i, md->outputs[i].name);
 
+    TVMNDArray* array = &executor->args[arg_idx++];
+
     status = TVMNDArray_Empty(md->outputs[i].num_shape, md->outputs[i].shape, md->outputs[i].dtype,
-                              executor->device, &executor->args[arg_idx++]);
+                              executor->device, array);
     if (status != 0) {
       return status;
     }
+
+    TVMNDArray_IncrementReference(array);
   }
 
   for (i = 0; i < md->num_pools; ++i) {
diff --git a/src/runtime/crt/aot_executor_module/aot_executor_module.c b/src/runtime/crt/aot_executor_module/aot_executor_module.c
index e1dbd533a3ecd..5dd11c3dbc7e0 100644
--- a/src/runtime/crt/aot_executor_module/aot_executor_module.c
+++ b/src/runtime/crt/aot_executor_module/aot_executor_module.c
@@ -80,13 +80,27 @@ int32_t TVMAotExecutorModule_NotImplemented(TVMValue* args, int* tcodes, int nar
 
 int32_t TVMAotExecutorModule_GetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
                                       int* ret_tcodes, void* resource_handle) {
-  int index = TVMAotExecutor_GetInputIndex(aot_executor.executor, args[0].v_str);
+  int64_t index;
 
-  if (index < 0) {
-    return kTvmErrorExecutorModuleNoSuchInput;
+  if (tcodes[0] == kTVMArgInt) {
+    if (args[0].v_int64 > TVMAotExecutor_GetNumInputs(aot_executor.executor)) {
+      return kTvmErrorFunctionCallInvalidArg;
+    }
+
+    index = args[0].v_int64;
+  } else {
+    index = TVMAotExecutor_GetInputIndex(aot_executor.executor, args[0].v_str);
+
+    if (index < 0) {
+      return kTvmErrorExecutorModuleNoSuchInput;
+    }
   }
 
-  ret_values[0].v_handle = (void*)&aot_executor.executor->args[index].dl_tensor;
+  TVMNDArray* array = &aot_executor.executor->args[index];
+
+  TVMNDArray_IncrementReference(array);
+
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
 
   return 0;
@@ -103,9 +117,13 @@ int32_t TVMAotExecutorModule_GetOutput(TVMValue* args, int* tcodes, int nargs, T
   }
 
   // index past the input entries
-  int64_t idx = args[0].v_int64 + TVMAotExecutor_GetNumInputs(aot_executor.executor);
+  int64_t index = args[0].v_int64 + TVMAotExecutor_GetNumInputs(aot_executor.executor);
+
+  TVMNDArray* array = &aot_executor.executor->args[index];
+
+  TVMNDArray_IncrementReference(array);
 
-  ret_values[0].v_handle = (void*)&aot_executor.executor->args[idx].dl_tensor;
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
 
   return 0;
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index 31ab3e9a69731..a8a17041f5eab 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -76,9 +76,9 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
 }
 
 int TVMArrayFree(TVMArrayHandle handle) {
-  TVMNDArray arr;
-  arr.dl_tensor = *handle;
-  return TVMNDArray_Release(&arr);
+  TVMNDArray* arr = (TVMNDArray*)handle;
+
+  return TVMNDArray_Release(arr);
 }
 
 int TVMDeviceAllocDataSpace(DLDevice dev, size_t nbytes, size_t alignment, DLDataType type_hint,
@@ -149,7 +149,7 @@ static const TVMModule* registered_modules[TVM_CRT_MAX_REGISTERED_MODULES];
 /*! \brief Passed as `module_index` to EncodeFunctionHandle. */
 static const tvm_module_index_t kGlobalFuncModuleIndex = TVM_CRT_MAX_REGISTERED_MODULES;
 
-/*! \brief Special module handle for retur values from RPCTimeEvaluator. */
+/*! \brief Special module handle for return values from RPCTimeEvaluator. */
 static const tvm_module_index_t kTimeEvaluatorModuleIndex = 0x7fff;
 
 static int DecodeModuleHandle(TVMModuleHandle handle, tvm_module_index_t* out_module_index) {
@@ -202,8 +202,8 @@ int TVMModFree(TVMModuleHandle mod) {
   return 0;
 }
 
-int SystemLibraryCreate(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
-                        int* ret_type_codes) {
+static int SystemLibraryCreate(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
+                               int* ret_type_codes) {
   const TVMModule* system_lib;
 
   if (system_lib_handle == kTVMModuleHandleUninitialized) {
@@ -400,8 +400,22 @@ int RPCGetCRTMaxPacketSize(TVMValue* args, int* type_codes, int num_args, TVMVal
   return 0;
 }
 
-int TVMContribRandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
-                         int* ret_type_code);
+// Fill the tensor in args[0] with random data using TVMPlatformGenerateRandom.
+static int RandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
+                      int* ret_type_code) {
+  if (num_args != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (type_codes[0] != kTVMDLTensorHandle) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  DLTensor* tensor = (DLTensor*)args[0].v_handle;
+  TVMNDArray arr = {*tensor, 0};
+  return TVMNDArray_RandomFill(&arr);
+}
+
 tvm_crt_error_t TVMInitializeRuntime() {
   int idx = 0;
   tvm_crt_error_t error = kTvmErrorNoError;
@@ -440,7 +454,7 @@ tvm_crt_error_t TVMInitializeRuntime() {
   }
 
   if (error == kTvmErrorNoError) {
-    error = TVMFuncRegisterGlobal("tvm.contrib.random.random_fill", &TVMContribRandomFill, 0);
+    error = TVMFuncRegisterGlobal("tvm.contrib.random.random_fill", &RandomFill, 0);
   }
 
   if (error != kTvmErrorNoError) {
@@ -590,20 +604,3 @@ __attribute__((weak)) tvm_crt_error_t TVMPlatformBeforeMeasurement() { return kT
 
 // Default implementation, overridden by the platform runtime.
 __attribute__((weak)) tvm_crt_error_t TVMPlatformAfterMeasurement() { return kTvmErrorNoError; }
-
-// Fill the tensor in args[0] with random data using TVMPlatformGenerateRandom.
-// Named to correspond with the analogous function in the C++ runtime.
-int TVMContribRandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
-                         int* ret_type_code) {
-  if (num_args != 1) {
-    return kTvmErrorFunctionCallNumArguments;
-  }
-
-  if (type_codes[0] != kTVMDLTensorHandle) {
-    return kTvmErrorFunctionCallWrongArgType;
-  }
-
-  DLTensor* tensor = (DLTensor*)args[0].v_handle;
-  TVMNDArray arr = {*tensor};
-  return TVMNDArray_RandomFill(&arr);
-}
diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c
index 16bde3227f7c2..b0e869766bde5 100644
--- a/src/runtime/crt/common/ndarray.c
+++ b/src/runtime/crt/common/ndarray.c
@@ -30,8 +30,8 @@
 
 #include "crt_config.h"
 
-int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
-                      TVMNDArray* array) {
+static int Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
+                  TVMNDArray* array) {
   memset(array, 0, sizeof(TVMNDArray));
   array->dl_tensor.ndim = ndim;
   tvm_crt_error_t err;
@@ -58,7 +58,7 @@ int64_t TVMNDArray_DataSizeBytes(TVMNDArray* array) {
 
 int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
                      TVMNDArray* array) {
-  int status = TVMNDArray_Create(ndim, shape, dtype, dev, array);
+  int status = Create(ndim, shape, dtype, dev, array);
   if (status != 0) {
     return status;
   }
@@ -132,7 +132,7 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) {
 
 int TVMNDArray_CreateView(TVMNDArray* arr, const tvm_index_t* shape, int32_t ndim, DLDataType dtype,
                           TVMNDArray* array_view) {
-  int status = TVMNDArray_Create(ndim, shape, dtype, arr->dl_tensor.device, array_view);
+  int status = Create(ndim, shape, dtype, arr->dl_tensor.device, array_view);
   if (status != 0) {
     return status;
   }
@@ -149,21 +149,35 @@ int TVMNDArray_RandomFill(TVMNDArray* arr) {
   return TVMPlatformGenerateRandom(arr->dl_tensor.data, (size_t)num_bytes);
 }
 
+void TVMNDArray_IncrementReference(TVMNDArray* arr) { arr->reference_count++; }
+
+uint32_t TVMNDArray_DecrementReference(TVMNDArray* arr) {
+  if (arr->reference_count > 0) {
+    arr->reference_count--;
+  }
+
+  return arr->reference_count;
+}
+
 int TVMNDArray_Release(TVMNDArray* arr) {
   tvm_crt_error_t err;
   DLDevice dev = {kDLCPU, 0};
 
+  if (TVMNDArray_DecrementReference(arr) > 0) {
+    return 0;
+  }
+
   err = TVMPlatformMemoryFree(arr->dl_tensor.data, dev);
   if (err != kTvmErrorNoError) {
     return err;
   }
+  arr->dl_tensor.data = NULL;
 
-  arr->dl_tensor.data = 0;
   err = TVMPlatformMemoryFree(arr->dl_tensor.shape, dev);
   if (err != kTvmErrorNoError) {
     return err;
   }
+  arr->dl_tensor.shape = NULL;
 
-  arr->dl_tensor.shape = 0;
   return 0;
 }
diff --git a/src/runtime/crt/graph_executor/graph_executor.c b/src/runtime/crt/graph_executor/graph_executor.c
index 3fea408d97609..395a343ccb411 100644
--- a/src/runtime/crt/graph_executor/graph_executor.c
+++ b/src/runtime/crt/graph_executor/graph_executor.c
@@ -1014,7 +1014,7 @@ int TVMGraphExecutor_SetupStorage(TVMGraphExecutor* executor) {
     executor->storage_pool_count++;
   }
 
-  // Assign the pooled entries. A unified memory pool is used to simplifiy
+  // Assign the pooled entries. A unified memory pool is used to simplify
   // memory assignment for each node entry. The allocated memory on each device
   // is mapped to this pool.
   executor->data_entry_count = executor->node_row_ptr[executor->node_row_ptr_count - 1];
@@ -1031,6 +1031,8 @@ int TVMGraphExecutor_SetupStorage(TVMGraphExecutor* executor) {
                                        attrs->shape + idx * TVM_CRT_MAX_NDIM, attrs->ndim[idx],
                                        vtype[idx], &executor->data_entry[idx]);
     CHECK_EQ(status, 0, "fail to create for node with idx=%d, storage_id=%u\n", idx, storage_id);
+
+    TVMNDArray_IncrementReference(&executor->data_entry[idx]);
   }
 
   // Release memory
diff --git a/src/runtime/crt/graph_executor_module/graph_executor_module.c b/src/runtime/crt/graph_executor_module/graph_executor_module.c
index 0ae12f5a9e0a1..559b6896a55e9 100644
--- a/src/runtime/crt/graph_executor_module/graph_executor_module.c
+++ b/src/runtime/crt/graph_executor_module/graph_executor_module.c
@@ -95,7 +95,12 @@ int32_t TVMGraphExecutorModule_GetInput(TVMValue* args, int* tcodes, int nargs,
 
   uint32_t eid = TVMGraphExecutor_GetEntryId(graph_executor.executor,
                                              graph_executor.executor->input_nodes[index], 0);
-  ret_values[0].v_handle = (void*)&graph_executor.executor->data_entry[eid].dl_tensor;
+
+  TVMNDArray* array = &graph_executor.executor->data_entry[eid];
+
+  TVMNDArray_IncrementReference(array);
+
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
@@ -158,7 +163,11 @@ int32_t TVMGraphExecutorModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
   uint32_t index = graph_executor.executor->outputs[output_index].index;
   uint32_t eid = TVMGraphExecutor_GetEntryId(graph_executor.executor, nid, index);
 
-  ret_values[0].v_handle = (void*)&(graph_executor.executor->data_entry[eid].dl_tensor);
+  TVMNDArray* array = &graph_executor.executor->data_entry[eid];
+
+  TVMNDArray_IncrementReference(array);
+
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index bf4a98569e33a..d8fa95fe236bc 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -139,9 +139,6 @@ int main(int argc, char** argv) {
            "failed to register GraphExecutor TVMModule");
 #endif
 
-  CHECK_EQ(TVMAotExecutorModule_Register(), kTvmErrorNoError,
-           "failed to register AoT Executor TVMModule");
-
   int error = TVMFuncRegisterGlobal("tvm.testing.reset_server",
                                     (TVMFunctionHandle)&testonly_reset_server, 0);
   if (error) {
diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h b/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
index e5869ed2a3037..0162c6eb4de6d 100644
--- a/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
@@ -38,7 +38,11 @@ static const uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
 static const uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
 
 typedef struct TVMNDArray {
+  /*! \brief the actual tensor in DLPack format. NOTE: this must be first element in struct */
   DLTensor dl_tensor;
+
+  /*! \brief count of references to TVMNDArray to avoid early freeing by host */
+  uint32_t reference_count;
 } TVMNDArray;
 
 int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
@@ -56,6 +60,10 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm);
 int TVMNDArray_CreateView(TVMNDArray* arr, const tvm_index_t* shape, int32_t ndim, DLDataType dtype,
                           TVMNDArray* array_view);
 
+void TVMNDArray_IncrementReference(TVMNDArray* arr);
+
+uint32_t TVMNDArray_DecrementReference(TVMNDArray* arr);
+
 int TVMNDArray_Release(TVMNDArray* arr);
 
 #endif  // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_COMMON_NDARRAY_H_
diff --git a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
index b7bae243ecf0e..1e5f625998abe 100644
--- a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
@@ -33,6 +33,7 @@
 #define DMLC_CMAKE_LITTLE_ENDIAN DMLC_IO_USE_LITTLE_ENDIAN
 #define DMLC_LITTLE_ENDIAN 1
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/aot_executor_module.h>
 #include <tvm/runtime/crt/crt.h>
 #include <tvm/runtime/crt/logging.h>
 #include <tvm/runtime/crt/microtvm_rpc_server.h>
@@ -207,6 +208,11 @@ microtvm_rpc_server_t MicroTVMRpcServerInit(microtvm_rpc_channel_write_t write_f
     TVMPlatformAbort(err);
   }
 
+  err = TVMAotExecutorModule_Register();
+  if (err != kTvmErrorNoError) {
+    TVMPlatformAbort(err);
+  }
+
   DLDevice dev = {kDLCPU, 0};
   void* receive_buffer_memory;
   err = TVMPlatformMemoryAllocate(TVM_CRT_MAX_PACKET_SIZE_BYTES, dev, &receive_buffer_memory);
diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
index 25b01a253c7d5..2564f5b0d924b 100644
--- a/src/runtime/graph_executor/graph_executor.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -61,7 +61,7 @@ struct TVMOpParam {
 /*!
  * \brief Tiny graph executor.
  *
- *  This runtime can be acccesibly in various language via
+ *  This runtime can be accessible in various languages via
  *  TVM runtime PackedFunc API.
  */
 class TVM_DLL GraphExecutor : public ModuleNode {
diff --git a/tests/micro/zephyr/conftest.py b/tests/micro/zephyr/conftest.py
index 997237d370a5d..c4de48a0a47aa 100644
--- a/tests/micro/zephyr/conftest.py
+++ b/tests/micro/zephyr/conftest.py
@@ -30,7 +30,7 @@ def pytest_addoption(parser):
         "--zephyr-board",
         required=True,
         choices=test_utils.ZEPHYR_BOARDS.keys(),
-        help=("Zephyr board for test."),
+        help="Zephyr board for test.",
     )
     parser.addoption(
         "--west-cmd", default="west", help="Path to `west` command for flashing device."
@@ -92,5 +92,5 @@ def skip_by_board(request, board):
 def pytest_configure(config):
     config.addinivalue_line(
         "markers",
-        "skip_by_board(board): skip test for the given board",
+        "skip_boards(board): skip test for the given board",
     )
diff --git a/tests/micro/zephyr/test_zephyr_aot_exec.py b/tests/micro/zephyr/test_zephyr_aot_exec.py
new file mode 100644
index 0000000000000..1add0063bc9c6
--- /dev/null
+++ b/tests/micro/zephyr/test_zephyr_aot_exec.py
@@ -0,0 +1,157 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import logging
+import os
+import pathlib
+import sys
+import logging
+
+import pytest
+import numpy as np
+
+import onnx
+from PIL import Image
+
+import tvm
+import tvm.testing
+import tvm.relay as relay
+from tvm.relay.backend import Executor, Runtime
+from tvm.relay.testing import byoc
+from tvm.contrib import utils
+from tvm.micro.testing.utils import check_tune_log
+from tvm._ffi import get_global_func, register_func
+
+import test_utils
+
+
+def _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config):
+    config_main_stack_size = None
+    if test_utils.qemu_boards(zephyr_board):
+        # fyi: qemu_riscv64 seems to be the greediest stack user
+        config_main_stack_size = 4096
+
+    project_options = {
+        "project_type": "host_driven",
+        "west_cmd": west_cmd,
+        "verbose": bool(build_config.get("debug")),
+        "zephyr_board": zephyr_board,
+    }
+    if config_main_stack_size is not None:
+        project_options["config_main_stack_size"] = config_main_stack_size
+
+    project = tvm.micro.generate_project(
+        str(test_utils.TEMPLATE_PROJECT_DIR),
+        mod,
+        temp_dir / "project",
+        project_options,
+    )
+    project.build()
+    project.flash()
+    return tvm.micro.Session(project.transport())
+
+
+@tvm.testing.requires_micro
+def test_relay(temp_dir, board, west_cmd, tvm_debug):
+    """Testing a simple relay graph"""
+
+    model = test_utils.ZEPHYR_BOARDS[board]
+    build_config = {"debug": tvm_debug}
+    shape = (10,)
+    dtype = "int8"
+
+    # Construct Relay program.
+    x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype))
+    xx = relay.multiply(x, x)
+    z = relay.add(xx, relay.const(np.ones(shape=shape, dtype=dtype)))
+    func = relay.Function([x], z)
+    ir_mod = tvm.IRModule.from_expr(func)
+
+    runtime = Runtime("crt", {"system-lib": True})
+    executor = Executor("aot")
+    target = tvm.target.target.micro(model)
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(ir_mod, target=target, runtime=runtime, executor=executor)
+
+    with _make_session(temp_dir, board, west_cmd, mod, build_config) as session:
+
+        aot_executor = tvm.runtime.executor.aot_executor.AotModule(session.create_aot_executor())
+
+        x_in = np.random.randint(10, size=shape[0], dtype=dtype)
+        aot_executor.run(x=x_in)
+        result = aot_executor.get_output(0).numpy()
+        tvm.testing.assert_allclose(aot_executor.get_input(0).numpy(), x_in)
+        tvm.testing.assert_allclose(result, x_in * x_in + 1)
+
+
+@tvm.testing.requires_micro
+def test_aot_executor(temp_dir, board, west_cmd, tvm_debug):
+    """Test use of the AOT executor with microTVM."""
+
+    model = test_utils.ZEPHYR_BOARDS[board]
+    build_config = {"debug": tvm_debug}
+    shape = (10,)
+    dtype = "int8"
+
+    print("test_relay: construct relay program\n")
+
+    # Construct Relay program.
+    relay_mod = tvm.parser.fromtext(
+        """
+      #[version = "0.0.5"]
+      def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) {
+          %0 = %a + %b;
+          %0
+      }"""
+    )
+
+    runtime = Runtime("crt", {"system-lib": True})
+    executor = Executor("aot")
+    target = tvm.target.target.micro(model)
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(relay_mod, target=target, runtime=runtime, executor=executor)
+
+    def do_test():
+
+        aot_executor = tvm.runtime.executor.aot_executor.AotModule(session.create_aot_executor())
+
+        assert aot_executor.get_input_index("a") == 0
+        assert aot_executor.get_input_index("b") == 1
+
+        assert aot_executor.get_num_inputs() == 2
+        assert aot_executor.get_num_outputs() == 1
+
+        A_np = np.array([[2, 3]], dtype="uint8")
+        B_np = np.array([[4, 7]], dtype="uint8")
+
+        A_data = aot_executor.get_input("a").copyfrom(A_np)
+        B_data = aot_executor.get_input("b").copyfrom(B_np)
+
+        aot_executor.run()
+
+        out = aot_executor.get_output(0)
+        assert (out.numpy() == np.array([6, 10])).all()
+
+        B_np_new = np.array([[5, 8]])
+        aot_executor.set_input("b", B_np_new)
+        assert (B_data.numpy() == B_np_new).all()
+
+    with _make_session(temp_dir, board, west_cmd, mod, build_config) as session:
+        do_test()
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot_exec_standalone.py
similarity index 100%
rename from tests/micro/zephyr/test_zephyr_aot.py
rename to tests/micro/zephyr/test_zephyr_aot_exec_standalone.py

From f667342ca988ec78763c937e6e5e8b70146d0907 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 14 Jun 2022 19:31:37 -0500
Subject: [PATCH 133/181] [LLVM] Update uses of AllocaInst::getAlign[ment]
 (#11718)

Today's LLVM main branch removed AllocaInst::getAlignment.
---
 src/target/llvm/codegen_amdgpu.cc |  7 ++++++-
 src/target/llvm/codegen_llvm.cc   | 12 +++++++++++-
 src/target/llvm/codegen_nvptx.cc  |  7 ++++++-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 9e5103ef7f394..626fdff8012e2 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -96,7 +96,12 @@ class CodeGenAMDGPU : public CodeGenLLVM {
         llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
           return builder_->CreateAlloca(DTypeToLLVMType(op->dtype), ConstInt32(constant_size));
         });
-        if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
+#if TVM_LLVM_VERSION >= 110
+        auto alignment = static_cast<unsigned>(alloca->getAlign().value());
+#else
+        unsigned alignment = alloca->getAlignment();
+#endif
+        if (alignment < static_cast<unsigned>(info.alignment)) {
 #if TVM_LLVM_VERSION >= 100
           alloca->setAlignment(llvm::Align(info.alignment));
 #else
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index dceecfc9f007b..981aef2f6c06c 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1522,14 +1522,24 @@ void CodeGenLLVM::VisitStmt_(const AllocateNode* op) {
   llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
     return builder_->CreateAlloca(DTypeToLLVMType(op->dtype), ConstInt32(constant_size));
   });
-  if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
+#if TVM_LLVM_VERSION >= 110
+  auto alignment = static_cast<unsigned>(alloca->getAlign().value());
+#else
+  unsigned alignment = alloca->getAlignment();
+#endif
+  if (alignment < static_cast<unsigned>(info.alignment)) {
 #if TVM_LLVM_VERSION >= 100
     alloca->setAlignment(llvm::Align(info.alignment));
 #else
     alloca->setAlignment(info.alignment);
 #endif
   }
+#if TVM_LLVM_VERSION >= 110
+  info.alignment = static_cast<unsigned>(alloca->getAlign().value());
+#else
   info.alignment = alloca->getAlignment();
+#endif
+
   buf = alloca;
 
   buf = builder_->CreatePointerCast(
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index 01a3191cc2205..6c1bb7832a86e 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -72,7 +72,12 @@ class CodeGenNVPTX : public CodeGenLLVM {
         llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
           return builder_->CreateAlloca(DTypeToLLVMType(op->dtype), ConstInt32(constant_size));
         });
-        if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
+#if TVM_LLVM_VERSION >= 110
+        auto alignment = static_cast<unsigned>(alloca->getAlign().value());
+#else
+        unsigned alignment = alloca->getAlignment();
+#endif
+        if (alignment < static_cast<unsigned>(info.alignment)) {
 #if TVM_LLVM_VERSION >= 100
           alloca->setAlignment(llvm::Align(info.alignment));
 #else

From d2e2f71b14ea39d354f20c239dbe44cde59d0a67 Mon Sep 17 00:00:00 2001
From: Jinkun Lin <lazycal12@gmail.com>
Date: Tue, 14 Jun 2022 20:31:54 -0400
Subject: [PATCH 134/181] Fix 1d-softmax schedule. (#11719)

---
 python/tvm/topi/cuda/softmax.py               |  2 +-
 tests/python/topi/python/test_topi_softmax.py | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/python/tvm/topi/cuda/softmax.py b/python/tvm/topi/cuda/softmax.py
index 14d2963acf984..d669c64ca97c5 100644
--- a/python/tvm/topi/cuda/softmax.py
+++ b/python/tvm/topi/cuda/softmax.py
@@ -63,7 +63,7 @@ def sched_warp_softmax():
             return False
         return True
 
-    if len(outs[0].shape) > 2:
+    if len(outs[0].shape) != 2:
         ops = [max_elem.op, expsum.op, softmax_op]
         if delta is not None:
             ops.append(delta.op)
diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py
index cd73c660e8be2..10bad979c80b0 100644
--- a/tests/python/topi/python/test_topi_softmax.py
+++ b/tests/python/topi/python/test_topi_softmax.py
@@ -44,7 +44,7 @@
     "softmax": {
         "topi": topi.nn.softmax,
         "ref": tvm.topi.testing.softmax_python,
-        "dimensions": [2, 4],
+        "dimensions": [1, 2, 4],
     },
     "log_softmax": {
         "topi": topi.nn.log_softmax,
@@ -52,7 +52,7 @@
         "dimensions": [2],
     },
 }
-shapes = [(32, 10), (3, 4), (1, 16, 256, 256)]
+shapes = [(32, 10), (3, 4), (1, 16, 256, 256), (32,)]
 softmax_operation, shape = tvm.testing.parameters(
     *[
         (name, shape)
@@ -69,13 +69,19 @@ def ref_data(shape, dtype, softmax_operation):
 
     a_np = np.random.uniform(size=shape).astype(dtype)
 
-    if len(shape) == 2:
+    if len(shape) == 1:
+        a_np_2d = a_np[None, :]
+        b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
+        b_np = b_np_2d[0]
+    elif len(shape) == 2:
         b_np = ref_func(a_np)
     elif len(shape) == 4:
         _, c, h, w = a_np.shape
         a_np_2d = a_np.transpose(0, 2, 3, 1).reshape(h * w, c)
         b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
         b_np = b_np_2d.reshape(1, h, w, c).transpose(0, 3, 1, 2)
+    else:
+        raise NotImplementedError(f"{len(shape)}-D shape not supported")
 
     return a_np, b_np
 
@@ -89,7 +95,7 @@ def test_softmax(target, dev, shape, dtype, ref_data, softmax_operation):
     A = te.placeholder(shape, dtype=dtype, name="A")
 
     topi_op = configs[softmax_operation]["topi"]
-    B = topi_op(A, axis=1)
+    B = topi_op(A, axis=min(len(shape) - 1, 1))
 
     with tvm.target.Target(target):
         fschedule = tvm.topi.testing.dispatch(target, _softmax_schedule)

From 1312658093c4cd48f6320672bdb799f2fdcbb520 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 14 Jun 2022 22:10:24 -0700
Subject: [PATCH 135/181] [MetaSchedule] Apply-History-Best Task Filtering
 (#11692)

This PR enables task filtering in Apply-History-Best, which is used in
Relay/Relax integration. Previously, even though a task is ruled out
during task extraction, it still shows up in Relay compilation due to
the lack of filtering on `Apply-History-Best`. However, TE-to-TIR
conversion `te.CreatePrimFunc` doesn't support all cases with hybrid
operators involved, which leads to post-tuning failure affecting
multiple models.
---
 .../tvm/meta_schedule/apply_history_best.h    | 21 ++++++-
 include/tvm/meta_schedule/extracted_task.h    | 23 ++++++++
 .../tvm/meta_schedule/apply_history_best.py   | 26 +++++++--
 python/tvm/meta_schedule/relay_integration.py | 16 ++++--
 python/tvm/meta_schedule/testing/utils.py     | 15 ++++-
 src/meta_schedule/apply_history_best.cc       | 15 ++++-
 src/meta_schedule/extracted_task.cc           | 55 ++++++++++++++++++-
 src/meta_schedule/utils.h                     |  1 +
 src/relay/backend/task_extraction.cc          | 45 ++-------------
 src/relay/backend/te_compiler_cache.cc        | 17 +++---
 .../test_meta_schedule_relay_tir_compute.py   | 15 ++++-
 11 files changed, 183 insertions(+), 66 deletions(-)

diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h
index 5b1816cef41ff..82bb350e1c5ee 100644
--- a/include/tvm/meta_schedule/apply_history_best.h
+++ b/include/tvm/meta_schedule/apply_history_best.h
@@ -29,6 +29,12 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/target/target.h>
 
+namespace tvm {
+namespace te {
+class Tensor;
+}  // namespace te
+}  // namespace tvm
+
 namespace tvm {
 namespace meta_schedule {
 
@@ -38,12 +44,21 @@ namespace meta_schedule {
  */
 class ApplyHistoryBestNode : public runtime::Object {
  public:
+  using FTEFilterFunc =
+      runtime::TypedPackedFunc<Optional<tir::PrimFunc>(const Array<te::Tensor, void>&)>;
+
   /*! \brief The database to be queried from */
   Database database{nullptr};
+  /*! \brief The filtering function for TE computation */
+  FTEFilterFunc te_filter_func{nullptr};
   /*! \brief The logging function to be used */
   PackedFunc logging_func;
 
-  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("database", &database); }
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("database", &database);
+    // `te_filter_func` is not visited
+    // `logging_func` is not visited
+  }
   /*!
    * \brief Query the best entry from the database
    * \param task_name The name of the task to be queried
@@ -67,9 +82,11 @@ class ApplyHistoryBest : public runtime::ObjectRef {
   /*!
    * \brief Constructor
    * \param database The database to be queried from
+   * \param te_filter_func The filtering function for TE computation
    * \param logging_func The logging function to use
    */
-  explicit ApplyHistoryBest(Database database, PackedFunc logging_func);
+  explicit ApplyHistoryBest(Database database, ApplyHistoryBestNode::FTEFilterFunc te_filter_func,
+                            PackedFunc logging_func);
   /*!
    * \brief The current ApplyHistoryBest in the context
    * \return The ApplyHistoryBest in the current scope.
diff --git a/include/tvm/meta_schedule/extracted_task.h b/include/tvm/meta_schedule/extracted_task.h
index 898b974d87726..bed1428f83036 100644
--- a/include/tvm/meta_schedule/extracted_task.h
+++ b/include/tvm/meta_schedule/extracted_task.h
@@ -26,6 +26,15 @@
 #include <tvm/runtime/object.h>
 #include <tvm/target/target.h>
 
+namespace tvm {
+namespace tir {
+class PrimFunc;
+}  // namespace tir
+namespace te {
+class Tensor;
+}  // namespace te
+}  // namespace tvm
+
 namespace tvm {
 namespace meta_schedule {
 
@@ -67,6 +76,20 @@ class ExtractedTask : public runtime::ObjectRef {
                                                     ExtractedTaskNode);
 };
 
+/*!
+ * \brief The default TE task filter
+ * \param args The input/output arguments of the TE compute graph
+ * \return NullOpt if the task is filtered out, otherwise the task in PrimFunc
+ */
+Optional<tvm::tir::PrimFunc> DefaultTaskFilter(const Array<tvm::te::Tensor, void>& args);
+
+/*!
+ * \brief The default TE task filter, with `te.extern` allowed
+ * \param args The input/output arguments of the TE compute graph
+ * \return NullOpt if the task is filtered out, otherwise the task in PrimFunc
+ */
+Optional<tir::PrimFunc> DefaultTaskFilterAllowExtern(const Array<tvm::te::Tensor, void>& args);
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/python/tvm/meta_schedule/apply_history_best.py b/python/tvm/meta_schedule/apply_history_best.py
index bcde7c97b04d3..d618c3a04fa1f 100644
--- a/python/tvm/meta_schedule/apply_history_best.py
+++ b/python/tvm/meta_schedule/apply_history_best.py
@@ -16,12 +16,14 @@
 # under the License.
 """A context manager that injects the best tuning record in the database into compilation"""
 import logging
-from typing import List, Optional, Union
+from typing import Callable, List, Optional, Union
 
-from tvm._ffi import register_object
+from tvm._ffi import get_global_func, register_object
 from tvm.ir import IRModule
 from tvm.runtime import Object
 from tvm.target import Target
+from tvm.te import Tensor
+from tvm.tir import PrimFunc
 
 from . import _ffi_api
 from .database import Database
@@ -38,13 +40,29 @@ class ApplyHistoryBest(Object):
     ----------
     database : Database
         The database to be queried from
+    te_filter_func : Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None
+        The filtering function for TE computation
+        If it's a string, it's the name of the filtering function. Built in functions are
+          - "meta_schedule.DefaultTaskFilter"
+          - "meta_schedule.DefaultTaskFilterAllowExtern"
+        If it's None, it's the default filtering function
+        If it's a callable, it's the filtering function
     """
 
     database: Database
 
-    def __init__(self, database: Database) -> None:
+    def __init__(
+        self,
+        database: Database,
+        te_filter_func: Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None,
+    ) -> None:
+        if isinstance(te_filter_func, str):
+            te_filter_func = get_global_func(te_filter_func)
         self.__init_handle_by_constructor__(
-            _ffi_api.ApplyHistoryBest, database, make_logging_func(logger)  # type: ignore # pylint: disable=no-member
+            _ffi_api.ApplyHistoryBest,  # type: ignore # pylint: disable=no-member
+            database,
+            te_filter_func,
+            make_logging_func(logger),
         )
 
     def query(
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index b556338174130..833f100a0d16d 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """MetaSchedule-Relay integration"""
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import numpy as np  # type: ignore
 from tvm import nd
@@ -24,6 +24,7 @@
 from tvm.runtime import NDArray
 from tvm.target import Target
 from tvm.te import Tensor
+from tvm.tir import PrimFunc
 
 from .extracted_task import ExtractedTask
 from .utils import autotvm_silencer
@@ -37,7 +38,7 @@ def extract_task_from_relay(
     opt_level: int = 3,
     pass_config: Optional[Dict[str, Any]] = None,
     disabled_pass: Optional[List[str]] = None,
-    filter_func: Callable[[List[Tensor]], bool] = None,
+    te_filter_func: Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None,
 ) -> List[ExtractedTask]:
     """Extract tuning tasks from a relay program.
 
@@ -55,8 +56,13 @@ def extract_task_from_relay(
         The pass config of the compiler
     disabled_pass : Optional[List[str]]
         The list of disabled passes of the compiler
-    filter_func : Callable[[List[tvm.te.Tensor]], bool]
+    te_filter_func : Callable[[List[tvm.te.Tensor]], bool]
         The filter function to filter out the extracted tasks
+        If it's a string, it's the name of the filtering function. Built in functions are
+          - "meta_schedule.DefaultTaskFilter"
+          - "meta_schedule.DefaultTaskFilterAllowExtern"
+        If it's None, it's the default filtering function
+        If it's a callable, it's the filtering function
 
     Returns
     -------
@@ -68,6 +74,8 @@ def extract_task_from_relay(
 
     # pylint: enable=import-outside-toplevel
 
+    if isinstance(te_filter_func, str):
+        te_filter_func = get_global_func(te_filter_func)
     extract_task_func = get_global_func(
         "relay.backend.MetaScheduleExtractTask",
         allow_missing=False,
@@ -94,4 +102,4 @@ def extract_task_from_relay(
         config=pass_config,
         disabled_pass=disabled_pass,
     ):
-        return list(extract_task_func(mod, target, relay_params, filter_func))
+        return list(extract_task_func(mod, target, relay_params, te_filter_func))
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
index f353d401a10c6..bdd3852e40a32 100644
--- a/python/tvm/meta_schedule/testing/utils.py
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -30,6 +30,7 @@ def apply_fixed_schedules(
     target: Union[str, Target],
     params: Optional[Dict[str, NDArray]],
     schedule_fn: Callable[[ms.ExtractedTask, Schedule], bool],
+    te_filter_func=None,
 ):
     """Apply fixed schedules (manually written, without any tunable knobs) as specified by
     schedule_fn to extracted tasks, and return a database that can be passed to ApplyHistoryBest.
@@ -45,6 +46,13 @@ def apply_fixed_schedules(
     schedule_fn : Callable[[ExtractedTask, Schedule], bool]
         A callable that is applied for each extracted task and the corresponding default schedule.
         Returns True if the given schedule should be committed to the database, False otherwise.
+    te_filter_func : Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None
+        The filtering function for TE computation
+        If it's a string, it's the name of the filtering function. Built in functions are
+          - "meta_schedule.DefaultTaskFilter"
+          - "meta_schedule.DefaultTaskFilterAllowExtern"
+        If it's None, it's the default filtering function
+        If it's a callable, it's the filtering function
 
     Returns
     -------
@@ -52,7 +60,12 @@ def apply_fixed_schedules(
         The database containing dummy tuning records for manually scheduled traces.
     """
     target = Target(target) if isinstance(target, str) else target
-    extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params)
+    extracted_tasks = ms.extract_task_from_relay(
+        relay_mod,
+        target,
+        params,
+        te_filter_func=te_filter_func,
+    )
     database = ms.database.MemoryDatabase()
     for task in extracted_tasks:
         mod = ms.default_config.mod(task.dispatched[0])
diff --git a/src/meta_schedule/apply_history_best.cc b/src/meta_schedule/apply_history_best.cc
index 18135811f5f15..e5cc929fd01fd 100644
--- a/src/meta_schedule/apply_history_best.cc
+++ b/src/meta_schedule/apply_history_best.cc
@@ -16,6 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <tvm/te/tensor.h>
+
 #include "./utils.h"
 
 namespace tvm {
@@ -87,10 +89,16 @@ void ApplyHistoryBest::ExitWithScope() {
 
 /**************** ApplyHistoryBest ****************/
 
-ApplyHistoryBest::ApplyHistoryBest(Database database, PackedFunc logging_func) {
+ApplyHistoryBest::ApplyHistoryBest(Database database,
+                                   ApplyHistoryBestNode::FTEFilterFunc te_filter_func,
+                                   PackedFunc logging_func) {
   ObjectPtr<ApplyHistoryBestNode> n = make_object<ApplyHistoryBestNode>();
   n->database = database;
+  n->te_filter_func = te_filter_func;
   n->logging_func = logging_func;
+  if (te_filter_func == nullptr) {
+    n->te_filter_func = DefaultTaskFilter;
+  }
   data_ = n;
 }
 
@@ -129,8 +137,9 @@ Optional<IRModule> ApplyHistoryBestNode::Query(runtime::String task_name, IRModu
 
 TVM_REGISTER_NODE_TYPE(ApplyHistoryBestNode);
 TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBest")
-    .set_body_typed([](Database database, PackedFunc logging_func) -> ApplyHistoryBest {
-      return ApplyHistoryBest(database, logging_func);
+    .set_body_typed([](Database database, ApplyHistoryBestNode::FTEFilterFunc te_filter_func,
+                       PackedFunc logging_func) -> ApplyHistoryBest {
+      return ApplyHistoryBest(database, te_filter_func, logging_func);
     });
 TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestEnterScope")
     .set_body_typed(ApplyHistoryBestInternal::EnterScope);
diff --git a/src/meta_schedule/extracted_task.cc b/src/meta_schedule/extracted_task.cc
index b1044fc87d0f1..abd7235acb998 100644
--- a/src/meta_schedule/extracted_task.cc
+++ b/src/meta_schedule/extracted_task.cc
@@ -17,6 +17,12 @@
  * under the License.
  */
 #include <tvm/meta_schedule/extracted_task.h>
+#include <tvm/te/operation.h>
+#include <tvm/te/tensor.h>
+#include <tvm/tir/function.h>
+
+#include "../te/operation/create_primfunc.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace meta_schedule {
@@ -32,12 +38,59 @@ ExtractedTask::ExtractedTask(String task_name, IRModule mod, Target target,
   data_ = n;
 }
 
+Optional<tir::PrimFunc> DefaultTaskFilterImpl(const Array<te::Tensor>& args, bool allow_extern_op) {
+  using namespace ::tvm::te;
+  std::vector<Tensor> stack;
+  std::unordered_set<const TensorNode*> visited;
+  for (const Tensor& v : args) {
+    for (const PrimExpr& e : v->shape) {
+      // Dynamic shape is not supported for now
+      if (!e->IsInstance<IntImmNode>()) {
+        return NullOpt;
+      }
+    }
+    if (!visited.count(v.get())) {
+      visited.insert(v.get());
+      stack.push_back(v);
+    }
+  }
+  while (!stack.empty()) {
+    Tensor tensor = stack.back();
+    stack.pop_back();
+    if (tensor->op->IsInstance<PlaceholderOpNode>()) {
+      // do nothing
+    } else if (tensor->op->IsInstance<ComputeOpNode>() ||
+               (allow_extern_op && tensor->op->IsInstance<ExternOpNode>())) {
+      Array<Tensor> inputs = tensor->op->InputTensors();
+      for (const Tensor& v : inputs) {
+        if (!visited.count(v.get())) {
+          visited.insert(v.get());
+          stack.push_back(v);
+        }
+      }
+    } else {
+      return NullOpt;
+    }
+  }
+  return te::CreatePrimFunc(args);
+}
+
+Optional<tir::PrimFunc> DefaultTaskFilter(const Array<te::Tensor>& args) {
+  return DefaultTaskFilterImpl(args, false);
+}
+
+Optional<tir::PrimFunc> DefaultTaskFilterAllowExtern(const Array<te::Tensor>& args) {
+  return DefaultTaskFilterImpl(args, true);
+}
+
 TVM_REGISTER_NODE_TYPE(ExtractedTaskNode);
 TVM_REGISTER_GLOBAL("meta_schedule.ExtractedTask")
     .set_body_typed([](String task_name, IRModule mod, Target target, Array<IRModule> dispatched,
                        int weight) -> ExtractedTask {
       return ExtractedTask(task_name, mod, target, dispatched, weight);
     });
-
+TVM_REGISTER_GLOBAL("meta_schedule.DefaultTaskFilter").set_body_typed(DefaultTaskFilter);
+TVM_REGISTER_GLOBAL("meta_schedule.DefaultTaskFilterAllowExtern")
+    .set_body_typed(DefaultTaskFilterAllowExtern);
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index c399696a82d79..76deb62f23767 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -26,6 +26,7 @@
 #include <tvm/meta_schedule/builder.h>
 #include <tvm/meta_schedule/cost_model.h>
 #include <tvm/meta_schedule/database.h>
+#include <tvm/meta_schedule/extracted_task.h>
 #include <tvm/meta_schedule/feature_extractor.h>
 #include <tvm/meta_schedule/measure_callback.h>
 #include <tvm/meta_schedule/profiler.h>
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index 421a92c245e79..af4b49b4f1da4 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -31,48 +31,12 @@ namespace tvm {
 namespace relay {
 namespace backend {
 
-bool DefaultTaskFilter(const Array<te::Tensor>& args) {
-  using namespace ::tvm::te;
-  std::vector<Tensor> stack;
-  std::unordered_set<const TensorNode*> visited;
-  for (const Tensor& v : args) {
-    for (const PrimExpr& e : v->shape) {
-      // Dynamic shape is not supported for now
-      if (!e->IsInstance<IntImmNode>()) {
-        return false;
-      }
-    }
-    if (!visited.count(v.get())) {
-      visited.insert(v.get());
-      stack.push_back(v);
-    }
-  }
-  while (!stack.empty()) {
-    Tensor tensor = stack.back();
-    stack.pop_back();
-    if (tensor->op->IsInstance<PlaceholderOpNode>()) {
-      // do nothing
-    } else if (tensor->op->IsInstance<ComputeOpNode>() || tensor->op->IsInstance<ExternOpNode>()) {
-      Array<Tensor> inputs = tensor->op->InputTensors();
-      for (const Tensor& v : inputs) {
-        if (!visited.count(v.get())) {
-          visited.insert(v.get());
-          stack.push_back(v);
-        }
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
 Array<meta_schedule::ExtractedTask> ExtractTask(
     IRModule mod, Target target, Map<String, runtime::NDArray> params,
-    runtime::TypedPackedFunc<bool(const Array<te::Tensor>&)> filter_func) {
+    runtime::TypedPackedFunc<Optional<tir::PrimFunc>(const Array<te::Tensor>&)> filter_func) {
   using meta_schedule::ExtractedTask;
   if (filter_func == nullptr) {
-    filter_func = DefaultTaskFilter;
+    filter_func = tvm::meta_schedule::DefaultTaskFilter;
   }
   backend::BindParamsInModule(mod, params);
   // is_vm=true for backward compatibility
@@ -98,11 +62,10 @@ Array<meta_schedule::ExtractedTask> ExtractTask(
       std::string fused_name;
       std::tie(inputs_outputs, fused_name) =
           tec::LowerTECompute(relay_func, target, /*return_inputs=*/true);
-      if (filter_func(inputs_outputs)) {
-        tir::PrimFunc prim_func = tir::CreatePrimFunc(inputs_outputs);
+      if (Optional<tir::PrimFunc> prim_func = filter_func(inputs_outputs)) {
         GlobalVar prim_fn_var(fused_name);
         IRModule relay_mod({{prim_fn_var, relay_func}});
-        IRModule tir_mod({{prim_fn_var, prim_func}});
+        IRModule tir_mod({{prim_fn_var, prim_func.value()}});
         ExtractedTask extracted_task(fused_name, relay_mod, target, {tir_mod}, 1);
         tasks.push_back(extracted_task);
         cache.emplace(cache_key, extracted_task);
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index d219e9bb67871..5b23843c95e62 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -346,15 +346,18 @@ class ScheduleBuilder : public ExprVisitor {
         }
       }
       if (meta_schedule_ctx_) {
-        IRModule relay_mod({{prim_fn_var, relay_func}});
-        IRModule tir_mod({{prim_fn_var, tir::CreatePrimFunc(Concat(fn_inputs, tensor_outs))}});
-        if (Optional<IRModule> scheduled_mod = meta_schedule_ctx_.value()->Query(
-                prim_fn_var->name_hint, relay_mod, target_, Array<IRModule>{tir_mod})) {
-          ICHECK_EQ(scheduled_mod.value()->functions.count(prim_fn_var), 1);
-          prim_func = Downcast<tir::PrimFunc>(scheduled_mod.value()->functions[prim_fn_var]);
+        Array<te::Tensor> te_args = Concat(fn_inputs, tensor_outs);
+        if (Optional<tir::PrimFunc> tir_func =
+                meta_schedule_ctx_.value()->te_filter_func(te_args)) {
+          IRModule relay_mod({{prim_fn_var, relay_func}});
+          IRModule tir_mod({{prim_fn_var, tir_func.value()}});
+          if (Optional<IRModule> scheduled_mod = meta_schedule_ctx_.value()->Query(
+                  prim_fn_var->name_hint, relay_mod, target_, Array<IRModule>{tir_mod})) {
+            ICHECK_EQ(scheduled_mod.value()->functions.count(prim_fn_var), 1);
+            prim_func = Downcast<tir::PrimFunc>(scheduled_mod.value()->functions[prim_fn_var]);
+          }
         }
       }
-
       // Use TOPI schedule if user specificed, or the function has no auto_scheduler schedule.
       if (!schedule.defined() && !prim_func.defined()) {
         if (anchor_op_.defined()) {
diff --git a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
index b208276539cc1..058012cb643ab 100644
--- a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
+++ b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
@@ -18,7 +18,7 @@
 import tvm
 import tvm.testing
 import tvm.topi.testing
-from tvm import autotvm, relay, te, tir
+from tvm import autotvm, relay, te
 from tvm.meta_schedule import ApplyHistoryBest
 from tvm.meta_schedule.testing.utils import apply_fixed_schedules
 from tvm.relay.testing.temp_op_attr import TempOpAttr
@@ -147,8 +147,17 @@ def schedule_fn(task, sch):
         return False
 
     with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy):
-        database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
-        with ApplyHistoryBest(database):
+        database = apply_fixed_schedules(
+            relay_mod,
+            target,
+            params,
+            schedule_fn,
+            te_filter_func="meta_schedule.DefaultTaskFilterAllowExtern",
+        )
+        with ApplyHistoryBest(
+            database,
+            te_filter_func="meta_schedule.DefaultTaskFilterAllowExtern",
+        ):
             with tvm.transform.PassContext(
                 opt_level=3,
                 config={"relay.backend.use_meta_schedule": True},

From 954a927be3bb00076ae66b3997483f7ce9b4c355 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 14 Jun 2022 23:56:22 -0700
Subject: [PATCH 136/181] [Bugfix][TIR] Narrow-Datatype for thread axis
 (#11725)

This PR fixes a bug in the pass Narrow-Datatype in TIR, where dtype of
certain IterVar and loop variables are adjusted to narrower ones.

The bug occurs when the dtype of thread axis is int32, while its extent
is int64, where the original behavior will not narrow the extent to
int32, which causes an assertion thrown in IterVar's constructor. An
alternative approach is to re-dtype IterVar to int64, however, the
subsequent passes do not actually respect int64 thread axes, which leads
to even more issues in lowering.

This bug prevents AutoTIR in tuning Huggingface DistilBERT.
---
 src/tir/transforms/narrow_datatype.cc         |  3 +-
 .../test_tir_transform_narrow_datatype.py     | 31 ++++++++++++++++++-
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index 8df7b57eafde7..16ec86d01826f 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -281,8 +281,7 @@ class DataTypeRewriter : public StmtExprMutator {
           PrimExpr extend = dom->extent;
           if (extend.dtype().is_int() && var.dtype().is_int() &&
               var.dtype().bits() != extend.dtype().bits()) {
-            int bits = std::max(extend.dtype().bits(), var.dtype().bits());
-            DataType dtype = var.dtype().with_bits(bits);
+            DataType dtype = var.dtype();
             dom = Range(cast(dtype, dom->min), cast(dtype, extend), dom->span);
           }
         }
diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py
index 9909262a44fc2..5c69ddc412d90 100644
--- a/tests/python/unittest/test_tir_transform_narrow_datatype.py
+++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py
@@ -15,8 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te, relay
+from tvm import relay, te
 from tvm.driver.build_module import schedule_to_module
+from tvm.script import tir as T
 from tvm.tir import const
 
 
@@ -118,6 +119,33 @@ def check(m, n, target_bits, target_dtype):
     check(2**14, 32, target_bits=16, target_dtype="int32")
 
 
+def test_thread_axis_2():
+    # fmt: off
+    @tvm.script.ir_module
+    class Before:
+        @T.prim_func
+        def main(T_reshape: T.Buffer[(1, 12, 384, 384), "float32"], placeholder_1: T.Buffer[(T.int64(1), T.int64(12), T.int64(384), 384), "bool"], T_where: T.Buffer[(T.int64(1), T.int64(12), T.int64(384), 384), "float32"]) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            # body
+            # with T.block("root")
+            for i0_i1_i2_i3_fused_1 in T.thread_binding(T.int64(256), thread="blockIdx.x"):
+                for i0_i1_i2_i3_fused_2 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
+                    for i0_i1_i2_i3_fused_0 in T.serial(T.int64(7)):
+                        with T.block("T_where"):
+                            ax0 = T.axis.spatial(T.int64(1), T.int64(0))
+                            ax1 = T.axis.spatial(T.int64(12), ((i0_i1_i2_i3_fused_0 * T.int64(256) + i0_i1_i2_i3_fused_1) * T.int64(1024) + i0_i1_i2_i3_fused_2) % T.int64(1769472) // T.int64(147456))
+                            ax2 = T.axis.spatial(T.int64(384), ((i0_i1_i2_i3_fused_0 * T.int64(256) + i0_i1_i2_i3_fused_1) * T.int64(1024) + i0_i1_i2_i3_fused_2) % T.int64(147456) // T.int64(384))
+                            ax3 = T.axis.spatial(384, T.cast(((i0_i1_i2_i3_fused_0 * T.int64(256) + i0_i1_i2_i3_fused_1) * T.int64(1024) + i0_i1_i2_i3_fused_2) % T.int64(384), "int32"))
+                            T.where((i0_i1_i2_i3_fused_0 * T.int64(256) + i0_i1_i2_i3_fused_1) * T.int64(1024) + i0_i1_i2_i3_fused_2 < T.int64(1769472))
+                            T.reads(placeholder_1[ax0, ax1, ax2, ax3], T_reshape[ax0, ax1, ax2, ax3])
+                            T.writes(T_where[ax0, ax1, ax2, ax3])
+                            T_where[ax0, ax1, ax2, ax3] = T.Select(T.cast(placeholder_1[ax0, ax1, ax2, ax3], "int32") != 0, T.float32(-1000000000), T_reshape[ax0, ax1, ax2, ax3])
+    # fmt: on
+    # TODO(@junrushao1994): make this test more "unit" after the new TVMScript printer/parser lands
+    tvm.lower(Before)
+
+
 def test_multilanes():
     def check(m, lanes, target_bits, target_dtype):
         ib = tvm.tir.ir_builder.create()
@@ -280,6 +308,7 @@ def test_ramp_dtype_consistency():
 if __name__ == "__main__":
     test_basic()
     test_thread_axis()
+    test_thread_axis_2()
     test_multilanes()
     test_reduce()
     test_slice()

From 3eb372e26fc3797150ddc7a0ab96871725af4d1d Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Wed, 15 Jun 2022 10:02:46 +0100
Subject: [PATCH 137/181] [CI] add GH workflow to comment with link to docs
 (#11594)

---
 .github/workflows/docs_bot.yml       | 18 ++++++
 tests/python/ci/test_ci.py           | 35 ++++++++++++
 tests/scripts/github_docs_comment.py | 85 ++++++++++++++++++++++++++++
 3 files changed, 138 insertions(+)
 create mode 100644 .github/workflows/docs_bot.yml
 create mode 100755 tests/scripts/github_docs_comment.py

diff --git a/.github/workflows/docs_bot.yml b/.github/workflows/docs_bot.yml
new file mode 100644
index 0000000000000..9480a1176f15e
--- /dev/null
+++ b/.github/workflows/docs_bot.yml
@@ -0,0 +1,18 @@
+
+name: docs-bot
+on:
+  status
+jobs:
+  run-docs-bot:
+    if: ${{ github.repository == 'apache/tvm' && github.event.state == 'success' && github.event.context == 'tvm-ci/pr-head' }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Comment link to docs
+        env:
+          COMMIT_SHA: ${{ github.event.sha }}
+          TARGET_URL: ${{ github.event.target_url }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -eux
+          python tests/scripts/github_docs_comment.py
\ No newline at end of file
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 7ef2f0cd58452..f92e98c49ca18 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -39,6 +39,41 @@ def run(self, *args, **kwargs):
         return proc
 
 
+@pytest.mark.parametrize(
+    "target_url,base_url,commit_sha,expected_url,expected_body",
+    [
+        (
+            "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
+            "https://pr-docs.tlcpack.ai",
+            "SHA",
+            "issues/11594/comments",
+            "Built docs for commit [SHA](SHA) can be found [here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).",
+        )
+    ],
+)
+def test_docs_comment(
+    tmpdir_factory, target_url, base_url, commit_sha, expected_url, expected_body
+):
+    docs_comment_script = REPO_ROOT / "tests" / "scripts" / "github_docs_comment.py"
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+    git.run("init")
+    git.run("checkout", "-b", "main")
+    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
+    proc = subprocess.run(
+        [str(docs_comment_script), "--dry-run", f"--base-url-docs={base_url}"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha},
+        encoding="utf-8",
+        cwd=git.cwd,
+    )
+    if proc.returncode != 0:
+        raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
+
+    assert f"Dry run, would have posted {expected_url} with data {expected_body}." in proc.stderr
+
+
 def test_cc_reviewers(tmpdir_factory):
     reviewers_script = REPO_ROOT / "tests" / "scripts" / "github_cc_reviewers.py"
 
diff --git a/tests/scripts/github_docs_comment.py b/tests/scripts/github_docs_comment.py
new file mode 100755
index 0000000000000..c92023482d145
--- /dev/null
+++ b/tests/scripts/github_docs_comment.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import logging
+import argparse
+import sys
+from urllib import error
+
+from git_utils import git, GitHubRepo, parse_remote
+from cmd_utils import init_log
+
+
+def build_docs_url(base_url_docs, pr_number, build_number):
+    return f"{base_url_docs}/PR-{str(pr_number)}/{str(build_number)}/docs/index.html"
+
+
+def get_pr_and_build_numbers(target_url):
+    target_url = target_url[target_url.find("PR-") : len(target_url)]
+    split = target_url.split("/")
+    pr_number = split[0].strip("PR-")
+    build_number = split[1]
+    return {"pr_number": pr_number, "build_number": build_number}
+
+
+if __name__ == "__main__":
+    help = "Add comment with link to docs"
+    parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--remote", default="origin", help="ssh remote to parse")
+    parser.add_argument("--base-url-docs", default="https://pr-docs.tlcpack.ai")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="run but don't send any request to GitHub",
+    )
+    args = parser.parse_args()
+    init_log()
+
+    remote = git(["config", "--get", f"remote.{args.remote}.url"])
+    user, repo = parse_remote(remote)
+
+    target_url = os.environ["TARGET_URL"]
+    pr_and_build = get_pr_and_build_numbers(target_url)
+
+    commit_sha = os.environ["COMMIT_SHA"]
+
+    docs_url = build_docs_url(
+        args.base_url_docs, pr_and_build["pr_number"], pr_and_build["build_number"]
+    )
+
+    url = f'issues/{pr_and_build["pr_number"]}/comments'
+    body = f"Built docs for commit [{commit_sha}]({commit_sha}) can be found [here]({docs_url})."
+    if not args.dry_run:
+        github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
+
+        # For now, only comment for PRs open by driazati, gigiblender and areusch.
+        get_pr_url = f'pulls/{pr_and_build["pr_number"]}'
+        pull_request_body = github.get(get_pr_url)
+        author = pull_request_body["user"]["login"]
+        if author not in ["driazati", "gigiblender", "areusch"]:
+            logging.info(f"Skipping this action for user {author}")
+            sys.exit(0)
+
+        try:
+            github.post(url, {"body": body})
+        except error.HTTPError as e:
+            logging.exception(f"Failed to add docs comment {docs_url}: {e}")
+    else:
+        logging.info(f"Dry run, would have posted {url} with data {body}.")

From 24f49f1aeb4643df97ce82a14a0d2c4f55d637f7 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Wed, 15 Jun 2022 15:11:57 +0100
Subject: [PATCH 138/181] [CI] Apply linting rules to AOT tests (#11657)

This enables pylint against the AOT test cases.

One issue I found was with the `tvm.testing.parameter` which breaks the naming convention rules in pylint (constants are upper case and function parameters are lower case). It may be worth a syntax similar to:

```
tvm.testing.parameter("enable_usmp", [True, False])
```
---
 tests/lint/pylint.sh                        |   1 +
 tests/python/relay/aot/test_c_device_api.py |  37 ++-
 tests/python/relay/aot/test_cpp_aot.py      |  34 +--
 tests/python/relay/aot/test_crt_aot.py      | 294 +++++++++++---------
 tests/python/relay/aot/test_crt_aot_usmp.py |  71 ++---
 5 files changed, 247 insertions(+), 190 deletions(-)

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index b442c33c0ff67..3e55168f265eb 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -21,4 +21,5 @@ python3 -m pylint python/tvm --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint vta/python/vta --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/unittest/test_tvmscript_type.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/relay/aot/*.py --rcfile="$(dirname "$0")"/pylintrc
 
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index b972b0845c30b..ea5ea4920c870 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -14,32 +14,38 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""AOT with C Device API Tests"""
 
-import sys
+import re
 from collections import OrderedDict
 
 import numpy as np
 import pytest
-import re
-import tvm.testing
 
+import tvm.testing
 from tvm import relay
 from tvm.ir.module import IRModule
 from tvm.testing.aot import AOTTestModel, generate_ref_data, compile_models
 from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER
 
 
-@pytest.fixture
-def device_api_main_func():
+@pytest.fixture(name="device_api_main_func")
+def fixture_device_api_main_func():
+    """Test function generator which generates C Device API calls"""
+
     # Ideally we should have a sample Target registered here
     # but we're going to re-use this for now
     pytest.importorskip("ethosu.vela")
+
+    # pylint: disable=import-outside-toplevel
     import tensorflow as tf
     import tflite.Model
 
     from tests.python.contrib.test_ethosu.infra import create_test_runner, generate_ref_data_tflite
     from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 
+    # pylint: enable=import-outside-toplevel
+
     tf.config.run_functions_eagerly(True)
 
     class Model(tf.Module):
@@ -97,8 +103,9 @@ def compile_to_main_func(interface_api="c", use_unpacked_api=True):
     return compile_to_main_func
 
 
-@pytest.fixture
-def non_device_api_main_func():
+@pytest.fixture(name="non_device_api_main_func")
+def fixture_non_device_api_main_func():
+    """Test function generator which does not generate C Device API calls"""
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
     func = relay.Function([x, y], relay.multiply(x, y))
@@ -151,7 +158,10 @@ def test_device_api_hooks_unpacked_api(device_api_main_func):
     # We dont need to check exact input and output var names in this test.
     # Hence, using a regex to cover any legal I/O name.
     regex = re.compile(
-        'tir\.tvm_check_return\(0, -1, tir\.call_extern\("tvmgen_default_ethos_u_main_0", \w+, \w+, device_context_ethos_u\)\)'
+        r"tir\.tvm_check_return\("
+        r"0, -1, "
+        r'tir\.call_extern\("tvmgen_default_ethos_u_main_0", '
+        r"\w+, \w+, device_context_ethos_u\)\)"
     )
     assert regex.match(str(main_func.body[1][0][0][1]))
     # Close Device
@@ -171,7 +181,9 @@ def test_device_api_hooks_unpacked_api(device_api_main_func):
 
 
 @pytest.mark.skip(
-    "Skipping this test as this is incorrectly using Arm(R) Ethos(TM)-U NPU with packed calling convention which is not supported by the NPU codegen's TIR to Runtime Hook. We need to use a different target to test this feature"
+    "Skipping this test as this is incorrectly using Arm(R) Ethos(TM)-U NPU "
+    "with packed calling convention which is not supported by the NPU codegen's "
+    "TIR to Runtime Hook. We need to use a different target to test this feature"
 )
 def test_device_api_hooks_packed_api(device_api_main_func):
     """Check for Device API hooks with packed internal calls"""
@@ -236,11 +248,12 @@ def test_without_device_api_packed_api(non_device_api_main_func):
     """Test a graph without the Device API with the packed internal calls"""
 
     main_func = non_device_api_main_func(interface_api="packed", use_unpacked_api=False)
+
     assert str(main_func.body) == (
         'tir.tvm_call_cpacked("tvmgen_default_fused_multiply", '
-        "tir.tvm_stack_make_array(x_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "
-        "tir.tvm_stack_make_array(y_buffer_var, tir.tvm_stack_make_shape(1, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "
-        "tir.tvm_stack_make_array(output_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "
+        "tir.tvm_stack_make_array(x_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "  # pylint: disable=line-too-long
+        "tir.tvm_stack_make_array(y_buffer_var, tir.tvm_stack_make_shape(1, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "  # pylint: disable=line-too-long
+        "tir.tvm_stack_make_array(output_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "  # pylint: disable=line-too-long
         "tir.reinterpret((uint64)0))\n"
     )
 
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index 04a1111e357c8..742b681ae619f 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -14,10 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""AOT with C++ Runtime Tests"""
 
 import re
-import sys
 import textwrap
 
 import numpy as np
@@ -28,13 +27,10 @@
 from tvm import relay
 from tvm.relay import backend, testing
 from tvm.testing.aot import generate_ref_data
-from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER
 
 
 def test_error_c_interface():
-    interface_api = "c"
-    use_unpacked_api = False
-    test_runner = AOT_DEFAULT_RUNNER
+    """Checks that an error occurs when using the packed API in combination with C interface"""
 
     two = relay.add(relay.const(1), relay.const(1))
     func = relay.Function([], two)
@@ -53,12 +49,11 @@ def test_error_c_interface():
         )
 
 
-enable_usmp = tvm.testing.parameter(True, False)
-target_kind = tvm.testing.parameter("c", "llvm")
-
-
+@pytest.mark.parametrize("enable_usmp", [True, False])
+@pytest.mark.parametrize("target_kind", ["c", "llvm"])
 def test_conv2d(enable_usmp, target_kind):
-    RELAY_MODEL = textwrap.dedent(
+    """Tests compilation of convolutions"""
+    relay_model = textwrap.dedent(
         """\
         #[version = "0.0.5"]
         def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5), int8]) {
@@ -86,7 +81,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5),
         }
     """
     )
-    ir_mod = tvm.parser.fromtext(RELAY_MODEL)
+    ir_mod = tvm.parser.fromtext(relay_model)
 
     main_func = ir_mod["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
@@ -119,7 +114,10 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5),
     assert (runner.get_output(0).asnumpy() == list(ref_outputs.values())[0]).all()
 
 
+@pytest.mark.parametrize("enable_usmp", [True, False])
+@pytest.mark.parametrize("target_kind", ["c", "llvm"])
 def test_mobilenet(enable_usmp, target_kind):
+    """Full network test with Mobilenet"""
     ir_mod, params = testing.mobilenet.get_workload(batch_size=1)
     data_shape = [int(x) for x in ir_mod["main"].checked_type.arg_types[0].shape]
     data = np.random.uniform(size=data_shape).astype("float32")
@@ -147,10 +145,11 @@ def test_mobilenet(enable_usmp, target_kind):
 
 
 def test_module_list():
-    x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
-    expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
+    """Checks the correct list of module names is generated"""
+    input_x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
+    expr = tvm.relay.add(input_x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
     mod = tvm.relay.build(
-        tvm.IRModule.from_expr(tvm.relay.Function([x], expr)),
+        tvm.IRModule.from_expr(tvm.relay.Function([input_x], expr)),
         target="c",
         executor=tvm.relay.backend.Executor("aot", {"interface-api": "packed"}),
         mod_name="unusual_module_name_fred",
@@ -177,6 +176,7 @@ def test_create_executor():
 
 
 def test_pass_wrong_device_arg():
+    """Ensure an error is generated if the incorrect number of devices are passed"""
     x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
     expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
@@ -191,12 +191,12 @@ def test_pass_wrong_device_arg():
     mod.export_library(test_so_path, cc="gcc", options=["-std=c11"])
     loaded_mod = tvm.runtime.load_module(test_so_path)
 
-    with pytest.raises(tvm.TVMError) as cm:
+    with pytest.raises(tvm.TVMError) as error:
         tvm.runtime.executor.AotModule(loaded_mod["default"](tvm.cpu(0), tvm.cpu(0)))
 
         assert (
             "Check failed: devices_.size() == 1 (2 vs. 1) : Expect exactly 1 device passed."
-            in str(cm.exception)
+            in str(error.exception)
         )
     # TODO write asserts for # and type of device.
 
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index f4ef8d7845316..1a4f23ad467a5 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -14,11 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""AOT with C Runtime Tests"""
 
 from collections import OrderedDict
-import platform
 import re
-import sys
 import os
 import tarfile
 import pathlib
@@ -48,6 +47,7 @@
 
 
 def test_error_c_interface_with_packed_api():
+    """Checks that an error occurs when using the packed API in combination with C interface"""
     interface_api = "c"
     use_unpacked_api = False
     test_runner = AOT_DEFAULT_RUNNER
@@ -75,7 +75,8 @@ def test_error_c_interface_with_packed_api():
 
 @parametrize_aot_options
 def test_conv_with_params(interface_api, use_unpacked_api, test_runner):
-    RELAY_MODEL = """
+    """Tests compilation of convolution with parameters"""
+    relay_model = """
 #[version = "0.0.5"]
 def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
     %1 = nn.conv2d(
@@ -90,7 +91,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
   %1
 }
 """
-    mod = tvm.parser.fromtext(RELAY_MODEL)
+    mod = tvm.parser.fromtext(relay_model)
     main_func = mod["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
     type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
@@ -112,16 +113,17 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
 
 @parametrize_aot_options
 def test_add_with_params(interface_api, use_unpacked_api, test_runner):
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
+    """Tests compilation of add with parameters"""
+    input_x = relay.var("x", shape=(1, 10))
+    input_y = relay.var("y", shape=(1, 10))
+    input_z = relay.add(input_x, input_y)
+    func = relay.Function([input_x, input_y], input_z)
 
-    x_in = np.ones((1, 10)).astype("float32")
-    y_in = np.random.uniform(size=(1, 10)).astype("float32")
+    input_x_data = np.ones((1, 10)).astype("float32")
+    input_y_data = np.random.uniform(size=(1, 10)).astype("float32")
 
-    params = {"x": x_in}
-    inputs = {"y": y_in}
+    params = {"x": input_x_data}
+    inputs = {"y": input_y_data}
     output_list = generate_ref_data(func, inputs, params)
 
     compile_and_run(
@@ -231,21 +233,23 @@ def test_packed_global_variables():
                     # Collect all functions starting with tvmgen_default
                     tvmgen_funcs += re.findall(r"(?<=).*(?=\()", item)
 
-        # Check if any function name has a packed variable name in all items that start with tvmgen_default
+        # Check if any function name has a packed variable name in all
+        # items that start with tvmgen_default
         for func in tvmgen_funcs:
             assert f"{func}_packed" not in tvmgen_names
 
 
 @parametrize_aot_options
 def test_concatenate(interface_api, use_unpacked_api, test_runner):
+    """Tests compilation of concatenate"""
     dtype = "float32"
-    x = relay.var("x", shape=(10, 5), dtype=dtype)
-    y = relay.var("y", shape=(10, 5), dtype=dtype)
-    t = relay.var("z", shape=(), dtype=dtype)
-    z = relay.concatenate((x, y), axis=1)
-    z = relay.add(z, t)
+    input_x = relay.var("x", shape=(10, 5), dtype=dtype)
+    input_y = relay.var("y", shape=(10, 5), dtype=dtype)
+    input_z = relay.var("z", shape=(), dtype=dtype)
+    concat_inputs = relay.concatenate((input_x, input_y), axis=1)
+    func_output = relay.add(input_z, concat_inputs)
     # Check result.
-    func = relay.Function([x, y, t], z)
+    func = relay.Function([input_x, input_y, input_z], func_output)
     x_data = np.random.rand(10, 5).astype(dtype)
     y_data = np.random.rand(10, 5).astype(dtype)
     t_data = np.random.uniform(size=()).astype(dtype)
@@ -262,13 +266,16 @@ def test_concatenate(interface_api, use_unpacked_api, test_runner):
 
 @parametrize_aot_options
 def test_nested_tuples(interface_api, use_unpacked_api, test_runner):
-    x = relay.var("x", shape=(10,))
-    x1 = x + relay.const(1.0)
-    x2 = x1 + relay.const(1.0)
-    x3 = x2 + relay.const(1.0)
-    x4 = x3 + relay.const(1.0)
-    out = relay.Tuple([x1, relay.Tuple([relay.Tuple([x2, x3]), x4])])
-    func = relay.Function([x], out)
+    """Tests compilation of functions with nested tuple outputs"""
+    input_x = relay.var("x", shape=(10,))
+    output_1 = input_x + relay.const(1.0)
+    output_2 = output_1 + relay.const(1.0)
+    output_3 = output_2 + relay.const(1.0)
+    output_4 = output_3 + relay.const(1.0)
+    full_output = relay.Tuple(
+        [output_1, relay.Tuple([relay.Tuple([output_2, output_3]), output_4])]
+    )
+    func = relay.Function([input_x], full_output)
 
     x_data = np.random.uniform(size=(10,)).astype(np.float32)
     inputs = {"x": x_data}
@@ -326,7 +333,8 @@ def test_add_const(interface_api, use_unpacked_api, test_runner):
 
 
 @parametrize_aot_options
-def test_mul_param(interface_api, use_unpacked_api, test_runner):
+def test_multiply(interface_api, use_unpacked_api, test_runner):
+    """Tests compilation of multiply"""
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
     func = relay.Function([x, y], relay.multiply(x, y))
@@ -362,6 +370,7 @@ def test_subtract(interface_api, use_unpacked_api, test_runner):
 
 @parametrize_aot_options
 def test_tuple_output(interface_api, use_unpacked_api, test_runner):
+    """Tests getting items from tuples"""
     x = relay.var("x", shape=(6, 9))
     y = relay.split(x, 3).astuple()
     a = relay.TupleGetItem(y, 0)
@@ -383,6 +392,7 @@ def test_tuple_output(interface_api, use_unpacked_api, test_runner):
     ["debug_calculated_workspaces", "workspace_byte_alignment"], [(True, 1), (True, 16), (False, 1)]
 )
 def test_mobilenet(debug_calculated_workspaces, workspace_byte_alignment):
+    """Full network test with Mobilenet"""
     use_unpacked_api = True
     interface_api = "c"
     test_runner = AOT_DEFAULT_RUNNER
@@ -413,33 +423,36 @@ def test_mobilenet(debug_calculated_workspaces, workspace_byte_alignment):
 
 @pytest.mark.parametrize("merge_compiler_regions", [False, True])
 def test_byoc_microtvm(merge_compiler_regions):
-    """This is a simple test to check BYOC capabilities of AOT - with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036"""
+    """
+    This is a simple test to check BYOC capabilities of AOT
+    with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036
+    """
     use_unpacked_api = False
     interface_api = "packed"
     test_runner = AOT_DEFAULT_RUNNER
 
-    x = relay.var("x", shape=(10, 10))
-    w0 = relay.var("w0", shape=(10, 10))
-    w1 = relay.var("w1", shape=(10, 10))
+    input_x = relay.var("x", shape=(10, 10))
+    input_w0 = relay.var("w0", shape=(10, 10))
+    input_w1 = relay.var("w1", shape=(10, 10))
 
     # z0 = x + w0
-    x_ = compiler_begin(x, "ccompiler")
-    w0_ = compiler_begin(w0, "ccompiler")
-    z0_ = relay.add(x_, w0_)
-    z0 = compiler_end(z0_, "ccompiler")
+    marked_input_x = compiler_begin(input_x, "ccompiler")
+    marked_input_w0 = compiler_begin(input_w0, "ccompiler")
+    add_x_and_w0 = relay.add(marked_input_x, marked_input_w0)
+    end_inner_add = compiler_end(add_x_and_w0, "ccompiler")
 
     # z1 = z0 + w1
-    z0__ = compiler_begin(z0, "ccompiler")
-    w1_ = compiler_begin(w1, "ccompiler")
-    z1_ = relay.add(z0__, w1_)
-    z1 = compiler_end(z1_, "ccompiler")
+    marked_inner_add = compiler_begin(end_inner_add, "ccompiler")
+    marked_w1 = compiler_begin(input_w1, "ccompiler")
+    add_nested_and_w1 = relay.add(marked_inner_add, marked_w1)
+    end_outer_add = compiler_end(add_nested_and_w1, "ccompiler")
 
     # z2 = z0 + z1
-    z2 = relay.add(z0, z1)
+    final_add = relay.add(end_inner_add, end_outer_add)
 
-    f = relay.Function([x, w0, w1], z2)
+    relay_func = relay.Function([input_x, input_w0, input_w1], final_add)
     mod = tvm.IRModule()
-    mod["main"] = f
+    mod["main"] = relay_func
 
     if merge_compiler_regions:
         mod = transform.MergeCompilerRegions()(mod)
@@ -467,34 +480,37 @@ def test_byoc_microtvm_multiple_subgraphs(merge_compiler_regions):
     interface_api = "packed"
     test_runner = AOT_DEFAULT_RUNNER
 
-    x = relay.var("x", shape=(10, 10))
-    w0 = relay.var("w0", shape=(10, 10))
-    w1 = relay.var("w1", shape=(10, 10))
-    w2 = relay.var("w2", shape=(10, 10))
-    w3 = relay.var("w3", shape=(10, 10))
-    w4 = relay.var("w4", shape=(10, 10))
-    w5 = relay.var("w5", shape=(10, 10))
-    w6 = relay.var("w6", shape=(10, 10))
-    w7 = relay.var("w7", shape=(10, 10))
+    input_x = relay.var("x", shape=(10, 10))
+    input_w0 = relay.var("w0", shape=(10, 10))
+    input_w1 = relay.var("w1", shape=(10, 10))
+    input_w2 = relay.var("w2", shape=(10, 10))
+    input_w3 = relay.var("w3", shape=(10, 10))
+    input_w4 = relay.var("w4", shape=(10, 10))
+    input_w5 = relay.var("w5", shape=(10, 10))
+    input_w6 = relay.var("w6", shape=(10, 10))
+    input_w7 = relay.var("w7", shape=(10, 10))
 
     # C compiler
-    z0 = relay.add(x, w0)
-    p0 = relay.subtract(z0, w1)
-    q0 = relay.multiply(p0, w2)
+    ccompiler_add_1 = relay.add(input_x, input_w0)
+    ccompiler_sub_1 = relay.subtract(ccompiler_add_1, input_w1)
+    ccompiler_mul_1 = relay.multiply(ccompiler_sub_1, input_w2)
 
-    z1 = relay.add(x, w3)
-    p1 = relay.subtract(z1, w4)
-    q1 = relay.multiply(p1, w5)
+    ccompiler_add_2 = relay.add(input_x, input_w3)
+    ccompiler_sub_2 = relay.subtract(ccompiler_add_2, input_w4)
+    ccompiler_mul_2 = relay.multiply(ccompiler_sub_2, input_w5)
 
     # Other parts on TVM
-    z2 = relay.add(x, w6)
-    q2 = relay.subtract(z2, w7)
+    tvm_add = relay.add(input_x, input_w6)
+    tvm_sub = relay.subtract(tvm_add, input_w7)
 
-    r = relay.concatenate((q0, q1, q2), axis=0)
-    f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r)
+    concat_outputs = relay.concatenate((ccompiler_mul_1, ccompiler_mul_2, tvm_sub), axis=0)
+    relay_func = relay.Function(
+        [input_x, input_w0, input_w1, input_w2, input_w3, input_w4, input_w5, input_w6, input_w7],
+        concat_outputs,
+    )
     mod = tvm.IRModule()
     ann = byoc.CcompilerAnnotator()
-    mod["main"] = ann.visit(f)
+    mod["main"] = ann.visit(relay_func)
 
     if merge_compiler_regions:
         mod = transform.MergeCompilerRegions()(mod)
@@ -521,22 +537,23 @@ def test_byoc_microtvm_multiple_subgraphs(merge_compiler_regions):
 
 @parametrize_aot_options
 def test_add_name_mangling_with_params(interface_api, use_unpacked_api, test_runner):
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
+    """Checks name mangling works with parameters"""
+    input_x = relay.var("x", shape=(1, 10))
+    input_y = relay.var("y", shape=(1, 10))
+    func_add = relay.add(input_x, input_y)
+    relay_func = relay.Function([input_x, input_y], func_add)
 
     x_in = np.ones((1, 10)).astype("float32")
     y_in = np.random.uniform(size=(1, 10)).astype("float32")
 
     params = {"x": x_in}
     inputs = {"y": y_in}
-    output_list = generate_ref_data(func, inputs, params)
+    output_list = generate_ref_data(relay_func, inputs, params)
 
     compile_and_run(
         AOTTestModel(
             name="my_mod",
-            module=func,
+            module=relay_func,
             inputs=inputs,
             outputs=output_list,
             params=params,
@@ -549,6 +566,7 @@ def test_add_name_mangling_with_params(interface_api, use_unpacked_api, test_run
 
 @parametrize_aot_options
 def test_multiple_models(interface_api, use_unpacked_api, test_runner):
+    """Compiles multiple models to ensure both can be compiled into one output"""
     # Identity model without params
     x = relay.var("x", "float32")
     mod1 = relay.Function([x], x)
@@ -558,22 +576,23 @@ def test_multiple_models(interface_api, use_unpacked_api, test_runner):
     params1 = None
 
     # Convolution model
-    RELAY_MODEL = """
-#[version = "0.0.5"]
-def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
-    %1 = nn.conv2d(
-         %data,
-         %weight,
-         padding=[2, 2],
-         channels=8,
-         kernel_size=[5, 5],
-         data_layout="NCHW",
-         kernel_layout="OIHW",
-         out_dtype="int32");
-  %1
-}
-"""
-    mod2 = tvm.parser.fromtext(RELAY_MODEL)
+    relay_model = """
+    #[version = "0.0.5"]
+    def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
+        %1 = nn.conv2d(
+            %data,
+            %weight,
+            padding=[2, 2],
+            channels=8,
+            kernel_size=[5, 5],
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+            out_dtype="int32");
+    %1
+    }
+    """
+
+    mod2 = tvm.parser.fromtext(relay_model)
     main_func = mod2["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
     type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
@@ -609,12 +628,14 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
 
 
 def test_quant_mobilenet_tfl():
-    """Since in AOT we pass directly the output buffer from the user, in quantized networks sharing the output buffers is not possible.
-    This is because the output data type is int8 and the intermediate buffer are int32 or int16. We use mobilenet quantized to stress this
+    """Since in AOT we pass directly the output buffer from the user,
+    in quantized networks sharing the output buffers is not possible.
+    This is because the output data type is int8 and the intermediate
+    buffer are int32 or int16. We use mobilenet quantized to stress this
     situation and verify that the output buffer sharing is disabled in AOT."""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -640,22 +661,22 @@ def test_transpose(interface_api, use_unpacked_api, test_runner):
     """Test that non-inpleaceable operations (e.g., transpose) do not happen in-place."""
 
     dtype = "float32"
-    x = relay.var("x", shape=(10, 5), dtype=dtype)
-    y = relay.var("y", shape=(10, 5), dtype=dtype)
-    t = relay.var("z", shape=(), dtype=dtype)
-    a = relay.add(x, y)
-    b = relay.transpose(a)
-    z = relay.add(b, t)
+    input_x = relay.var("x", shape=(10, 5), dtype=dtype)
+    input_y = relay.var("y", shape=(10, 5), dtype=dtype)
+    input_z = relay.var("z", shape=(), dtype=dtype)
+    first_add = relay.add(input_x, input_y)
+    transpose_add = relay.transpose(first_add)
+    final_add = relay.add(transpose_add, input_z)
     # Check result.
-    func = relay.Function([x, y, t], z)
+    relay_func = relay.Function([input_x, input_y, input_z], final_add)
     x_data = np.random.rand(10, 5).astype(dtype)
     y_data = np.random.rand(10, 5).astype(dtype)
     t_data = np.random.uniform(size=()).astype(dtype)
 
     inputs = {"x": x_data, "y": y_data, "z": t_data}
-    output_list = generate_ref_data(func, inputs)
+    output_list = generate_ref_data(relay_func, inputs)
     compile_and_run(
-        AOTTestModel(module=IRModule.from_expr(func), inputs=inputs, outputs=output_list),
+        AOTTestModel(module=IRModule.from_expr(relay_func), inputs=inputs, outputs=output_list),
         test_runner,
         interface_api,
         use_unpacked_api,
@@ -693,15 +714,15 @@ def test_name_sanitiser_name_clash():
     test_runner = AOT_DEFAULT_RUNNER
 
     dtype = "float32"
-    x = relay.var("input::-1", shape=(10, 5), dtype=dtype)
+    input_non_clashing = relay.var("input::-1", shape=(10, 5), dtype=dtype)
     # Next 2 input tensor names will clash once sanitized.
-    y = relay.var("input::-2", shape=(10, 5), dtype=dtype)
-    t = relay.var("input:--2", shape=(), dtype=dtype)
-    a = relay.add(x, y)
-    b = relay.transpose(a)
-    z = relay.add(b, t)
+    input_clashing_1 = relay.var("input::-2", shape=(10, 5), dtype=dtype)
+    input_clashing_2 = relay.var("input:--2", shape=(), dtype=dtype)
+    inner_add = relay.add(input_non_clashing, input_clashing_1)
+    transpose_add = relay.transpose(inner_add)
+    final_add = relay.add(transpose_add, input_clashing_2)
     # Check result.
-    func = relay.Function([x, y, t], z)
+    func = relay.Function([input_non_clashing, input_clashing_1, input_clashing_2], final_add)
     x_data = np.random.rand(10, 5).astype(dtype)
     y_data = np.random.rand(10, 5).astype(dtype)
     t_data = np.random.uniform(size=()).astype(dtype)
@@ -721,17 +742,17 @@ def test_name_sanitiser_name_clash():
 
 # This tests for deprecated AOT executor arguments
 # TODO(Mousius) Remove deprecated arguments later
-def test_deprecated_target_arguments(capsys):
+def test_deprecated_target_arguments():
     """Tests we can still use relay.build with -executor, -runtime and -link-params"""
 
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_DEFAULT_RUNNER
 
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
+    input_x = relay.var("x", shape=(1, 10))
+    input_y = relay.var("y", shape=(1, 10))
+    func_add = relay.add(input_x, input_y)
+    func = relay.Function([input_x, input_y], func_add)
 
     x_in = np.ones((1, 10)).astype("float32")
     y_in = np.random.uniform(size=(1, 10)).astype("float32")
@@ -761,6 +782,7 @@ def test_aot_codegen_backend_alloc_workspace_calls():
     # The %data and %weight shapes in the following primitive Relay should create
     # small tensors that would get lowered to stack allocations in the CPU PrimFuncs.
     # However, the AoT executor codegen should retain them as TVMBAW calls
+    # pylint: disable=line-too-long
     relay_mod = tvm.parser.fromtext(
         """
         #[version = "0.0.5"]
@@ -784,6 +806,8 @@ def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), fl
         }
         """
     )
+    # pylint: enable=line-too-long
+
     compiled_test_mods = compile_models(
         models=AOTTestModel(module=relay_mod, inputs=None, outputs=None),
         interface_api="c",
@@ -822,10 +846,12 @@ def test_output_tensor_names():
     """Test that the output names generated match those in the model"""
     pytest.importorskip("tflite")
 
-    import os
+    # pylint: disable=import-outside-toplevel
     import tensorflow as tf
     import tflite.Model
 
+    # pylint: enable=import-outside-toplevel
+
     ifm_shape = (1, 299, 299, 3)
     padding = "VALID"
     strides = (1, 1)
@@ -836,38 +862,40 @@ def create_tflite_graph_two_outs():
         """Create a model with 2 output tensors"""
 
         class Model(tf.Module):
+            """Simple TFLite test model"""
+
             @tf.function
-            def tf_function(self, x):
-                # Use tf.nn API to create the model
+            def tf_function(self, tf_input_x):
+                """Single TFLite function with two convolutions"""
                 tf_strides = [1, strides[0], strides[1], 1]
                 filter_shape = [kernel_shape[0], kernel_shape[1], 3, 3]
                 filter1 = tf.constant(
                     np.arange(np.prod(filter_shape)).reshape(filter_shape),
                     dtype=tf.float32,
                 )
-                op = tf.nn.conv2d(
-                    x,
+                first_conv2d = tf.nn.conv2d(
+                    tf_input_x,
                     filters=filter1,
                     strides=tf_strides,
                     padding=padding,
                     dilations=dilation,
                 )
-                op = tf.nn.relu(op)
-                # Second convolution
+                first_conv2d = tf.nn.relu(first_conv2d)
+
                 filter2 = tf.constant(
                     1000 + np.arange(np.prod(filter_shape)).reshape(filter_shape),
                     dtype=tf.float32,
                 )
-                op2 = tf.nn.conv2d(
-                    x,
+                second_conv2d = tf.nn.conv2d(
+                    tf_input_x,
                     filters=filter2,
                     strides=strides,
                     padding=padding,
                     data_format="NHWC",
                     dilations=dilation,
                 )
-                op2 = tf.nn.relu(op2)
-                return op, op2
+                second_conv2d = tf.nn.relu(second_conv2d)
+                return first_conv2d, second_conv2d
 
         model = Model()
         concrete_func = model.tf_function.get_concrete_function(
@@ -934,6 +962,7 @@ def representative_dataset():
     ],
 )
 def test_workspace_calculation(workspace_byte_alignment, main_workspace_size):
+    """Checks calculated workspace against known values"""
     mod, params = tvm.relay.testing.synthetic.get_workload()
     target = "c"
     runtime = Runtime("crt")
@@ -964,9 +993,12 @@ def test_workspace_calculation_cmsis_nn():
     -hierarchical manner."""
     pytest.importorskip("tflite")
 
+    # pylint: disable=import-outside-toplevel
     from tvm.relay.op.contrib import cmsisnn
     from tvm.contrib.download import download_testdata
 
+    # pylint: enable=import-outside-toplevel
+
     target = "c"
     runtime = Runtime("crt")
     executor = Executor(
@@ -978,7 +1010,11 @@ def test_workspace_calculation_cmsis_nn():
         },
     )
 
-    base_url = "https://github.com/ARM-software/ML-zoo/raw/48a22ee22325d15d2371a6df24eb7d67e21dcc97/models/keyword_spotting/cnn_small/tflite_int8"
+    base_url = (
+        "https://github.com/ARM-software/ML-zoo/raw/"
+        "48a22ee22325d15d2371a6df24eb7d67e21dcc97"
+        "/models/keyword_spotting/cnn_small/tflite_int8"
+    )
     file_to_download = "cnn_s_quantized.tflite"
     file_saved = "cnn_s_quantized_15Dec2021.tflite"
     model_file = download_testdata("{}/{}".format(base_url, file_to_download), file_saved)
@@ -997,10 +1033,10 @@ def test_workspace_calculation_cmsis_nn():
 
 def test_aot_codegen_checks_returns():
     """This test checks whether AoT lowering creates calls that check the return value correctly"""
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
+    input_x = relay.var("x", shape=(1, 10))
+    input_y = relay.var("y", shape=(1, 10))
+    func_add = relay.add(input_x, input_y)
+    func = relay.Function([input_x, input_y], func_add)
 
     compiled_test_mods = compile_models(
         models=AOTTestModel(module=IRModule.from_expr(func), inputs=None, outputs=None),
@@ -1021,17 +1057,17 @@ def test_aot_codegen_checks_returns():
     )
     # TODO(Mousius) - Create a better place for C codegen tests
     assert (
-        "if (tvmgen_default_fused_add(x_buffer_var, y_buffer_var, output_buffer_var) != 0 ) return -1;"
+        "if (tvmgen_default_fused_add(x_buffer_var, y_buffer_var, output_buffer_var) != 0 ) return -1;"  # pylint: disable=line-too-long
         in source
     )
 
 
 def test_aot_uses_anf():
     """Checks that A-Normal Form is being used in the AOT lowering pipeline."""
-    x = relay.var("x", shape=(1, 10, 10, 10))
-    y = relay.var("y", shape=(1, 10, 10, 10))
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
+    input_x = relay.var("x", shape=(1, 10, 10, 10))
+    input_y = relay.var("y", shape=(1, 10, 10, 10))
+    func_add = relay.add(input_x, input_y)
+    func = relay.Function([input_x, input_y], func_add)
 
     @pass_instrument
     class CheckANFRuns:
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index 3ede2298873b5..4205b458177cc 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -17,17 +17,14 @@
 """ This file contains test that use USMP + AoT using C runtime APIs"""
 
 from collections import OrderedDict
-import sys
 import re
 
 import numpy as np
 import pytest
 
 import tvm
-from tvm import relay, TVMError
-from tvm.ir.module import IRModule
-from tvm.relay import testing, transform
-from tvm.relay.testing import byoc
+from tvm import relay
+from tvm.relay import transform
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend import Executor, Runtime
 from tvm import WorkspaceMemoryPools, PoolInfo
@@ -47,7 +44,7 @@
 
 def _check_for_no_tvm_backendallocworkspace_calls(mod: tvm.runtime.module):
     assert (
-        is_tvm_backendallocworkspace_calls(mod) == False
+        is_tvm_backendallocworkspace_calls(mod) is False
     ), "This is failing because USMP was unable to plan for every tir.allocate node."
 
 
@@ -60,6 +57,7 @@ def _check_for_no_tvm_backendallocworkspace_calls(mod: tvm.runtime.module):
     ],
 )
 def test_memory_planning(workspace_byte_alignment, main_workspace_size):
+    """Checks calculated workspace against known values"""
     mod, params = tvm.relay.testing.synthetic.get_workload()
     target = "c"
     runtime = Runtime("crt")
@@ -141,33 +139,36 @@ def test_conv2d(interface_api, use_unpacked_api, test_runner, groups, weight_sha
 
 @pytest.mark.parametrize("merge_compiler_regions", [False, True])
 def test_byoc_microtvm(merge_compiler_regions):
-    """This is a simple test to check BYOC capabilities of AOT - with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036"""
+    """
+    This is a simple test to check BYOC capabilities of AOT
+    with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036
+    """
     use_unpacked_api = False
     interface_api = "packed"
     test_runner = AOTTestRunner(pass_config={"tir.usmp.enable": True})
 
-    x = relay.var("x", shape=(10, 10))
-    w0 = relay.var("w0", shape=(10, 10))
-    w1 = relay.var("w1", shape=(10, 10))
+    input_x = relay.var("x", shape=(10, 10))
+    input_w0 = relay.var("w0", shape=(10, 10))
+    input_w1 = relay.var("w1", shape=(10, 10))
 
     # z0 = x + w0
-    x_ = compiler_begin(x, "ccompiler")
-    w0_ = compiler_begin(w0, "ccompiler")
-    z0_ = relay.add(x_, w0_)
-    z0 = compiler_end(z0_, "ccompiler")
+    marked_input_x = compiler_begin(input_x, "ccompiler")
+    marked_input_w0 = compiler_begin(input_w0, "ccompiler")
+    add_x_and_w0 = relay.add(marked_input_x, marked_input_w0)
+    end_inner_add = compiler_end(add_x_and_w0, "ccompiler")
 
     # z1 = z0 + w1
-    z0__ = compiler_begin(z0, "ccompiler")
-    w1_ = compiler_begin(w1, "ccompiler")
-    z1_ = relay.add(z0__, w1_)
-    z1 = compiler_end(z1_, "ccompiler")
+    marked_inner_add = compiler_begin(end_inner_add, "ccompiler")
+    marked_w1 = compiler_begin(input_w1, "ccompiler")
+    add_nested_and_w1 = relay.add(marked_inner_add, marked_w1)
+    end_outer_add = compiler_end(add_nested_and_w1, "ccompiler")
 
     # z2 = z0 + z1
-    z2 = relay.add(z0, z1)
+    final_add = relay.add(end_inner_add, end_outer_add)
 
-    f = relay.Function([x, w0, w1], z2)
+    relay_func = relay.Function([input_x, input_w0, input_w1], final_add)
     mod = tvm.IRModule()
-    mod["main"] = f
+    mod["main"] = relay_func
 
     if merge_compiler_regions:
         mod = transform.MergeCompilerRegions()(mod)
@@ -199,11 +200,13 @@ def test_byoc_microtvm(merge_compiler_regions):
 
 
 MOBILENET_V1_URL = (
-    "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/"
+    + "mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
     "mobilenet_v1_1.0_224_quant.tflite",
 )
 MOBILENET_V2_URL = (
-    "https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/"
+    + "tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz",
     "mobilenet_v2_1.0_224_quant.tflite",
 )
 
@@ -217,10 +220,13 @@ def test_byoc_microtvm(merge_compiler_regions):
     ],
 )
 def test_tflite_model_u1_usecase(model_url, usmp_algo, workspace_size):
-    """This checks for ML models and the memory used by them when using USMP with different algorithms"""
+    """
+    This checks for ML models and the memory used by them
+    when using USMP with different algorithms
+    """
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -287,7 +293,7 @@ def test_tflite_model_u3_usecase_single_external_pool(model_url, usmp_algo):
     """This checks for inference with USMP using external pool placed in the application"""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -341,7 +347,7 @@ def test_tflite_model_u3_usecase_two_external_pools(model_url, usmp_algo):
     """This checks for inference using two external pools placed in the application"""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -397,11 +403,11 @@ def test_tflite_model_u3_usecase_two_external_pools(model_url, usmp_algo):
         ((MOBILENET_V1_URL, MOBILENET_V2_URL), "greedy_by_size"),
     ],
 )
-def test_tflite_model_u2_usecase_two_models_with_a_single_external_pool(model_urls, usmp_algo):
+def test_two_models_with_a_single_external_pool(model_urls, usmp_algo):
     """This checks for inference using a single large enough common pool"""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -469,7 +475,7 @@ def test_tflite_model_u4_usecase_single_external_pool(model_url, usmp_algo):
     """This checks for inference with USMP using external pool placed in the application"""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -538,7 +544,7 @@ def test_tflite_model_u4_usecase_two_external_pools(model_url, usmp_algo):
     """This checks for inference with USMP using external pool placed in the application"""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -604,7 +610,8 @@ def test_tflite_model_u4_usecase_two_external_pools(model_url, usmp_algo):
     )
 
 
-def test_u4_usecase_incompatible_interface_api_errors():
+def test_incompatible_interface_api_errors():
+    """Ensures an error is thrown if not using the C interface API"""
     mod, params = tvm.relay.testing.synthetic.get_workload()
     target = "c"
     runtime = Runtime("crt")

From 3cb4597ed48360e3f3d80161d1c03f833072d28e Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Wed, 15 Jun 2022 16:47:26 +0100
Subject: [PATCH 139/181] [CMSIS-NN] Fixed error in finding input's dtype in
 maxpool (#11701)

---
 python/tvm/relay/op/contrib/cmsisnn.py        | 19 ++----
 .../contrib/test_cmsisnn/test_pooling.py      | 65 ++++++++++++++++++-
 2 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 1a06867e5485c..09831929e5277 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -31,12 +31,6 @@ def enabled():
     return "cmsis-nn" in Target.list_kinds()
 
 
-def _find_last(pattern):
-    if hasattr(pattern, "args"):
-        return _find_last(pattern.args[0])
-    return pattern
-
-
 def partition_for_cmsisnn(mod, params=None, mod_name="default", **opts):
     """Partition the graph greedily offloading supported
     operators on Cortex-M using CMSIS-NN
@@ -206,17 +200,17 @@ def qnn_avg_pool2d_pattern():
     def check_qnn_avg_pool2d(pattern):
         """Check if avg pool2d is supported by CMSIS-NN."""
         output = pattern
-        input_var = _find_last(pattern)
 
         if str(pattern.op.name) == "clip":
             pooling = pattern.args[0].args[0]
         else:
             pooling = pattern.args[0]
+        input_op = pooling.args[0].args[0]
 
         return (
             pooling.attrs.layout == "NHWC"
-            and bool(input_var.checked_type.shape[0] == 1)
-            and input_var.checked_type.dtype == "int8"
+            and int(input_op.checked_type.shape[0]) == 1
+            and input_op.checked_type.dtype == "int8"
             and output.checked_type.dtype == "int8"
         )
 
@@ -229,17 +223,18 @@ def qnn_max_pool2d_pattern():
     def check_qnn_max_pool2d(pattern):
         """Check if max pool2d is supported by CMSIS-NN."""
         output = pattern
-        input_var = _find_last(pattern)
+        input_op = None
 
         if str(pattern.op.name) == "clip":
             pooling = pattern.args[0]
         else:
             pooling = pattern
+        input_op = pooling.args[0]
 
         return (
             pooling.attrs.layout == "NHWC"
-            and bool(input_var.checked_type.shape[0] == 1)
-            and input_var.checked_type.dtype == "int8"
+            and int(input_op.checked_type.shape[0]) == 1
+            and input_op.checked_type.dtype == "int8"
             and output.checked_type.dtype == "int8"
         )
 
diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py
index 6b719cdc9938e..a59dba0f7868f 100644
--- a/tests/python/contrib/test_cmsisnn/test_pooling.py
+++ b/tests/python/contrib/test_cmsisnn/test_pooling.py
@@ -45,11 +45,15 @@ def make_model(
     zero_point=-33,
     relu_type="RELU",
     layout="NHWC",
+    input_op=None,
 ):
     """Return a model and any parameters it may have,
     all parameters are defaulted to known good values
     """
-    op = relay.var("input", shape=shape, dtype=dtype)
+    if input_op:
+        op = input_op
+    else:
+        op = relay.var("input", shape=shape, dtype=dtype)
     pad_ = (0, 0, 0, 0)
     if padding == "SAME":
         dilation = (1, 1)
@@ -135,6 +139,65 @@ def test_op_int8(
     )
 
 
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize(
+    "pool_size, strides, padding", [((3, 3), (2, 2), "SAME"), ((2, 2), (1, 1), "VALID")]
+)
+@pytest.mark.parametrize("relu_type", ["NONE", "RELU"])
+def test_int8_pool_with_float32_input(
+    pool_size,
+    strides,
+    padding,
+    relu_type,
+):
+    """Tests QNN maxpool partitions with float32 input"""
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    in_shape = (1, 28, 28, 12)
+    zero_point, scale = (-34, 0.0256)
+
+    input_ = relay.var("input", shape=in_shape, dtype="float32")
+    op = relay.op.add(input_, input_)
+    op = relay.qnn.op.quantize(op, relay.const(scale), relay.const(zero_point), -1, "int8")
+
+    model = make_model(
+        pool_op=relay.nn.max_pool2d,
+        shape=in_shape,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        scale=scale,
+        zero_point=zero_point,
+        relu_type=relu_type,
+        input_op=op,
+    )
+    orig_mod = make_module(model)
+
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod)
+
+    # validate pattern matching
+    assert_partitioned_function(orig_mod, cmsisnn_mod)
+
+    # validate the output
+    np.random.seed(0)
+    inputs = {"input": np.random.uniform(0, 1, in_shape).astype("float32")}
+    output_list = generate_ref_data(orig_mod["main"], inputs)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=output_list,
+            params=None,
+            output_tolerance=1,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("op", [relay.nn.avg_pool2d, relay.nn.max_pool2d])
 def test_invalid_datatype(op):

From f942d197889ee93fc112ca346ca8366d29933fac Mon Sep 17 00:00:00 2001
From: Jason <928090362@qq.com>
Date: Thu, 16 Jun 2022 01:02:04 +0800
Subject: [PATCH 140/181] [TVMC] Fix error while compile paddle model with tvmc
 (#11730)

The tvmc command will throw a error while the passed path of model is not exist, But for PaddlePaddle model, it contains 2 file model_name.pdmodel and model_name.pdiparams, we only pass the prefix like inference_model/model_name.

This pr is same with https://github.com/apache/tvm/pull/11108
Since the origin PR didn't update for a long time, I send this new PR
---
 python/tvm/driver/tvmc/frontends.py           | 14 ++++++--
 tests/python/driver/tvmc/conftest.py          |  2 +-
 tests/python/driver/tvmc/test_command_line.py | 33 +++++++++++++++++++
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py
index a3222782c68ee..cfe5a4ac7b2e5 100644
--- a/python/tvm/driver/tvmc/frontends.py
+++ b/python/tvm/driver/tvmc/frontends.py
@@ -21,6 +21,7 @@
 loading the tool.
 """
 import logging
+import os
 import sys
 import importlib
 from abc import ABC
@@ -268,7 +269,7 @@ def name():
 
     @staticmethod
     def suffixes():
-        return ["pdmodel", "pdiparams"]
+        return ["pdmodel"]
 
     def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0415
@@ -277,9 +278,18 @@ def load(self, path, shape_dict=None, **kwargs):
         paddle.enable_static()
         paddle.disable_signal_handler()
 
+        if not os.path.exists(path):
+            raise TVMCException("File {} is not exist.".format(path))
+        if not path.endswith(".pdmodel"):
+            raise TVMCException("Path of model file should be endwith suffixes '.pdmodel'.")
+        prefix = "".join(path.strip().split(".")[:-1])
+        params_file_path = prefix + ".pdiparams"
+        if not os.path.exists(params_file_path):
+            raise TVMCException("File {} is not exist.".format(params_file_path))
+
         # pylint: disable=E1101
         exe = paddle.static.Executor(paddle.CPUPlace())
-        prog, _, _ = paddle.static.load_inference_model(path, exe)
+        prog, _, _ = paddle.static.load_inference_model(prefix, exe)
 
         return relay.frontend.from_paddle(prog, shape_dict=shape_dict, **kwargs)
 
diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py
index efce13e38c2fb..fcf079620e253 100644
--- a/tests/python/driver/tvmc/conftest.py
+++ b/tests/python/driver/tvmc/conftest.py
@@ -160,7 +160,7 @@ def paddle_resnet50(tmpdir_factory):
     model_url = "paddle_resnet50.tar"
     model_file = download_and_untar(
         "{}/{}".format(base_url, model_url),
-        "paddle_resnet50/model",
+        "paddle_resnet50/model.pdmodel",
         temp_dir=tmpdir_factory.mktemp("data"),
     )
     return model_file
diff --git a/tests/python/driver/tvmc/test_command_line.py b/tests/python/driver/tvmc/test_command_line.py
index 5b15492aa4e3f..0fddb7073f3f8 100644
--- a/tests/python/driver/tvmc/test_command_line.py
+++ b/tests/python/driver/tvmc/test_command_line.py
@@ -20,8 +20,10 @@
 import shutil
 
 from pytest_lazyfixture import lazy_fixture
+from unittest import mock
 from tvm.driver.tvmc.main import _main
 from tvm.driver.tvmc.model import TVMCException
+from tvm.driver.tvmc import compiler
 
 
 @pytest.mark.skipif(
@@ -155,3 +157,34 @@ def test_tvmc_tune_file_check(capsys, invalid_input):
     )
     on_assert_error = f"'tvmc tune' failed to check invalid FILE: {invalid_input}"
     assert captured.err == expected_err, on_assert_error
+
+
+@pytest.fixture
+def paddle_model(paddle_resnet50):
+    # If we can't import "paddle" module, skip testing paddle as the input model.
+    if pytest.importorskip("paddle", reason="'paddle' module not installed"):
+        return paddle_resnet50
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        lazy_fixture("paddle_model"),
+    ],
+)
+# compile_model() can take too long and is tested elsewhere, hence it's mocked below
+@mock.patch.object(compiler, "compile_model")
+# @mock.patch.object(compiler, "compile_model")
+def test_tvmc_compile_input_model(mock_compile_model, tmpdir_factory, model):
+
+    output_dir = tmpdir_factory.mktemp("output")
+    output_file = output_dir / "model.tar"
+
+    compile_cmd = (
+        f"tvmc compile --target 'llvm' {model} --model-format paddle --output {output_file}"
+    )
+    run_arg = compile_cmd.split(" ")[1:]
+
+    _main(run_arg)
+
+    mock_compile_model.assert_called_once()

From a5df28332cbdb88320e591c4fe1fbc7294054a90 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 15 Jun 2022 12:03:51 -0500
Subject: [PATCH 141/181] [MetaSchedule] Include te/tensor.h instead of forward
 declaring te::Tensor (#11731)

ApplyHistoryBestNode declares an Array of Tensor. There are type traits
used in Array that require that the element type is complete at the time
of the declaration. With only a forward declaration compilation fails
(clang 14.0.3, libc++).
---
 include/tvm/meta_schedule/apply_history_best.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h
index 82bb350e1c5ee..3a8983012b9d3 100644
--- a/include/tvm/meta_schedule/apply_history_best.h
+++ b/include/tvm/meta_schedule/apply_history_best.h
@@ -28,12 +28,7 @@
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/target/target.h>
-
-namespace tvm {
-namespace te {
-class Tensor;
-}  // namespace te
-}  // namespace tvm
+#include <tvm/te/tensor.h>
 
 namespace tvm {
 namespace meta_schedule {

From 9d98da27361429cb558930032f074172bc99b7c3 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <73191103+jverma-quic@users.noreply.github.com>
Date: Wed, 15 Jun 2022 12:40:37 -0500
Subject: [PATCH 142/181] [Hexagon] Implement avg_pool2d slice op (#11417)

* Implement avg_pool2d slice op

* Address review comments and fix the STIR schedule

* Fix formatting issues

* Address pylint errors

* Additional formatting issues

* more pylint fixes

* Changed arch version to v68 for now

* Changing arch version back to v69

* Move the test to tests/python/contrib/test_hexagon/topi
---
 python/tvm/topi/hexagon/slice_ops/__init__.py |  22 ++
 .../tvm/topi/hexagon/slice_ops/avg_pool2d.py  | 141 +++++++
 python/tvm/topi/hexagon/utils.py              |  52 +++
 .../contrib/test_hexagon/infrastructure.py    |  20 +
 .../topi/test_avg_pool2d_slice.py             | 369 ++++++++++++++++++
 5 files changed, 604 insertions(+)
 create mode 100644 python/tvm/topi/hexagon/slice_ops/__init__.py
 create mode 100644 python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
 create mode 100644 python/tvm/topi/hexagon/utils.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
new file mode 100644
index 0000000000000..b52d410676aff
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Computes and Schedules for Hexagon slice ops. """
+
+# pylint: disable=wildcard-import
+
+from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule
diff --git a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
new file mode 100644
index 0000000000000..306be543d8fb6
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals
+
+""" Compute and schedule for avg_pool2d slice op
+
+Please note the following assumptions made by the implementation:
+
+1) The input must be padded in advance to account for 'padding'. In addition,
+   both input and output must be padded as per the physical buffer layout.
+2) The current implementation assumes 'count_include_pad' to be 'True'. It can be
+   modified to support 'False' case but the element count for the pooling window
+   must be pre-computed and provided as an input to reduce the run-time overhead.
+3) 'padding' is ignored. It must be handled outside of the sliced op.
+4) Please note that this implementation will not work if the output includes any
+   physical layout related padding as it can result into out-of-bound access
+   for the input.
+"""
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn
+
+
+def validate_out_shape(out_shape, in_shape, kernel, stride, dilation):
+    """Validate output shape"""
+    _, oh, ow, _ = out_shape
+    _, ih, iw, _ = in_shape
+    kh, kw = kernel
+    sh, sw = stride
+    dh, dw = dilation
+    if ih < (oh - 1) * sh + dh * (kh - 1) + 1:
+        raise RuntimeError("Output height is too large")
+    if iw < (ow - 1) * sw + dw * (kw - 1) + 1:
+        raise RuntimeError("Output width is too large")
+
+
+def avg_pool2d_compute(A, out_shape, kernel, stride, dilation):
+    """avg_pool2d compute"""
+    kh, kw = kernel
+    rh = te.reduce_axis((0, kh), name="rh")
+    rw = te.reduce_axis((0, kw), name="rw")
+    ob, oh, ow, oc = out_shape
+    if isinstance(ob, int):
+        validate_out_shape(out_shape, A.shape, kernel, stride, dilation)
+
+    sh, sw = stride
+    dh, dw = dilation
+    InvArea = float(1) / (kh * kw)
+
+    Sum = te.compute(
+        out_shape,
+        lambda b, h, w, c: te.sum(
+            A[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw]
+        ),
+        name="sum",
+    )
+    Avg = te.compute(
+        out_shape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(A.dtype), name="avg"
+    )
+    return Avg
+
+
+def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
+    """Schedule for input and output layout nhwc-8h2w32c2w"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+    Sum = s.get_block("sum")
+    Avg = s.get_block("avg")
+
+    input_transform_fn = get_layout_transform_fn(input_layout)
+    output_transform_fn = get_layout_transform_fn(output_layout)
+    s.transform_layout(Sum, ("read", 0), input_transform_fn)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn)
+
+    # Schedule 'Avg'
+    n, h, w, c = s.get_loops(Avg)
+    ho, hi = s.split(h, [None, 8])
+    wo, wi = s.split(w, [None, 4])
+    wio, wii = s.split(wi, [None, 2])
+    co, ci = s.split(c, [None, 32])
+    s.reorder(n, ho, wo, co, hi, wio, ci, wii)
+    ci_wii = s.fuse(ci, wii)
+    s.vectorize(ci_wii)
+
+    # Schedule 'Sum'
+    s.compute_at(Sum, wio)
+    Sum_axis = s.get_loops(Sum)
+    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3])
+    ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3])
+    # s.vectorize(ci_wii) # Doesn't work
+    return s
+
+
+def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
+    """Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+    Sum = s.get_block("sum")
+    Avg = s.get_block("avg")
+
+    input_transform_fn = get_layout_transform_fn(input_layout)
+    output_transform_fn = get_layout_transform_fn(output_layout)
+    s.transform_layout(Sum, ("read", 0), input_transform_fn)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn)
+
+    # Schedule 'Avg'
+    n, h, w, c = s.get_loops(Avg)
+    co, ci = s.split(c, [None, 1024])
+    cio, cii = s.split(ci, [None, 64])
+    s.vectorize(cii)
+
+    # Schedule 'Sum'
+    s.compute_at(Sum, cio)
+    Sum_axis = s.get_loops(Sum)
+    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-3])
+    # s.vectorize(Sum_axis[-3]) # Doesn't work
+    return s
+
+
+def avg_pool2d_STIR_schedule(outs, ins, output_layout: str, input_layout: str):
+    """STIR based schedule"""
+    if output_layout == "nhwc-8h2w32c2w-2d":
+        return STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout)
+    if output_layout == "n11c-1024c-2d":
+        return STIR_schedule_n11c_1024c(outs, ins, output_layout, input_layout)
+    raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
new file mode 100644
index 0000000000000..af6e3de9c350e
--- /dev/null
+++ b/python/tvm/topi/hexagon/utils.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Common hexagon specific utilities"""
+from tvm import te
+
+
+def n11c_1024c_2d(n, h, w, c):
+    """Return index map for n11c_1024 2d layout"""
+    return [n, h, w, c // 1024, te.AXIS_SEPARATOR, c % 1024]
+
+
+def n11c_1024c_1d(n, h, w, c):
+    """Return index map for n11c_1024 1d layout"""
+    return [n, h, w, c // 1024, c % 1024]
+
+
+def nhwc_8h2w32c2w_2d(n, h, w, c):
+    """Return index map for nhwc_8h2w32c2w 2d layout"""
+    return [n, h // 8, w // 4, c // 32, te.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2]
+
+
+def nhwc_8h2w32c2w_1d(n, h, w, c):
+    """Return index map for nhwc_8h2w32c2w 1d layout"""
+    return [n, h // 8, w // 4, c // 32, h % 8, (w % 4) // 2, c % 32, w % 2]
+
+
+def get_layout_transform_fn(layout):
+    """Return index map function as per the layout string"""
+    if layout == "nhwc-8h2w32c2w-2d":
+        return nhwc_8h2w32c2w_2d
+    if layout == "nhwc-8h2w32c2w-1d":
+        return nhwc_8h2w32c2w_1d
+    if layout == "n11c-1024c-2d":
+        return n11c_1024c_2d
+    if layout == "n11c-1024c-1d":
+        return n11c_1024c_1d
+    raise RuntimeError(f"Unexpected layout '{layout}'")
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 01eef86e6b5b8..57a9dff8b4246 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name
 
 """ Hexagon testing infrastructure """
 
@@ -228,3 +229,22 @@ def compute(n, ho, wo, ko, hi, wi, ki):
         )
 
     return output_shape, compute
+
+
+def transform_numpy(arr_np, current_layout: str, new_layout: str):
+    """Reshape and transpose numpy array according to the specified layout"""
+    if current_layout == "nhwc":
+        if new_layout == "nhwc":
+            return arr_np
+        if new_layout in ["nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-1d"]:
+            n, h, w, c = arr_np.shape
+            return arr_np.reshape([n, h // 8, 8, w // 4, 2, 2, c // 32, 32]).transpose(
+                0, 1, 3, 6, 2, 4, 7, 5
+            )
+        if new_layout in ["n11c-1024c-2d", "n11c-1024c-1d"]:
+            n, h, w, c = arr_np.shape
+            assert h == 1 and w == 1, "The size of h and w must be 1"
+            return arr_np.reshape([n, 1, 1, c // 1024, 1024])
+
+        raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
+    raise RuntimeError(f"Unexpected current_layout '{current_layout}'")
diff --git a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
new file mode 100644
index 0000000000000..6cbd84b7ee3ae
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
@@ -0,0 +1,369 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import numpy as np
+
+from tvm import te, topi
+
+import tvm.testing
+from tvm.topi import testing
+from tvm.contrib.hexagon.build import HexagonLauncher
+import tvm.topi.hexagon.slice_ops as sl
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+
+input_layout = tvm.testing.parameter(
+    "nhwc-8h2w32c2w-2d",
+)
+
+
+@tvm.testing.fixture
+def input_np(input_shape, dtype):
+    return np.random.random(input_shape).astype(dtype)
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, output_layout):
+    return transform_numpy(expected_output_np, "nhwc", output_layout)
+
+
+@tvm.testing.fixture
+def transformed_input_np_padded(input_np_padded, input_layout):
+    return transform_numpy(input_np_padded, "nhwc", input_layout)
+
+
+class TestAvgPool2dSlice:
+    # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d"
+    (
+        output_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        output_layout,
+        dtype,
+    ) = tvm.testing.parameters(
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [1, 1],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 16, 16, 32],
+            [3, 3],
+            [1, 1],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [8, 8],
+            [1, 1],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # Test non-one stride and dilation
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [2, 3],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [2, 2],
+            [2, 2],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [2, 2],
+            [2, 3],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # Test non-zero padding
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [1, 1],
+            [1, 1],
+            [1, 1, 1, 1],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [1, 1],
+            [1, 1],
+            [1, 2, 3, 4],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [1, 1],
+            [1, 1],
+            [1, 2, 3, 4],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [3, 2],
+            [2, 3],
+            [1, 2, 3, 4],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # Test n11c-1024c-2d layout which will require input and output to have different layout
+        (
+            [1, 1, 1, 2048],
+            [8, 8],
+            [1, 1],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "n11c-1024c-2d",
+            "float16",
+        ),
+        (
+            [1, 1, 1, 2048],
+            [6, 6],
+            [1, 1],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "n11c-1024c-2d",
+            "float16",
+        ),
+        (
+            [1, 1, 1, 2048],
+            [3, 3],
+            [2, 2],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "n11c-1024c-2d",
+            "float16",
+        ),
+        (
+            [1, 1, 1, 2048],
+            [4, 4],
+            [2, 2],
+            [2, 3],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "n11c-1024c-2d",
+            "float16",
+        ),
+    )
+
+    @tvm.testing.fixture
+    def expected_output_np(
+        self,
+        input_np,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        ceil_mode,
+        count_include_pad,
+    ):
+        pad_before = padding[:2]
+        pad_after = padding[2:]
+        ref_np = tvm.topi.testing.poolnd_python(
+            input_np,
+            kernel,
+            stride,
+            dilation,
+            pad_before,
+            pad_after,
+            "avg",  # pool_type
+            count_include_pad,
+            False,  # ceil_mode,
+            layout="NHWC",
+        )
+        return ref_np
+
+    @tvm.testing.fixture
+    def input_shape(self, output_shape, kernel, padding, stride, dilation, output_layout):
+        # Input shape without any padding; 'ceil' is being ignored from calculation:
+        o_b, o_h, o_w, o_c = output_shape
+        d_h, d_w = dilation
+        s_h, s_w = stride
+        k_h, k_w = kernel
+        pad_before_h, pad_before_w = padding[:2]
+        pad_after_h, pad_after_w = padding[2:]
+
+        if output_layout == "n11c-1024c-2d":
+            assert (
+                pad_before_w == 0 and pad_after_w == 0 and pad_before_h == 0 and pad_after_h == 0
+            ), "Padding must be zero for n11c-1024c-2d layout"
+            assert o_h == 1 and o_w == 1, "Output height and width must be 1"
+
+        in_h = (o_h - 1) * s_h + d_h * (k_h - 1) + 1 - pad_before_h - pad_after_h
+        in_w = (o_w - 1) * s_w + d_w * (k_w - 1) + 1 - pad_before_w - pad_after_w
+
+        return [o_b, in_h, in_w, o_c]
+
+    @tvm.testing.fixture
+    def input_shape_padded(self, input_shape, padding, output_layout):
+        # Input shape is adjusted to account for 'padding'. Also, due to the physical
+        # layout of the buffer, height and width are adjusted so that they are a
+        # multiple of 8 and 4 respectively.
+        # NOTE: Input layout is always assumed to be nhwc-8h2w32c2w-2d.
+        pad_before_h, pad_before_w = padding[:2]
+        pad_after_h, pad_after_w = padding[2:]
+        padded_input_height = ((input_shape[1] + pad_before_h + pad_after_h + 7) // 8) * 8
+        padded_input_width = ((input_shape[2] + pad_before_w + pad_after_w + 3) // 4) * 4
+        return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]]
+
+    @tvm.testing.fixture
+    def input_np_padded(self, input_np, input_shape, input_shape_padded, padding):
+        pad_before_h, pad_before_w = padding[:2]
+        pad_after_h = input_shape_padded[1] - input_shape[1] - pad_before_h
+        pad_after_w = input_shape_padded[2] - input_shape[2] - pad_before_w
+        input_padded = np.pad(
+            input_np,
+            ((0, 0), (pad_before_h, pad_after_h), (pad_before_w, pad_after_w), (0, 0)),
+            "constant",
+        )
+        return input_padded
+
+    @tvm.testing.requires_hexagon
+    def test_avg_pool2d_slice(
+        self,
+        stride,
+        kernel,
+        dtype,
+        dilation,
+        padding,
+        count_include_pad,
+        input_layout,
+        output_layout,
+        output_shape,
+        input_shape,
+        input_shape_padded,
+        input_np,
+        input_np_padded,
+        transformed_input_np_padded,
+        transformed_expected_output_np,
+        expected_output_np,
+        hexagon_session,
+    ):
+
+        target_hexagon = tvm.target.hexagon("v69")
+        A = te.placeholder(input_shape_padded, name="A", dtype=dtype)
+
+        M = sl.avg_pool2d_compute(A, output_shape, kernel, stride, dilation)
+
+        # tir schedule
+        tir_schedule = sl.avg_pool2d_STIR_schedule(M, A, output_layout, input_layout)
+        sch = tir_schedule.mod
+
+        input_axis_separator = [4]
+        if output_layout == "nhwc-8h2w32c2w-2d":
+            output_axis_separator = [4]
+        elif output_layout == "n11c-1024c-2d":
+            output_axis_separator = [4]
+        else:
+            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                [A, M],
+                tvm.target.Target(target_hexagon, host=target_hexagon),
+                name="avg_pool2d",
+            )
+
+        input_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np_padded,
+            axis_separators=input_axis_separator,
+            mem_scope="global.vtcm",
+        )
+        output_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            transformed_expected_output_np.shape,
+            dtype,
+            axis_separators=output_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(input_arr, output_arr)
+        b, h, w, c = output_shape
+        if output_layout == "nhwc-8h2w32c2w-2d":
+            output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
+        elif output_layout == "n11c-1024c-2d":
+            output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024])
+        else:
+            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
+        np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From a64368be0e76cc9ed28dcbaa910739c4700feb00 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Wed, 15 Jun 2022 19:16:25 +0100
Subject: [PATCH 143/181] [ci] Skip failing tests in wheel (#11705)

Some python tests are failing in the wheel. This PR skips them if the environment variable `WHEEL_TEST` is set.

This PR is related to https://github.com/tlc-pack/tlcpack/pull/115.
---
 python/tvm/testing/utils.py                       |  4 ++++
 tests/python/ci/test_ci.py                        |  4 +++-
 tests/python/ci/test_mergebot.py                  |  3 +++
 tests/python/unittest/test_runtime_profiling.py   |  1 +
 .../python/unittest/test_target_codegen_vulkan.py |  2 ++
 tests/python/unittest/test_te_hybrid_script.py    | 15 +++++++++++++++
 6 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 569ea0cca7ffd..5a6ded9bcb709 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -94,6 +94,10 @@ def test_something():
 
 SKIP_SLOW_TESTS = os.getenv("SKIP_SLOW_TESTS", "").lower() in {"true", "1", "yes"}
 
+skip_if_wheel_test = pytest.mark.skipif(
+    os.getenv("WHEEL_TEST") is not None, reason="Test not supported in wheel."
+)
+
 
 def assert_allclose(actual, desired, rtol=1e-7, atol=1e-7):
     """Version of np.testing.assert_allclose with `atol` and `rtol` fields set
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index f92e98c49ca18..b712b6780cd78 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+import os
 import subprocess
 import sys
 import json
@@ -74,6 +74,7 @@ def test_docs_comment(
     assert f"Dry run, would have posted {expected_url} with data {expected_body}." in proc.stderr
 
 
+@tvm.testing.skip_if_wheel_test
 def test_cc_reviewers(tmpdir_factory):
     reviewers_script = REPO_ROOT / "tests" / "scripts" / "github_cc_reviewers.py"
 
@@ -525,6 +526,7 @@ def assert_in(needle: str, haystack: str):
         raise AssertionError(f"item not found:\n{needle}\nin:\n{haystack}")
 
 
+@tvm.testing.skip_if_wheel_test
 def test_github_tag_teams(tmpdir_factory):
     tag_script = REPO_ROOT / "tests" / "scripts" / "github_tag_teams.py"
 
diff --git a/tests/python/ci/test_mergebot.py b/tests/python/ci/test_mergebot.py
index a565cc76a5c14..75f56eee562de 100644
--- a/tests/python/ci/test_mergebot.py
+++ b/tests/python/ci/test_mergebot.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import os
 import subprocess
 import json
 import sys
@@ -22,6 +23,7 @@
 
 from pathlib import Path
 
+import tvm.testing
 from test_utils import REPO_ROOT
 
 
@@ -137,6 +139,7 @@ def run(self, *args, **kwargs):
 }
 
 
+@tvm.testing.skip_if_wheel_test
 @pytest.mark.parametrize(
     ["number", "filename", "expected", "comment", "user", "detail"],
     [tuple(d.values()) for d in test_data.values()],
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index ab22bd2b9c481..adb5dee17480a 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -53,6 +53,7 @@ def read_csv(report):
 
 
 @pytest.mark.skipif(not profiler_vm.enabled(), reason="VM Profiler not enabled")
+@tvm.testing.skip_if_wheel_test
 @tvm.testing.parametrize_targets
 def test_vm(target, dev):
     dtype = "float32"
diff --git a/tests/python/unittest/test_target_codegen_vulkan.py b/tests/python/unittest/test_target_codegen_vulkan.py
index 3b42dd61dca2b..73e8402085490 100644
--- a/tests/python/unittest/test_target_codegen_vulkan.py
+++ b/tests/python/unittest/test_target_codegen_vulkan.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import os
 import random
 import re
 import threading
@@ -258,6 +259,7 @@ def test_cumsum(target, dev):
     check_mod(target, dev, mod, x_np, res_np)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_unique(target, dev):
     dtype = "int32"
     x = relay.var("x", shape=(relay.Any(),), dtype=dtype)
diff --git a/tests/python/unittest/test_te_hybrid_script.py b/tests/python/unittest/test_te_hybrid_script.py
index e9626e7f31b46..d6b11785a4a35 100644
--- a/tests/python/unittest/test_te_hybrid_script.py
+++ b/tests/python/unittest/test_te_hybrid_script.py
@@ -101,6 +101,7 @@ def outer_product(n, m, a, b):
     return c
 
 
+@tvm.testing.skip_if_wheel_test
 # Test global function
 # Test bridge between frontend and backend
 def test_outer_product():
@@ -159,6 +160,7 @@ def test_outer_product():
         assert key not in outer_product.__globals__.keys()
 
 
+@tvm.testing.skip_if_wheel_test
 # Test local function
 # Test allocation of local variable
 def test_fanout():
@@ -273,6 +275,7 @@ def looptype(a, b, c):
     run_and_check(func, ins, outs=outs)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_if():
     @script
     def if_then_else(a):
@@ -387,6 +390,7 @@ def max_threads(a):
         run_and_check(func, ins, outs=outs, target="cuda")
 
 
+@tvm.testing.skip_if_wheel_test
 def test_math_intrin():
     @script
     def intrin_real(a):
@@ -432,6 +436,7 @@ def intrin_int(a):
     assert tvm_b.numpy()[0] == b[0]
 
 
+@tvm.testing.skip_if_wheel_test
 # test non caconical loops
 def test_non_zero():
     @te.hybrid.script
@@ -506,6 +511,7 @@ def share_vec_add(a, b):
     run_and_check(func, ins, outs=outs, target="cuda")
 
 
+@tvm.testing.skip_if_wheel_test
 def test_upstream():
     @te.hybrid.script
     def upstream(a):
@@ -537,6 +543,7 @@ def upstream(a):
     tvm.testing.assert_allclose(tvm_d.numpy(), ref, 1e-5, 1e-5)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_downstream():
     @te.hybrid.script
     def downstream(a):
@@ -564,6 +571,7 @@ def downstream(a):
     tvm.testing.assert_allclose(tvm_c.numpy(), ref, 1e-5, 1e-5)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_const_param():
     @te.hybrid.script
     def add_something(a, b):
@@ -591,6 +599,7 @@ def add_something(a, b):
     tvm.testing.assert_allclose(nd_c.numpy(), ref, 1e-5, 1e-5)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_value_index():
     @te.hybrid.script
     def kernel_a(a):
@@ -625,6 +634,7 @@ def kernel_b(b, a):
     tvm.testing.assert_allclose(res.numpy(), ref)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_func_call():
     @te.hybrid.script
     def foo(a, b):
@@ -645,6 +655,7 @@ def foo(a, b):
     run_and_check(func, ins, outs=outs)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_bool():
     @te.hybrid.script
     def foo(a):
@@ -662,6 +673,7 @@ def foo(a):
     run_and_check(func, ins, outs=outs)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_const_range():
     @te.hybrid.script
     def foo(a, b):
@@ -718,6 +730,7 @@ def hoo(a, b):
     run_and_check(func, ins, outs=outs)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_schedule():
     @script
     def outer_product(a, b):
@@ -781,6 +794,7 @@ def outer_product(a, b):
     # Test loop binds
 
 
+@tvm.testing.skip_if_wheel_test
 def test_capture():
     n = 8
 
@@ -801,6 +815,7 @@ def add_something(a):
     run_and_check(func, ins, outs=outs)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_array_inputs():
     @script
     def sum_array(inputs):

From fdc3c0274bcdff771d8bb1dc96796a80a51cf5fd Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 15 Jun 2022 13:00:06 -0700
Subject: [PATCH 144/181] [MetaSchedule] Developer Ergonomics Enhancement II
 (#11727)

Follow-up of #11622, per discussion with @Kathryn-cat

- [x] Allow using a string `"default"` in `TuneContext` to quickly specify a set of target-specific
rules
- [x] Enhance detection of `ScheduleFn` in `TuneContext` to make it easier for users to quickly try
out template-driven scheduling on TIR.

Next PR:
- Add `TuneContext.tune` to allow directly tuning without task scheduler.

Co-Authored-By: Kathryn (Jinqi) Chen <65606304+Kathryn-cat@users.noreply.github.com>
---
 .../meta_schedule/space_generator/__init__.py |   6 +-
 .../space_generator/schedule_fn.py            |  13 +--
 python/tvm/meta_schedule/tune.py              |   3 +-
 python/tvm/meta_schedule/tune_context.py      | 103 +++++++++++++-----
 ..._meta_schedule_custom_rule_winograd_cpu.py |   1 -
 ...meta_schedule_custom_rule_winograd_cuda.py |   1 -
 .../test_meta_schedule_integration.py         |   2 +-
 ...chedule_mutator_mutate_compute_location.py |   1 -
 ...t_meta_schedule_mutator_mutate_parallel.py |   1 -
 ..._schedule_mutator_mutate_thread_binding.py |   1 -
 ..._meta_schedule_mutator_mutate_tile_size.py |   1 -
 ...est_meta_schedule_mutator_mutate_unroll.py |   1 -
 .../test_meta_schedule_post_order_apply.py    |   6 -
 ...schedule_postproc_disallow_dynamic_loop.py |   1 -
 ...dule_postproc_rewrite_cooperative_fetch.py |   1 -
 ...hedule_postproc_rewrite_reduction_block.py |   1 -
 ...eta_schedule_postproc_rewrite_tensorize.py |   1 -
 ...schedule_postproc_rewrite_unbound_block.py |   1 -
 ..._meta_schedule_postproc_verify_gpu_code.py |   1 -
 ...meta_schedule_schedule_rule_add_rfactor.py |   1 -
 ...t_meta_schedule_schedule_rule_auto_bind.py |   1 -
 ...meta_schedule_schedule_rule_auto_inline.py |   1 -
 ...le_schedule_rule_cross_thread_reduction.py |   1 -
 ...hedule_schedule_rule_multi_level_tiling.py |   1 -
 ...schedule_rule_parallel_vectorize_unroll.py |   1 -
 ...e_schedule_rule_random_compute_location.py |   1 -
 .../test_meta_schedule_search_strategy.py     |   3 -
 27 files changed, 88 insertions(+), 68 deletions(-)

diff --git a/python/tvm/meta_schedule/space_generator/__init__.py b/python/tvm/meta_schedule/space_generator/__init__.py
index fc08cd491de76..007fa6da4559d 100644
--- a/python/tvm/meta_schedule/space_generator/__init__.py
+++ b/python/tvm/meta_schedule/space_generator/__init__.py
@@ -19,7 +19,7 @@
 Meta Schedule design space generators that generates design
 space for generation of measure candidates.
 """
-from .space_generator import SpaceGenerator, PySpaceGenerator
-from .space_generator_union import SpaceGeneratorUnion
-from .schedule_fn import ScheduleFn
 from .post_order_apply import PostOrderApply
+from .schedule_fn import SCH_FN_TYPE, ScheduleFn
+from .space_generator import PySpaceGenerator, SpaceGenerator
+from .space_generator_union import SpaceGeneratorUnion
diff --git a/python/tvm/meta_schedule/space_generator/schedule_fn.py b/python/tvm/meta_schedule/space_generator/schedule_fn.py
index ffc13eecca260..97498bcbf59d4 100644
--- a/python/tvm/meta_schedule/space_generator/schedule_fn.py
+++ b/python/tvm/meta_schedule/space_generator/schedule_fn.py
@@ -30,18 +30,17 @@
 if TYPE_CHECKING:
     from ..tune_context import TuneContext
 
+SCH_FN_TYPE = Union[  # pylint: disable=invalid-name
+    Callable[[Schedule], None],  # No output
+    Callable[[Schedule], Schedule],  # Single output
+    Callable[[Schedule], List[Schedule]],  # Multiple outputs
+]
+
 
 @derived_object
 class ScheduleFn(PySpaceGenerator):
     """A design space generator with design spaces specified by a schedule function."""
 
-    # Multiple cases of schedule functions supported
-    SCH_FN_TYPE = Union[
-        Callable[[Schedule], None],  # No output
-        Callable[[Schedule], Schedule],  # Single output
-        Callable[[Schedule], List[Schedule]],  # Multiple outputs
-    ]
-
     def __init__(self, sch_fn: SCH_FN_TYPE):
         """Constructor.
 
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 9ee02aa2bbc62..0622b8d773b7a 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -88,7 +88,7 @@ class TuneConfig(NamedTuple):
     search_strategy_config: Optional[Dict[str, Any]] = None
     logger_config: Optional[Dict[str, Any]] = None
 
-    def create_strategy(self, **kwargs):
+    def create_strategy(self):
         """Create search strategy from configuration"""
         cls_tbl = {
             "evolutionary": EvolutionarySearch,
@@ -111,7 +111,6 @@ def create_strategy(self, **kwargs):
         return cls_tbl[self.strategy](
             num_trials_per_iter=self.num_trials_per_iter,
             max_trials_per_task=max_trials_per_task,
-            **kwargs,
             **config,
         )
 
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index 78fd3d659fafa..b7975e7b2c4e9 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -17,7 +17,7 @@
 """Meta Schedule tuning context."""
 
 import logging
-from typing import TYPE_CHECKING, Dict, List, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from tvm import IRModule
 from tvm._ffi import register_object
@@ -36,7 +36,8 @@
     from .runner import RunnerResult
     from .schedule_rule import ScheduleRule
     from .search_strategy import MeasureCandidate, SearchStrategy
-    from .space_generator import SpaceGenerator
+    from .space_generator import SCH_FN_TYPE, ScheduleFn, SpaceGenerator
+    from .tune import TuneConfig
 
 
 @register_object("meta_schedule.TuneContext")
@@ -54,16 +55,24 @@ class TuneContext(Object):
         The workload to be optimized.
     target : Optional[Target] = None
         The target to be optimized for.
-    space_generator : Optional[SpaceGenerator] = None
+    space_generator : Union[None, SCH_FN_TYPE, SpaceGenerator] = None
         The design space generator.
-    search_strategy : Optional[SearchStrategy] = None
+    search_strategy : Union[None, TuneConfig, SearchStrategy] = None
         The search strategy.
-    sch_rules: Optional[List[ScheduleRule]] = None,
+        if None, the strategy is left blank.
+        If TuneConfig, the strategy is initialized with the TuneConfig.create_strategy().
+    sch_rules: Union[None, str, List[ScheduleRule]] = None,
         The schedule rules.
-    postprocs: Optional[List[Postproc"]] = None,
+        If None, use an empty list of rules.
+        if "default", use target-default rules.
+    postprocs: Union[None, str, List[Postproc"]] = None,
         The postprocessors.
-    mutator_probs: Optional[Dict[Mutator, float]]
+        If None, use an empty list of rules.
+        if "default", use target-default rules.
+    mutator_probs: Union[None, str, Dict[Mutator, float]]
         Mutators and their probability mass.
+        If None, use an empty list of rules.
+        if "default", use target-default rules.
     task_name : Optional[str] = None
         The name of the tuning task.
     logger : logging.Logger
@@ -99,24 +108,53 @@ def __init__(
         mod: Optional[IRModule] = None,
         *,
         target: Optional[Target] = None,
-        space_generator: Optional["SpaceGenerator"] = None,
-        search_strategy: Optional["SearchStrategy"] = None,
-        sch_rules: Optional[List["ScheduleRule"]] = None,
-        postprocs: Optional[List["Postproc"]] = None,
-        mutator_probs: Optional[Dict["Mutator", float]] = None,
+        space_generator: Union[None, "SCH_FN_TYPE", "ScheduleFn", "SpaceGenerator"] = None,
+        search_strategy: Union[None, "SearchStrategy", "TuneConfig"] = None,
+        sch_rules: Union[None, str, List["ScheduleRule"]] = None,
+        postprocs: Union[None, str, List["Postproc"]] = None,
+        mutator_probs: Union[None, str, Dict["Mutator", float]] = None,
         task_name: str = "main",
         logger: Optional[logging.Logger] = None,
         rand_state: int = -1,
         num_threads: Optional[int] = None,
     ):
+        # pylint: disable=import-outside-toplevel
+        from . import default_config
+        from .space_generator import ScheduleFn
+        from .tune import TuneConfig
+
+        # pylint: enable=import-outside-toplevel
         if isinstance(mod, PrimFunc):
             mod = IRModule.from_expr(mod)
-        if num_threads is None:
-            num_threads = cpu_count()
+        if callable(space_generator):
+            space_generator = ScheduleFn(space_generator)
+        if isinstance(search_strategy, TuneConfig):
+            search_strategy = search_strategy.create_strategy()
+        if isinstance(sch_rules, str):
+            if sch_rules == "default":
+                if target is None:
+                    raise ValueError("target is required when sch_rules is 'default'")
+                sch_rules = default_config.schedule_rules(None, target)
+            else:
+                raise ValueError("sch_rules should be a list of ScheduleRule or 'default'")
+        if isinstance(postprocs, str):
+            if postprocs == "default":
+                if target is None:
+                    raise ValueError("target is required when postprocs is 'default'")
+                postprocs = default_config.postproc(None, target)
+            else:
+                raise ValueError("postprocs should be a list of Postproc or 'default'")
+        if isinstance(mutator_probs, str):
+            if mutator_probs == "default":
+                if target is None:
+                    raise ValueError("target is required when mutator_probs is 'default'")
+                mutator_probs = default_config.mutator_probs(None, target)
         if logger is None:
             self.logger = logging.getLogger(__name__)
         else:
             self.logger = None
+        if num_threads is None:
+            num_threads = cpu_count()
         self.__init_handle_by_constructor__(
             _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
             mod,
@@ -131,9 +169,6 @@ def __init__(
             rand_state,
             num_threads,
         )
-
-    def initialize(self):
-        """Initialize the tuning context"""
         _ffi_api.TuneContextInitialize(self)  # type: ignore # pylint: disable=no-member
 
     def generate_design_space(self) -> List[Schedule]:
@@ -157,7 +192,7 @@ def generate_design_space(self) -> List[Schedule]:
 
     def pre_tuning(
         self,
-        design_spaces: List[Schedule],
+        design_spaces: Optional[List[Schedule]] = None,
         database: Optional["Database"] = None,
         cost_model: Optional["CostModel"] = None,
     ) -> None:
@@ -167,18 +202,38 @@ def pre_tuning(
 
         Parameters
         ----------
-        design_spaces : List[Schedule]
+        design_spaces : Optional[List[Schedule]]
             The design spaces used during tuning process.
+            If None, use the outcome of `self.generate_design_space()`.
         database : Optional[Database] = None
             The database used during tuning process.
+            If None, and the search strategy is `EvolutionarySearch`,
+            then use `tvm.meta_schedule.database.MemoryDatabase`.
         cost_model : Optional[CostModel] = None
             The cost model used during tuning process.
+            If None, and the search strategy is `EvolutionarySearch`,
+            then use `tvm.meta_schedule.cost_model.RandomModel`.
         """
+        # pylint: disable=import-outside-toplevel
+        from .cost_model import RandomModel
+        from .database import MemoryDatabase
+        from .search_strategy import EvolutionarySearch
+
+        # pylint: enable=import-outside-toplevel
+
         if self.search_strategy is None:
             raise ValueError(
                 "search_strategy is not provided."
                 "Please construct TuneContext with search_strategy"
             )
+        if design_spaces is None:
+            design_spaces = self.generate_design_space()
+        if database is None:
+            if isinstance(self.search_strategy, EvolutionarySearch):
+                database = MemoryDatabase()  # type: ignore
+        if cost_model is None:
+            if isinstance(self.search_strategy, EvolutionarySearch):
+                cost_model = RandomModel()  # type: ignore
         return self.search_strategy.pre_tuning(design_spaces, database, cost_model)
 
     def post_tuning(self) -> None:
@@ -191,7 +246,7 @@ def post_tuning(self) -> None:
                 "search_strategy is not provided."
                 "Please construct TuneContext with search_strategy"
             )
-        _ffi_api.SearchStrategyPostTuning(self)  # type: ignore # pylint: disable=no-member
+        return self.search_strategy.post_tuning()
 
     def generate_measure_candidates(self) -> Optional[List["MeasureCandidate"]]:
         """Generate a batch of measure candidates from design spaces for measurement.
@@ -208,7 +263,7 @@ def generate_measure_candidates(self) -> Optional[List["MeasureCandidate"]]:
                 "search_strategy is not provided."
                 "Please construct TuneContext with search_strategy"
             )
-        return _ffi_api.SearchStrategyGenerateMeasureCandidates(self)  # type: ignore # pylint: disable=no-member
+        return self.search_strategy.generate_measure_candidates()
 
     def notify_runner_results(
         self,
@@ -231,8 +286,4 @@ def notify_runner_results(
                 "search_strategy is not provided."
                 "Please construct TuneContext with search_strategy"
             )
-        _ffi_api.SearchStrategyNotifyRunnerResults(  # type: ignore # pylint: disable=no-member
-            self,
-            measure_candidates,
-            results,
-        )
+        return self.search_strategy.notify_runner_results(measure_candidates, results)
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
index 31b8b81829955..69408a2e901ac 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
@@ -173,7 +173,6 @@ def test_conv2d_winograd_cpu():
             target,
         ),
     )
-    context.initialize()
     post_order_apply = context.space_generator
     (sch,) = post_order_apply.generate_design_space(mod)
     decisions = dict(
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
index f8fdb79a1ded4..958baabedb6d5 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
@@ -290,7 +290,6 @@ def test_conv2d_winograd_cuda():
             None, Target("cuda")
         ),
     )
-    context.initialize()
     post_order_apply = context.space_generator
     (sch,) = post_order_apply.generate_design_space(mod)
     decisions = dict(
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 155d6aa235fd0..f2802b41ebb52 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -221,7 +221,7 @@ def traverse(t):
         mod,
         target="llvm",
         params=params,
-        filter_func=filter_func,
+        te_filter_func=filter_func,
     )
     expected_task_names = [
         "fused_" + s
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
index 882655c17f5ac..3d4a9966cb90f 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
@@ -69,7 +69,6 @@ def _make_mutator(target: Target) -> Mutator:
             MutateComputeLocation(): 1.0,
         },
     )
-    ctx.initialize()
     return list(ctx.mutator_probs.keys())[0]
 
 
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
index 42e8ffd678f54..b517c3ed490a5 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
@@ -87,7 +87,6 @@ def _make_mutator(target: Target, max_jobs_per_core: int) -> Mutator:
             MutateParallel(max_jobs_per_core): 1.0,
         },
     )
-    ctx.initialize()
     return list(ctx.mutator_probs.keys())[0]
 
 
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
index 10bbdb366c8f7..1dc7588edd7d2 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
@@ -70,7 +70,6 @@ def _make_mutator(target: Target) -> Mutator:
             MutateThreadBinding(): 1.0,
         },
     )
-    ctx.initialize()
     return list(ctx.mutator_probs.keys())[0]
 
 
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
index 47b386447b02b..00b190a75de72 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
@@ -73,7 +73,6 @@ def _make_mutator(target: Target) -> Mutator:
         target=target,
         mutator_probs={MutateTileSize(): 1.0},
     )
-    ctx.initialize()
     return list(ctx.mutator_probs.keys())[0]
 
 
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
index dece8a8bc1ec9..7bed83f522326 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
@@ -92,7 +92,6 @@ def _make_mutator(target: Target) -> Mutator:
             MutateUnroll(): 1.0,
         },
     )
-    ctx.initialize()
     return list(ctx.mutator_probs.keys())[0]
 
 
diff --git a/tests/python/unittest/test_meta_schedule_post_order_apply.py b/tests/python/unittest/test_meta_schedule_post_order_apply.py
index 4300e66aa567f..2609d2be9d3f6 100644
--- a/tests/python/unittest/test_meta_schedule_post_order_apply.py
+++ b/tests/python/unittest/test_meta_schedule_post_order_apply.py
@@ -223,7 +223,6 @@ def test_meta_schedule_post_order_apply():
         space_generator=PostOrderApply(),
         sch_rules=[WowSoFancyScheduleRule()],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 1
@@ -240,7 +239,6 @@ def test_meta_schedule_post_order_apply_double():
         space_generator=PostOrderApply(),
         sch_rules=[DoubleScheduleRule()],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 2
@@ -258,7 +256,6 @@ def test_meta_schedule_post_order_apply_multiple():
         space_generator=PostOrderApply(),
         sch_rules=[DoubleScheduleRule(), ReorderScheduleRule()],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 4
@@ -276,7 +273,6 @@ def test_meta_schedule_post_order_apply_duplicate_matmul():
         space_generator=PostOrderApply(),
         sch_rules=[WowSoFancyScheduleRule()],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     with pytest.raises(
         TVMError,
@@ -348,7 +344,6 @@ def correct_trace(a, b, c, d):
         space_generator=PostOrderApply(),
         sch_rules=[RemoveBlock(), TrinityDouble()],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 4
@@ -376,7 +371,6 @@ def test_meta_schedule_custom_search_space():
         space_generator=PostOrderApply(),
         sch_rules=[],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     post_order_apply.generate_design_space(mod)
     called = False
diff --git a/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py b/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
index 906519cd36eb5..92c669ca1febb 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
@@ -37,7 +37,6 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
index e31e912ae4a9a..5460c5900946b 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
@@ -39,7 +39,6 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
index c7b6e89727a10..24d1229b3ac69 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
@@ -37,7 +37,6 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
index 51bf2226d3e1a..6fae11c7fd547 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
@@ -457,7 +457,6 @@ def _create_context(mod, target, postprocs):
         postprocs=postprocs,
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
index d797bc9d154d0..ebc435a02e8bd 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
@@ -38,7 +38,6 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
index c91f7bfd1daea..aacb889cb5771 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
@@ -41,7 +41,6 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
index 7f7f52d1f8a28..09daea094520e 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
@@ -33,7 +33,6 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index 2cedd2051dc82..9c43c23a3e07d 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -68,7 +68,6 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
index 5e6690d88e830..2a8a1e5fe12aa 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
@@ -252,7 +252,6 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index 79d53cebe45fa..8b21d11a37714 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -67,7 +67,6 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 029dbc52efd1d..51f62f8bd83b1 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -38,7 +38,6 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
index 752bf5e04c4e8..02b55350b7d5e 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
@@ -232,7 +232,6 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
index 379fb4675aa50..b2df408e9d016 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
@@ -63,7 +63,6 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index 1201e4100a979..7433f001c0ebb 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -89,7 +89,6 @@ def test_meta_schedule_replay_func(
             num_trials_per_iter=num_trials_per_iter, max_trials_per_task=max_trials_per_task
         ),
     )
-    context.initialize()
     strategy = context.search_strategy
     spaces = context.space_generator.generate_design_space(context.mod)
     strategy.pre_tuning(spaces)
@@ -154,7 +153,6 @@ def _schedule_matmul_small(sch: Schedule):
         target=tvm.target.Target("llvm"),
         num_threads=1,  # because we are using a mutator from the python side
     )
-    context.initialize()
     strategy = context.search_strategy
     strategy.pre_tuning(
         context.space_generator.generate_design_space(context.mod),
@@ -218,7 +216,6 @@ def _schedule_matmul_empty(sch: Schedule):
         target=tvm.target.Target("llvm"),
         num_threads=1,
     )
-    context.initialize()
     strategy = context.search_strategy
     strategy.pre_tuning(
         context.space_generator.generate_design_space(context.mod),

From d0cbde0c9930e2be1c860fab7af4b415ff793022 Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Wed, 15 Jun 2022 13:14:42 -0700
Subject: [PATCH 145/181] rename aot_demo to aot_standalone_demo for clarity
 vs. host-driven aot (#11723)

---
 .../microtvm/zephyr/template_project/microtvm_api_server.py | 2 +-
 .../src/{aot_demo => aot_standalone_demo}/main.c            | 0
 .../src/{aot_demo => aot_standalone_demo}/zephyr_uart.c     | 0
 .../src/{aot_demo => aot_standalone_demo}/zephyr_uart.h     | 6 +++---
 cmake/modules/Zephyr.cmake                                  | 4 ++--
 tests/micro/zephyr/test_utils.py                            | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)
 rename apps/microtvm/zephyr/template_project/src/{aot_demo => aot_standalone_demo}/main.c (100%)
 rename apps/microtvm/zephyr/template_project/src/{aot_demo => aot_standalone_demo}/zephyr_uart.c (100%)
 rename apps/microtvm/zephyr/template_project/src/{aot_demo => aot_standalone_demo}/zephyr_uart.h (87%)

diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index dad4cdf9d64c3..4ed3614e7a6e4 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -421,7 +421,7 @@ def _create_prj_conf(self, project_dir, options):
 
     CRT_LIBS_BY_PROJECT_TYPE = {
         "host_driven": "microtvm_rpc_server microtvm_rpc_common aot_executor_module aot_executor common",
-        "aot_demo": "memory microtvm_rpc_common common",
+        "aot_standalone_demo": "memory microtvm_rpc_common common",
     }
 
     def _get_platform_version(self, zephyr_base: str) -> float:
diff --git a/apps/microtvm/zephyr/template_project/src/aot_demo/main.c b/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/main.c
similarity index 100%
rename from apps/microtvm/zephyr/template_project/src/aot_demo/main.c
rename to apps/microtvm/zephyr/template_project/src/aot_standalone_demo/main.c
diff --git a/apps/microtvm/zephyr/template_project/src/aot_demo/zephyr_uart.c b/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/zephyr_uart.c
similarity index 100%
rename from apps/microtvm/zephyr/template_project/src/aot_demo/zephyr_uart.c
rename to apps/microtvm/zephyr/template_project/src/aot_standalone_demo/zephyr_uart.c
diff --git a/apps/microtvm/zephyr/template_project/src/aot_demo/zephyr_uart.h b/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/zephyr_uart.h
similarity index 87%
rename from apps/microtvm/zephyr/template_project/src/aot_demo/zephyr_uart.h
rename to apps/microtvm/zephyr/template_project/src/aot_standalone_demo/zephyr_uart.h
index f24ade734c4f5..771cb490d0d63 100644
--- a/apps/microtvm/zephyr/template_project/src/aot_demo/zephyr_uart.h
+++ b/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/zephyr_uart.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_APPS_MICROTVM_ZEPHYR_AOT_DEMO_INCLUDE_ZEPHYR_UART_H_
-#define TVM_APPS_MICROTVM_ZEPHYR_AOT_DEMO_INCLUDE_ZEPHYR_UART_H_
+#ifndef TVM_APPS_MICROTVM_ZEPHYR_AOT_STANDALONE_DEMO_ZEPHYR_UART_H_
+#define TVM_APPS_MICROTVM_ZEPHYR_AOT_STANDALONE_DEMO_ZEPHYR_UART_H_
 
 #include <stdint.h>
 
@@ -47,4 +47,4 @@ uint32_t TVMPlatformWriteSerial(const char* data, uint32_t size);
  */
 void TVMPlatformUARTInit();
 
-#endif /* TVM_APPS_MICROTVM_ZEPHYR_AOT_DEMO_INCLUDE_ZEPHYR_UART_H_ */
+#endif /* TVM_APPS_MICROTVM_ZEPHYR_AOT_STANDALONE_DEMO_ZEPHYR_UART_H_ */
diff --git a/cmake/modules/Zephyr.cmake b/cmake/modules/Zephyr.cmake
index a1cafce29d6ee..b88d6c63bd68e 100644
--- a/cmake/modules/Zephyr.cmake
+++ b/cmake/modules/Zephyr.cmake
@@ -23,8 +23,8 @@ if(USE_MICRO)
       "apps/microtvm/zephyr/template_project microtvm_api_server.py -> zephyr"
       "apps/microtvm/zephyr/template_project boards.json -> zephyr"
       "apps/microtvm/zephyr/template_project CMakeLists.txt.template -> zephyr"
-      "apps/microtvm/zephyr/template_project/src/aot_demo *.c -> zephyr/src/aot_demo"
-      "apps/microtvm/zephyr/template_project/src/aot_demo *.h -> zephyr/src/aot_demo"
+      "apps/microtvm/zephyr/template_project/src/aot_standalone_demo *.c -> zephyr/src/aot_standalone_demo"
+      "apps/microtvm/zephyr/template_project/src/aot_standalone_demo *.h -> zephyr/src/aot_standalone_demo"
       "apps/microtvm/zephyr/template_project/src/host_driven *.c -> zephyr/src/host_driven"
       "apps/microtvm/zephyr/template_project/qemu-hack * -> zephyr/qemu-hack"
       "apps/microtvm/zephyr/template_project/crt_config *.h -> zephyr/crt_config"
diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py
index 4fd3e39fd1c02..52f0eef36359d 100644
--- a/tests/micro/zephyr/test_utils.py
+++ b/tests/micro/zephyr/test_utils.py
@@ -87,7 +87,7 @@ def build_project(
         workspace_size = mlf_extract_workspace_size_bytes(model_tar_path)
         project_options = {
             "extra_files_tar": extra_files_tar,
-            "project_type": "aot_demo",
+            "project_type": "aot_standalone_demo",
             "west_cmd": west_cmd,
             "verbose": bool(build_config.get("debug")),
             "zephyr_board": zephyr_board,

From 6ce41be3279fc5368fe4e0f406d412eeee20c92d Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 15 Jun 2022 13:40:10 -0700
Subject: [PATCH 146/181] [MetaSchedule] Modify Profiler Timers (#11735)

Minor modification to scoped timers to cover 99% of all the time cost during MS tuning. Allow `ApplyHistoryBest` and `TaskExtraction` time to be counted during tune_relay.
---
 python/tvm/meta_schedule/tune.py              |  17 +--
 .../measure_callback/add_to_database.cc       |   2 +-
 .../measure_callback/echo_statistics.cc       |   2 +-
 .../measure_callback/measure_callback.cc      |   2 +-
 .../measure_callback/remove_build_artifact.cc |   3 +-
 .../measure_callback/update_cost_model.cc     |   2 +-
 src/meta_schedule/profiler.cc                 |   4 +-
 .../search_strategy/evolutionary_search.cc    | 108 ++++++++++--------
 .../task_scheduler/task_scheduler.cc          |   1 +
 .../unittest/test_meta_schedule_tune_relay.py |  33 +++---
 10 files changed, 95 insertions(+), 79 deletions(-)

diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 0622b8d773b7a..fd31760c11746 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -38,6 +38,7 @@
 from .measure_callback import MeasureCallback
 from .mutator import Mutator
 from .postproc import Postproc
+from .profiler import Profiler
 from .runner import Runner
 from .schedule_rule import ScheduleRule
 from .search_strategy import EvolutionarySearch, ReplayFunc, ReplayTrace
@@ -563,7 +564,8 @@ def tune_relay(
     target = default_config.target(target)
     # pylint: enable=protected-access,
     # parse the tuning contexts
-    extracted_tasks = extract_task_from_relay(mod, target, params)
+    with Profiler.timeit("TaskExtraction"):
+        extracted_tasks = extract_task_from_relay(mod, target, params)
     database = tune_extracted_tasks(
         extracted_tasks,
         config,
@@ -579,9 +581,10 @@ def tune_relay(
         mutator_probs=mutator_probs,
         num_threads=num_threads,
     )
-    with target, autotvm_silencer(), ApplyHistoryBest(database):
-        with PassContext(
-            opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
-        ):
-            return relay_build(mod, target=target, params=params)
+    with Profiler.timeit("ApplyHistoryBest"):
+        with target, autotvm_silencer(), ApplyHistoryBest(database):
+            with PassContext(
+                opt_level=3,
+                config={"relay.backend.use_meta_schedule": True},
+            ):
+                return relay_build(mod, target=target, params=params)
diff --git a/src/meta_schedule/measure_callback/add_to_database.cc b/src/meta_schedule/measure_callback/add_to_database.cc
index e86da3720f35b..26399276c9332 100644
--- a/src/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/meta_schedule/measure_callback/add_to_database.cc
@@ -30,7 +30,7 @@ class AddToDatabaseNode : public MeasureCallbackNode {
     if (!task_scheduler->database.defined()) {
       return;
     }
-    auto _ = Profiler::TimedScope("AddToDatabase");
+    auto _ = Profiler::TimedScope("MeasureCallback/AddToDatabase");
     TuneContext task = task_scheduler->tasks[task_id];
     Database database = task_scheduler->database.value();
     Workload workload = database->CommitWorkload(task->mod.value());
diff --git a/src/meta_schedule/measure_callback/echo_statistics.cc b/src/meta_schedule/measure_callback/echo_statistics.cc
index 5f3dce06f09c3..fb1064266566e 100644
--- a/src/meta_schedule/measure_callback/echo_statistics.cc
+++ b/src/meta_schedule/measure_callback/echo_statistics.cc
@@ -79,10 +79,10 @@ class EchoStatisticsNode : public MeasureCallbackNode {
              const Array<MeasureCandidate>& measure_candidates,
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
+    auto _ = Profiler::TimedScope("MeasureCallback/EchoStatistics");
     if (this->task_info.empty()) {
       SetupTaskInfo(task_scheduler->tasks);
     }
-    auto _ = Profiler::TimedScope("EchoStatistics");
     ICHECK_EQ(measure_candidates.size(), builder_results.size());
     ICHECK_EQ(measure_candidates.size(), runner_results.size());
     int n = measure_candidates.size();
diff --git a/src/meta_schedule/measure_callback/measure_callback.cc b/src/meta_schedule/measure_callback/measure_callback.cc
index e49f5216ec57c..ebe63e7b76f13 100644
--- a/src/meta_schedule/measure_callback/measure_callback.cc
+++ b/src/meta_schedule/measure_callback/measure_callback.cc
@@ -27,7 +27,7 @@ void PyMeasureCallbackNode::Apply(const TaskScheduler& task_scheduler,
                                   const Array<BuilderResult>& builds,                 //
                                   const Array<RunnerResult>& results) {
   ICHECK(f_apply != nullptr) << "PyMeasureCallback's Apply method not implemented!";
-  auto _ = Profiler::TimedScope(this->f_as_string());
+  auto _ = Profiler::TimedScope("MeasureCallback/" + this->f_as_string());
   return f_apply(task_scheduler, task_id, measure_candidates, builds, results);
 }
 
diff --git a/src/meta_schedule/measure_callback/remove_build_artifact.cc b/src/meta_schedule/measure_callback/remove_build_artifact.cc
index 67267dff91c88..0abbebf3b4844 100644
--- a/src/meta_schedule/measure_callback/remove_build_artifact.cc
+++ b/src/meta_schedule/measure_callback/remove_build_artifact.cc
@@ -28,7 +28,8 @@ class RemoveBuildArtifactNode : public MeasureCallbackNode {
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
     static const PackedFunc* f_rm = runtime::Registry::Get("meta_schedule.remove_build_dir");
-    auto _ = Profiler::TimedScope("RemoveBuildArtifact");
+    ICHECK(*f_rm != nullptr) << "The `remove_build_dir` func is not in tvm registry.";
+    auto _ = Profiler::TimedScope("MeasureCallback/RemoveBuildArtifact");
     for (const BuilderResult& build_result : builder_results) {
       if (Optional<String> path = build_result->artifact_path) {
         (*f_rm)(path.value());
diff --git a/src/meta_schedule/measure_callback/update_cost_model.cc b/src/meta_schedule/measure_callback/update_cost_model.cc
index 5b6208581cc70..8851345c43b0e 100644
--- a/src/meta_schedule/measure_callback/update_cost_model.cc
+++ b/src/meta_schedule/measure_callback/update_cost_model.cc
@@ -27,7 +27,7 @@ class UpdateCostModelNode : public MeasureCallbackNode {
              const Array<MeasureCandidate>& measure_candidates,
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
-    auto _ = Profiler::TimedScope("UpdateCostModel");
+    auto _ = Profiler::TimedScope("MeasureCallback/UpdateCostModel");
     TuneContext task = task_scheduler->tasks[task_id];
     ICHECK(task_scheduler->cost_model.defined())
         << "Cost model must be defined for the task scheduler!";
diff --git a/src/meta_schedule/profiler.cc b/src/meta_schedule/profiler.cc
index d3f72bb70577a..2f955ebf09c27 100644
--- a/src/meta_schedule/profiler.cc
+++ b/src/meta_schedule/profiler.cc
@@ -63,6 +63,7 @@ String ProfilerNode::Table() const {
       p.Row() << i << table_entry[i].name << table_entry[i].minutes << table_entry[i].percentage;
     }
   }
+  p.Separator();
   return p.AsStr();
 }
 
@@ -79,7 +80,8 @@ PackedFunc ProfilerTimedScope(String name) {
                                     tik = std::chrono::high_resolution_clock::now(),  //
                                     name = std::move(name)]() {
       auto tok = std::chrono::high_resolution_clock::now();
-      double duration = std::chrono::duration_cast<std::chrono::seconds>(tok - tik).count();
+      double duration =
+          std::chrono::duration_cast<std::chrono::nanoseconds>(tok - tik).count() / 1e9;
       profiler->stats_sec[name] += duration;
     });
   }
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index acde7f65a86c1..3b672639aaa0a 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -499,9 +499,13 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
 
 std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     std::vector<Schedule> population, int num) {
-  ICHECK_GT(num, 0);
-  // The heap to record best schedule, we do not consider schedules that are already measured
-  IRModuleSet exists = this->measured_workloads_;
+  IRModuleSet exists;
+  {
+    auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc/CopyMeasuredWorkloads");
+    ICHECK_GT(num, 0);
+    // The heap to record best schedule, we do not consider schedules that are already measured
+    exists = this->measured_workloads_;
+  }
   SizedHeap heap(num);
   for (int iter = 0;; ++iter) {
     // Predict normalized score with the cost model,
@@ -509,31 +513,35 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
                                                         GetRef<TuneContext>(self->context_),  //
                                                         this->cost_model_,                    //
                                                         this->args_info_);
-    ICHECK_EQ(scores.size(), population.size());
-    for (int i = 0, n = population.size(); i < n; ++i) {
-      Schedule sch = population.at(i);
-      IRModule mod = sch->mod();
-      size_t shash = StructuralHash()(mod);
-      double score = scores.at(i);
-      if (!exists.Has(mod, shash)) {
-        exists.Add(mod, shash);
-        heap.Push(sch, score);
+
+    {
+      auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc");
+      ICHECK_EQ(scores.size(), population.size());
+      for (int i = 0, n = population.size(); i < n; ++i) {
+        Schedule sch = population.at(i);
+        IRModule mod = sch->mod();
+        size_t shash = StructuralHash()(mod);
+        double score = scores.at(i);
+        if (!exists.Has(mod, shash)) {
+          exists.Add(mod, shash);
+          heap.Push(sch, score);
+        }
+      }
+      // Discontinue once it reaches end of search
+      if (iter == self->genetic_num_iters) {
+        break;
+      }
+      // Set threaded samplers, with probability from predicated normalized throughput
+      for (PerThreadData& data : this->per_thread_data_) {
+        data.Set(scores, self->genetic_mutate_prob, self->context_->mutator_probs);
       }
     }
-    // Discontinue once it reaches end of search
-    if (iter == self->genetic_num_iters) {
-      break;
-    }
-    // Set threaded samplers, with probability from predicated normalized throughput
-    for (PerThreadData& data : this->per_thread_data_) {
-      data.Set(scores, self->genetic_mutate_prob, self->context_->mutator_probs);
-    }
-    ThreadedTraceApply pp(self->context_->postprocs);
-    ConcurrentBitmask cbmask(self->population_size);
-    std::vector<Schedule> next_population(self->population_size, Schedule{nullptr});
-    // The worker function
     {
       auto _ = Profiler::TimedScope("EvoSearch/Evolve/Mutation");
+      ThreadedTraceApply pp(self->context_->postprocs);
+      ConcurrentBitmask cbmask(self->population_size);
+      std::vector<Schedule> next_population(self->population_size, Schedule{nullptr});
+      // The worker function
       auto f_find_candidate = [&cbmask, &population, &next_population, &pp, this](int thread_id,
                                                                                   int trace_id) {
         // Prepare samplers
@@ -571,40 +579,46 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
       };
       support::parallel_for_dynamic(0, self->population_size, self->context_->num_threads,
                                     f_find_candidate);
+
+      population.swap(next_population);
+      TVM_PY_LOG(INFO, self->context_->logging_func)
+          << "Evolve iter #" << iter << " done. Summary:\n"
+          << pp.SummarizeFailures();
     }
-    population.swap(next_population);
-    TVM_PY_LOG(INFO, self->context_->logging_func) << "Evolve iter #" << iter << " done. Summary:\n"
-                                                   << pp.SummarizeFailures();
   }
   // Return the best states from the heap, sorting from higher score to lower ones
-  std::sort(heap.heap.begin(), heap.heap.end());
-  std::vector<Schedule> results;
-  results.reserve(num);
-  for (const SizedHeap::Item& item : heap.heap) {
-    results.push_back(item.sch);
-  }
+  {
+    auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc");
+    std::sort(heap.heap.begin(), heap.heap.end());
+    std::vector<Schedule> results;
+    results.reserve(num);
+    for (const SizedHeap::Item& item : heap.heap) {
+      results.push_back(item.sch);
+    }
 
-  constexpr int kNumScoresPerLine = 16;
-  std::ostringstream os;
-  int n = heap.heap.size();
-  for (int st = 0; st < n; st += kNumScoresPerLine) {
-    os << std::endl;
-    int ed = std::min(st + kNumScoresPerLine, n);
-    os << "[" << (st + 1) << " : " << ed << "]:\t";
-    for (int i = st; i < ed; ++i) {
-      if (i != st) {
-        os << "  ";
+    constexpr int kNumScoresPerLine = 16;
+    std::ostringstream os;
+    int n = heap.heap.size();
+    for (int st = 0; st < n; st += kNumScoresPerLine) {
+      os << std::endl;
+      int ed = std::min(st + kNumScoresPerLine, n);
+      os << "[" << (st + 1) << " : " << ed << "]:\t";
+      for (int i = st; i < ed; ++i) {
+        if (i != st) {
+          os << "  ";
+        }
+        os << std::fixed << std::setprecision(4) << heap.heap.at(i).score;
       }
-      os << std::fixed << std::setprecision(4) << heap.heap.at(i).score;
     }
+    TVM_PY_LOG(INFO, self->context_->logging_func)
+        << "Scores of the best " << n << " candidates:" << os.str();
+    return results;
   }
-  TVM_PY_LOG(INFO, self->context_->logging_func)
-      << "Scores of the best " << n << " candidates:" << os.str();
-  return results;
 }
 
 std::vector<Schedule> EvolutionarySearchNode::State::PickWithEpsGreedy(
     const std::vector<Schedule>& unmeasured, const std::vector<Schedule>& bests, int num) {
+  auto _ = Profiler::TimedScope("EvoSearch/PickWithEpsGreedy");
   int num_rands = num * self->eps_greedy;
   int num_bests = num - num_rands;
   std::vector<int> rands =
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index 9c1f451414e32..ea233648f4f5d 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -22,6 +22,7 @@ namespace tvm {
 namespace meta_schedule {
 
 void TaskSchedulerNode::InitializeTask(int task_id) {
+  auto _ = Profiler::TimedScope("InitializeTask");
   TuneContext task = this->tasks[task_id];
   TVM_PY_LOG(INFO, this->logging_func)
       << "Initializing Task #" << task_id << ": " << task->task_name;
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index c2baf8d2b9215..d86b6fe48b8f1 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -138,25 +138,20 @@ def test_meta_schedule_tune_relay(
     mod, params, (input_name, _, _) = get_network(name=model_name, input_shape=input_shape)
     target = Target(target)
     with tempfile.TemporaryDirectory() as work_dir:
-        rt_mod1: tvm.runtime.Module = ms.tune_relay(
-            mod=mod,
-            params=params,
-            target=target,
-            config=ms.TuneConfig(
-                strategy="evolutionary",
-                num_trials_per_iter=32,
-                max_trials_per_task=20000,
-                max_trials_global=20000,
-                search_strategy_config={
-                    "genetic_num_iters": 10,
-                },
-            ),
-            work_dir=work_dir,
-            database=ms.database.JSONDatabase(
-                osp.join(work_dir, "workload.json"),
-                osp.join(work_dir, "records.json"),
-            ),
-        )
+        with ms.Profiler() as profiler:
+            rt_mod1: tvm.runtime.Module = ms.tune_relay(
+                mod=mod,
+                params=params,
+                target=target,
+                config=ms.TuneConfig(
+                    strategy="evolutionary",
+                    num_trials_per_iter=32,
+                    max_trials_per_task=20000,
+                    max_trials_global=20000,
+                ),
+                work_dir=work_dir,
+            )
+        print(profiler.table())
         # Compile without meta-scheduler for correctness check
         with tvm.transform.PassContext(opt_level=0):
             rt_mod2 = relay.build(mod, target=target, params=params)

From ddb43e2ab01aac4c150dd6a55f6611ac4f51060c Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Wed, 15 Jun 2022 17:45:06 -0600
Subject: [PATCH 147/181] [microTVM] Add support for the Raspberry Pi Pico via
 Arduino (#11694)

* Add RP2040 support
---
 apps/microtvm/arduino/template_project/boards.json        | 8 ++++++++
 apps/microtvm/reference-vm/arduino/README.md              | 1 +
 .../reference-vm/arduino/base-box/base_box_provision.sh   | 6 ++++--
 python/tvm/target/target.py                               | 1 +
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/apps/microtvm/arduino/template_project/boards.json b/apps/microtvm/arduino/template_project/boards.json
index b8efbbc57887e..e87a5c8111965 100644
--- a/apps/microtvm/arduino/template_project/boards.json
+++ b/apps/microtvm/arduino/template_project/boards.json
@@ -57,6 +57,14 @@
         "vid_hex": "",
         "pid_hex": ""
     },
+    "rpipico": {
+        "package": "rp2040",
+        "architecture": "rp2040",
+        "board": "rpipico",
+        "model": "rp2040",
+        "vid_hex": "2e8a",
+        "pid_hex": "000a"
+    },
     "teensy40": {
         "package": "teensy",
         "architecture": "avr",
diff --git a/apps/microtvm/reference-vm/arduino/README.md b/apps/microtvm/reference-vm/arduino/README.md
index 46acfc5f18a42..0fdcd7858abed 100644
--- a/apps/microtvm/reference-vm/arduino/README.md
+++ b/apps/microtvm/reference-vm/arduino/README.md
@@ -34,6 +34,7 @@ This RVM has been tested and is known to work with these boards:
 - Arduino Nano 33 BLE
 - Arduino Portenta H7
 - Feather S2
+- Raspberry Pi Pico
 - Sony Spresense
 - Wio Terminal
 
diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
index 287f81df135f1..4d845d7fed0ec 100644
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
@@ -46,9 +46,10 @@ sudo usermod -a -G dialout $USER
 
 # 3rd party board URLs
 ADAFRUIT_BOARDS_URL="https://raw.githubusercontent.com/adafruit/arduino-board-index/7840c768/package_adafruit_index.json"
-ESP32_BOARDS_URL="https://raw.githubusercontent.com/espressif/arduino-esp32/gh-pages/package_esp32_dev_index.json"
+ESP32_BOARDS_URL="https://github.com/espressif/arduino-esp32/releases/download/2.0.3/package_esp32_dev_index.json"
+RP2040_BOARDS_URL="https://github.com/earlephilhower/arduino-pico/releases/download/2.0.3/package_rp2040_index.json"
 SPRESENSE_BOARDS_URL="https://github.com/sonydevworld/spresense-arduino-compatible/releases/download/v2.5.0/package_spresense_index.json"
-arduino-cli core update-index --additional-urls $ADAFRUIT_BOARDS_URL,$ESP32_BOARDS_URL,$SPRESENSE_BOARDS_URL
+arduino-cli core update-index --additional-urls $ADAFRUIT_BOARDS_URL,$ESP32_BOARDS_URL,$RP2040_BOARDS_URL,$SPRESENSE_BOARDS_URL
 
 # Install supported cores from those URLS
 arduino-cli version
@@ -57,6 +58,7 @@ arduino-cli core install arduino:sam@1.6.12
 arduino-cli core install arduino:mbed_portenta@3.1.1
 arduino-cli core install adafruit:samd@1.7.10 --additional-urls $ADAFRUIT_BOARDS_URL
 arduino-cli core install esp32:esp32@2.0.2 --additional-urls $ESP32_BOARDS_URL
+arduino-cli core install rp2040:rp2040@2.0.3 --additional-urls $RP2040_BOARDS_URL
 arduino-cli core install SPRESENSE:spresense@2.5.0 --additional-urls $SPRESENSE_BOARDS_URL
 
 # The Arduino Code API has a major bug that breaks TVM. It has been worked around in
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 830cd03cec970..7c1e55c39e9cf 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -432,6 +432,7 @@ def intel_graphics(model="unknown", options=None):
     "mps3_an547": ["-mcpu=cortex-m55"],
     "nrf52840": ["-mcpu=cortex-m4"],
     "nrf5340dk": ["-mcpu=cortex-m33"],
+    "rp2040": ["-mcpu=cortex-m0"],
     "sam3x8e": ["-mcpu=cortex-m3"],
     "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"],
     "stm32h7xx": ["-mcpu=cortex-m7"],

From ec918644ef01df81354bcf958f686e2b8863dac4 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Wed, 15 Jun 2022 17:45:25 -0600
Subject: [PATCH 148/181] [microTVM] [docs] Point micro_train tutorial links to
 official repos (#11715)

* Point micro_train tutorial links to official repos
---
 gallery/how_to/work_with_microtvm/micro_train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gallery/how_to/work_with_microtvm/micro_train.py b/gallery/how_to/work_with_microtvm/micro_train.py
index 378fe56d9da01..d6a6b0ebdfe6b 100644
--- a/gallery/how_to/work_with_microtvm/micro_train.py
+++ b/gallery/how_to/work_with_microtvm/micro_train.py
@@ -33,9 +33,9 @@
 #   using the link at the bottom of this page, or open it online for free using Google Colab.
 #   Click the icon below to open in Google Colab.
 #
-# .. image:: https://raw.githubusercontent.com/guberti/web-data/micro-train-tutorial-data/images/utilities/colab_button.png
+# .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/utilities/colab_button.png
 #      :align: center
-#      :target: https://colab.research.google.com/github/guberti/tvm-site/blob/asf-site/docs/_downloads/a7c7ea4b5017ae70db1f51dd8e6dcd82/micro_train.ipynb
+#      :target: https://colab.research.google.com/github/apache/tvm-site/blob/asf-site/docs/_downloads/a7c7ea4b5017ae70db1f51dd8e6dcd82/micro_train.ipynb
 #      :width: 300px
 #
 # Motivation
@@ -490,7 +490,7 @@ def representative_dataset():
 # We will test our Arduino project by loading both of these images and executing the compiled model
 # on them.
 #
-# .. image:: https://raw.githubusercontent.com/guberti/web-data/micro-train-tutorial-data/testdata/microTVM/data/model_train_images_combined.png
+# .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/testdata/microTVM/data/model_train_images_combined.png
 #      :align: center
 #      :height: 200px
 #      :width: 600px

From 47ef9466b2751acb8c545ea0c3124c70870ec399 Mon Sep 17 00:00:00 2001
From: yuanfz <42092999+yuanfz98@users.noreply.github.com>
Date: Thu, 16 Jun 2022 05:48:52 +0200
Subject: [PATCH 149/181] [Pytorch] Add quantized::leaky_relu (#11729)

* emptycommit 2nd try

* add operator and test

* example output

* lint with black

* register param index

* remove assert as it is a warning in torch

* fix algo bug

Co-authored-by: yuanfz <42092999+FZYUAN-1@users.noreply.github.com>
---
 python/tvm/relay/frontend/qnn_torch.py    | 21 +++++++++++++++++++++
 tests/python/frontend/pytorch/qnn_test.py | 14 ++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 41543ec611ac2..63ee6ea96fb2d 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -271,6 +271,7 @@ def _get_quant_param_for_input(input_value):
         "quantized::add_scalar": (2, 3),
         "quantized::hardswish": (1, 2),
         "quantized::conv_transpose2d": qconv_indices,
+        "quantized::leaky_relu": (3, 4),
     }
 
     def dfs(current_node):
@@ -443,6 +444,7 @@ def add_input_quant_params_to_op_inputs(graph):
         "quantized::hardswish": 1,
         "aten::hardsigmoid": 1,
         "quantized::conv_transpose2d": 1,
+        "quantized::leaky_relu": 1,
     }
 
     need_input_quant_param = set(num_quantized_inputs.keys())
@@ -935,6 +937,24 @@ def _impl(inputs, _):
     return _impl
 
 
+def _leaky_relu():
+    # refer to src/ATen/native/quantized/cpu/qrelu.cpp
+    def _impl(inputs, _):
+        assert len(inputs) == 7, "Input quant params not found in op inputs"
+        alpha = inputs[1]
+        output_scale = _expr.const(inputs[3])
+        output_zero_point = _expr.const(inputs[4])
+        input_scale = _expr.const(inputs[5])
+        input_zero_point = _expr.const(inputs[6])
+        dequant = relay.qnn.op.dequantize(inputs[0], input_scale, input_zero_point)
+        dequantized = _op.nn.leaky_relu(dequant, alpha)
+        return relay.qnn.op.quantize(
+            dequantized, output_scale, output_zero_point, out_dtype="uint8"
+        )
+
+    return _impl
+
+
 def _mul_scalar():
     # this is used for mobilenet v3
     def _impl(inputs, _):
@@ -1131,6 +1151,7 @@ def _impl(inputs, _):
     "quantized::add_scalar": _add_scalar(),
     "quantized::mul_scalar": _mul_scalar(),
     "quantized::relu6": _relu6(),
+    "quantized::leaky_relu": _leaky_relu(),
     "quantized::linear_dynamic": _linear_dynamic(),
     "quantized::hardswish": _hswish(),
     "quantized::conv_transpose2d": _quantized_conv_transpose2d(),
diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py
index 6e87b9ee4f6fc..ef7f3f769ca07 100644
--- a/tests/python/frontend/pytorch/qnn_test.py
+++ b/tests/python/frontend/pytorch/qnn_test.py
@@ -148,6 +148,18 @@ def fuse_model(self):
         pass
 
 
+class LeakyReLU(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.leaky_relu = QuantWrapper(nn.LeakyReLU())
+
+    def forward(self, x):
+        return self.leaky_relu(x)
+
+    def fuse_model(self):
+        pass
+
+
 # Mobilenet V3 related modules
 class Hsigmoid(nn.Module):
     def __init__(self, add_stub=False):
@@ -302,6 +314,7 @@ def test_quantized_modules():
             ("semodule", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), False),
             ("semodule, per_channel", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), True),
             ("mul_scalar negative", imagenet_ishape, MulScalarNegative(), False),
+            ("leaky_relu", imagenet_ishape, LeakyReLU(), False),
         ]
 
     for (module_name, ishape, raw_module, per_channel) in qmodules:
@@ -347,6 +360,7 @@ def test_quantized_modules():
         # sample outputs
         """
         relu 0.0039215684 2.6052087e-08 0.9999933567176871
+        leaky_relu 0.0 0.0 1.0
         upsample bilinear 0.0 0.0 1.0
         conv_bn 0.22062653 0.011478779 0.6909348115006899
         conv_bn_relu 0.3700896 0.010921672 0.7489366477964451

From 89e1a6c3f2bbdaa3f585459cefbc7612ae46b1ad Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 15 Jun 2022 22:42:12 -0700
Subject: [PATCH 150/181] [TIR] Add preserve-unit-iters (#11585)

---
 include/tvm/tir/schedule/schedule.h           |   7 +-
 python/tvm/tir/schedule/schedule.py           |  21 ++-
 src/tir/schedule/concrete_schedule.cc         |   9 +-
 src/tir/schedule/concrete_schedule.h          |   5 +-
 src/tir/schedule/primitive.h                  |   7 +-
 .../schedule/primitive/loop_transformation.cc |  64 ++++---
 src/tir/schedule/traced_schedule.cc           |  13 +-
 src/tir/schedule/traced_schedule.h            |   5 +-
 .../test_meta_schedule_integration.py         |   9 +-
 .../test_meta_schedule_post_order_apply.py    |   8 +-
 ...meta_schedule_schedule_rule_add_rfactor.py |   4 +-
 ...t_meta_schedule_schedule_rule_auto_bind.py |  12 +-
 ...le_schedule_rule_cross_thread_reduction.py |  38 ++---
 ...hedule_schedule_rule_multi_level_tiling.py | 158 +++++++++---------
 .../unittest/test_tir_schedule_trace.py       |   4 +-
 15 files changed, 202 insertions(+), 162 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index d3ecd8a1135b8..d95a9d4e7e5eb 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -277,9 +277,10 @@ class ScheduleNode : public runtime::Object {
    * 3) All loops must start with 0.
    * 4) The domain of a loop to be fused cannot depend on another loop to be fused.
    * \param loop_rvs The loops to be fused
+   * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
    * \return The new loop after fusion
    */
-  virtual LoopRV Fuse(const Array<LoopRV>& loop_rvs) = 0;
+  virtual LoopRV Fuse(const Array<LoopRV>& loop_rvs, bool preserve_unit_iters = true) = 0;
   /*!
    * \brief Split a loop into a list of consecutive loops. It requires:
    * 1) The loop can't have annotation or thread binding.
@@ -287,9 +288,11 @@ class ScheduleNode : public runtime::Object {
    * \param loop_rv The loop to be split
    * \param factors The positive tiling factors, and at most one of which is `NullOpt`, which means
    * that factor is inferred.
+   * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
    * \return The new loops after split
    */
-  virtual Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors) = 0;
+  virtual Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors,
+                              bool preserve_unit_iters = true) = 0;
   /*!
    * \brief Reorder a list of loops. It doesn't require the loops to be consecutive.
    * It requires:
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index d29495c430076..7a1e244604b7d 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -495,7 +495,11 @@ def get_consumers(self, block: Union[BlockRV, str]) -> List[BlockRV]:
 
     ########## Schedule: Transform loops ##########
     @type_checked
-    def fuse(self, *loops: List[LoopRV]) -> LoopRV:
+    def fuse(
+        self,
+        *loops: List[LoopRV],
+        preserve_unit_iters: bool = True,
+    ) -> LoopRV:
         """Fuse a list of consecutive loops into one. It requires:
         1) The loops can't have annotations or thread bindings.
         2) The (i+1)-th loop must be the only child of the i-th loop.
@@ -553,13 +557,14 @@ def after_fuse(a: T.handle, b: T.handle) -> None:
                         B[vi, vj] = A[vi, vj] * 2.0
 
         """
-        return _ffi_api.ScheduleFuse(self, loops)  # type: ignore # pylint: disable=no-member
+        return _ffi_api.ScheduleFuse(self, loops, preserve_unit_iters)  # type: ignore # pylint: disable=no-member
 
     @type_checked
     def split(
         self,
         loop: LoopRV,
         factors: List[Union[int, ExprRV, None]],
+        preserve_unit_iters: bool = True,
     ) -> List[LoopRV]:
         """Split a loop into a list of consecutive loops. It requires:
         1) The loop can't have annotation or thread binding.
@@ -580,6 +585,9 @@ def split(
             - ExprRV
             - Positive constant integers
 
+        preserve_unit_iters : bool
+            Whether or not to preserve unit iterators in block bindings
+
         Returns
         -------
         split_loops : List[LoopRV]
@@ -628,7 +636,14 @@ def after_split(a: T.handle, b: T.handle) -> None:
         """
         # it will be checked later in C++ implementation
         # that there is at most one None in `factors`
-        return list(_ffi_api.ScheduleSplit(self, loop, factors))  # type: ignore # pylint: disable=no-member
+        return list(
+            _ffi_api.ScheduleSplit(  # type: ignore # pylint: disable=no-member
+                self,
+                loop,
+                factors,
+                preserve_unit_iters,
+            )
+        )
 
     @type_checked
     def reorder(self, *ordered_loops: List[LoopRV]) -> None:
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 051bd42506252..b2f48753b5554 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -333,19 +333,20 @@ Array<BlockRV> ConcreteScheduleNode::GetConsumers(const BlockRV& block_rv) {
 
 /******** Schedule: Transform loops ********/
 
-LoopRV ConcreteScheduleNode::Fuse(const Array<LoopRV>& loop_rvs) {
+LoopRV ConcreteScheduleNode::Fuse(const Array<LoopRV>& loop_rvs, bool preserve_unit_iters) {
   CHECK(!loop_rvs.empty()) << "ValueError: 'fuse' requires at least 1 loop(s)";
   Array<StmtSRef> loop_srefs = this->GetSRefs(loop_rvs);
   StmtSRef result{nullptr};
   TVM_TIR_SCHEDULE_BEGIN();
-  result = tir::Fuse(state_, loop_srefs);
+  result = tir::Fuse(state_, loop_srefs, preserve_unit_iters);
   TVM_TIR_SCHEDULE_END("fuse", this->error_render_level_);
   this->state_->DebugVerify();
   return CreateRV<LoopRV>(result);
 }
 
 Array<LoopRV> ConcreteScheduleNode::Split(const LoopRV& loop_rv,
-                                          const Array<Optional<ExprRV>>& factor_rvs) {
+                                          const Array<Optional<ExprRV>>& factor_rvs,
+                                          bool preserve_unit_iters) {
   class NotSingleInferFactorError : public ScheduleError {
    public:
     explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {}
@@ -440,7 +441,7 @@ Array<LoopRV> ConcreteScheduleNode::Split(const LoopRV& loop_rv,
   } else if (!this->analyzer_->CanProve(tot_length >= loop->extent)) {
     throw WrongFactorProductError(state_->mod, GetRef<For>(loop));
   }
-  results = tir::Split(state_, loop_sref, factors);
+  results = tir::Split(state_, loop_sref, factors, preserve_unit_iters);
   TVM_TIR_SCHEDULE_END("split", this->error_render_level_);
   this->state_->DebugVerify();
   return CreateRV<LoopRV>(results);
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 11d68694a1fec..dfbacb530a36b 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -96,8 +96,9 @@ class ConcreteScheduleNode : public ScheduleNode {
   Array<BlockRV> GetProducers(const BlockRV& block_rv) override;
   Array<BlockRV> GetConsumers(const BlockRV& block_rv) override;
   /******** Schedule: Transform loops ********/
-  LoopRV Fuse(const Array<LoopRV>& loop_rvs) override;
-  Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors) override;
+  LoopRV Fuse(const Array<LoopRV>& loop_rvs, bool preserve_unit_iters) override;
+  Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors,
+                      bool preserve_unit_iters) override;
   void Reorder(const Array<LoopRV>& ordered_loop_rvs) override;
   LoopRV AddUnitLoop(const BlockRV& block_rv) override;
   LoopRV AddUnitLoop(const LoopRV& loop_rv) override;
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index af0f417e4cf50..212571df10275 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -156,10 +156,11 @@ Array<StmtSRef> GetConsumers(const ScheduleState& self, const StmtSRef& block_sr
  * \param self The state of the schedule
  * \param loop_sref The sref to the loop being split
  * \param factors The splitting factors
+ * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
  * \return An array of srefs to the loops after splitting
  */
 TVM_DLL Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
-                              const Array<PrimExpr>& factors);
+                              const Array<PrimExpr>& factors, bool preserve_unit_iters);
 /*!
  * \brief Fuse a list of consecutive loops into one. It requires:
  * 1) The loops can't have annotations or thread bindings.
@@ -168,9 +169,11 @@ TVM_DLL Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
  * 4) The domain of a loop to be fused cannot depend on another loop to be fused.
  * \param self The state of the schedule
  * \param loop_srefs An array of srefs to the loops to be fused
+ * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
  * \return The sref to the fused loop
  */
-TVM_DLL StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs);
+TVM_DLL StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs,
+                      bool preserve_unit_loops);
 /*!
  * \brief Reorder a list of loops. It doesn't require the loops to be consecutive.
  * It requires:
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index bb505bca33763..f1b6f46e1b8ff 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -77,18 +77,21 @@ class SubstituteVarAndCollectOpaqueBlock : public StmtExprMutator {
 /*! \brief Simplify the binding of block realize and update the opaque block reuse mapping */
 class IterMapSimplifyBlockBinding : public StmtExprMutator {
  public:
-  explicit IterMapSimplifyBlockBinding(MapNode* opaque_blocks, Map<Var, Range> loop_var2extent)
-      : opaque_blocks_(opaque_blocks), loop_var2extent_(loop_var2extent) {}
-
-  static For SimplifyBindings(Stmt stmt, const Array<StmtSRef>& loop_srefs,
-                              MapNode* opaque_blocks) {
+  explicit IterMapSimplifyBlockBinding(MapNode* opaque_blocks, Map<Var, Range> loop_var2extent,
+                                       bool preserve_unit_iters)
+      : opaque_blocks_(opaque_blocks),
+        loop_var2extent_(loop_var2extent),
+        preserve_unit_iters_(preserve_unit_iters) {}
+
+  static For SimplifyBindings(Stmt stmt, const Array<StmtSRef>& loop_srefs, MapNode* opaque_blocks,
+                              bool preserve_unit_iters) {
     Map<Var, Range> loop_var2extent;
     for (const StmtSRef& sref : loop_srefs) {
       const ForNode* loop = TVM_SREF_TO_FOR(loop, sref);
       loop_var2extent.Set(loop->loop_var, Range::FromMinExtent(loop->min, loop->extent));
     }
-    return Downcast<For>(
-        IterMapSimplifyBlockBinding(opaque_blocks, std::move(loop_var2extent))(std::move(stmt)));
+    return Downcast<For>(IterMapSimplifyBlockBinding(opaque_blocks, std::move(loop_var2extent),
+                                                     preserve_unit_iters)(std::move(stmt)));
   }
 
  private:
@@ -112,11 +115,12 @@ class IterMapSimplifyBlockBinding : public StmtExprMutator {
       }
       return std::move(realize);
     }
-    Array<PrimExpr> v = arith::IterMapSimplify(/*indices=*/op->iter_values,
-                                               /*input_iters=*/loop_var2extent_,
-                                               /*input_pred=*/op->predicate,
-                                               /*check_level=*/arith::IterMapLevel::Surjective,
-                                               /*simplify_trivial_iterators=*/false);
+    Array<PrimExpr> v =
+        arith::IterMapSimplify(/*indices=*/op->iter_values,
+                               /*input_iters=*/loop_var2extent_,
+                               /*input_pred=*/op->predicate,
+                               /*check_level=*/arith::IterMapLevel::Surjective,
+                               /*simplify_trivial_iterators=*/!preserve_unit_iters_);
     if (v.same_as(op->iter_values)) {
       return GetRef<Stmt>(op);
     } else {
@@ -130,6 +134,8 @@ class IterMapSimplifyBlockBinding : public StmtExprMutator {
   MapNode* opaque_blocks_;
   /*! \brief The range of loops */
   Map<Var, Range> loop_var2extent_;
+  /*! \brief Whether or not to simplify unit iterators */
+  bool preserve_unit_iters_;
 };
 
 class BlockPropertyError : public ScheduleError {
@@ -376,8 +382,8 @@ class DependentLoopError : public ScheduleError {
   PrimitiveKind kind_;
 };
 
-Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
-                      const Array<PrimExpr>& factors) {
+Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, const Array<PrimExpr>& factors,
+                      bool preserve_unit_iters) {
   // Invariance
   // - The total repeat number has not changed for each direct child block with updating predicate.
   // - The execution order has not changed. (The block executes with the same args and the same
@@ -432,7 +438,8 @@ Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
     new_stmt = For(new_loop_vars[i], 0, factors[i], ForKind::kSerial, new_stmt);
   }
   new_stmt = IterMapSimplifyBlockBinding::SimplifyBindings(std::move(new_stmt), GetLoops(loop_sref),
-                                                           opaque_block_reuse.CopyOnWrite());
+                                                           opaque_block_reuse.CopyOnWrite(),
+                                                           preserve_unit_iters);
   self->Replace(loop_sref, new_stmt, opaque_block_reuse);
   Array<StmtSRef> result_srefs;
   result_srefs.reserve(n);
@@ -444,7 +451,7 @@ Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
   return result_srefs;
 }
 
-StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) {
+StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs, bool preserve_unit_iters) {
   // Invariance
   // - The total repeat number has not changed for each direct child block.
   // - The execution order has not changed. (The block executes with the same
@@ -527,7 +534,8 @@ StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) {
   fused_extent = analyzer.Simplify(fused_extent);
   new_stmt = For(fused_var, 0, fused_extent, ForKind::kSerial, new_stmt);
   new_stmt = IterMapSimplifyBlockBinding::SimplifyBindings(
-      std::move(new_stmt), GetLoops(loop_srefs[0]), opaque_block_reuse.CopyOnWrite());
+      std::move(new_stmt), GetLoops(loop_srefs[0]), opaque_block_reuse.CopyOnWrite(),
+      preserve_unit_iters);
   self->Replace(loop_srefs[0], new_stmt, opaque_block_reuse);
   return self->stmt2ref.at(new_stmt.get());
 }
@@ -755,7 +763,7 @@ struct SplitTraits : public UnpackedInstTraits<SplitTraits> {
 
  private:
   static constexpr size_t kNumInputs = 2;
-  static constexpr size_t kNumAttrs = 0;
+  static constexpr size_t kNumAttrs = 1;
   static constexpr size_t kNumDecisions = 0;
 
   template <size_t delta>
@@ -770,14 +778,17 @@ struct SplitTraits : public UnpackedInstTraits<SplitTraits> {
   }
 
   static Array<LoopRV> UnpackedApplyToSchedule(Schedule sch, LoopRV loop_rv,
-                                               Array<Optional<ExprRV>> factors) {
-    return sch->Split(loop_rv, factors);
+                                               Array<Optional<ExprRV>> factors,
+                                               Bool preserve_unit_iters) {
+    return sch->Split(loop_rv, factors, preserve_unit_iters.operator bool());
   }
 
-  static String UnpackedAsPython(Array<String> outputs, String loop_rv, Array<ObjectRef> factors) {
+  static String UnpackedAsPython(Array<String> outputs, String loop_rv, Array<ObjectRef> factors,
+                                 Bool preserve_unit_iters) {
     PythonAPICall py("split");
     py.Input("loop", loop_rv);
     py.Input("factors", factors);
+    py.Input("preserve_unit_iters", preserve_unit_iters.operator bool());
     py.OutputList(outputs);
     return py.Str();
   }
@@ -792,7 +803,7 @@ struct FuseTraits : public UnpackedInstTraits<FuseTraits> {
 
  private:
   static constexpr size_t kNumInputs = 1;
-  static constexpr size_t kNumAttrs = 0;
+  static constexpr size_t kNumAttrs = 1;
   static constexpr size_t kNumDecisions = 0;
 
   template <size_t delta>
@@ -801,15 +812,18 @@ struct FuseTraits : public UnpackedInstTraits<FuseTraits> {
     setter(delta, inputs);
   }
 
-  static LoopRV UnpackedApplyToSchedule(Schedule sch, Array<LoopRV> loop_rvs) {
-    return sch->Fuse(loop_rvs);
+  static LoopRV UnpackedApplyToSchedule(Schedule sch, Array<LoopRV> loop_rvs,
+                                        Bool preserve_unit_iters) {
+    return sch->Fuse(loop_rvs, preserve_unit_iters.operator bool());
   }
 
-  static String UnpackedAsPython(Array<String> outputs, Array<String> loop_rvs) {
+  static String UnpackedAsPython(Array<String> outputs, Array<String> loop_rvs,
+                                 Bool preserve_unit_iters) {
     PythonAPICall py("fuse");
     for (const String& loop_rv : loop_rvs) {
       py.Input("", loop_rv);
     }
+    py.Input("preserve_unit_iters", preserve_unit_iters.operator bool());
     py.SingleOutput(outputs);
     return py.Str();
   }
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 95a10e26ac2f8..733b5d872f937 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -158,20 +158,21 @@ Array<BlockRV> TracedScheduleNode::GetConsumers(const BlockRV& block_rv) {
 
 /******** Schedule: Transform loops ********/
 
-LoopRV TracedScheduleNode::Fuse(const Array<LoopRV>& loop_rvs) {
-  LoopRV result = ConcreteScheduleNode::Fuse(loop_rvs);
+LoopRV TracedScheduleNode::Fuse(const Array<LoopRV>& loop_rvs, bool preserve_unit_loops) {
+  LoopRV result = ConcreteScheduleNode::Fuse(loop_rvs, preserve_unit_loops);
 
   static const InstructionKind& kind = InstructionKind::Get("Fuse");
   trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
                                       /*inputs=*/{loop_rvs.begin(), loop_rvs.end()},
-                                      /*attrs=*/{},
+                                      /*attrs=*/{Integer(preserve_unit_loops)},
                                       /*outputs=*/{result}));
   return result;
 }
 
 Array<LoopRV> TracedScheduleNode::Split(const LoopRV& loop_rv,
-                                        const Array<Optional<ExprRV>>& factor_rvs) {
-  Array<LoopRV> results = ConcreteScheduleNode::Split(loop_rv, factor_rvs);
+                                        const Array<Optional<ExprRV>>& factor_rvs,
+                                        bool preserve_unit_iters) {
+  Array<LoopRV> results = ConcreteScheduleNode::Split(loop_rv, factor_rvs, preserve_unit_iters);
 
   std::vector<ObjectRef> inputs;
   inputs.reserve(1 + factor_rvs.size());
@@ -183,7 +184,7 @@ Array<LoopRV> TracedScheduleNode::Split(const LoopRV& loop_rv,
   static const InstructionKind& kind = InstructionKind::Get("Split");
   trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
                                       /*inputs=*/inputs,
-                                      /*attrs=*/{},
+                                      /*attrs=*/{Integer(preserve_unit_iters)},
                                       /*outputs=*/{results.begin(), results.end()}));
   return results;
 }
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 25bf3d4871ae7..178026d9eaf85 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -60,8 +60,9 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   Array<BlockRV> GetProducers(const BlockRV& block_rv) final;
   Array<BlockRV> GetConsumers(const BlockRV& block_rv) final;
   /******** Schedule: Transform loops ********/
-  LoopRV Fuse(const Array<LoopRV>& loop_rvs) final;
-  Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factor_rvs) final;
+  LoopRV Fuse(const Array<LoopRV>& loop_rvs, bool preserve_unit_iters) final;
+  Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factor_rvs,
+                      bool preserve_unit_iters) final;
   void Reorder(const Array<LoopRV>& ordered_loop_rvs) final;
   LoopRV AddUnitLoop(const BlockRV& block_rv) final;
   LoopRV AddUnitLoop(const LoopRV& loop_rv) final;
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index f2802b41ebb52..6d5016cd814ca 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -193,6 +193,7 @@ def test_meta_schedule_integration_extract_from_bert_base():
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet_with_filter_func():
     def filter_func(args) -> bool:
+        from tvm.te import create_prim_func  # pylint: disable=import-outside-toplevel
 
         has_complex_op = False
         visited = set()
@@ -205,16 +206,16 @@ def traverse(t):
             if isinstance(t.op, te.PlaceholderOp):
                 pass
             elif isinstance(t.op, te.ComputeOp):
-                has_complex_op = has_complex_op or any(
-                    [isinstance(e, tir.Reduce) for e in t.op.body]
-                )
+                has_complex_op = has_complex_op or any(isinstance(e, tir.Reduce) for e in t.op.body)
                 for x in t.op.input_tensors:
                     traverse(x)
             visited.add(t.handle.value)
 
         for t in args:
             traverse(t)
-        return has_complex_op
+        if not has_complex_op:
+            return None
+        return create_prim_func(args)
 
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
     extracted_tasks = ms.extract_task_from_relay(
diff --git a/tests/python/unittest/test_meta_schedule_post_order_apply.py b/tests/python/unittest/test_meta_schedule_post_order_apply.py
index 2609d2be9d3f6..21d29ac74d82c 100644
--- a/tests/python/unittest/test_meta_schedule_post_order_apply.py
+++ b/tests/python/unittest/test_meta_schedule_post_order_apply.py
@@ -326,12 +326,12 @@ def correct_trace(a, b, c, d):
                 'b2 = sch.get_block(name="C", func_name="main")',
                 "sch.compute_inline(block=b1)",
                 "l3, l4 = sch.get_loops(block=b2)",
-                "l5, l6 = sch.split(loop=l3, factors=" + str(a) + ")",
-                "l7, l8 = sch.split(loop=l4, factors=" + str(b) + ")",
+                "l5, l6 = sch.split(loop=l3, factors=" + str(a) + ", preserve_unit_iters=True)",
+                "l7, l8 = sch.split(loop=l4, factors=" + str(b) + ", preserve_unit_iters=True)",
                 "sch.reorder(l5, l7, l6, l8)",
                 "l9, l10 = sch.get_loops(block=b0)",
-                "l11, l12 = sch.split(loop=l9, factors=" + str(c) + ")",
-                "l13, l14 = sch.split(loop=l10, factors=" + str(d) + ")",
+                "l11, l12 = sch.split(loop=l9, factors=" + str(c) + ", preserve_unit_iters=True)",
+                "l13, l14 = sch.split(loop=l10, factors=" + str(d) + ", preserve_unit_iters=True)",
                 "sch.reorder(l11, l13, l12, l14)",
             ]
         )
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
index 09daea094520e..a39c8aea5fb6a 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
@@ -43,7 +43,7 @@ def test_cpu_matmul():
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l6, l7 = sch.split(loop=l3, factors=[v4, v5])",
+            "l6, l7 = sch.split(loop=l3, factors=[v4, v5], preserve_unit_iters=True)",
             "b8 = sch.rfactor(loop=l7, factor_axis=2)",
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.random_compute_producer", ann_val=1)',
         ],
@@ -51,7 +51,7 @@ def test_cpu_matmul():
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l6, l7 = sch.split(loop=l3, factors=[v4, v5])",
+            "l6, l7 = sch.split(loop=l3, factors=[v4, v5], preserve_unit_iters=True)",
             "b8 = sch.rfactor(loop=l6, factor_axis=2)",
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.random_compute_producer", ann_val=1)',
         ],
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index 9c43c23a3e07d..a89cca72e1b10 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -76,9 +76,9 @@ def test_cuda_element_wise():
         [
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1, l2 = sch.get_loops(block=b0)",
-            "l3 = sch.fuse(l1, l2)",
+            "l3 = sch.fuse(l1, l2, preserve_unit_iters=True)",
             "v4 = sch.sample_categorical(candidates=[32, 64, 128, 256, 512, 1024], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4])",
+            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l5, thread_axis="blockIdx.x")',
             'sch.bind(loop=l6, thread_axis="threadIdx.x")',
         ]
@@ -100,8 +100,8 @@ def test_cuda_reduction_loop_only():
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1, = sch.get_loops(block=b0)",
             "l2 = sch.add_unit_loop(block_or_loop=l1)",
-            "l3 = sch.fuse(l2)",
-            "l4, l5 = sch.split(loop=l3, factors=[None, 1])",
+            "l3 = sch.fuse(l2, preserve_unit_iters=True)",
+            "l4, l5 = sch.split(loop=l3, factors=[None, 1], preserve_unit_iters=True)",
             'sch.bind(loop=l4, thread_axis="blockIdx.x")',
             'sch.bind(loop=l5, thread_axis="threadIdx.x")',
         ]
@@ -122,8 +122,8 @@ def test_cuda_zero_dim_add():
         [
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1 = sch.add_unit_loop(block_or_loop=b0)",
-            "l2 = sch.fuse(l1)",
-            "l3, l4 = sch.split(loop=l2, factors=[None, 1])",
+            "l2 = sch.fuse(l1, preserve_unit_iters=True)",
+            "l3, l4 = sch.split(loop=l2, factors=[None, 1], preserve_unit_iters=True)",
             'sch.bind(loop=l3, thread_axis="blockIdx.x")',
             'sch.bind(loop=l4, thread_axis="threadIdx.x")',
         ]
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index 8b21d11a37714..5f76e77592e31 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -78,12 +78,12 @@ def test_gpu_softmax_mn():
             "b1, = sch.get_consumers(block=b0)",
             "l2, l3 = sch.get_loops(block=b1)",
             "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4])",
+            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l6, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10, l11 = sch.split(loop=l9, factors=[None, v4])",
+            "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l11, thread_axis="threadIdx.x")',
         ],
         [
@@ -91,12 +91,12 @@ def test_gpu_softmax_mn():
             "b1, = sch.get_consumers(block=b0)",
             "l2, l3 = sch.get_loops(block=b1)",
             "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4])",
+            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l6, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10, l11 = sch.split(loop=l9, factors=[None, v4])",
+            "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l11, thread_axis="threadIdx.x")',
         ],
         [
@@ -105,22 +105,22 @@ def test_gpu_softmax_mn():
             "b2, = sch.get_consumers(block=b1)",
             "l3, l4 = sch.get_loops(block=b2)",
             "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l6, l7 = sch.split(loop=l4, factors=[None, v5])",
+            "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l7, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True)",
             'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")',
             "l8, l9, l10 = sch.get_loops(block=b1)",
-            "l11, l12 = sch.split(loop=l10, factors=[None, v5])",
+            "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l12, thread_axis="threadIdx.x")',
             "b13, = sch.get_consumers(block=b0)",
             "l14, l15 = sch.get_loops(block=b13)",
             "v16 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l17, l18 = sch.split(loop=l15, factors=[None, v16])",
+            "l17, l18 = sch.split(loop=l15, factors=[None, v16], preserve_unit_iters=True)",
             'sch.bind(loop=l18, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b0, loop=l14, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l19, l20, l21 = sch.get_loops(block=b0)",
-            "l22, l23 = sch.split(loop=l21, factors=[None, v16])",
+            "l22, l23 = sch.split(loop=l21, factors=[None, v16], preserve_unit_iters=True)",
             'sch.bind(loop=l23, thread_axis="threadIdx.x")',
         ],
     ]
@@ -147,7 +147,7 @@ def test_gpu_softmax_mn_after_inline():
             'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")',
             "v1 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
             "l2, l3 = sch.get_loops(block=b0)",
-            "l4, l5 = sch.split(loop=l3, factors=[None, v1])",
+            "l4, l5 = sch.split(loop=l3, factors=[None, v1], preserve_unit_iters=True)",
             'sch.bind(loop=l5, thread_axis="threadIdx.x")',
         ],
         [
@@ -155,12 +155,12 @@ def test_gpu_softmax_mn_after_inline():
             "b1, = sch.get_consumers(block=b0)",
             "l2, l3 = sch.get_loops(block=b1)",
             "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4])",
+            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l6, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10, l11 = sch.split(loop=l9, factors=[None, v4])",
+            "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l11, thread_axis="threadIdx.x")',
         ],
         [
@@ -169,19 +169,19 @@ def test_gpu_softmax_mn_after_inline():
             "b2, = sch.get_consumers(block=b1)",
             "l3, l4 = sch.get_loops(block=b2)",
             "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l6, l7 = sch.split(loop=l4, factors=[None, v5])",
+            "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l7, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True)",
             'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")',
             "l8, l9, l10 = sch.get_loops(block=b1)",
-            "l11, l12 = sch.split(loop=l10, factors=[None, v5])",
+            "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l12, thread_axis="threadIdx.x")',
             "b13, b14 = sch.get_consumers(block=b0)",
             "l15, l16, l17, l18 = sch.get_loops(block=b13)",
             "sch.compute_at(block=b0, loop=l15, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l19, l20, l21 = sch.get_loops(block=b0)",
-            "l22, l23 = sch.split(loop=l21, factors=[None, v5])",
+            "l22, l23 = sch.split(loop=l21, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l23, thread_axis="threadIdx.x")',
         ],
     ]
@@ -204,13 +204,13 @@ def test_gpu_batch_norm_bmn():
             "b1, = sch.get_consumers(block=b0)",
             "l2, = sch.get_loops(block=b1)",
             "v3 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l4, l5 = sch.split(loop=l2, factors=[None, v3])",
+            "l4, l5 = sch.split(loop=l2, factors=[None, v3], preserve_unit_iters=True)",
             'sch.bind(loop=l5, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b0, loop=l4, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l6, l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10 = sch.fuse(l8, l9)",
-            "l11, l12 = sch.split(loop=l10, factors=[None, v3])",
+            "l10 = sch.fuse(l8, l9, preserve_unit_iters=True)",
+            "l11, l12 = sch.split(loop=l10, factors=[None, v3], preserve_unit_iters=True)",
             'sch.bind(loop=l12, thread_axis="threadIdx.x")',
         ],
     ]
@@ -232,6 +232,6 @@ def test_gpu_batch_norm_bmn():
 
 
 if __name__ == "__main__":
-    test_gpu_softmax_mn()
-    test_gpu_softmax_mn_after_inline()
+    # test_gpu_softmax_mn()
+    # test_gpu_softmax_mn_after_inline()
     test_gpu_batch_norm_bmn()
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 51f62f8bd83b1..30511d6690c7f 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -48,11 +48,11 @@ def test_cpu_matmul():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")',
             "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True)",
@@ -62,11 +62,11 @@ def test_cpu_matmul():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")',
             "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True)",
@@ -76,11 +76,11 @@ def test_cpu_matmul():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
         ],
     ]
@@ -109,11 +109,11 @@ def test_cpu_matmul_relu():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             "b24, = sch.get_consumers(block=b0)",
             "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True)",
@@ -123,11 +123,11 @@ def test_cpu_matmul_relu():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             "b24, = sch.get_consumers(block=b0)",
             "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True)",
@@ -137,11 +137,11 @@ def test_cpu_matmul_relu():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
         ],
     ]
@@ -171,17 +171,17 @@ def test_cuda_matmul():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7, v8 = sch.sample_perfect_tile(loop=l1, n=5, max_innermost_factor=64)",
-            "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8])",
+            "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8], preserve_unit_iters=True)",
             "v14, v15, v16, v17, v18 = sch.sample_perfect_tile(loop=l2, n=5, max_innermost_factor=64)",
-            "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18])",
+            "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18], preserve_unit_iters=True)",
             "v24, v25, v26 = sch.sample_perfect_tile(loop=l3, n=3, max_innermost_factor=64)",
-            "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26])",
+            "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26], preserve_unit_iters=True)",
             "sch.reorder(l9, l19, l10, l20, l11, l21, l27, l28, l12, l22, l29, l13, l23)",
-            "l30 = sch.fuse(l9, l19)",
+            "l30 = sch.fuse(l9, l19, preserve_unit_iters=True)",
             'sch.bind(loop=l30, thread_axis="blockIdx.x")',
-            "l31 = sch.fuse(l10, l20)",
+            "l31 = sch.fuse(l10, l20, preserve_unit_iters=True)",
             'sch.bind(loop=l31, thread_axis="vthread.x")',
-            "l32 = sch.fuse(l11, l21)",
+            "l32 = sch.fuse(l11, l21, preserve_unit_iters=True)",
             'sch.bind(loop=l32, thread_axis="threadIdx.x")',
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32)',
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024)',
@@ -190,13 +190,13 @@ def test_cuda_matmul():
             'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")',
             "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True)",
             "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)",
-            "l41 = sch.fuse(l39, l40)",
+            "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)",
             "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)',
             'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
             "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True)",
             "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)",
-            "l50 = sch.fuse(l48, l49)",
+            "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)",
             "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b43, ann_key="meta_schedule.cooperative_fetch", ann_val=v51)',
         ]
@@ -227,30 +227,30 @@ def test_cuda_matmul_relu():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7, v8 = sch.sample_perfect_tile(loop=l1, n=5, max_innermost_factor=64)",
-            "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8])",
+            "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8], preserve_unit_iters=True)",
             "v14, v15, v16, v17, v18 = sch.sample_perfect_tile(loop=l2, n=5, max_innermost_factor=64)",
-            "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18])",
+            "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18], preserve_unit_iters=True)",
             "v24, v25, v26 = sch.sample_perfect_tile(loop=l3, n=3, max_innermost_factor=64)",
-            "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26])",
+            "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26], preserve_unit_iters=True)",
             "sch.reorder(l9, l19, l10, l20, l11, l21, l27, l28, l12, l22, l29, l13, l23)",
-            "l30 = sch.fuse(l9, l19)",
+            "l30 = sch.fuse(l9, l19, preserve_unit_iters=True)",
             'sch.bind(loop=l30, thread_axis="blockIdx.x")',
-            "l31 = sch.fuse(l10, l20)",
+            "l31 = sch.fuse(l10, l20, preserve_unit_iters=True)",
             'sch.bind(loop=l31, thread_axis="vthread.x")',
-            "l32 = sch.fuse(l11, l21)",
+            "l32 = sch.fuse(l11, l21, preserve_unit_iters=True)",
             'sch.bind(loop=l32, thread_axis="threadIdx.x")',
             'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")',
             "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True)",
             'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")',
             "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True)",
             "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)",
-            "l41 = sch.fuse(l39, l40)",
+            "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)",
             "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)',
             'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
             "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True)",
             "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)",
-            "l50 = sch.fuse(l48, l49)",
+            "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)",
             "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b43, ann_key="meta_schedule.cooperative_fetch", ann_val=v51)',
         ]
@@ -366,33 +366,33 @@ def test_multi_level_tiling_conv2d_nchwc_vnni():
         """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[1, 4])
-l13, l14 = sch.split(loop=l5, factors=[1, 16])
+l11, l12 = sch.split(loop=l10, factors=[1, 4], preserve_unit_iters=True)
+l13, l14 = sch.split(loop=l5, factors=[1, 16], preserve_unit_iters=True)
 l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
 sch.reorder(l21, l22, l23, l24, l25, l14, l12)
 b27 = sch.blockize(loop=l14)
 sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
 l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
 v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
-l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41])
+l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True)
 v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
-l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49])
+l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True)
 v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
-l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57])
+l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True)
 v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
-l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65])
+l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True)
 v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
-l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73])
+l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True)
 v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
-l80, l81 = sch.split(loop=l33, factors=[v78, v79])
+l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True)
 v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
-l84, l85 = sch.split(loop=l34, factors=[v82, v83])
+l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True)
 v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
-l88, l89 = sch.split(loop=l35, factors=[v86, v87])
+l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True)
 v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
-l92, l93 = sch.split(loop=l36, factors=[v90, v91])
+l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True)
 v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
-l96, l97 = sch.split(loop=l37, factors=[v94, v95])
+l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True)
 sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)
 b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global")
 sch.reverse_compute_at(block=b98, loop=l75, preserve_unit_loops=True)""".split(
@@ -401,33 +401,33 @@ def test_multi_level_tiling_conv2d_nchwc_vnni():
         """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[1, 4])
-l13, l14 = sch.split(loop=l5, factors=[1, 16])
+l11, l12 = sch.split(loop=l10, factors=[1, 4], preserve_unit_iters=True)
+l13, l14 = sch.split(loop=l5, factors=[1, 16], preserve_unit_iters=True)
 l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
 sch.reorder(l21, l22, l23, l24, l25, l14, l12)
 b27 = sch.blockize(loop=l14)
 sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
 l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
 v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
-l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41])
+l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True)
 v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
-l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49])
+l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True)
 v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
-l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57])
+l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True)
 v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
-l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65])
+l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True)
 v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
-l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73])
+l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True)
 v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
-l80, l81 = sch.split(loop=l33, factors=[v78, v79])
+l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True)
 v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
-l84, l85 = sch.split(loop=l34, factors=[v82, v83])
+l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True)
 v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
-l88, l89 = sch.split(loop=l35, factors=[v86, v87])
+l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True)
 v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
-l92, l93 = sch.split(loop=l36, factors=[v90, v91])
+l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True)
 v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
-l96, l97 = sch.split(loop=l37, factors=[v94, v95])
+l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True)
 sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)
 b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global")
 sch.reverse_compute_at(block=b98, loop=l74, preserve_unit_loops=True)""".split(
@@ -436,33 +436,33 @@ def test_multi_level_tiling_conv2d_nchwc_vnni():
         """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[1, 4])
-l13, l14 = sch.split(loop=l5, factors=[1, 16])
+l11, l12 = sch.split(loop=l10, factors=[1, 4], preserve_unit_iters=True)
+l13, l14 = sch.split(loop=l5, factors=[1, 16], preserve_unit_iters=True)
 l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
 sch.reorder(l21, l22, l23, l24, l25, l14, l12)
 b27 = sch.blockize(loop=l14)
 sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
 l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
 v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
-l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41])
+l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True)
 v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
-l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49])
+l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True)
 v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
-l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57])
+l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True)
 v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
-l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65])
+l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True)
 v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
-l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73])
+l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True)
 v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
-l80, l81 = sch.split(loop=l33, factors=[v78, v79])
+l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True)
 v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
-l84, l85 = sch.split(loop=l34, factors=[v82, v83])
+l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True)
 v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
-l88, l89 = sch.split(loop=l35, factors=[v86, v87])
+l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True)
 v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
-l92, l93 = sch.split(loop=l36, factors=[v90, v91])
+l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True)
 v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
-l96, l97 = sch.split(loop=l37, factors=[v94, v95])
+l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True)
 sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)""".split(
             "\n"
         ),
@@ -517,36 +517,36 @@ def test_multi_level_tiling_dense_dpa4():
         """b0 = sch.get_block(name="compute", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
 l1, l2, l3 = sch.get_loops(block=b0)
-l4, l5 = sch.split(loop=l3, factors=[32, 4])
+l4, l5 = sch.split(loop=l3, factors=[32, 4], preserve_unit_iters=True)
 sch.reorder(l5)
 b6 = sch.blockize(loop=l5)
 sch.annotate(block_or_loop=b6, ann_key="meta_schedule.auto_tensorize", ann_val="dp4a")
 l7, l8, l9 = sch.get_loops(block=b6)
 v10, v11, v12, v13, v14 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64)
-l15, l16, l17, l18, l19 = sch.split(loop=l7, factors=[v10, v11, v12, v13, v14])
+l15, l16, l17, l18, l19 = sch.split(loop=l7, factors=[v10, v11, v12, v13, v14], preserve_unit_iters=True)
 v20, v21, v22, v23, v24 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64)
-l25, l26, l27, l28, l29 = sch.split(loop=l8, factors=[v20, v21, v22, v23, v24])
+l25, l26, l27, l28, l29 = sch.split(loop=l8, factors=[v20, v21, v22, v23, v24], preserve_unit_iters=True)
 v30, v31, v32 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64)
-l33, l34, l35 = sch.split(loop=l9, factors=[v30, v31, v32])
+l33, l34, l35 = sch.split(loop=l9, factors=[v30, v31, v32], preserve_unit_iters=True)
 sch.reorder(l15, l25, l16, l26, l17, l27, l33, l34, l18, l28, l35, l19, l29)
-l36 = sch.fuse(l15, l25)
+l36 = sch.fuse(l15, l25, preserve_unit_iters=True)
 sch.bind(loop=l36, thread_axis="blockIdx.x")
-l37 = sch.fuse(l16, l26)
+l37 = sch.fuse(l16, l26, preserve_unit_iters=True)
 sch.bind(loop=l37, thread_axis="vthread.x")
-l38 = sch.fuse(l17, l27)
+l38 = sch.fuse(l17, l27, preserve_unit_iters=True)
 sch.bind(loop=l38, thread_axis="threadIdx.x")
 b39 = sch.cache_write(block=b6, write_buffer_index=0, storage_scope="local")
 sch.reverse_compute_at(block=b39, loop=l38, preserve_unit_loops=True)
 b40 = sch.cache_read(block=b6, read_buffer_index=0, storage_scope="shared")
 sch.compute_at(block=b40, loop=l33, preserve_unit_loops=True)
 l41, l42, l43, l44, l45, l46 = sch.get_loops(block=b40)
-l47 = sch.fuse(l45, l46)
+l47 = sch.fuse(l45, l46, preserve_unit_iters=True)
 v48 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b40, ann_key="meta_schedule.cooperative_fetch", ann_val=v48)
 b49 = sch.cache_read(block=b6, read_buffer_index=1, storage_scope="shared")
 sch.compute_at(block=b49, loop=l33, preserve_unit_loops=True)
 l50, l51, l52, l53, l54, l55 = sch.get_loops(block=b49)
-l56 = sch.fuse(l54, l55)
+l56 = sch.fuse(l54, l55, preserve_unit_iters=True)
 v57 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b49, ann_key="meta_schedule.cooperative_fetch", ann_val=v57)""".split(
             "\n"
diff --git a/tests/python/unittest/test_tir_schedule_trace.py b/tests/python/unittest/test_tir_schedule_trace.py
index 6fc573b1a8c2a..d1d87b60b7c81 100644
--- a/tests/python/unittest/test_tir_schedule_trace.py
+++ b/tests/python/unittest/test_tir_schedule_trace.py
@@ -87,7 +87,7 @@ def _make_split(inputs, outputs):  # pylint: disable=redefined-builtin
     return Instruction(
         kind=InstructionKind.get("Split"),
         inputs=inputs,
-        attrs=[],
+        attrs=[True],
         outputs=outputs,
     )
 
@@ -262,7 +262,7 @@ def test_trace_simplified_3():
         (
             'b0 = sch.get_block(name="B", func_name="main")',
             "l1, = sch.get_loops(block=b0)",
-            "l2, l3 = sch.split(loop=l1, factors=[None, 32])",
+            "l2, l3 = sch.split(loop=l1, factors=[None, 32], preserve_unit_iters=True)",
         )
     )
 

From 24010db6c0e90bc555f6d12e23381fa7b00cf25d Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Thu, 16 Jun 2022 14:11:41 +0800
Subject: [PATCH 151/181] [TVMScript] Support roundtrip of LetNode (#11742)

Just a missing support for `tir.LetNode`
---
 python/tvm/script/tir/scope_handler.py            |  3 +++
 tests/python/unittest/test_tvmscript_roundtrip.py | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/python/tvm/script/tir/scope_handler.py b/python/tvm/script/tir/scope_handler.py
index 7d3250fe8711e..85882055d02f6 100644
--- a/python/tvm/script/tir/scope_handler.py
+++ b/python/tvm/script/tir/scope_handler.py
@@ -312,6 +312,9 @@ def let(var, value, span):
 
         super().__init__(let, concise_scope=False, def_symbol=False)
 
+    def __call__(self, var: tvm.tir.Var, value: tvm.tir.PrimExpr, body: tvm.tir.PrimExpr):
+        return tvm.tir.Let(var, value, body)
+
 
 @register
 class Block(WithScopeHandler):
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 93bd0707c659f..306f60f1b1bac 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3288,6 +3288,15 @@ def buffer_ramp_access(a: T.handle, b: T.handle, c: T.handle) -> None:
     return buffer_ramp_access
 
 
+def let_expression():
+    @T.prim_func
+    def func():
+        x = T.var("int32")
+        T.evaluate(T.let(x, 1, x + 1))
+
+    return func
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3325,6 +3334,7 @@ def buffer_ramp_access(a: T.handle, b: T.handle, c: T.handle) -> None:
     pointer_type,
     buffer_axis_separator,
     buffer_ramp_access_as_slice_index,
+    let_expression,
 )
 
 

From 6732a9e3b2d64316926693e91d5ca6a54fc75958 Mon Sep 17 00:00:00 2001
From: WANG Zihan <wzh1999_frog@126.com>
Date: Thu, 16 Jun 2022 16:45:56 +0800
Subject: [PATCH 152/181] [Relay] Implement `SoftmaxRel` for softmax operators.
 (#11728)

* Implement `SoftmaxRel` for softmax operators.

* Print better error message for wrong axis.
---
 src/relay/op/nn/nn.cc | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 41b47401de1c2..b8d48d9e9e3dd 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -399,6 +399,27 @@ where :math:`*` is an channelwise multiplication for each sample in the batch.
 // relay.softmax
 TVM_REGISTER_NODE_TYPE(SoftmaxAttrs);
 
+bool SoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                const TypeReporter& reporter) {
+  ICHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const SoftmaxAttrs* param = attrs.as<SoftmaxAttrs>();
+  ICHECK(param != nullptr);
+  int axis = param->axis;
+  int ndim = static_cast<int>(data->shape.size());
+  if (axis >= ndim || axis < -ndim) {
+    reporter->GetDiagCtx().EmitFatal(Diagnostic::Error(reporter->GetSpan())
+                                     << "Wrong axis (" << axis << ") not in expected range: ["
+                                     << -ndim << ", " << ndim << ")");
+    return false;
+  }
+
+  reporter->Assign(types[1], types[0]);
+  return true;
+}
+
 TVM_REGISTER_GLOBAL("relay.op.nn._make.softmax").set_body_typed([](Expr data, int axis) {
   auto attrs = make_object<SoftmaxAttrs>();
   attrs->axis = axis;
@@ -420,7 +441,7 @@ RELAY_REGISTER_OP("nn.softmax")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(1)
-    .add_type_rel("Identity", IdentityRel);
+    .add_type_rel("Softmax", SoftmaxRel);
 
 // relay.fast_softmax
 TVM_REGISTER_NODE_TYPE(SoftmaxAttrs);
@@ -447,7 +468,7 @@ RELAY_REGISTER_OP("nn.fast_softmax")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(1)
-    .add_type_rel("Identity", IdentityRel);
+    .add_type_rel("Softmax", SoftmaxRel);
 
 // relay.nn.log_softmax
 TVM_REGISTER_GLOBAL("relay.op.nn._make.log_softmax").set_body_typed([](Expr data, int axis) {
@@ -471,7 +492,7 @@ RELAY_REGISTER_OP("nn.log_softmax")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(1)
-    .add_type_rel("Identity", IdentityRel)
+    .add_type_rel("Softmax", SoftmaxRel)
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
       const auto* param = attrs.as<SoftmaxAttrs>();

From d0650bad66d0ff89a01347537021bc442a98c223 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 16 Jun 2022 04:19:48 -0700
Subject: [PATCH 153/181] [Bugfix][MetaSchedule] Filter out dynamic extents
 (#11747)

Previously only static shape computation is allowed in our tuning
system. However, one special case is overlooked: the reduction iter vars
could still have dynamic iteration domains which depend on other data
parallel vars. This PR rules out this case by carefully checking all the
loop extents during task extraction.

Related issue: https://github.com/apache/tvm/issues/11746.
---
 python/tvm/meta_schedule/runner/rpc_runner.py      | 11 +++++++----
 src/meta_schedule/extracted_task.cc                | 14 +++++++++++++-
 .../unittest/test_meta_schedule_integration.py     |  9 +++++++++
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/python/tvm/meta_schedule/runner/rpc_runner.py b/python/tvm/meta_schedule/runner/rpc_runner.py
index 16e422cc6073c..9ff2489f8eb1c 100644
--- a/python/tvm/meta_schedule/runner/rpc_runner.py
+++ b/python/tvm/meta_schedule/runner/rpc_runner.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """RPC Runner"""
-import logging
 import concurrent.futures
+import logging
 import os.path as osp
 from contextlib import contextmanager
 from typing import Callable, List, Optional, Union
@@ -26,6 +26,7 @@
 from tvm.runtime import Device, Module
 
 from ..utils import (
+    cpu_count,
     derived_object,
     get_global_func_on_rpc_session,
     get_global_func_with_default_on_worker,
@@ -242,7 +243,7 @@ def __init__(
         f_alloc_argument: Union[T_ALLOC_ARGUMENT, str, None] = None,
         f_run_evaluator: Union[T_RUN_EVALUATOR, str, None] = None,
         f_cleanup: Union[T_CLEANUP, str, None] = None,
-        max_workers: int = 1,
+        max_workers: Optional[int] = 1,
         initializer: Optional[Callable[[], None]] = None,
     ) -> None:
         """Constructor
@@ -267,8 +268,8 @@ def __init__(
             The function name to run the evaluator or the function itself.
         f_cleanup: Union[T_CLEANUP, str, None]
             The function name to cleanup the session or the function itself.
-        max_workers: int = 1
-            The maximum number of connections. Defaults to 1.
+        max_workers: Optional[int] = None
+            The maximum number of connections. Defaults to number of logical CPU cores.
         initializer: Optional[Callable[[], None]]
             The initializer function.
         """
@@ -282,6 +283,8 @@ def __init__(
         self.f_alloc_argument = f_alloc_argument
         self.f_run_evaluator = f_run_evaluator
         self.f_cleanup = f_cleanup
+        if max_workers is None:
+            max_workers = cpu_count()
         logger.info("RPCRunner: max_workers = %d", max_workers)
         self.pool = PopenPoolExecutor(
             max_workers=max_workers,
diff --git a/src/meta_schedule/extracted_task.cc b/src/meta_schedule/extracted_task.cc
index abd7235acb998..358f56efab2e1 100644
--- a/src/meta_schedule/extracted_task.cc
+++ b/src/meta_schedule/extracted_task.cc
@@ -72,7 +72,19 @@ Optional<tir::PrimFunc> DefaultTaskFilterImpl(const Array<te::Tensor>& args, boo
       return NullOpt;
     }
   }
-  return te::CreatePrimFunc(args);
+  PrimFunc func = te::CreatePrimFunc(args);
+  bool dynamic_loop_extent = false;
+  PostOrderVisit(func->body, [&dynamic_loop_extent](const ObjectRef& obj) -> void {
+    if (const auto* loop = obj.as<tir::ForNode>()) {
+      if (!loop->extent->IsInstance<IntImmNode>()) {
+        dynamic_loop_extent = true;
+      }
+    }
+  });
+  if (dynamic_loop_extent) {
+    return NullOpt;
+  }
+  return func;
 }
 
 Optional<tir::PrimFunc> DefaultTaskFilter(const Array<te::Tensor>& args) {
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 6d5016cd814ca..4868640adeadf 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -19,6 +19,7 @@
 import pytest
 import tvm
 import tvm.testing
+from tvm import IRModule
 from tvm import meta_schedule as ms
 from tvm import relay, te, tir
 from tvm.meta_schedule.testing.relay_workload import get_network
@@ -60,6 +61,14 @@ def test_meta_schedule_apply_history_best_no_current():
     assert ms.ApplyHistoryBest.current() is None
 
 
+def test_meta_schedule_dynamic_loop_extent():
+    a = relay.var("a", shape=(1, 8, 8, 512), dtype="float32")
+    b = relay.nn.adaptive_avg_pool2d(a, (7, 7), "NHWC")
+    mod = IRModule({"main": relay.Function([a], b)})
+    extracted_tasks = ms.extract_task_from_relay(mod, target="llvm", params={})
+    assert not extracted_tasks
+
+
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet():
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])

From ada4c46f095f876efd97c4d0a3bf8860d7c5d5e8 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Thu, 16 Jun 2022 19:02:38 +0100
Subject: [PATCH 154/181] [ci] Remove apt cache from the docker images (#11470)

---
 docker/Dockerfile.ci_arm                      |  5 ++-
 docker/Dockerfile.ci_cpu                      |  2 ++
 docker/Dockerfile.ci_gpu                      |  4 ++-
 docker/Dockerfile.ci_hexagon                  |  5 ++-
 docker/Dockerfile.ci_i386                     |  6 +++-
 docker/Dockerfile.ci_jekyll                   |  7 ++--
 docker/Dockerfile.ci_lint                     |  6 ++--
 docker/Dockerfile.ci_qemu                     |  2 ++
 docker/Dockerfile.ci_wasm                     |  2 ++
 docker/Dockerfile.conda_cpu                   |  6 +++-
 docker/Dockerfile.conda_cuda100               |  6 +++-
 docker/Dockerfile.conda_cuda90                |  6 +++-
 docker/Dockerfile.demo_android                |  2 ++
 docker/Dockerfile.demo_cpu                    |  2 ++
 docker/Dockerfile.demo_gpu                    |  2 ++
 docker/Dockerfile.demo_opencl                 | 18 +++++------
 docker/Dockerfile.demo_rocm                   |  2 ++
 docker/Dockerfile.demo_vitis_ai               |  2 ++
 docker/Dockerfile.docs                        |  5 ++-
 .../ubuntu1804_install_clang_format.sh        |  2 +-
 docker/install/ubuntu1804_install_llvm.sh     |  2 +-
 docker/install/ubuntu1804_install_python.sh   |  3 +-
 .../install/ubuntu1804_install_python_venv.sh |  3 +-
 docker/install/ubuntu2004_install_python.sh   |  3 +-
 ...buntu_download_arm_compute_lib_binaries.sh |  3 +-
 docker/install/ubuntu_install_arduino.sh      |  2 +-
 docker/install/ubuntu_install_caffe.sh        |  2 +-
 docker/install/ubuntu_install_core.sh         |  2 +-
 .../ubuntu_install_ethosn_driver_stack.sh     |  2 +-
 .../ubuntu_install_ethosu_driver_stack.sh     |  3 +-
 docker/install/ubuntu_install_golang.sh       |  4 +--
 docker/install/ubuntu_install_java.sh         |  3 +-
 docker/install/ubuntu_install_llvm.sh         |  4 +--
 docker/install/ubuntu_install_nnpack.sh       |  2 +-
 docker/install/ubuntu_install_nodejs.sh       |  7 ++--
 docker/install/ubuntu_install_opencl.sh       |  5 ++-
 docker/install/ubuntu_install_papi.sh         |  2 +-
 docker/install/ubuntu_install_python.sh       |  6 ++--
 docker/install/ubuntu_install_qemu.sh         |  2 +-
 docker/install/ubuntu_install_redis.sh        |  2 +-
 docker/install/ubuntu_install_rocm.sh         |  6 ++--
 docker/install/ubuntu_install_sbt.sh          |  4 +--
 .../ubuntu_install_tensorflow_aarch64.sh      |  2 +-
 docker/install/ubuntu_install_verilator.sh    |  2 +-
 .../install/ubuntu_install_vitis_ai_core.sh   |  5 ++-
 docker/install/ubuntu_install_vulkan.sh       |  4 +--
 docker/install/ubuntu_install_wasmtime.sh     |  2 +-
 docker/install/ubuntu_install_zephyr.sh       |  4 +--
 docker/utils/apt-install-and-clear.sh         | 20 ++++++++++++
 tests/lint/docker-format.sh                   | 32 +++++++++++++++++++
 tests/scripts/task_lint.sh                    |  3 ++
 51 files changed, 165 insertions(+), 73 deletions(-)
 create mode 100755 docker/utils/apt-install-and-clear.sh
 create mode 100755 tests/lint/docker-format.sh

diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
index c19f1ff5a4c13..b7aaba47f4316 100644
--- a/docker/Dockerfile.ci_arm
+++ b/docker/Dockerfile.ci_arm
@@ -20,8 +20,11 @@
 
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
-RUN apt-get install -y ca-certificates gnupg2
+
+RUN apt-install-and-clear -y ca-certificates gnupg2
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 45943334a06ff..2dc075d29b139 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -18,6 +18,8 @@
 # CI docker CPU env
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index e0d1997de729b..22c372cc70b00 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -19,6 +19,8 @@
 # tag: v0.60
 FROM nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 # Per https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212772
 # we need to add a new GPG key before running apt update.
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
@@ -55,7 +57,7 @@ COPY install/ubuntu_install_sphinx.sh /install/ubuntu_install_sphinx.sh
 RUN bash /install/ubuntu_install_sphinx.sh
 
 # Enable doxygen for c++ doc build
-RUN apt-get update && apt-get install -y doxygen libprotobuf-dev protobuf-compiler
+RUN apt-get update && apt-install-and-clear -y doxygen libprotobuf-dev protobuf-compiler
 
 COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
 RUN bash /install/ubuntu_install_java.sh
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index ddca5c6c2e666..cf7407c2ab05d 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -19,8 +19,11 @@
 # tag: v0.02
 FROM tvmcihexagon/ci-hexagon-base:v0.02_SDK4.5.0.3
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
-RUN apt-get install -y ca-certificates gnupg2 libxml2-dev
+
+RUN apt-install-and-clear -y ca-certificates gnupg2 libxml2-dev
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index 61ba064ff3f1a..0b6d8d28c4d7c 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -20,7 +20,11 @@
 
 FROM i386/ubuntu:18.04
 
-RUN apt-get update --fix-missing && apt-get install -y ca-certificates
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
+RUN apt-get update --fix-missing
+
+RUN apt-install-and-clear -y ca-certificates
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.ci_jekyll b/docker/Dockerfile.ci_jekyll
index 5d3cf86dd6f52..f6912a8a9651d 100644
--- a/docker/Dockerfile.ci_jekyll
+++ b/docker/Dockerfile.ci_jekyll
@@ -19,6 +19,9 @@
 # tag: v0.50
 FROM ubuntu:16.04
 
-RUN apt-get update && apt-get install -y sudo wget
-RUN apt-get update && apt-get install -y ruby-full build-essential zlib1g-dev
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
+RUN apt-get update
+
+RUN apt-install-and-clear -y sudo wget ruby-full build-essential zlib1g-dev
 RUN gem install jekyll bundler
diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 1d0c984c6190d..437ea71bd4bea 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -20,9 +20,11 @@
 # tag: v0.60
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
-RUN apt-get update && apt-get install -y wget git sudo make parallel
+RUN apt-install-and-clear -y wget git sudo make parallel
 
 COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
@@ -30,7 +32,7 @@ RUN bash /install/ubuntu1804_install_python.sh
 # Globally disable pip cache
 RUN pip config set global.no-cache-dir false
 
-RUN apt-get update && apt-get install -y doxygen graphviz curl shellcheck
+RUN apt-get update && apt-install-and-clear -y doxygen graphviz curl shellcheck
 
 RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==22.3.0 flake8==3.9.2 blocklint==0.2.3 jinja2==3.0.3
 
diff --git a/docker/Dockerfile.ci_qemu b/docker/Dockerfile.ci_qemu
index 28bfd8962de53..851a3c520e3ac 100644
--- a/docker/Dockerfile.ci_qemu
+++ b/docker/Dockerfile.ci_qemu
@@ -19,6 +19,8 @@
 # tag: v0.62
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index 1c7d3eb59b41c..49435b4f3d479 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -16,6 +16,8 @@
 # under the License.
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.conda_cpu b/docker/Dockerfile.conda_cpu
index d2779afbdaf37..1ca7a743b0ef7 100644
--- a/docker/Dockerfile.conda_cpu
+++ b/docker/Dockerfile.conda_cpu
@@ -17,7 +17,11 @@
 
 FROM ubuntu:16.04
 
-RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
+RUN apt-get update --fix-missing
+
+RUN apt-install-and-clear -y bzip2 wget sudo binutils git
 
 COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
 RUN bash /install/ubuntu_install_conda.sh
diff --git a/docker/Dockerfile.conda_cuda100 b/docker/Dockerfile.conda_cuda100
index 7705c8548b523..8e5a1a1b7fa51 100644
--- a/docker/Dockerfile.conda_cuda100
+++ b/docker/Dockerfile.conda_cuda100
@@ -17,7 +17,11 @@
 
 FROM nvidia/cuda:10.0-devel-ubuntu16.04
 
-RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
+RUN apt-get update --fix-missing
+
+RUN apt-install-and-clear -y bzip2 wget sudo binutils git
 
 RUN  wget -q http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-10.0-linux-x64-v7.6.0.64.tgz && \
     tar --no-same-owner -xzf cudnn-10.0-linux-x64-v7.6.0.64.tgz -C /usr/local && \
diff --git a/docker/Dockerfile.conda_cuda90 b/docker/Dockerfile.conda_cuda90
index 3721674381410..cfc109200e019 100644
--- a/docker/Dockerfile.conda_cuda90
+++ b/docker/Dockerfile.conda_cuda90
@@ -17,7 +17,11 @@
 
 FROM nvidia/cuda:9.0-devel-ubuntu16.04
 
-RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
+RUN apt-get update --fix-missing
+
+RUN apt-install-and-clear -y bzip2 wget sudo binutils git
 
 RUN  wget -q http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-9.0-linux-x64-v7.6.0.64.tgz && \
     tar --no-same-owner -xzf cudnn-9.0-linux-x64-v7.6.0.64.tgz -C /usr/local && \
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index 8236075a3d3c6..e66fb3aa3cfaf 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -18,6 +18,8 @@
 # Minimum docker image for demo purposes
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.demo_cpu b/docker/Dockerfile.demo_cpu
index c4397e02f50b1..d081f26423c15 100644
--- a/docker/Dockerfile.demo_cpu
+++ b/docker/Dockerfile.demo_cpu
@@ -19,6 +19,8 @@
 # prebuilt-image: tvmai/demo-cpu
 FROM tlcpack/ci-cpu:v0.55
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 # Jupyter notebook.
 RUN pip3 install matplotlib Image Pillow jupyter[notebook]
 
diff --git a/docker/Dockerfile.demo_gpu b/docker/Dockerfile.demo_gpu
index c3b973f138079..4ef6b0c29cbcf 100644
--- a/docker/Dockerfile.demo_gpu
+++ b/docker/Dockerfile.demo_gpu
@@ -20,6 +20,8 @@
 # tag: v0.54
 FROM tlcpack/ci-gpu:v0.55
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 # Jupyter notebook.
 RUN pip3 install matplotlib Image "Pillow<7" jupyter[notebook]
 
diff --git a/docker/Dockerfile.demo_opencl b/docker/Dockerfile.demo_opencl
index 2f534d8b5b5ce..7f497b7807264 100644
--- a/docker/Dockerfile.demo_opencl
+++ b/docker/Dockerfile.demo_opencl
@@ -21,6 +21,8 @@
 
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN echo "Labelling this image"
 LABEL Description="Docker image for TVM built with OpenCL & OpenGL support"
 
@@ -30,29 +32,25 @@ RUN apt-get update
 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
 
 RUN echo "Installing utility libraries"
-RUN apt-get install -y apt-utils sudo
-RUN apt-get install -y cmake g++ llvm
-RUN apt-get install -y git
-# make wget unzip libtinfo-dev libz-dev libcurl4-openssl-dev
-RUN apt-get install -y libopenblas-dev
+RUN apt-install-and-clear -y apt-utils sudo cmake g++ llvm git libopenblas-dev
 
 # RUN echo "Installing gtest"
-# RUN apt-get install -y libgtest-dev
+# RUN apt-install-and-clear -y libgtest-dev
 # RUN cd /usr/src/gtest && cmake CMakeLists.txt && make && cp *.a /usr/lib
 
 RUN echo "Installing Python"
-RUN apt-get install -y python3-dev python3-pip
+RUN apt-install-and-clear -y python3-dev python3-pip
 RUN pip3 install setuptools numpy pytest cython decorator scipy tornado psutil xgboost
 
 RUN echo "Installing Jupyter notebook"
 RUN pip3 install matplotlib Image "Pillow<7" jupyter[notebook]
 
 RUN echo "Installing OpenCL libraries"
-RUN apt-get install -y libviennacl-dev mesa-opencl-icd ocl-icd-opencl-dev clinfo
-RUN apt-get install -y libclblas-dev libclfft-dev libclsparse-dev
+RUN apt-install-and-clear -y libviennacl-dev mesa-opencl-icd ocl-icd-opencl-dev clinfo
+RUN apt-install-and-clear -y libclblas-dev libclfft-dev libclsparse-dev
 
 RUN echo "Installing OpenGL libraries"
-RUN apt-get install -y libcogl-dev libegl1 libgles1 libglfw3-dev
+RUN apt-install-and-clear -y libcogl-dev libegl1 libgles1 libglfw3-dev
 # libglew-dev
 
 RUN echo "Upgrading dependencies"
diff --git a/docker/Dockerfile.demo_rocm b/docker/Dockerfile.demo_rocm
index c336be41934fd..1dd7d1bf119af 100644
--- a/docker/Dockerfile.demo_rocm
+++ b/docker/Dockerfile.demo_rocm
@@ -18,6 +18,8 @@
 # Demo docker for ROCm
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.demo_vitis_ai b/docker/Dockerfile.demo_vitis_ai
index c38ccaf0e5914..445f74ced982c 100644
--- a/docker/Dockerfile.demo_vitis_ai
+++ b/docker/Dockerfile.demo_vitis_ai
@@ -18,6 +18,8 @@
 # Main Vitis AI docker env
 FROM xilinx/vitis-ai:1.4.916
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.docs b/docker/Dockerfile.docs
index 840094b4d0cbc..9fe90a7302c0b 100644
--- a/docker/Dockerfile.docs
+++ b/docker/Dockerfile.docs
@@ -18,6 +18,9 @@
 FROM ubuntu:18.04
 
 # Base scripts
+
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
@@ -36,7 +39,7 @@ COPY install/ubuntu_install_sphinx.sh /install/ubuntu_install_sphinx.sh
 RUN bash /install/ubuntu_install_sphinx.sh
 
 # Enable doxygen for c++ doc build
-RUN apt-get update && apt-get install -y doxygen libprotobuf-dev protobuf-compiler
+RUN apt-get update && apt-install-and-clear -y doxygen libprotobuf-dev protobuf-compiler
 
 COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
 RUN bash /install/ubuntu_install_java.sh
diff --git a/docker/install/ubuntu1804_install_clang_format.sh b/docker/install/ubuntu1804_install_clang_format.sh
index e830433bb0391..ffc885810aedb 100755
--- a/docker/install/ubuntu1804_install_clang_format.sh
+++ b/docker/install/ubuntu1804_install_clang_format.sh
@@ -26,4 +26,4 @@ echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\
      >> /etc/apt/sources.list.d/llvm.list
 
 wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
-apt-get update && apt-get install -y clang-format-10
+apt-get update && apt-install-and-clear -y clang-format-10
diff --git a/docker/install/ubuntu1804_install_llvm.sh b/docker/install/ubuntu1804_install_llvm.sh
index b4640aa9ae6ec..cc821ab6a41b5 100755
--- a/docker/install/ubuntu1804_install_llvm.sh
+++ b/docker/install/ubuntu1804_install_llvm.sh
@@ -53,7 +53,7 @@ echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\
 
 wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
 
-apt-get update && apt-get install -y \
+apt-get update && apt-install-and-clear -y \
      llvm-9 llvm-10 llvm-11 llvm-12 llvm-13 \
      clang-9 libclang-9-dev \
      clang-10 libclang-10-dev \
diff --git a/docker/install/ubuntu1804_install_python.sh b/docker/install/ubuntu1804_install_python.sh
index 94d316199db58..2cdddbd451a61 100755
--- a/docker/install/ubuntu1804_install_python.sh
+++ b/docker/install/ubuntu1804_install_python.sh
@@ -31,8 +31,7 @@ trap cleanup 0
 # Install python and pip. Don't modify this to add Python package dependencies,
 # instead modify install_python_package.sh
 apt-get update
-apt-get install -y software-properties-common
-apt-get install -y python3.7 python3.7-dev python3-pip
+apt-install-and-clear -y software-properties-common python3.7 python3.7-dev python3-pip
 update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
 
 # Pin pip and setuptools versions
diff --git a/docker/install/ubuntu1804_install_python_venv.sh b/docker/install/ubuntu1804_install_python_venv.sh
index 5dc5efea76f6d..3f0fb3ee89716 100755
--- a/docker/install/ubuntu1804_install_python_venv.sh
+++ b/docker/install/ubuntu1804_install_python_venv.sh
@@ -22,8 +22,7 @@ set -o pipefail
 
 # install python and pip, don't modify this, modify install_python_package.sh
 apt-get update
-apt-get install -y software-properties-common
-apt-get install -y python3.7-dev python3-setuptools python3.7-venv
+apt-install-and-clear -y software-properties-common python3.7-dev python3-setuptools python3.7-venv
 
 python3 -mvenv /opt/tvm-venv
 
diff --git a/docker/install/ubuntu2004_install_python.sh b/docker/install/ubuntu2004_install_python.sh
index 5b87a74061fb5..8b14ea07907b3 100755
--- a/docker/install/ubuntu2004_install_python.sh
+++ b/docker/install/ubuntu2004_install_python.sh
@@ -30,8 +30,7 @@ trap cleanup 0
 # Install python and pip. Don't modify this to add Python package dependencies,
 # instead modify install_python_package.sh
 apt-get update
-apt-get install -y software-properties-common
-apt-get install -y python3.8 python3.8-dev python3-pip
+apt-install-and-clear -y software-properties-common python3.8 python3.8-dev python3-pip
 update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
 
 # Pin pip and setuptools versions
diff --git a/docker/install/ubuntu_download_arm_compute_lib_binaries.sh b/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
index 1f0b34c11165d..051a94b71c0c7 100755
--- a/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
+++ b/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
@@ -22,7 +22,8 @@ architecture_type=$(uname -i)
 # Depending on the architecture selected to compile for,
 # you may need to install an alternative cross-compiler.
 if [ "$architecture_type" != "aarch64" ]; then
-  apt-get update && apt-get install -y --no-install-recommends \
+  apt-get update
+  apt-install-and-clear -y --no-install-recommends \
     g++-aarch64-linux-gnu \
     gcc-aarch64-linux-gnu
 fi
diff --git a/docker/install/ubuntu_install_arduino.sh b/docker/install/ubuntu_install_arduino.sh
index bb27b56b995dc..107b452f8d3f3 100755
--- a/docker/install/ubuntu_install_arduino.sh
+++ b/docker/install/ubuntu_install_arduino.sh
@@ -21,7 +21,7 @@ set -u
 set -o pipefail
 
 export DEBIAN_FRONTEND=noninteractive
-apt-get install -y ca-certificates
+apt-install-and-clear -y ca-certificates
 
 ARDUINO_CLI_VERSION="0.21.1"
 # Install arduino-cli
diff --git a/docker/install/ubuntu_install_caffe.sh b/docker/install/ubuntu_install_caffe.sh
index ab71eab54a2d6..c37bfb764935c 100755
--- a/docker/install/ubuntu_install_caffe.sh
+++ b/docker/install/ubuntu_install_caffe.sh
@@ -21,7 +21,7 @@ set -euxo pipefail
 apt-get update --fix-missing
 
 # # Install dependencies
-apt-get install -y --no-install-recommends protobuf-compiler \
+apt-install-and-clear -y --no-install-recommends protobuf-compiler \
     libprotobuf-dev libhdf5-serial-dev libopenblas-dev libgflags-dev libgoogle-glog-dev
 
 
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index 5593d61ea5c90..eba4318f07727 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -21,7 +21,7 @@ set -u
 set -o pipefail
 
 # install libraries for building c++ core on ubuntu
-apt-get update && apt-get install -y --no-install-recommends \
+apt-get update && apt-install-and-clear -y --no-install-recommends \
     apt-transport-https \
     ca-certificates \
     cmake \
diff --git a/docker/install/ubuntu_install_ethosn_driver_stack.sh b/docker/install/ubuntu_install_ethosn_driver_stack.sh
index 873486e965624..e7878d8e4ba7a 100755
--- a/docker/install/ubuntu_install_ethosn_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosn_driver_stack.sh
@@ -37,7 +37,7 @@ trap cleanup 0
 # Ubuntu 16.04 dependencies
 apt-get update
 
-apt-get install -y \
+apt-install-and-clear -y \
     bsdmainutils \
     build-essential \
     cmake \
diff --git a/docker/install/ubuntu_install_ethosu_driver_stack.sh b/docker/install/ubuntu_install_ethosu_driver_stack.sh
index cbb55c9c0de04..d34445e2e80f8 100755
--- a/docker/install/ubuntu_install_ethosu_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosu_driver_stack.sh
@@ -38,8 +38,7 @@ trap cleanup 0
 
 # Ubuntu 18.04 dependencies
 apt-get update
-
-apt-get install -y \
+apt-install-and-clear -y \
     bsdmainutils \
     build-essential \
     cpp \
diff --git a/docker/install/ubuntu_install_golang.sh b/docker/install/ubuntu_install_golang.sh
index 5b05b3886b05c..4e8e743266c16 100755
--- a/docker/install/ubuntu_install_golang.sh
+++ b/docker/install/ubuntu_install_golang.sh
@@ -22,6 +22,4 @@ set -o pipefail
 
 #install the necessary dependancies for golang build
 apt-get update
-apt-get install -y golang-1.10-go
-apt-get install -y golang-1.10-doc
-apt-get install -y golint
+apt-install-and-clear -y golang-1.10-go golang-1.10-doc golint
diff --git a/docker/install/ubuntu_install_java.sh b/docker/install/ubuntu_install_java.sh
index 7e8ee39d00226..d0ced98d89f16 100755
--- a/docker/install/ubuntu_install_java.sh
+++ b/docker/install/ubuntu_install_java.sh
@@ -19,7 +19,8 @@
 set -o errexit -o nounset
 set -o pipefail
 
-apt-get update && apt-get install -y openjdk-8-jdk maven
+apt-get update
+apt-install-and-clear -y openjdk-8-jdk maven
 arch=$(uname -m)
 jre_arch="unknown"
 case $arch in
diff --git a/docker/install/ubuntu_install_llvm.sh b/docker/install/ubuntu_install_llvm.sh
index 6616bfc5eb2b6..4da9d9f4441ed 100755
--- a/docker/install/ubuntu_install_llvm.sh
+++ b/docker/install/ubuntu_install_llvm.sh
@@ -19,7 +19,7 @@
 set -euxo pipefail
 
 apt-get update
-apt-get install -y gnupg
+apt-install-and-clear -y gnupg
 
 echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-4.0 main\
      >> /etc/apt/sources.list.d/llvm.list
@@ -47,4 +47,4 @@ echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial main\
      >> /etc/apt/sources.list.d/llvm.list
 
 wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
-apt-get install -y llvm-4.0 llvm-9 llvm-8 llvm-7 clang-9 libclang-9-dev clang-8 libclang-8-dev clang-7 libclang-7-dev
+apt-install-and-clear -y llvm-4.0 llvm-9 llvm-8 llvm-7 clang-9 libclang-9-dev clang-8 libclang-8-dev clang-7 libclang-7-dev
diff --git a/docker/install/ubuntu_install_nnpack.sh b/docker/install/ubuntu_install_nnpack.sh
index 744f76a162bbb..e4a37f56f7eba 100755
--- a/docker/install/ubuntu_install_nnpack.sh
+++ b/docker/install/ubuntu_install_nnpack.sh
@@ -20,7 +20,7 @@ set -e
 set -u
 set -o pipefail
 
-apt-get update && apt-get install -y --no-install-recommends git cmake python-setuptools
+apt-get update && apt-install-and-clear -y --no-install-recommends git cmake python-setuptools
 
 git clone https://github.com/Maratyszcza/NNPACK NNPACK
 git clone https://github.com/Maratyszcza/pthreadpool  NNPACK/pthreadpool
diff --git a/docker/install/ubuntu_install_nodejs.sh b/docker/install/ubuntu_install_nodejs.sh
index de3ba31747b6d..2bb8a115a0c44 100755
--- a/docker/install/ubuntu_install_nodejs.sh
+++ b/docker/install/ubuntu_install_nodejs.sh
@@ -24,10 +24,9 @@ apt-get update
 # Please do not remove 'curl' package installation from here, as this
 # script runs in some images (e.g. ci_lint) that keep a very mininal
 # set of packages installed by default.
-apt-get install -y curl
+apt-install-and-clear -y curl
 
 # The node install script fetched and executed here will update the
-# apt source list, hence the second apt-get update is necessary.
+# apt source list, hence the second apt-get update --fix-missing is necessary.
 curl -s -S -L https://deb.nodesource.com/setup_14.x | bash -
-apt-get update
-apt-get install -y nodejs
+apt-install-and-clear -y nodejs
\ No newline at end of file
diff --git a/docker/install/ubuntu_install_opencl.sh b/docker/install/ubuntu_install_opencl.sh
index ca6101675307c..705f4a65eedb1 100755
--- a/docker/install/ubuntu_install_opencl.sh
+++ b/docker/install/ubuntu_install_opencl.sh
@@ -21,10 +21,9 @@ set -u
 set -o pipefail
 
 # Install OpenCL runtime in nvidia docker.
-apt-get update && apt-get install -y --no-install-recommends \
+apt-get update && apt-install-and-clear -y --no-install-recommends \
         ocl-icd-opencl-dev \
-        clinfo && \
-    rm -rf /var/lib/apt/lists/*
+        clinfo
 
 mkdir -p /etc/OpenCL/vendors && \
     echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
diff --git a/docker/install/ubuntu_install_papi.sh b/docker/install/ubuntu_install_papi.sh
index bd8908240dac3..ebcca0b424a62 100755
--- a/docker/install/ubuntu_install_papi.sh
+++ b/docker/install/ubuntu_install_papi.sh
@@ -23,7 +23,7 @@ set -o pipefail
 apt-get update --fix-missing
 
 # deps
-apt-get install -y linux-tools-common linux-tools-generic kmod
+apt-install-and-clear -y linux-tools-common linux-tools-generic kmod
 
 cd /
 git clone https://bitbucket.org/icl/papi.git
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index b71398ad5fc86..c8856f299ba03 100755
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -22,14 +22,14 @@ set -o pipefail
 
 # install python and pip, don't modify this, modify install_python_package.sh
 apt-get update
-apt-get install -y python-dev
+apt-install-and-clear -y python-dev
 
 # python 3.6
-apt-get install -y software-properties-common
+apt-install-and-clear -y software-properties-common
 
 add-apt-repository -y ppa:deadsnakes/ppa
 apt-get update
-apt-get install -y python-pip python-dev python3.6 python3.6-dev
+apt-install-and-clear -y python-pip python-dev python3.6 python3.6-dev
 
 rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
 
diff --git a/docker/install/ubuntu_install_qemu.sh b/docker/install/ubuntu_install_qemu.sh
index 6682795b0fd8c..b34ac018d94cd 100755
--- a/docker/install/ubuntu_install_qemu.sh
+++ b/docker/install/ubuntu_install_qemu.sh
@@ -79,4 +79,4 @@ make -j${num_cores}
 sudo make install
 
 # For debugging with qemu
-apt-get -y install libpython3.8
+apt-install-and-clear -y install libpython3.8
diff --git a/docker/install/ubuntu_install_redis.sh b/docker/install/ubuntu_install_redis.sh
index 8678c2050100a..72049f7b9a30a 100755
--- a/docker/install/ubuntu_install_redis.sh
+++ b/docker/install/ubuntu_install_redis.sh
@@ -20,6 +20,6 @@ set -e
 set -u
 set -o pipefail
 
-apt-get update && apt-get install -y redis-server
+apt-get update && apt-install-and-clear -y redis-server
 pip3 install \
     xgboost==1.4.2
diff --git a/docker/install/ubuntu_install_rocm.sh b/docker/install/ubuntu_install_rocm.sh
index 2f28356da3c86..15cb1e143ac46 100755
--- a/docker/install/ubuntu_install_rocm.sh
+++ b/docker/install/ubuntu_install_rocm.sh
@@ -23,8 +23,6 @@ set -o pipefail
 # Install ROCm cross compilation toolchain.
 wget -qO - https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
 echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.3/ ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
-apt-get update && apt-get install -y \
+apt-get update && apt-install-and-clear -y \
     rocm-dev \
-    lld-12 && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+    lld-12
diff --git a/docker/install/ubuntu_install_sbt.sh b/docker/install/ubuntu_install_sbt.sh
index 713faad03a43c..d27c9b7352fe2 100755
--- a/docker/install/ubuntu_install_sbt.sh
+++ b/docker/install/ubuntu_install_sbt.sh
@@ -22,7 +22,7 @@ set -o pipefail
 
 # The https:// source added below required an apt https transport
 # support.
-apt-get update && apt-get install -y apt-transport-https
+apt-get update && apt-install-and-clear -y apt-transport-https
 
 # Install the necessary dependencies for sbt
 echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list
@@ -31,4 +31,4 @@ apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2
 
 # Note: The settings in vta/hardware/chisel/project/build.properties
 # file determines required sbt version.
-apt-get update && apt-get install -y sbt=1.1.1
+apt-get update && apt-install-and-clear -y sbt=1.1.1
diff --git a/docker/install/ubuntu_install_tensorflow_aarch64.sh b/docker/install/ubuntu_install_tensorflow_aarch64.sh
index 8d5b6765deb05..82f8b28bf3c61 100755
--- a/docker/install/ubuntu_install_tensorflow_aarch64.sh
+++ b/docker/install/ubuntu_install_tensorflow_aarch64.sh
@@ -19,7 +19,7 @@
 set -euxo pipefail
 
 # Build dependencies
-apt-get install -y --no-install-recommends libhdf5-dev
+apt-install-and-clear -y --no-install-recommends libhdf5-dev
 
 # We're only using the TensorFlow wheel snapshot here as the
 # h5py wheel tries to use the wrong .so file
diff --git a/docker/install/ubuntu_install_verilator.sh b/docker/install/ubuntu_install_verilator.sh
index 9361bba70277c..fe89b30758725 100755
--- a/docker/install/ubuntu_install_verilator.sh
+++ b/docker/install/ubuntu_install_verilator.sh
@@ -24,7 +24,7 @@ set -o pipefail
 version="4.104"
 
 # Install dependencies
-apt-get update && apt-get install -y autoconf g++ flex bison
+apt-get update && apt-install-and-clear -y autoconf g++ flex bison
 
 # Install Verilator
 wget "https://github.com/verilator/verilator/archive/v$version.tar.gz"
diff --git a/docker/install/ubuntu_install_vitis_ai_core.sh b/docker/install/ubuntu_install_vitis_ai_core.sh
index 09e7aaea931ba..48980d2e7ba24 100755
--- a/docker/install/ubuntu_install_vitis_ai_core.sh
+++ b/docker/install/ubuntu_install_vitis_ai_core.sh
@@ -24,12 +24,11 @@ export PYXIR_HOME=/opt/pyxir
 mkdir "$PYXIR_HOME"
 
 # install libraries for building Vitis-AI on ubuntu
-apt-get update && apt-get install -y \
+apt-get update && apt-install-and-clear -y \
     graphviz \
     gnupg2 \
     gpg-agent \
-    gcc-aarch64-linux-gnu \
-    && rm -rf /var/lib/apt/lists/*
+    gcc-aarch64-linux-gnu
 
 
 . $VAI_ROOT/conda/etc/profile.d/conda.sh
diff --git a/docker/install/ubuntu_install_vulkan.sh b/docker/install/ubuntu_install_vulkan.sh
index b7d2d4672b0cb..78cd4143f83eb 100755
--- a/docker/install/ubuntu_install_vulkan.sh
+++ b/docker/install/ubuntu_install_vulkan.sh
@@ -22,5 +22,5 @@ set -o pipefail
 
 wget -qO - http://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
 wget -qO /etc/apt/sources.list.d/lunarg-vulkan-1.2.135-xenial.list http://packages.lunarg.com/vulkan/1.2.135/lunarg-vulkan-1.2.135-xenial.list
-apt update
-apt install -y vulkan-sdk
+apt-get update
+apt-install-and-clear -y vulkan-sdk
diff --git a/docker/install/ubuntu_install_wasmtime.sh b/docker/install/ubuntu_install_wasmtime.sh
index 0a086a019b94b..b93527135c976 100755
--- a/docker/install/ubuntu_install_wasmtime.sh
+++ b/docker/install/ubuntu_install_wasmtime.sh
@@ -19,7 +19,7 @@
 set -euxo pipefail
 
 # install wasmtime (note: requires ubuntu_install_rust.sh to run first)
-apt-get install -y --no-install-recommends libc6-dev-i386
+apt-install-and-clear -y --no-install-recommends libc6-dev-i386
 export WASMTIME_HOME=/opt/wasmtime
 curl https://wasmtime.dev/install.sh -sSf | bash
 export PATH="${WASMTIME_HOME}/bin:${PATH}"
diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh
index 1237f91a41525..1755079a12764 100755
--- a/docker/install/ubuntu_install_zephyr.sh
+++ b/docker/install/ubuntu_install_zephyr.sh
@@ -26,7 +26,7 @@ export TZ=Etc/UTC
 sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime
 echo $TZ > /etc/timezone
 
-sudo apt-get install -y --no-install-recommends \
+sudo apt-install-and-clear -y --no-install-recommends \
      libsdl2-dev ca-certificates gnupg software-properties-common wget \
      git cmake ninja-build gperf \
      ccache dfu-util device-tree-compiler wget \
@@ -40,7 +40,7 @@ echo deb https://apt.kitware.com/ubuntu/ bionic main\
      >> /etc/apt/sources.list.d/kitware.list
 sudo apt-get update
 
-sudo apt-get install -y cmake
+sudo apt-install-and-clear -y cmake
 
 pip3 install west
 
diff --git a/docker/utils/apt-install-and-clear.sh b/docker/utils/apt-install-and-clear.sh
new file mode 100755
index 0000000000000..1840c17b37bbb
--- /dev/null
+++ b/docker/utils/apt-install-and-clear.sh
@@ -0,0 +1,20 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+apt-get install $@ && apt-get clean
+
diff --git a/tests/lint/docker-format.sh b/tests/lint/docker-format.sh
new file mode 100755
index 0000000000000..8638afc2d46ba
--- /dev/null
+++ b/tests/lint/docker-format.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+TVM_HOME="$(git rev-parse --show-toplevel)"
+DOCKER_DIR="$TVM_HOME/docker"
+
+if git grep "apt install" -- ':(exclude)docker/utils/apt-install-and-clear.sh' $DOCKER_DIR; then
+  echo "Found \"apt install\" in docker file."
+  exit 1
+fi
+
+if git grep "apt-get install" -- ':(exclude)docker/utils/apt-install-and-clear.sh' $DOCKER_DIR; then
+  echo "Found \"apt-get install\" in docker file."
+  exit 1
+fi
+
+exit 0
\ No newline at end of file
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 8fbba52662de8..80cfc00ff7be4 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -74,6 +74,9 @@ function shard2 {
 
   echo "Rust check..."
   tests/lint/rust_format.sh
+
+  echo "Docker check..."
+  tests/lint/docker-format.sh
 }
 
 

From 7a5f4e0c4124031fbc4412395b3ba31625959fea Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 16 Jun 2022 12:49:21 -0700
Subject: [PATCH 155/181] [microTVM] Refactor RVM scripts and fix DNS network
 issue (#11741)

* refactor scripts

* address comments
---
 .../arduino/base-box/base_box_setup.sh        | 35 ++--------
 apps/microtvm/reference-vm/base-box-tool.py   |  9 ++-
 .../reference-vm/base_box_setup_common.sh     | 66 +++++++++++++++++++
 .../zephyr/base-box/base_box_setup.sh         | 47 ++-----------
 .../zephyr/base-box/base_box_test.sh          |  3 +-
 docker/install/ubuntu_install_core.sh         |  2 +
 docker/install/ubuntu_install_python.sh       |  7 +-
 7 files changed, 96 insertions(+), 73 deletions(-)
 create mode 100755 apps/microtvm/reference-vm/base_box_setup_common.sh

diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
index cde9d38b2df79..8ce9a5a0fa287 100644
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
@@ -26,42 +26,19 @@ if [ -e "$HOME/skip_zeroing_disk" ]; then
     skip_zeroing_disk=1
 fi
 
-sudo apt update
-sudo apt install -y build-essential
-sudo apt-get --purge remove modemmanager  # required to access serial ports.
+# Install common configs
+~/base_box_setup_common.sh
+rm -f ~/base_box_setup_common.sh
 
-sudo apt install -y --no-install-recommends git \
-     cmake cmake-data \
-     ninja-build gperf ccache dfu-util device-tree-compiler wget \
-     python3-dev python3-pip python3-setuptools python3-tk python3-wheel xz-utils file \
-     make gcc gcc-multilib g++-multilib libsdl2-dev
-
-OLD_HOSTNAME=$(hostname)
-sudo hostnamectl set-hostname microtvm
-sudo sed -i.bak "s/${OLD_HOSTNAME}/microtvm.localdomain/g" /etc/hosts
-
-# Poetry deps
-sudo apt install -y python3-venv
-
-# TVM deps
-sudo apt install -y llvm
-
-# ONNX deps
-sudo apt install -y protobuf-compiler libprotoc-dev
+# Poetry
+sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
+sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
 
 # TODO do we need this?
 echo 'export PATH=$HOME/vagrant/bin:"$PATH"' >> ~/.profile
 source ~/.profile
 echo PATH=$PATH
 
-# Poetry
-curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python3
-sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
-sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
-
-# Python 3.7
-sudo apt install -y python3.7
-
 # Clean box for packaging as a base box
 sudo apt-get clean
 if [ $skip_zeroing_disk -eq 0 ]; then
diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index db89f323328e1..c012d6f52af0f 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -53,8 +53,15 @@
 # Extra scripts required to execute on provisioning
 # in [platform]/base-box/base_box_provision.sh
 EXTRA_SCRIPTS = {
-    "arduino": (),
+    "arduino": (
+        "apps/microtvm/reference-vm/base_box_setup_common.sh",
+        "docker/install/ubuntu_install_core.sh",
+        "docker/install/ubuntu_install_python.sh",
+    ),
     "zephyr": (
+        "apps/microtvm/reference-vm/base_box_setup_common.sh",
+        "docker/install/ubuntu_install_core.sh",
+        "docker/install/ubuntu_install_python.sh",
         "docker/install/ubuntu_init_zephyr_project.sh",
         "docker/install/ubuntu_install_zephyr_sdk.sh",
         "docker/install/ubuntu_install_cmsis.sh",
diff --git a/apps/microtvm/reference-vm/base_box_setup_common.sh b/apps/microtvm/reference-vm/base_box_setup_common.sh
new file mode 100755
index 0000000000000..04d57e44804fa
--- /dev/null
+++ b/apps/microtvm/reference-vm/base_box_setup_common.sh
@@ -0,0 +1,66 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -x
+
+# Fix network DNS issue
+sudo sed -i 's/DNSSEC=yes/DNSSEC=no/' /etc/systemd/resolved.conf
+sudo systemctl restart systemd-resolved
+
+sudo apt update
+sudo apt install -y build-essential
+sudo apt-get --purge remove modemmanager  # required to access serial ports.
+
+# Core
+sudo ~/ubuntu_install_core.sh
+rm -f ~/ubuntu_install_core.sh
+
+sudo apt install -y --no-install-recommends git \
+     gperf ccache dfu-util device-tree-compiler xz-utils file \
+     gcc gcc-multilib g++-multilib libsdl2-dev
+
+# Cmake
+wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc
+sudo apt-key add kitware-archive-latest.asc
+sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
+sudo apt update
+sudo apt install -y --no-install-recommends \
+     cmake=3.22.2-0kitware1ubuntu18.04.1 cmake-data=3.22.2-0kitware1ubuntu18.04.1 \
+
+# Python
+sudo ~/ubuntu_install_python.sh
+rm -f ~/ubuntu_install_python.sh
+
+# Poetry deps
+sudo apt install -y python3-venv
+
+# TVM deps
+# TODO(mehrdadh): replace with ubuntu_install_llvm.sh
+sudo apt install -y llvm
+
+# ONNX deps
+sudo apt install -y protobuf-compiler libprotoc-dev
+
+# Poetry
+curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python3
+
+# Host name
+OLD_HOSTNAME=$(hostname)
+sudo hostnamectl set-hostname microtvm
+sudo sed -i.bak "s/${OLD_HOSTNAME}/microtvm.localdomain/g" /etc/hosts
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
index 0c98713932277..a1959e5d3bf71 100644
--- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
@@ -26,38 +26,14 @@ if [ -e "$HOME/skip_zeroing_disk" ]; then
     skip_zeroing_disk=1
 fi
 
-sudo apt update
-sudo apt install -y build-essential
-sudo apt-get --purge remove modemmanager  # required to access serial ports.
+# Install common configs
+~/base_box_setup_common.sh
+rm -f ~/base_box_setup_common.sh
 
-# Zephyr
-wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc
-sudo apt-key add kitware-archive-latest.asc
-sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
-sudo apt update
-# NOTE: latest cmake cannot be installed due to
-# https://github.com/zephyrproject-rtos/zephyr/issues/30232
-sudo apt install -y --no-install-recommends git \
-     cmake=3.22.2-0kitware1ubuntu18.04.1 cmake-data=3.22.2-0kitware1ubuntu18.04.1 \
-     ninja-build gperf ccache dfu-util device-tree-compiler wget \
-     python3-dev python3-pip python3-setuptools python3-tk python3-wheel xz-utils file \
-     make gcc gcc-multilib g++-multilib libsdl2-dev
-
-# Avahi, so that ssh microtvm works.
-# apt install -y avahi-daemon
-
-OLD_HOSTNAME=$(hostname)
-sudo hostnamectl set-hostname microtvm
-sudo sed -i.bak "s/${OLD_HOSTNAME}/microtvm.localdomain/g" /etc/hosts
-
-# Poetry deps
-sudo apt install -y python3-venv
-
-# TVM deps
-sudo apt install -y llvm
-
-# ONNX deps
-sudo apt install -y protobuf-compiler libprotoc-dev
+# Poetry
+sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
+sed -i "/^# If not running interactively,/ i export ZEPHYR_BASE=$HOME/zephyr/zephyr" ~/.bashrc
+sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
 
 # nrfjprog
 NRF_COMMANDLINE_TOOLS_FILE=nRFCommandLineToolsLinuxamd64.tar.gz
@@ -95,15 +71,6 @@ sudo apt install -y python3.8-dev
 sudo find ~/zephyr-sdk -name '*.rules' -exec cp {} /etc/udev/rules.d \;
 sudo udevadm control --reload
 
-# Poetry
-curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python3
-sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
-sed -i "/^# If not running interactively,/ i export ZEPHYR_BASE=$HOME/zephyr/zephyr" ~/.bashrc
-sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
-
-# Python 3.7
-sudo apt install -y python3.7
-
 # Clean box for packaging as a base box
 sudo apt-get clean
 if [ $skip_zeroing_disk -eq 0 ]; then
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh
index accaef50ffca8..49f86a6ef9ddd 100755
--- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh
+++ b/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh
@@ -30,5 +30,4 @@ fi
 
 board=$1
 
-pytest tests/micro/zephyr/test_zephyr.py --zephyr-board=${board}
-pytest tests/micro/zephyr/test_zephyr_aot.py --zephyr-board=${board}
+pytest tests/micro/zephyr --zephyr-board=${board}
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index eba4318f07727..d20eeeba69987 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -18,6 +18,8 @@
 
 set -e
 set -u
+# Used for debugging RVM build
+set -x
 set -o pipefail
 
 # install libraries for building c++ core on ubuntu
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index c8856f299ba03..eff7504d47719 100755
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -18,6 +18,8 @@
 
 set -e
 set -u
+# Used for debugging RVM build
+set -x
 set -o pipefail
 
 # install python and pip, don't modify this, modify install_python_package.sh
@@ -33,8 +35,11 @@ apt-install-and-clear -y python-pip python-dev python3.6 python3.6-dev
 
 rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
 
+# python 3.7
+apt-get install -y python3.7
+
 # Install pip
-cd /tmp && wget -q https://bootstrap.pypa.io/get-pip.py && python3.6 get-pip.py
+wget -q https://bootstrap.pypa.io/get-pip.py && python3.7 get-pip.py
 
 # Pin pip and setuptools versions
 pip3 install pip==19.3.1 setuptools==58.4.0

From 7bfbc74c65684d1e25e235335da41c94372a561a Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Jun 2022 12:52:40 -0700
Subject: [PATCH 156/181] upgrade ci lint docker file (#11734)

---
 docker/Dockerfile.ci_lint | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 437ea71bd4bea..2c9d72764aaed 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -34,7 +34,7 @@ RUN pip config set global.no-cache-dir false
 
 RUN apt-get update && apt-install-and-clear -y doxygen graphviz curl shellcheck
 
-RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==22.3.0 flake8==3.9.2 blocklint==0.2.3 jinja2==3.0.3
+RUN pip3 install cpplint pylint==2.9.3 mypy==0.902 black==22.3.0 flake8==3.9.2 blocklint==0.2.3 jinja2==3.0.3
 
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh

From b4a77ac7f4ed9c639a28468133b22d9b03c69bf7 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Thu, 16 Jun 2022 15:58:42 -0700
Subject: [PATCH 157/181] Fix CI break due to concurrent merge. (#11753)

* #11470 and #11741.
---
 docker/install/ubuntu_install_python.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index eff7504d47719..ec50682c1454c 100755
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -36,7 +36,7 @@ apt-install-and-clear -y python-pip python-dev python3.6 python3.6-dev
 rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
 
 # python 3.7
-apt-get install -y python3.7
+apt-install-and-clear -y python3.7
 
 # Install pip
 wget -q https://bootstrap.pypa.io/get-pip.py && python3.7 get-pip.py

From 2b1243bc16c116f29b9c0187d4e6130af226f5be Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mehrdad.hessar@gmail.com>
Date: Thu, 16 Jun 2022 16:46:22 -0700
Subject: [PATCH 158/181] [skip ci][microTVM] Update Arduino RVM name and box
 version (#11743)

* update

* Fix version

* readme

* Update README.md
---
 apps/microtvm/reference-vm/arduino/README.md   | 2 +-
 apps/microtvm/reference-vm/arduino/Vagrantfile | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/apps/microtvm/reference-vm/arduino/README.md b/apps/microtvm/reference-vm/arduino/README.md
index 0fdcd7858abed..530da71a58f32 100644
--- a/apps/microtvm/reference-vm/arduino/README.md
+++ b/apps/microtvm/reference-vm/arduino/README.md
@@ -24,7 +24,7 @@ microTVM platforms that are supported by [Arduino](https://www.arduino.cc/).
 Arduino VM is published under [tlcpack](https://app.vagrantup.com/tlcpack).
 Here is a list of different release versions and their tools.
 
-(none currently)
+We use semantic versioning as it is recommended by [Vagrant](https://www.vagrantup.com/docs/boxes/versioning). We use `X.Y.Z` version where we maintain the same major version `X` it has minor changes and newer version is still compatible with older versions and we increase minor version `Y`. However, We increase the major version `X` when new RVM is not compatible with older onces. Changing any Arduino board SDKs is considered a major change and requires increasing `X`.
 
 ## Supported Arduino Boards
 This RVM has been tested and is known to work with these boards:
diff --git a/apps/microtvm/reference-vm/arduino/Vagrantfile b/apps/microtvm/reference-vm/arduino/Vagrantfile
index 2511a6ae296e5..277d8de766558 100644
--- a/apps/microtvm/reference-vm/arduino/Vagrantfile
+++ b/apps/microtvm/reference-vm/arduino/Vagrantfile
@@ -16,7 +16,8 @@
 # under the License.
 
 Vagrant.configure("2") do |config|
-  config.vm.box = "tlcpack/microtvm-arduino-0.18.3"
+  config.vm.box = "tlcpack/microtvm-arduino"
+  config.vm.box_version = "1.0.0"
 
   if ENV.has_key?("TVM_RVM_NUM_CORES")
     num_cores = ENV["TVM_RVM_NUM_CORES"]

From 333994d45fab729dfebcfeb2f1f9c211818b693e Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mehrdad.hessar@gmail.com>
Date: Thu, 16 Jun 2022 16:46:31 -0700
Subject: [PATCH 159/181] [skip ci][microTVM] Update Zephyr RVM name and box
 version (#11655)

* Use new Zephyr RVM version

* fix box name

* fix version to match zephyr

* update version

* Update README.md

* Update README.md
---
 apps/microtvm/reference-vm/zephyr/README.md   | 13 +++----------
 apps/microtvm/reference-vm/zephyr/Vagrantfile |  3 ++-
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/apps/microtvm/reference-vm/zephyr/README.md b/apps/microtvm/reference-vm/zephyr/README.md
index 6218d6d0f9f51..c5a1654c3ed31 100644
--- a/apps/microtvm/reference-vm/zephyr/README.md
+++ b/apps/microtvm/reference-vm/zephyr/README.md
@@ -24,14 +24,7 @@ that are supported by [Zephyr Project](https://zephyrproject.org/).
 Zephyr VM is published under [tlcpack](https://app.vagrantup.com/tlcpack).
 Here is a list of different release versions and their tools.
 
-### [microtvm-zephyr-2.5](https://app.vagrantup.com/tlcpack/boxes/microtvm-zephyr-2.5/versions/0.0.1)
+**Note**: We will release all microTVM RVM boxes under [microtvm-zephyr](https://app.vagrantup.com/tlcpack/boxes/microtvm-zephyr) and use box versioning in Vagrant file. Previous versions like `microtvm-zephyr-2.5`, `microtvm-zephyr-2.4` are not continued and will be removed in future.
 
-- Zephyr [version 2.5.0]
-- Zephyr SDK [version 0.12.3]
-- nRFjProg [version 10.12.1]
-
-### [microtvm-zephyr-2.4](https://app.vagrantup.com/tlcpack/boxes/microtvm-zephyr/versions/0.0.4)
-
-- Zephyr [version 2.4.0]
-- Zephyr SDK [version 0.11.3]
-- nRFjProg [version 10.9.0]
+## Versioning
+We use semantic versioning as it is recommended by [Vagrant](https://www.vagrantup.com/docs/boxes/versioning). We use `X.Y.Z` version where we maintain the same major version `X` it has minor changes and newer version is still compatible with older versions and we increase minor version `Y`. However, We increase the major version `X` when new RVM is not compatible with older onces. Updating Zephyr SDK is considered a major change and it requires incrementing major version `X`.
diff --git a/apps/microtvm/reference-vm/zephyr/Vagrantfile b/apps/microtvm/reference-vm/zephyr/Vagrantfile
index d76219a9a42db..4ca3c6e414a5e 100644
--- a/apps/microtvm/reference-vm/zephyr/Vagrantfile
+++ b/apps/microtvm/reference-vm/zephyr/Vagrantfile
@@ -16,7 +16,8 @@
 # under the License.
 
 Vagrant.configure("2") do |config|
-  config.vm.box = "tlcpack/microtvm-zephyr-2.7"
+  config.vm.box = "tlcpack/microtvm-zephyr"
+  config.vm.box_version = "1.0.0"
 
   if ENV.has_key?("TVM_RVM_NUM_CORES")
     num_cores = ENV["TVM_RVM_NUM_CORES"]

From 7e376e2599b6422bb1562bdf1823413276914d5d Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 16 Jun 2022 18:01:20 -0700
Subject: [PATCH 160/181] [MetaSchedule][Minor] Organize Testing Scripts
 (#11751)

---
 python/tvm/auto_scheduler/testing/__init__.py | 20 ++++++
 .../testing/tune_onnx.py}                     | 30 +++++++--
 .../testing/tune_relay.py}                    | 28 +++++++--
 .../testing/tune_te.py}                       | 34 +++++++---
 python/tvm/meta_schedule/testing/__init__.py  |  2 +
 ...une_onnx_meta_schedule.py => tune_onnx.py} | 63 +++++++++++++------
 ...e_relay_meta_schedule.py => tune_relay.py} | 31 +++++++--
 .../{tune_te_meta_schedule.py => tune_te.py}  | 62 ++++++++++++------
 python/tvm/meta_schedule/tune.py              | 20 +++---
 9 files changed, 218 insertions(+), 72 deletions(-)
 create mode 100644 python/tvm/auto_scheduler/testing/__init__.py
 rename python/tvm/{meta_schedule/testing/tune_onnx_auto_scheduler.py => auto_scheduler/testing/tune_onnx.py} (93%)
 rename python/tvm/{meta_schedule/testing/tune_relay_auto_scheduler.py => auto_scheduler/testing/tune_relay.py} (93%)
 rename python/tvm/{meta_schedule/testing/tune_te_auto_scheduler.py => auto_scheduler/testing/tune_te.py} (85%)
 rename python/tvm/meta_schedule/testing/{tune_onnx_meta_schedule.py => tune_onnx.py} (84%)
 rename python/tvm/meta_schedule/testing/{tune_relay_meta_schedule.py => tune_relay.py} (91%)
 rename python/tvm/meta_schedule/testing/{tune_te_meta_schedule.py => tune_te.py} (69%)

diff --git a/python/tvm/auto_scheduler/testing/__init__.py b/python/tvm/auto_scheduler/testing/__init__.py
new file mode 100644
index 0000000000000..2bbcf8317de30
--- /dev/null
+++ b/python/tvm/auto_scheduler/testing/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-import, redefined-builtin
+"""Testing utilities in auto scheduler."""
+
+# NOTE: Do not import any module here by default
diff --git a/python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
similarity index 93%
rename from python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py
rename to python/tvm/auto_scheduler/testing/tune_onnx.py
index e916f5ace3393..2e6b9e5924e69 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -22,11 +22,11 @@
 import numpy as np  # type: ignore
 import onnx  # type: ignore
 import tvm
-from tvm.relay.frontend import from_onnx
 from tvm import auto_scheduler
 from tvm import meta_schedule as ms
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.relay.frontend import from_onnx
 
 
 def _parse_args():
@@ -82,6 +82,26 @@ def _parse_args():
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     parsed.input_shape = json.loads(parsed.input_shape)
@@ -105,10 +125,10 @@ def main():
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
         n_parallel=ARGS.rpc_workers,
-        number=3,
-        repeat=1,
-        min_repeat_ms=100,  # TODO
-        enable_cpu_cache_flush=False,  # TODO
+        number=ARGS.number,
+        repeat=ARGS.repeat,
+        min_repeat_ms=ARGS.min_repeat_ms,
+        enable_cpu_cache_flush=ARGS.cpu_flush,
     )
 
     if ARGS.target.kind.name == "llvm":
diff --git a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py b/python/tvm/auto_scheduler/testing/tune_relay.py
similarity index 93%
rename from python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
rename to python/tvm/auto_scheduler/testing/tune_relay.py
index ff4f9313470c9..48ed44ef19b78 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -80,6 +80,26 @@ def _parse_args():
         type=str,
         default=None,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     parsed.input_shape = json.loads(parsed.input_shape)
@@ -103,10 +123,10 @@ def main():
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
         n_parallel=ARGS.rpc_workers,
-        number=3,
-        repeat=1,
-        min_repeat_ms=100,  # TODO
-        enable_cpu_cache_flush=False,  # TODO
+        number=ARGS.number,
+        repeat=ARGS.repeat,
+        min_repeat_ms=ARGS.min_repeat_ms,
+        enable_cpu_cache_flush=ARGS.cpu_flush,
     )
 
     if ARGS.target.kind.name == "llvm":
diff --git a/python/tvm/meta_schedule/testing/tune_te_auto_scheduler.py b/python/tvm/auto_scheduler/testing/tune_te.py
similarity index 85%
rename from python/tvm/meta_schedule/testing/tune_te_auto_scheduler.py
rename to python/tvm/auto_scheduler/testing/tune_te.py
index 00edb7d48d041..b02a6059e23dd 100644
--- a/python/tvm/meta_schedule/testing/tune_te_auto_scheduler.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -12,7 +12,7 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitatios
+# specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
 import argparse
@@ -61,10 +61,30 @@ def _parse_args():
         required=True,
     )
     args.add_argument(
-        "--log-dir",
+        "--work-dir",
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     return parsed
@@ -74,7 +94,7 @@ def _parse_args():
 
 
 def main():
-    log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json")
+    log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
     workload_func, params = CONFIGS[ARGS.workload]
     params = params[0]  # type: ignore
     workload_func = auto_scheduler.register_workload(workload_func)
@@ -110,10 +130,10 @@ def main():
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
         n_parallel=ARGS.rpc_workers,
-        number=3,
-        repeat=1,
-        min_repeat_ms=100,
-        enable_cpu_cache_flush=False,
+        number=ARGS.number,
+        repeat=ARGS.repeat,
+        min_repeat_ms=ARGS.min_repeat_ms,
+        enable_cpu_cache_flush=ARGS.cpu_flush,
     )
 
     # Inspect the computational graph
diff --git a/python/tvm/meta_schedule/testing/__init__.py b/python/tvm/meta_schedule/testing/__init__.py
index 5d6081fa81e4a..b742191e16bfa 100644
--- a/python/tvm/meta_schedule/testing/__init__.py
+++ b/python/tvm/meta_schedule/testing/__init__.py
@@ -15,3 +15,5 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utilities in meta schedule"""
+
+# NOTE: Do not import any module here by default
diff --git a/python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_onnx.py
similarity index 84%
rename from python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py
rename to python/tvm/meta_schedule/testing/tune_onnx.py
index f5c7d1cde80b4..3a1b4cd5fe206 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -18,12 +18,13 @@
 import argparse
 import json
 import logging
+
 import numpy as np  # type: ignore
 import onnx  # type: ignore
 import tvm
-from tvm.relay.frontend import from_onnx
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.relay.frontend import from_onnx
 
 
 def _parse_args():
@@ -79,6 +80,26 @@ def _parse_args():
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     parsed.input_shape = json.loads(parsed.input_shape)
@@ -108,31 +129,33 @@ def main():
         print(f"  input_dtype: {item['dtype']}")
         shape_dict[item["name"]] = item["shape"]
     mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
-    alloc_repeat = 1
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(
-            number=3,
-            repeat=1,
-            min_repeat_ms=100,
-            enable_cpu_cache_flush=False,
+            number=ARGS.number,
+            repeat=ARGS.repeat,
+            min_repeat_ms=ARGS.min_repeat_ms,
+            enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
-        alloc_repeat=alloc_repeat,
+        alloc_repeat=1,
         max_workers=ARGS.rpc_workers,
     )
-    lib = ms.tune_relay(
-        mod=mod,
-        target=ARGS.target,
-        config=ms.TuneConfig(
-            strategy="evolutionary",
-            num_trials_per_iter=64,
-            max_trials_per_task=ARGS.num_trials,
-            max_trials_global=ARGS.num_trials,
-        ),
-        runner=runner,  # type: ignore
-        work_dir=ARGS.work_dir,
-        params=params,
-    )
+    with ms.Profiler() as profiler:
+        lib = ms.tune_relay(
+            mod=mod,
+            target=ARGS.target,
+            config=ms.TuneConfig(
+                strategy="evolutionary",
+                num_trials_per_iter=64,
+                max_trials_per_task=ARGS.num_trials,
+                max_trials_global=ARGS.num_trials,
+            ),
+            runner=runner,  # type: ignore
+            work_dir=ARGS.work_dir,
+            params=params,
+        )
+    print("Tuning Time:")
+    print(profiler.table())
     graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
     input_data = {}
     for item in ARGS.input_shape:
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay.py
similarity index 91%
rename from python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
rename to python/tvm/meta_schedule/testing/tune_relay.py
index ee26b6303da04..8663eb460c4a8 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -78,6 +78,26 @@ def _parse_args():
         type=str,
         default=None,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     parsed.input_shape = json.loads(parsed.input_shape)
@@ -110,16 +130,15 @@ def main():
         print(f"  input_name: {input_name}")
         print(f"  input_shape: {input_shape}")
         print(f"  input_dtype: {input_dtype}")
-    alloc_repeat = 1
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(
-            number=3,
-            repeat=1,
-            min_repeat_ms=100,
-            enable_cpu_cache_flush=False,
+            number=ARGS.number,
+            repeat=ARGS.repeat,
+            min_repeat_ms=ARGS.min_repeat_ms,
+            enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
-        alloc_repeat=alloc_repeat,
+        alloc_repeat=1,
         max_workers=ARGS.rpc_workers,
     )
     with ms.Profiler() as profiler:
diff --git a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_te.py
similarity index 69%
rename from python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
rename to python/tvm/meta_schedule/testing/tune_te.py
index b65761ba4fe5f..b2649564bfa98 100644
--- a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -68,6 +68,26 @@ def _parse_args():
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     parsed.rpc_config = ms.runner.RPCConfig(
@@ -87,32 +107,34 @@ def _parse_args():
 
 
 def main():
-    alloc_repeat = 1
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(
-            number=3,
-            repeat=1,
-            min_repeat_ms=100,
-            enable_cpu_cache_flush=False,
+            number=ARGS.number,
+            repeat=ARGS.repeat,
+            min_repeat_ms=ARGS.min_repeat_ms,
+            enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
-        alloc_repeat=alloc_repeat,
+        alloc_repeat=1,
         max_workers=ARGS.rpc_workers,
     )
-    sch: Optional[tir.Schedule] = ms.tune_tir(
-        mod=create_te_workload(ARGS.workload, 0),
-        target=ARGS.target,
-        config=ms.TuneConfig(
-            strategy="evolutionary",
-            num_trials_per_iter=64,
-            max_trials_per_task=ARGS.num_trials,
-            max_trials_global=ARGS.num_trials,
-        ),
-        runner=runner,  # type: ignore
-        task_name=ARGS.workload,
-        work_dir=ARGS.work_dir,
-        num_threads=cpu_count(),
-    )
+    with ms.Profiler() as profiler:
+        sch: Optional[tir.Schedule] = ms.tune_tir(
+            mod=create_te_workload(ARGS.workload, 0),
+            target=ARGS.target,
+            config=ms.TuneConfig(
+                strategy="evolutionary",
+                num_trials_per_iter=64,
+                max_trials_per_task=ARGS.num_trials,
+                max_trials_global=ARGS.num_trials,
+            ),
+            runner=runner,  # type: ignore
+            task_name=ARGS.workload,
+            work_dir=ARGS.work_dir,
+            num_threads=cpu_count(),
+        )
+    print("Tuning Time:")
+    print(profiler.table())
     if sch is None:
         print("No valid schedule found!")
     else:
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index fd31760c11746..d3c09b41292c4 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -430,15 +430,13 @@ def tune_tir(
         mutator_probs=mutator_probs,
         num_threads=num_threads,
     )
-    bests: List[TuningRecord] = database.get_top_k(
-        database.commit_workload(mod),
-        top_k=1,
-    )
-    if not bests:
-        return None
-    assert len(bests) == 1
-    sch = Schedule(mod)
-    bests[0].trace.apply_to_schedule(sch, remove_postproc=False)
+    with Profiler.timeit("ApplyHistoryBest"):
+        bests: List[TuningRecord] = database.get_top_k(database.commit_workload(mod), top_k=1)
+        if not bests:
+            return None
+        assert len(bests) == 1
+        sch = Schedule(mod)
+        bests[0].trace.apply_to_schedule(sch, remove_postproc=False)
     return sch
 
 
@@ -488,8 +486,10 @@ def tune_te(
     sch : Optional[Schedule]
         The tuned schedule.
     """
+    with Profiler.timeit("CreatePrimFunc"):
+        func = create_prim_func(tensors)
     return tune_tir(
-        mod=create_prim_func(tensors),
+        mod=func,
         target=target,
         config=config,
         work_dir=work_dir,

From 7433b2fd4109383731cc0235d47b9fcef54257e5 Mon Sep 17 00:00:00 2001
From: Karl Koscher <kkoscher@octoml.ai>
Date: Fri, 17 Jun 2022 01:28:13 -0400
Subject: [PATCH 161/181] Add optional mem_scope parameter to tvm.nd.array and
 tvm.nd.copyto (#11717)

---
 python/tvm/runtime/ndarray.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 9d3a3aff21659..16790ca2c783c 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -236,18 +236,21 @@ def numpy(self):
             return np_arr_ret.reshape(shape)
         return np_arr
 
-    def copyto(self, target):
+    def copyto(self, target, mem_scope=None):
         """Copy array to target
 
         Parameters
         ----------
         target : NDArray
             The target array to be copied, must have same shape as this array.
+
+        mem_scope : Optional[str]
+            The memory scope of the array.
         """
         if isinstance(target, NDArrayBase):
             return self._copyto(target)
         if isinstance(target, Device):
-            res = empty(self.shape, self.dtype, target)
+            res = empty(self.shape, self.dtype, target, mem_scope)
             return self._copyto(res)
         raise ValueError("Unsupported target type %s" % str(type(target)))
 
@@ -574,7 +577,7 @@ def webgpu(dev_id=0):
 mtl = metal
 
 
-def array(arr, device=cpu(0)):
+def array(arr, device=cpu(0), mem_scope=None):
     """Create an array from source arr.
 
     Parameters
@@ -585,6 +588,9 @@ def array(arr, device=cpu(0)):
     device : Device, optional
         The device device to create the array
 
+    mem_scope : Optional[str]
+        The memory scope of the array
+
     Returns
     -------
     ret : NDArray
@@ -595,7 +601,7 @@ def array(arr, device=cpu(0)):
 
     if not isinstance(arr, (np.ndarray, NDArray)):
         arr = np.array(arr)
-    return empty(arr.shape, arr.dtype, device).copyfrom(arr)
+    return empty(arr.shape, arr.dtype, device, mem_scope).copyfrom(arr)
 
 
 # Register back to FFI

From 1b8f3b54c6da0bf25427a8bd9a049d87318a1e66 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Thu, 16 Jun 2022 22:28:31 -0700
Subject: [PATCH 162/181] [ci][docker gpu] Install dnnl in docker GPU. (#11744)

BYOC related tutorial may use dnnl  and such tutorial run at docker gpu
which need to install dnnl to prepare the environment.
---
 docker/Dockerfile.ci_gpu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 22c372cc70b00..f04d8515b8dc2 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -139,6 +139,10 @@ COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
 ENV PATH /opt/sccache:$PATH
 
+# dnnl
+COPY install/ubuntu_install_dnnl.sh /install/ubuntu_install_dnnl.sh
+RUN bash /install/ubuntu_install_dnnl.sh
+
 # Environment variables
 ENV PATH=/usr/local/nvidia/bin:${PATH}
 ENV PATH=/usr/local/cuda/bin:${PATH}

From eb493110a89f75dafd59b4c6caa23eedbf93b32f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ulrik=20H=C3=B8rlyk=20Hjort=20=20=28Robert=20Bosch=20GmbH?=
 =?UTF-8?q?=29?= <ulrik.hjort@se.bosch.com>
Date: Fri, 17 Jun 2022 11:13:40 +0200
Subject: [PATCH 163/181] Constant name prefix added (#11509)

This is a proposal to fix the bug reported here: https://discuss.tvm.apache.org/t/problem-with-allocateconstnodes-in-cmsis-nn-code/12806

Bug report: #11394

A prefix has been added to the "constant_" in te_compiler_cache.cc to distinguish from the constant naming generated in aot_executor_gen.cc
---
 src/relay/backend/te_compiler_cache.cc    | 4 +++-
 tests/python/unittest/test_link_params.py | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 5b23843c95e62..8715900c0c4a1 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -193,7 +193,9 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
     } else {
       const auto* ttype = op->checked_type().as<TensorTypeNode>();
       std::stringstream ss;
-      ss << "constant_" << const_index++;
+      std::string s = readable_name_stream_.str();
+      std::replace(s.begin(), s.end(), '.', '_');
+      ss << s << "_constant_" << const_index++;
       tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype, ss.str());
       constant_tensors_[op] = tensor;
       return {tensor};
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index afa745760895c..80c2fbaeb416c 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -264,7 +264,8 @@ def test_c_link_params(linkable_dtype):
         c_dtype = _get_c_datatype(linkable_dtype)
         src_lines = src.split("\n")
         param = param_init[f"{linkable_dtype}_a"].reshape(np.prod(KERNEL_SHAPE))
-        param_def = rf"^static const {c_dtype} __attribute__\(\(section\(\".rodata.tvm\"\), aligned\(16\)\)\) constant_\d+\[{np.prod(param.shape)}\] = {{$"
+        param_def = rf"^static const {c_dtype} __attribute__\(\(section\(\".rodata.tvm\"\), aligned\(16\)\)\) [a-zA-Z_0-9]*constant_\d+\[{np.prod(param.shape)}\] = {{$"
+
         for i, line in enumerate(src_lines):
             if re.match(param_def, line):
                 i += 1

From 5aabeb741f6952fc8d09934fb85b15666c938280 Mon Sep 17 00:00:00 2001
From: apeskov <peskovnn@gmail.com>
Date: Fri, 17 Jun 2022 14:10:00 +0300
Subject: [PATCH 164/181] Enable QNN primitives for DNNL runtime (#11642)

* [DNNL] Enable QNN primitives

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [DNNL] add qnn test

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* typo fix

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>
---
 python/tvm/relay/op/contrib/dnnl.py           | 279 ++++++++-
 src/relay/backend/contrib/dnnl/codegen.cc     | 100 +++-
 .../backend/contrib/dnnl/comp_op_matcher.h    | 245 ++++++++
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 132 ++++-
 .../contrib/dnnl/dnnl_tensor_requisite.h      |   1 +
 tests/python/contrib/test_dnnl.py             | 539 +++++++++++++++++-
 .../python/relay/test_pass_partition_graph.py |  10 +-
 7 files changed, 1265 insertions(+), 41 deletions(-)
 create mode 100644 src/relay/backend/contrib/dnnl/comp_op_matcher.h

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 6581f10a2f568..c251b66bfbc77 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -39,13 +39,14 @@
 from tvm.relay import transform
 from tvm.relay.expr import GlobalVar
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
+from tvm.relay.expr import const
 
 from tvm.relay.analysis import analysis as _analysis
 from tvm.relay import expr as _expr
 
 
 from ... import _ffi_api
-from ...dataflow_pattern import wildcard, is_op, is_expr, rewrite, DFPatternCallback
+from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr, rewrite, DFPatternCallback
 from .register import register_pattern_table
 
 
@@ -56,8 +57,8 @@ def _register_external_op_helper(op_name, supported=True):
     """The helper function to indicate that a given operator can be supported
     by DNNL.
 
-    Paramters
-    ---------
+    Parameters
+    ----------
     op_name : Str
         The name of operator that will be registered.
 
@@ -69,6 +70,10 @@ def _register_external_op_helper(op_name, supported=True):
 
     @tvm.ir.register_op_attr(op_name, "target.dnnl")
     def _func_wrapper(expr):
+        args = expr.args
+        if any([x.checked_type.dtype == "int64" for x in args]):
+            logger.info("DNNL does not support int64.")
+            return False
         return supported
 
     return _func_wrapper
@@ -90,6 +95,7 @@ def _func_wrapper(expr):
 _register_external_op_helper("exp")
 _register_external_op_helper("log")
 _register_external_op_helper("sqrt")
+_register_external_op_helper("round")
 _register_external_op_helper("nn.relu")
 _register_external_op_helper("nn.leaky_relu")
 _register_external_op_helper("tanh")
@@ -199,6 +205,70 @@ def make_dnnl_pattern(op_name, with_bias, with_eltwise):
     return dnnl_pattern
 
 
+def make_qnn_conv2d_pattern():
+    """Make qnn.conv2d based pattern supported by DNNL
+
+    Returns
+    -------
+    pattern : Tuple(pattern_name, CallPattern)
+        Created pattern name, along with its CallPattern.
+    """
+    data = wildcard()
+    weight = is_constant()
+    bias = is_constant()
+    o_scl = is_constant()
+    dst_zp = is_constant()
+    act_scl = is_constant()
+    sum_scl = is_constant()
+    sum_src = wildcard()
+
+    zero_zp = is_expr(const(0, dtype="int32"))
+
+    pat = is_op("qnn.conv2d")(data, weight, zero_zp, zero_zp, is_constant(), is_constant())
+    pat = is_op("cast")(pat)
+    pat = is_op("add")(pat, bias) | pat  # optional bias
+    pat = is_op("multiply")(pat, o_scl)
+    pat = is_op("clip")(pat)  # TBD, not only clip
+    pat = is_op("multiply")(pat, act_scl) | pat  # optional multiply. Ex: act_scl == 1
+    pat = is_op("add")(pat, sum_scl * is_op("cast")(sum_src)) | pat  # optional sum
+    pat = is_op("add")(pat, dst_zp) | pat  # optional dst_zp, can be dst_zp == 0
+    pat = is_op("cast")(pat)
+
+    return "dnnl.qnn.conv2d", pat
+
+
+def make_qnn_dense_pattern():
+    """Make qnn.dense based pattern supported by DNNL
+
+    Returns
+    -------
+    pattern : Tuple(pattern_name, CallPattern)
+        Created pattern name, along with its CallPattern.
+    """
+    data = wildcard()
+    weight = is_constant()
+    bias = is_constant()
+    o_scl = is_constant()
+    dst_zp = is_constant()
+    act_scl = is_constant()
+    sum_scl = is_constant()
+    sum_src = wildcard()
+
+    zero_zp = is_expr(const(0, dtype="int32"))
+
+    pat = is_op("qnn.dense")(data, weight, zero_zp, zero_zp, is_constant(), is_constant())
+    pat = is_op("cast")(pat)
+    pat = is_op("add")(pat, bias) | pat  # optional bias
+    pat = is_op("multiply")(pat, o_scl)
+    pat = is_op("clip")(pat)  # TBD, not only clip
+    pat = is_op("multiply")(pat, act_scl) | pat  # optional multiply. ex act_scl == 1
+    pat = is_op("add")(pat, sum_scl * is_op("cast")(sum_src)) | pat  # optional sum
+    pat = is_op("add")(pat, dst_zp) | pat  # optional dst_zp, can be dst_zp == 0
+    pat = is_op("cast")(pat)
+
+    return "dnnl.qnn.dense", pat
+
+
 @register_pattern_table("dnnl")
 def pattern_table():
     """Create dnnl patterns.
@@ -208,8 +278,11 @@ def pattern_table():
     dnnl_patterns : List[dnnl_pattern]
         Created patterns.
     """
+    dnnl_patterns = list()
+    dnnl_patterns.append(make_qnn_conv2d_pattern())
+    dnnl_patterns.append(make_qnn_dense_pattern())
+
     elt_list = ["nn.relu", "tanh", "sigmoid", "gelu", None]
-    dnnl_patterns = []
     for with_bias in [True, False]:
         for elt in elt_list:
             if not with_bias and not elt:
@@ -707,3 +780,201 @@ def rewrite_dense_bias_gelu_reshape_last(mod):
         [DenseReshapeBiasGeluRewrite(), DenseReshapeBiasGeluRewrite(has_gelu=False)], mod["main"]
     )
     return mod
+
+
+class LegalizeQnnOpForDnnl(DFPatternCallback):
+    """Legalize QNN based patterns to match DNNL
+
+    original pattern:
+      OP = qnn.dense | qnn.conv2d
+      %1 = OP<int>(SRC, WGH) - OP<int>(src_zp, WGH)   // qnn.conv2d
+      %2 = %1 + orig_bias                             // bias
+      %2 = (%1 - rq_in_zp) * rq_in_scl / rq_out_scl + rq_out_zp  // qnn.requantize
+      %3 = act(%2)                                               // activation == clip
+      %4 = ((%3 - sum_lh_zp) * sum_lh_scl + (SRC2 - sum_rh_zp) * sum_rh_scl)  // qnn.add
+           / sum_out_scl + sum_out_zp
+
+    transform to DNNL compatible:
+      %1 = OP<int>(SRC, WGH)
+      %2 = cast(%1, dtype="float")
+      %2 = (%1 + bias) * o_scl
+      %3 = act(%2) * act_scl
+      %4 = %3 + SRC2 * sum_scl
+      %5 = %4 + dst_zp
+      %6 = cast(%5, dtype="float")
+
+    where:
+      o_scl = rq_in_scl / rq_out_scl
+      act_scl = sum_lhs_scl / sum_out_scl
+      sum_scl = sum_rhs_scl / sum_out_scl
+      bias = orig_bias - OP(src_zp, WGH) - rq_in_zp + rq_out_zp * rq_out_scl / rq_in_scl
+      dst_zp = sum_out_zp - sum_lhs_zp * sum_lhs_scl / sum_out_scl -
+               sum_rhs_zp * sum_rhs_scl / sum_out_scl
+    """
+
+    def __init__(self):
+        super(LegalizeQnnOpForDnnl, self).__init__()
+        self.src = wildcard()
+        self.wgh = wildcard()
+        self.bias = wildcard()
+        self.sum_src = wildcard()
+
+        self.src_scl = is_constant()
+        self.src_zp = is_constant()
+        self.wgh_scl = is_constant()
+        self.wgh_zp = is_expr(const(0))
+
+        self.rq_in_scl = is_constant()
+        self.rq_in_zp = is_constant()
+        self.rq_out_scl = is_constant()
+        self.rq_out_zp = is_constant()
+
+        self.sum_lhs_scl = is_constant()
+        self.sum_lhs_zp = is_constant()
+        self.sum_rhs_scl = is_constant()
+        self.sum_rhs_zp = is_constant()
+        self.sum_out_scl = is_constant()
+        self.sum_out_zp = is_constant()
+
+        self.root = (is_op("qnn.conv2d") | is_op("qnn.dense"))(
+            self.src, self.wgh, self.src_zp, self.wgh_zp, self.src_scl, self.wgh_scl
+        )
+        pat = is_op("add")(self.root, self.bias) | self.root  # optional bias
+        pat = is_op("qnn.requantize")(
+            pat, self.rq_in_scl, self.rq_in_zp, self.rq_out_scl, self.rq_out_zp
+        )
+        pat = is_op("clip")(pat)
+        cast = is_op("cast")(pat)
+        pat = is_op("qnn.add")(
+            cast,
+            self.sum_src,
+            self.sum_lhs_scl,
+            self.sum_lhs_zp,
+            self.sum_rhs_scl,
+            self.sum_rhs_zp,
+            self.sum_out_scl,
+            self.sum_out_zp,
+        )
+        pat = is_op("clip")(pat)
+        self.pattern = pat | cast
+
+    def callback(self, pre, post, node_map):
+        root = node_map[self.root][0]
+        src = node_map[self.src][0]
+        wgh = node_map[self.wgh][0]
+        bias = node_map.get(self.bias, default=[relay.const(0, dtype="int32")])[0]
+        src_zp = node_map[self.src_zp][0]
+        rq_in_scl = node_map[self.rq_in_scl][0]
+        rq_in_zp = node_map[self.rq_in_zp][0]
+        rq_out_scl = node_map[self.rq_out_scl][0]
+        rq_out_zp = node_map[self.rq_out_zp][0]
+
+        final_dtype = node_map[self.pattern][0].checked_type.dtype
+
+        if root.op == relay.op.get("qnn.conv2d"):
+            dst_layout = root.attrs.out_layout
+            dst_layout = root.attrs.data_layout if dst_layout == "" else dst_layout
+            wgh_layout = root.attrs.kernel_layout
+        else:
+            # qnn.dense has no layout attributes. Assume that is plain
+            dst_layout = "NC"
+            wgh_layout = "OI"
+
+        # TODO(@apeskov): dst_layout may ne blocked
+        bias_rank = len(dst_layout) - dst_layout.index("C")
+
+        sum_src = node_map[self.sum_src][0] if self.sum_src in node_map else None
+        # Default values if qnn.sum is not present
+        sum_lhs_scl = node_map[self.sum_lhs_scl][0] if sum_src else relay.const(1, dtype="float32")
+        sum_lhs_zp = node_map[self.sum_lhs_zp][0] if sum_src else relay.const(0, dtype="int32")
+        sum_rhs_scl = node_map[self.sum_rhs_scl][0] if sum_src else relay.const(0, dtype="float32")
+        sum_rhs_zp = node_map[self.sum_rhs_zp][0] if sum_src else relay.const(0, dtype="int32")
+        sum_out_scl = node_map[self.sum_out_scl][0] if sum_src else relay.const(1, dtype="float32")
+        sum_out_zp = node_map[self.sum_out_zp][0] if sum_src else relay.const(0, dtype="int32")
+
+        def cast_fp(op):
+            return relay.op.cast(op, dtype="float32")
+
+        # recalculate some factors
+        o_scl = rq_in_scl / rq_out_scl
+        act_scl = sum_lhs_scl / sum_out_scl
+        sum_scl = sum_rhs_scl / sum_out_scl
+        dst_zp = (
+            cast_fp(sum_out_zp)
+            - cast_fp(sum_lhs_zp) * sum_lhs_scl / sum_out_scl
+            - cast_fp(sum_rhs_zp) * sum_rhs_scl / sum_out_scl
+        )
+        bias = self.squeeze_bias(bias, dst_layout)
+        bias = (
+            cast_fp(bias)
+            - cast_fp(self.fake_op(src_zp, wgh, wgh_layout))
+            - cast_fp(rq_in_zp)
+            + cast_fp(rq_out_zp) * rq_out_scl / rq_in_scl
+        )
+        bias = self.broadcast_to_rank(bias, bias_rank)
+
+        zero_zp = relay.const(0, dtype="int32")
+        one_scl = relay.const(1.0, dtype="float32")
+
+        # construct new graph with proper post op ordering
+        gr = tvm.relay.Call(
+            root.op,
+            [src, wgh, zero_zp, zero_zp, one_scl, one_scl],
+            root.attrs,
+            root.type_args,
+            root.span,
+        )
+        gr = relay.op.cast(gr, dtype="float32")
+        gr = gr + bias
+        gr = gr * o_scl
+        gr = relay.op.clip(gr, 0, 255) * act_scl
+        gr = gr + sum_scl * cast_fp(sum_src) if sum_src else gr
+        gr = gr + dst_zp
+        gr = relay.op.cast(gr, dtype=final_dtype)
+        return gr
+
+    @staticmethod
+    def fake_op(zp, wgh, layout):
+        """Fake operator implementation for zp broadcast input"""
+        # Conv:  reduce kernel {OC, IC, KH, KW} -> {OC} in case of group that is still correct
+        # Dense: reduce kernel {OC, IC} -> {OC}
+        wgh_int = relay.op.cast(wgh, dtype="int32")
+        reduced_kernel = relay.op.sum(
+            wgh_int, axis=[layout.index("O")], keepdims=False, exclude=True
+        )
+        return zp * reduced_kernel
+
+    @staticmethod
+    def squeeze_bias(bias, layout):
+        shape = transform.InferTypeLocal(bias).concrete_shape
+        c_position = layout.index("C") - len(layout) + len(shape)
+        squeeze_idxs = [i for i in range(len(shape)) if i != c_position]
+        return relay.op.squeeze(bias, squeeze_idxs)
+
+    @staticmethod
+    def broadcast_to_rank(op, rank):
+        """Scalar or 1D tensor are supported"""
+        shape = transform.InferTypeLocal(op).concrete_shape
+        if len(shape) == 0:
+            return op
+        if len(shape) == 1:
+            return relay.op.expand_dims(op, 1, rank - 1)
+        raise ValueError("Unexpected bias rank to broadcast. Only 0 and 1 are supported.")
+
+
+def legalize_qnn_for_dnnl(mod):
+    """Transform qnn primitives to DNNL compatible form. Eliminate source zero point and apply
+    strict sequence of post ops."""
+    mod["main"] = rewrite(LegalizeQnnOpForDnnl(), mod["main"])
+
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            # transform.SimplifyInference(),  # TODO: this pass decompose nn.layer_norm
+            # transform.FoldScaleAxis(),  # TODO: fail inside TVM in case of grouped convolutions.
+            transform.FoldConstant(),
+        ]
+    )
+    with tvm.transform.PassContext(opt_level=3):
+        mod = seq(mod)
+    return mod
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 927cd12ae0fb9..f17cdafa76a5f 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -35,6 +35,7 @@
 #include <sstream>
 
 #include "../../utils.h"
+#include "comp_op_matcher.h"
 
 #ifdef USE_JSON_RUNTIME
 #include "../../../../runtime/contrib/json/json_node.h"
@@ -436,6 +437,30 @@ class DNNLModuleCodegen : public CSourceModuleCodegenBase {
 
 #else  // DNNL JSON runtime
 
+/*!
+ * \brief Replace var expr which bind with args of call node
+ *
+ * \param args vector of expression (contains vars or constant nodes)
+ * \param cn call node which describe mapping of internal body vars with args
+ * \return updated vector of expressions
+ */
+static tvm::Array<Expr> BindToCallNodeArgs(const std::vector<Expr>& args, const CallNode* cn) {
+  tvm::Array<Expr> res;
+  for (const auto& arg : args) {
+    if (arg->IsInstance<ConstantNode>()) {
+      res.push_back(arg);
+    } else {
+      auto body_params = cn->op.as<FunctionNode>()->params;
+      auto found = std::find(body_params.begin(), body_params.end(), arg);
+      ICHECK(found != body_params.end());
+      auto idx = std::distance(body_params.begin(), found);
+      res.push_back(cn->args[idx]);
+    }
+  }
+  return res;
+}
+
+/*! \brief Serializer to DNNL JSON runtime module */
 class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
   using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
   using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
@@ -475,14 +500,19 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
   }
 
  public:
-  DNNLJSONSerializer(const std::string& symbol, const Expr& expr) : JSONSerializer(symbol, expr) {}
+  DNNLJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer("dnnl_" + symbol, expr) {}
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) override {
     Expr expr = GetRef<Expr>(cn);
     std::string name;
+    tvm::Array<Expr> args;
+    std::unordered_map<std::string, dmlc::any> extra_attrs;
+
     const CallNode* call = cn;
     if (const auto* op_node = cn->op.as<OpNode>()) {
       name = op_node->name;
+      args = cn->args;
     } else if (const auto* fn = cn->op.as<FunctionNode>()) {
       auto comp = fn->GetAttr<String>(attr::kComposite);
       ICHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions.";
@@ -511,15 +541,24 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
       } else if (name.find("dnnl.dense") != std::string::npos) {
         call = GetRootCall(fn->body.as<CallNode>(), 10, "nn.dense");
         ICHECK(call->op.as<OpNode>()) << "Not op node";
+      } else if (name.find("dnnl.qnn.conv2d") != std::string::npos ||
+                 name.find("dnnl.qnn.dense") != std::string::npos) {
+        std::vector<Expr> args_loc;
+        call = ParseComposite(*fn, &extra_attrs, &args_loc);
+        args = BindToCallNodeArgs(args_loc, cn);
       } else {
         LOG(FATAL) << "Unrecognized DNNL pattern: " << name;
       }
+
+      if (args.empty()) {
+        args = cn->args;
+      }
     } else {
       LOG(FATAL) << "DNNL JSON runtime does not support calls to " << cn->op->GetTypeKey();
     }
 
     std::vector<JSONGraphNodeEntry> inputs;
-    for (const auto& arg : cn->args) {
+    for (const auto& arg : args) {
       auto res = VisitExpr(arg);
       inputs.insert(inputs.end(), res.begin(), res.end());
     }
@@ -527,6 +566,8 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
                                                 "kernel", /* op_type_ */
                                                 inputs, 1 /* num_outputs_ */);
     SetCallNodeAttribute(node, call);
+    for (const auto& kvp : extra_attrs) node->SetAttr(kvp.first, kvp.second);
+
     return AddNode(node, GetRef<Expr>(cn));
   }
 };
@@ -558,6 +599,61 @@ runtime::Module DNNLCompiler(const ObjectRef& ref) {
 
 TVM_REGISTER_GLOBAL("relay.ext.dnnl").set_body_typed(DNNLCompiler);
 
+/*!
+ * \brief Constant Updater for DNNL JSON runtime
+ *
+ * Not all originally existing ConstantNode should be passed to JSON runtime.
+ * Some of them may be skipped or change ordering. So we have to apply the same traversing through
+ * the graph as DNNLJSONSerializer.
+ */
+struct DNNLConstantUpdater : public ConstantUpdater {
+ public:
+  DNNLConstantUpdater(const std::string& symbol,
+                      std::unordered_map<std::string, runtime::NDArray>* params)
+      : ConstantUpdater("dnnl_" + symbol, params) {}
+  using ConstantUpdater::VisitExpr_;
+
+  void VisitExpr_(const CallNode* cn) final {
+    this->VisitSpan(cn->span);
+
+    if (const auto* fn = cn->op.as<FunctionNode>()) {
+      std::vector<Expr> args_loc;
+      std::unordered_map<std::string, dmlc::any> attrs;
+      auto root_cn = ParseComposite(*fn, &attrs, &args_loc);
+
+      auto args = root_cn ? BindToCallNodeArgs(args_loc, cn) : cn->args;
+
+      // Customized visit order of args
+      for (const auto& arg : args) {
+        this->VisitExpr(arg);
+      }
+    } else {
+      // Original visit order of args
+      for (auto arg : cn->args) {
+        this->VisitExpr(arg);
+      }
+    }
+  }
+};
+
+/*!
+ * \brief The external compiler/codegen tool. It takes a Relay expression/module and
+ * produce collection of required constant NDArrays.
+ */
+Map<String, runtime::NDArray> DNNLConstantUpdaterFunc(Expr expr, std::string symbol) {
+  // Visit all suitable constant nodes
+  std::unordered_map<std::string, runtime::NDArray> res;
+  DNNLConstantUpdater const_updater(symbol, &res);
+  const_updater(expr);
+
+  // Convert to tvm::Map
+  Map<String, runtime::NDArray> ret;
+  for (const auto& kvp : res) ret.Set(kvp.first, kvp.second);
+  return ret;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.dnnl.constant_updater").set_body_typed(DNNLConstantUpdaterFunc);
+
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/contrib/dnnl/comp_op_matcher.h b/src/relay/backend/contrib/dnnl/comp_op_matcher.h
new file mode 100644
index 0000000000000..364cc6e377ca8
--- /dev/null
+++ b/src/relay/backend/contrib/dnnl/comp_op_matcher.h
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/dnnl/comp_op_matcher.h
+ * \brief Implement matcher based function to parse complex composite nodes.
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_
+#define TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_
+
+#include <tvm/relay/function.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../../../ir/dataflow_matcher_impl.h"
+
+/*!
+ * \brief Converter value to dmlc attr acceptable format
+ *
+ * \tparam T type of value (auto deduction)
+ * \param val value to convert
+ * \return resulting dmlc object
+ */
+template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
+dmlc::any dmlc_attr(const T& val) {
+  std::vector<dmlc::any> attr;
+  attr.emplace_back(std::vector<std::string>{std::to_string(val)});
+  return dmlc::any{attr};
+}
+
+template <typename T, std::enable_if_t<std::is_same<T, std::string>::value, bool> = true>
+dmlc::any dmlc_attr(const T& val) {
+  std::vector<dmlc::any> attr;
+  attr.emplace_back(std::vector<std::string>{val});
+  return dmlc::any{attr};
+}
+
+template <typename T,
+          std::enable_if_t<std::is_same<T, std::vector<std::string>>::value, bool> = true>
+dmlc::any dmlc_attr(const T& val) {
+  std::vector<dmlc::any> attr;
+  attr.emplace_back(val);
+  return dmlc::any{attr};
+}
+
+/*! \brief Constructor of const scalar expression with defined type */
+tvm::relay::Expr constant(float val) {
+  auto value = tvm::runtime::NDArray::Empty({}, tvm::DataType::Float(32), {kDLCPU, 0});
+  value.CopyFromBytes(&val, sizeof(val));
+  auto res = tvm::relay::Constant(value);
+  tvm::relay::transform::InferTypeLocal(res);
+  return res;
+}
+
+/*!
+ * \brief Simple helper to accumulate composite function arguments and corresponding attributes
+ * with indexes of them.
+ */
+class ArgPacker {
+ public:
+  ArgPacker(std::unordered_map<std::string, dmlc::any>* attrs, std::vector<tvm::relay::Expr>* args)
+      : attrs_(attrs), args_(args) {}
+
+  int Put(const tvm::relay::Expr& arg, std::string tag_name = "") {
+    if (!arg.defined()) return -1;
+    int idx = args_->size();
+    args_->push_back(arg);
+    if (!tag_name.empty()) {
+      attrs_->operator[](tag_name) = dmlc_attr(idx);
+    }
+    return idx;
+  }
+
+ private:
+  std::unordered_map<std::string, dmlc::any>* attrs_;
+  std::vector<tvm::relay::Expr>* args_;
+};
+
+const tvm::relay::CallNode* ParseQnnConvComp(const tvm::relay::FunctionNode& comp_fn,
+                                             std::unordered_map<std::string, dmlc::any>* ext_attrs,
+                                             std::vector<tvm::relay::Expr>* args) {
+  using namespace tvm::relay;
+
+  // Pattern
+  auto src = IsWildcard();
+  auto wgh = IsWildcard();
+  auto sum_src = IsWildcard();
+  auto bias = IsConstant();
+
+  auto o_scl = IsConstant();
+  auto act_scl = IsConstant();
+  auto sum_scl = IsConstant();
+  auto dst_zp = IsConstant();
+
+  DFPattern cnv;
+  DFPattern pat;
+
+  cnv = IsOp("qnn.conv2d")({src, wgh, IsConstant(), IsConstant(), IsConstant(), IsConstant()});
+  pat = IsOp("cast")({cnv});
+  pat = IsOp("add")({pat, bias}) || pat;
+  pat = IsOp("multiply")({pat, o_scl});
+  pat = IsOp("clip")({pat});
+  pat = IsOp("multiply")({pat, act_scl}) || pat;
+  pat = IsOp("add")({pat, sum_scl * IsOp("cast")({sum_src})}) || pat;
+  pat = IsOp("add")({pat, dst_zp}) || pat;
+  pat = IsOp("cast")({pat});
+
+  // Check pattern match
+  auto indexed_body = CreateIndexedGraph(comp_fn.body);
+  DFPatternMatcher matcher(indexed_body.get());
+  auto res = matcher.Match(pat, comp_fn.body);
+  ICHECK(res) << "Mismatch of DNNL partitioner and codegen logic";
+
+  // Handle arguments in deterministic order
+  auto map = matcher.GetMemo();
+  auto find = [&map](const DFPattern& pat) -> tvm::relay::Expr {
+    if (map.count(pat)) return map.at(pat)[0];
+    return {};
+  };
+
+  ArgPacker arg_holder(ext_attrs, args);
+  arg_holder.Put(find(src));
+  arg_holder.Put(find(wgh));
+  arg_holder.Put(find(bias), "bias_idx");
+  arg_holder.Put(find(sum_src), "sum_idx");
+  arg_holder.Put(find(o_scl), "o_scl_idx");
+  arg_holder.Put(find(act_scl), "act_scl_idx");
+  arg_holder.Put(find(sum_scl), "sum_scl_idx");
+  arg_holder.Put(find(dst_zp), "dst_zp_idx");
+
+  // Activation. Default clip to simulate relu via uint8 cast
+  std::vector<std::string> clip_attr{"clip"};
+  auto act_scl_val = map.count(act_scl) ? find(act_scl) : constant(1.0);
+  clip_attr.push_back(std::to_string(arg_holder.Put(act_scl_val)));      // act_scale
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(0.0))));    // alpha
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(255.0))));  // beta
+  (*ext_attrs)["activation"] = dmlc_attr(clip_attr);
+
+  return map.at(cnv)[0].as<CallNode>();
+}
+
+const tvm::relay::CallNode* ParseQnnDenseComp(const tvm::relay::FunctionNode& comp_fn,
+                                              std::unordered_map<std::string, dmlc::any>* ext_attrs,
+                                              std::vector<tvm::relay::Expr>* args) {
+  using namespace tvm::relay;
+
+  // Pattern
+  auto src = IsWildcard();
+  auto wgh = IsWildcard();
+  auto sum_src = IsWildcard();
+  auto bias = IsConstant();
+
+  auto o_scl = IsConstant();
+  auto act_scl = IsConstant();
+  auto sum_scl = IsConstant();
+  auto dst_zp = IsConstant();
+
+  DFPattern dns, act, pat;
+
+  dns = IsOp("qnn.dense")({src, wgh, IsConstant(), IsConstant(), IsConstant(), IsConstant()});
+  pat = IsOp("cast")({dns});
+  pat = IsOp("add")({pat, bias}) || pat;
+  pat = IsOp("multiply")({pat, o_scl});
+  pat = IsOp("clip")({pat});
+  pat = IsOp("multiply")({pat, act_scl}) || pat;
+  pat = IsOp("add")({pat, sum_scl * IsOp("cast")({sum_src})}) || pat;
+  pat = IsOp("add")({pat, dst_zp}) || pat;
+  pat = IsOp("cast")({pat});
+
+  // Check pattern match
+  auto indexed_body = CreateIndexedGraph(comp_fn.body);
+  DFPatternMatcher matcher(indexed_body.get());
+  auto res = matcher.Match(pat, comp_fn.body);
+  ICHECK(res) << "Mismatch of DNNL partitioner and codegen logic";
+
+  // Handle arguments in deterministic order
+  auto memo = matcher.GetMemo();
+  auto find = [&memo](const DFPattern& pat) -> tvm::relay::Expr {
+    if (memo.count(pat)) return memo.at(pat)[0];
+    return {};
+  };
+
+  ArgPacker arg_holder(ext_attrs, args);
+  arg_holder.Put(find(src));
+  arg_holder.Put(find(wgh));
+  arg_holder.Put(find(bias), "bias_idx");
+  arg_holder.Put(find(sum_src), "sum_idx");
+  arg_holder.Put(find(o_scl), "o_scl_idx");
+  arg_holder.Put(find(act_scl), "act_scl_idx");
+  arg_holder.Put(find(sum_scl), "sum_scl_idx");
+  arg_holder.Put(find(dst_zp), "dst_zp_idx");
+
+  // Activation. Default clip to simulate relu via uint8 cast
+  std::vector<std::string> clip_attr{"clip"};
+  auto act_scl_val = memo.count(act_scl) ? find(act_scl) : constant(1.0);
+  clip_attr.push_back(std::to_string(arg_holder.Put(act_scl_val)));      // act_scale
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(0.0))));    // alpha
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(255.0))));  // beta
+  (*ext_attrs)["activation"] = dmlc_attr(clip_attr);
+
+  return memo.at(dns)[0].as<CallNode>();
+}
+
+/*!
+ * Parse composite function and return real args, additional attributes and root call node
+ * @param comp_fn composite function to parse
+ * @param ext_attrs attr collection with additional attributes
+ * @param args real arguments of node
+ * @return root call node
+ */
+const tvm::relay::CallNode* ParseComposite(const tvm::relay::FunctionNode& comp_fn,
+                                           std::unordered_map<std::string, dmlc::any>* ext_attrs,
+                                           std::vector<tvm::relay::Expr>* args) {
+  auto comp = comp_fn.GetAttr<tvm::String>(tvm::relay::attr::kComposite);
+  ICHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions.";
+  auto name = comp.value();
+
+  const tvm::relay::CallNode* res = nullptr;
+  if (name == "dnnl.qnn.conv2d")
+    res = ParseQnnConvComp(comp_fn, ext_attrs, args);
+  else if (name == "dnnl.qnn.dense")
+    res = ParseQnnDenseComp(comp_fn, ext_attrs, args);
+  return res;
+}
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index 5045f3323af7c..a4239186b4b33 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -134,9 +134,56 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       {"tanh", dnnl::algorithm::eltwise_tanh},
       {"sigmoid", dnnl::algorithm::eltwise_logistic},
       {"clip", dnnl::algorithm::eltwise_clip},
+      {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf},
   };
 
-  bool ParsingOpName(const std::string op_name, dnnl::primitive_attr attr) {
+  dnnl::primitive_attr ParseAttrs(const size_t& nid, TensorRequisite* bias_tr) {
+    dnnl::primitive_attr attr;
+
+    // Post op attributes based on named inputs.
+    auto dst_zp_tr = GetInputByName(nid, "dst_zp_idx");
+    auto o_scl_tr = GetInputByName(nid, "o_scl_idx");
+    auto sum_scl_tr = GetInputByName(nid, "sum_scl_idx");
+
+    if (o_scl_tr) {
+      ICHECK(o_scl_tr.IsConstant());
+      auto data = o_scl_tr.GetConstDataLikeVec<float>();
+      attr.set_output_scales(data.size() == 1 ? 0 : (1 << 1), data);
+    }
+
+    auto activation = GetNodeAttr<std::vector<std::string>>(nodes_[nid], "activation", {"none"});
+    if (activation[0] != "none") {
+      auto a_type = elt_name2algo.at(activation[0]);
+      auto a_scale = GetInput(nid, std::stoi(activation[1])).GetConstScalarData<float>();
+      auto a_alfa = GetInput(nid, std::stoi(activation[2])).GetConstScalarData<float>();
+      auto a_beta = GetInput(nid, std::stoi(activation[3])).GetConstScalarData<float>();
+
+      auto ops = attr.get_post_ops();
+      ops.append_eltwise(a_scale, a_type, a_alfa, a_beta);
+      attr.set_post_ops(ops);
+    }
+
+    if (sum_scl_tr) {
+      auto scl = sum_scl_tr.GetConstScalarData<float>();
+      auto ops = attr.get_post_ops();
+      ops.append_sum(scl);
+      attr.set_post_ops(ops);
+    }
+
+    if (dst_zp_tr) {
+      auto zp = dst_zp_tr.GetConstScalarData<float>();
+      // Use linear post op instead of set_zero_points(). Because of limitation of int32 type,
+      // but we have to use float.
+      auto ops = attr.get_post_ops();
+      ops.append_eltwise(1.0, dnnl::algorithm::eltwise_linear, 1.0, zp);
+      attr.set_post_ops(ops);
+    }
+    *bias_tr = GetInputByName(nid, "bias_idx");
+
+    if (o_scl_tr || activation[0] != "none" || sum_scl_tr || dst_zp_tr) return attr;
+
+    // parsing of name to extract attributes
+    auto op_name = nodes_[nid].GetOpName();
     // Define RegExp.
     std::regex bias_add_pat(".*_bias.*");
     std::regex relu_pat(".*_relu.*");
@@ -163,7 +210,9 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     }
 
     // Parsing bias_add.
-    return std::regex_match(op_name, bias_add_pat) ? true : false;
+    *bias_tr = std::regex_match(op_name, bias_add_pat) ? GetInput(nid, 2) : TensorRequisite{};
+
+    return attr;
   }
 
   // Build up the engine based on the input graph.
@@ -219,16 +268,16 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
   void Convolution(const size_t& nid) {
     auto node = nodes_[nid];
-    auto op_name = node.GetOpName();
-    dnnl::primitive_attr attr;
-    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
     auto src_tr = GetInput(nid, 0);
     auto wgh_tr = GetInput(nid, 1);
     auto dst_tr = GetOutput(nid, 0);
-    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+    auto bias_tr = TensorRequisite{};
+
+    auto attr = ParseAttrs(nid, &bias_tr);
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
     auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
     auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
     auto padding = GetNodeAttr<std::vector<int64_t>>(node, "padding");
@@ -292,25 +341,29 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
     auto scratchpad_tr = TensorRequisite::AsIs(conv_prim_desc.scratchpad_desc());
 
-    Submit(dnnl::convolution_forward(conv_prim_desc), {{DNNL_ARG_SRC, src_tr},
-                                                       {DNNL_ARG_WEIGHTS, wgh_tr},
-                                                       {DNNL_ARG_BIAS, bias_tr},
-                                                       {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
-                                                       {DNNL_ARG_DST, dst_tr}});
+    // TODO(@apeskov): Simulation of inplace primitive. just as PoC.
+    auto sum_in_tr = GetInputByName(nid, "sum_idx").TreatAs(dst_layout);
+
+    Submit(dnnl::convolution_forward(conv_prim_desc),
+           {{DNNL_ARG_SRC, src_tr},
+            {DNNL_ARG_WEIGHTS, wgh_tr},
+            {DNNL_ARG_BIAS, bias_tr},
+            {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+            {DNNL_ARG_DST, dst_tr}},
+           {sum_in_tr, DNNL_ARG_DST});
   }
 
   void Deconvolution(const size_t& nid) {
     auto node = nodes_[nid];
-    auto op_name = node.GetOpName();
-    dnnl::primitive_attr attr;
-    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
     auto src_tr = GetInput(nid, 0);
     auto wgh_tr = GetInput(nid, 1);
     auto dst_tr = GetOutput(nid, 0);
-    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+    auto bias_tr = TensorRequisite{};
+
+    auto attr = ParseAttrs(nid, &bias_tr);
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
 
     auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
     auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
@@ -374,16 +427,15 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
   void Dense(const size_t& nid) {
     auto node = nodes_[nid];
-    auto op_name = node.GetOpName();
-    dnnl::primitive_attr attr;
-    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
     auto src_tr = GetInput(nid, 0);
     auto wgh_tr = GetInput(nid, 1);
     auto dst_tr = GetOutput(nid, 0);
-    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+    auto bias_tr = TensorRequisite{};
+
+    auto attr = ParseAttrs(nid, &bias_tr);
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
 
     // Assumption that bias is correct and can be squeezed to 1D
     bias_tr = bias_tr.Reshape({dst_tr.dims()[1]});
@@ -403,11 +455,16 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
     auto scratchpad_tr = TensorRequisite::AsIs(dense_prim_desc.scratchpad_desc());
 
-    Submit(dnnl::inner_product_forward(dense_prim_desc), {{DNNL_ARG_SRC, src_tr},
-                                                          {DNNL_ARG_WEIGHTS, wgh_tr},
-                                                          {DNNL_ARG_BIAS, bias_tr},
-                                                          {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
-                                                          {DNNL_ARG_DST, dst_tr}});
+    // TODO(@apeskov): Simulation of inplace primitive. just as PoC.
+    auto sum_in_tr = GetInputByName(nid, "sum_idx");
+
+    Submit(dnnl::inner_product_forward(dense_prim_desc),
+           {{DNNL_ARG_SRC, src_tr},
+            {DNNL_ARG_WEIGHTS, wgh_tr},
+            {DNNL_ARG_BIAS, bias_tr},
+            {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+            {DNNL_ARG_DST, dst_tr}},
+           {sum_in_tr, DNNL_ARG_DST});
   }
 
   void BatchNorm(const size_t& nid) {
@@ -675,6 +732,11 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     return res;
   }
 
+  TensorRequisite GetInputByName(const size_t& nid, const std::string& name) {
+    auto idx = GetNodeAttr<int>(nodes_[nid], name, {"-1"});
+    return GetInput(nid, idx);
+  }
+
   TensorRequisite GetOutput(const size_t& nid, const int idx) {
     if (idx == -1) return {};  // -1 reserved value for empty input.
 
@@ -692,8 +754,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
   }
 
   /*! \brief Helper function to register primitive into execution queue */
-  void Submit(const dnnl::primitive& prim,
-              const std::unordered_map<int, TensorRequisite>& tr_args) {
+  void Submit(const dnnl::primitive& prim, const std::unordered_map<int, TensorRequisite>& tr_args,
+              const std::pair<TensorRequisite, int>& inplace_conf = {}) {
     // Register all provided TR arguments
     std::unordered_map<int, TensorRegistry::ArgId> prim_arg_id;
     TensorRegistry::ActionQue post_prim_actions;
@@ -706,6 +768,18 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       prim_arg_id[key] = arg_id;
     }
 
+    // Simulate inplace primitive
+    if (auto tr = inplace_conf.first) {
+      auto arg_id = tensor_registry_.Register(tr, &net_);
+      auto dst_tr = tr_args.at(inplace_conf.second);
+      auto dst_arg_id = prim_arg_id.at(inplace_conf.second);
+
+      // Register copy action direct before main primitive
+      dnnl::reorder::primitive_desc io_copy_pd(engine_, tr.desc(), engine_, dst_tr.desc());
+      net_.push_back(
+          {dnnl::reorder(io_copy_pd), {{DNNL_ARG_SRC, arg_id}, {DNNL_ARG_DST, dst_arg_id}}});
+    }
+
     // Register main primitive
     net_.push_back({prim, prim_arg_id});
 
diff --git a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
index d02ceff5de823..bad4bc10edec3 100644
--- a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
+++ b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
@@ -275,6 +275,7 @@ class TensorRequisite {
    *               innermost.
    */
   TensorRequisite TreatAs(const std::string& layout, std::string desired_logic_layout = "") const {
+    if (!defined()) return *this;
     if (desired_logic_layout.empty()) desired_logic_layout = DefaultLogicLayoutFor(layout);
 
     const auto origin_dims = dims();
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index c884665421cbf..2138eda086978 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -20,6 +20,7 @@
 import sys
 import subprocess
 import math
+import collections
 
 import tvm
 from tvm import relay
@@ -51,7 +52,7 @@ def bf16_supported():
             cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode()
             for line in cpu_info.split("\n"):
                 if line.startswith("hw.optional.avx512f"):
-                    _bf16_supported = bool(line.split(":", 1)[1])
+                    _bf16_supported = bool(int(line.split(":", 1)[1]))
         elif sys.platform.startswith("linux"):
             _bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
     return _bf16_supported
@@ -114,6 +115,7 @@ def partition_for_dnnl(mod, params=None, alter_layout=True, prune_subgraphs=True
 
     mod = dnnl.rewrite_layer_norm(mod)
     mod = dnnl.rewrite_dense_bias_gelu_reshape_last(mod)
+    mod = dnnl.legalize_qnn_for_dnnl(mod)
 
     byoc_seq = tvm.transform.Sequential(
         [
@@ -1126,5 +1128,540 @@ def get_graph(act=None):
     )
 
 
+def permute_shape(shape, l_from="", l_to=""):
+    res_shape = []
+    for label in l_to:
+        pos = l_from.find(label)
+        res_shape.append(shape[pos])
+
+    return res_shape
+
+
+def expand_dim(shape, rank=0):
+    assert len(shape) == 1
+    return shape + [1] * (rank - 1)
+
+
+def filler_uni(low=0, high=1):
+    def filler_func(shape):
+        return np.random.uniform(low, high, shape)
+
+    return filler_func
+
+
+class QnnBuilder:
+    def __init__(self, qnn_profile=None):
+        self._args = {}
+        self._args_op = []
+        self._qp = qnn_profile
+
+    def arg(self, shape=[], dtype="float32", filler=filler_uni(), is_const=True):
+        if isinstance(filler, (int, float)):
+            value = np.full(shape, filler).astype(dtype)
+        else:
+            value = filler(shape).astype(dtype)
+
+        if is_const:
+            res = relay.const(value, dtype=dtype)
+        else:
+            name = f"in_{len(self._args)}"
+            res = relay.var(name, shape=shape, dtype=dtype)
+            self._args[name] = value
+            self._args_op.append(res)
+
+        return res
+
+    def make_zp(self, mean_val, num_ch=1, dispersion=0.2):
+        if num_ch == 1:
+            return self.arg(shape=[], dtype="int32", filler=mean_val)
+        else:
+            low = int(mean_val * (1 - dispersion))
+            high = int(mean_val * (1 + dispersion))
+            return self.arg(shape=[num_ch], dtype="int32", filler=filler_uni(low, high))
+
+    def make_scl(self, mean_val, num_ch=1, dispersion=0.2):
+        if num_ch == 1:
+            return self.arg(shape=[], dtype="float32", filler=mean_val)
+        else:
+            low = mean_val * (1 - dispersion)
+            high = mean_val * (1 + dispersion)
+            return self.arg(shape=[num_ch], dtype="float32", filler=filler_uni(low, high))
+
+    def make_zp_and_scl(self, name, num_ch=1, dispersion=0.2):
+        is_per_channel = getattr(self._qp, f"{name}_pc")
+        zp_val = getattr(self._qp, f"{name}_zp")
+        scl_val = getattr(self._qp, f"{name}_scl")
+
+        zp = self.make_zp(zp_val, num_ch if is_per_channel else 1, dispersion)
+        scl = self.make_scl(scl_val, num_ch if is_per_channel else 1, dispersion)
+        return zp, scl
+
+    def finalize(self, op):
+        func = relay.Function(self._args_op, op)
+        mod = tvm.IRModule.from_expr(func)
+        mod = relay.transform.InferType()(mod)
+        return mod, self._args
+
+
+def check_fully_annotated(mod, desired_compiler):
+    matched_ops = []
+    other_ops = []
+
+    def _visit(node):
+        if isinstance(node, tvm.relay.Call):
+            op = node.op
+            if isinstance(op, relay.GlobalVar):
+                func = mod[op]
+                if "Compiler" in func.attrs and func.attrs["Compiler"] == desired_compiler:
+                    matched_ops.append(op)
+                    return
+            else:
+                other_ops.append(op)
+
+    tvm.relay.analysis.post_order_visit(mod["main"].body, _visit)
+
+    assert len(other_ops) == 0 and len(matched_ops) != 0, "Model is not fully DNNL compiled"
+
+
+def check_result(
+    mod,
+    ref_mod,
+    map_inputs,
+    tol=1e-5,
+    target="llvm",
+    device=tvm.cpu(),
+    params=None,
+    ref_result=None,
+    atol=None,
+    desired_compiler="dnnl",
+):
+    if atol is None:
+        atol = tol
+
+    if desired_compiler is not None:
+        check_fully_annotated(mod, desired_compiler)
+
+    if ref_result is None:
+        # Run the reference result
+        relay.backend.te_compiler.get().clear()
+        with tvm.transform.PassContext(opt_level=3):
+            ref_lib = relay.build(ref_mod, target=target, params=params)
+        ref_rt_mod = tvm.contrib.graph_executor.GraphModule(ref_lib["default"](device))
+
+        for name, data in map_inputs.items():
+            ref_rt_mod.set_input(name, data)
+        ref_rt_mod.run()
+        out = ref_rt_mod.get_output(0)
+        ref_result = out.numpy()
+
+    def check_vm_result():
+        relay.backend.te_compiler.get().clear()
+        with tvm.transform.PassContext(opt_level=3):
+            exe = relay.vm.compile(mod, target=target, params=params)
+        code, lib = exe.save()
+        exe = tvm.runtime.vm.Executable.load_exec(code, lib)
+        vm = tvm.runtime.vm.VirtualMachine(exe, device)
+        output = vm.run(**map_inputs)
+        tvm.testing.assert_allclose(output.numpy(), ref_result, rtol=tol, atol=atol)
+
+    def check_graph_executor_result():
+        relay.backend.te_compiler.get().clear()
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target=target, params=params)
+        rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+        rt_mod.run(**map_inputs)
+        output = rt_mod.get_output(0)
+        tvm.testing.assert_allclose(output.numpy(), ref_result, rtol=tol, atol=atol)
+
+    check_vm_result()
+    check_graph_executor_result()
+
+
+ConvProfile = collections.namedtuple(
+    "ConvProfile",
+    [
+        "SHAPE",
+        "KER",
+        "STR",
+        "PAD",
+        "DEL",
+        "OC",
+        "GR",
+        "D_LAYOUT",
+        "K_LAYOUT",
+    ],
+)
+base_conv = ConvProfile(
+    SHAPE=[1, 8, 5, 5],
+    KER=[3, 3],
+    STR=[1, 1],
+    PAD=[1, 1],
+    DEL=[1, 1],
+    OC=16,
+    GR=1,
+    D_LAYOUT="NCHW",
+    K_LAYOUT="OIHW",
+)
+base_conv_nhwc = base_conv._replace(D_LAYOUT="NHWC", K_LAYOUT="HWIO")
+base_conv_dilated = base_conv._replace(PAD=[2, 2], DEL=[2, 2])
+base_conv_no_pad = base_conv._replace(PAD=[0, 0])
+base_conv_no_pad_nhwc = base_conv_no_pad._replace(D_LAYOUT="NHWC", K_LAYOUT="HWIO")
+base_conv_group_no_pad = base_conv_no_pad._replace(GR=2)
+base_conv_dw_no_pad = base_conv_no_pad._replace(SHAPE=[1, 16, 5, 5], GR=16)
+
+
+DenseProfile = collections.namedtuple("DenseProfile", ["N", "IC", "OC"])
+base_dense_profile = DenseProfile(N=2, IC=10, OC=16)
+
+ArgConstConfig = collections.namedtuple("ArgConstConfig", ["Data", "Weights", "Bias", "Sum"])
+acp_regular = ArgConstConfig(Data=False, Weights=True, Bias=True, Sum=None)
+acp_no_bias = ArgConstConfig(Data=False, Weights=True, Bias=None, Sum=None)
+acp_with_sum = ArgConstConfig(Data=False, Weights=True, Bias=True, Sum=False)
+acp_no_bias_with_sum = ArgConstConfig(Data=False, Weights=True, Bias=None, Sum=False)
+
+QuantizationConfig = collections.namedtuple(
+    "QuantizationConfig",
+    [
+        "d_zp",
+        "d_scl",
+        "d_pc",
+        "k_zp",
+        "k_scl",
+        "k_pc",
+        "rq_zp",
+        "rq_scl",
+        "rq_pc",
+        "sum_zp",
+        "sum_scl",
+        "sum_pc",
+        "o_zp",
+        "o_scl",
+        "o_pc",
+    ],
+)
+
+qp_regular = QuantizationConfig(
+    d_zp=0,
+    d_scl=0.2,
+    d_pc=False,
+    k_zp=0,
+    k_scl=0.1,
+    k_pc=False,
+    rq_zp=30,
+    rq_scl=0.2,
+    rq_pc=False,
+    sum_zp=15,
+    sum_scl=0.3,
+    sum_pc=False,
+    o_zp=5,
+    o_scl=0.2,
+    o_pc=False,
+)
+qp_asymmetric_data = qp_regular._replace(
+    d_zp=3, rq_zp=10, rq_scl=0.1, sum_zp=15, sum_scl=0.3, o_zp=4
+)
+
+qnn_conv_profiles = tvm.testing.parameter(
+    by_dict={
+        #  Pattern qnn.conv2d + qnn.requantize
+        "Base": (base_conv, acp_regular, qp_regular),
+        "NHWC": (base_conv_nhwc, acp_regular, qp_regular),
+        #  Asymmetric input. NOTE: No pad! Input ZP is not compatible with padding
+        "Group": (base_conv_group_no_pad, acp_regular, qp_asymmetric_data),
+        "DW": (base_conv_dw_no_pad, acp_regular, qp_asymmetric_data),
+        "NoBias": (base_conv, acp_no_bias, qp_regular),
+        "AsymmetricInput": (base_conv_no_pad, acp_regular, qp_asymmetric_data),
+        "AsymmetricInput_NHWC": (base_conv_no_pad_nhwc, acp_regular, qp_asymmetric_data),
+        #  Pattern Conv2d + Requantize + Sum
+        "WithSum": (base_conv_no_pad, acp_with_sum, qp_asymmetric_data),
+        "WithSum_NHWC": (base_conv_no_pad_nhwc, acp_with_sum, qp_asymmetric_data),
+        "WithSum_NoBias": (base_conv_no_pad, acp_no_bias_with_sum, qp_asymmetric_data),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_qnn_conv2d(qnn_conv_profiles):
+    def generate_model(p, c, q):
+        np.random.seed(0)
+
+        N, IC, IH, IW = p.SHAPE
+        d_shape = p.SHAPE
+        w_shape = [p.OC, IC, *p.KER]
+        b_shape = [p.OC]
+        s_shape = [
+            p.SHAPE[0],
+            p.OC,
+            (IH + 2 * p.PAD[0] - (p.KER[0] - 1) * p.DEL[0] - 1) // p.STR[0] + 1,
+            (IW + 2 * p.PAD[1] - (p.KER[1] - 1) * p.DEL[1] - 1) // p.STR[1] + 1,
+        ]
+
+        if p.GR != 1:
+            w_shape[1] //= p.GR
+
+        d_shape = permute_shape(d_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        s_shape = permute_shape(s_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        w_shape = permute_shape(w_shape, l_from="OIHW", l_to=p.K_LAYOUT)
+
+        c_dim = p.D_LAYOUT.find("C")
+        b_shape = expand_dim(b_shape, rank=len(p.D_LAYOUT) - c_dim)
+
+        bld = QnnBuilder(qnn_profile=q)
+
+        # Start build a test graph
+        data = bld.arg(shape=d_shape, dtype="uint8", is_const=c.Data, filler=filler_uni(0, 20))
+        d_zp, d_scl = bld.make_zp_and_scl("d", IC)
+
+        # Convolution
+        wgh = bld.arg(shape=w_shape, dtype="int8", is_const=c.Weights, filler=filler_uni(-20, 20))
+        w_zp, w_scl = bld.make_zp_and_scl("k")
+
+        op = tvm.relay.qnn.op.conv2d(
+            data,
+            wgh,
+            d_zp,
+            w_zp,
+            d_scl,
+            w_scl,
+            kernel_size=p.KER,
+            padding=p.PAD,
+            strides=p.STR,
+            dilation=p.DEL,
+            groups=p.GR,
+            channels=p.OC,
+            out_dtype="int32",
+            data_layout=p.D_LAYOUT,
+            kernel_layout=p.K_LAYOUT,
+        )
+        # Optional bias
+        if c.Bias is not None:
+            bias = bld.arg(
+                shape=b_shape, dtype="int32", is_const=c.Bias, filler=filler_uni(-50, 50)
+            )
+            op = tvm.relay.add(op, bias)
+
+        # Re-quantization
+        rq_in_zp = bld.make_zp(0)
+        rq_in_scl = bld.make_scl(q.d_scl * q.k_scl)  # in real cases that should be a vector
+        rq_out_zp, rq_out_scl = bld.make_zp_and_scl("rq")
+
+        op = tvm.relay.qnn.op.requantize(
+            op, rq_in_scl, rq_in_zp, rq_out_scl, rq_out_zp, out_dtype="int32"
+        )
+        op = tvm.relay.clip(
+            op, a_min=0.0, a_max=255.0
+        )  # pytorch frontend specific, I guess it's redundant
+        op = tvm.relay.cast(op, dtype="uint8")
+
+        # Optional sum (ResNet like)
+        if c.Sum is not None:
+            sum_in = bld.arg(dtype="uint8", shape=s_shape, filler=filler_uni(0, 10), is_const=c.Sum)
+
+            lhs_zp, lhs_scl = bld.make_zp_and_scl("rq")
+            rhs_zp, rhs_scl = bld.make_zp_and_scl("sum")
+            out_zp, out_scl = bld.make_zp_and_scl("o")
+
+            op = tvm.relay.qnn.op.add(op, sum_in, lhs_scl, lhs_zp, rhs_scl, rhs_zp, out_scl, out_zp)
+            op = tvm.relay.clip(op, a_min=0.0, a_max=255.0)
+
+        return bld.finalize(op)
+
+    conv_p, arg_p, quant_p = qnn_conv_profiles
+    ref_mod, args = generate_model(conv_p, arg_p, quant_p)
+    mod = partition_for_dnnl(ref_mod)
+
+    # atol=1 means int values should match with +-1 quantum value tolerance
+    check_result(mod, ref_mod, args, tol=1e-10, atol=1, desired_compiler="dnnl")
+
+
+conv_profiles = tvm.testing.parameter(
+    by_dict={
+        "Base": (base_conv, acp_regular),
+        "NHWC": (base_conv_nhwc, acp_regular),
+        "Group": (base_conv_group_no_pad, acp_regular),
+        "DW": (base_conv_dw_no_pad, acp_regular),
+        "Dilated": (base_conv_dilated, acp_regular),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_conv2d_plus(conv_profiles):
+    def generate_model(p, c):
+        np.random.seed(0)
+
+        N, IC, IH, IW = p.SHAPE
+        d_shape = p.SHAPE
+        w_shape = [p.OC, IC, *p.KER]
+        b_shape = [p.OC]
+        s_shape = [
+            p.SHAPE[0],
+            p.OC,
+            (IH + 2 * p.PAD[0] - (p.KER[0] - 1) * p.DEL[0] - 1) // p.STR[0] + 1,
+            (IW + 2 * p.PAD[1] - (p.KER[1] - 1) * p.DEL[1] - 1) // p.STR[1] + 1,
+        ]
+
+        if p.GR != 1:
+            w_shape[1] //= p.GR
+
+        d_shape = permute_shape(d_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        s_shape = permute_shape(s_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        w_shape = permute_shape(w_shape, l_from="OIHW", l_to=p.K_LAYOUT)
+
+        c_dim = p.D_LAYOUT.find("C")
+        # b_shape = expand_dim(b_shape, rank=len(p.D_LAYOUT) - c_dim)
+
+        bld = QnnBuilder()
+
+        op = bld.arg(shape=d_shape, dtype="float32", is_const=c.Data)
+        wgh = bld.arg(shape=w_shape, dtype="float32", is_const=c.Weights)
+        op = tvm.relay.nn.conv2d(
+            op,
+            wgh,
+            kernel_size=p.KER,
+            padding=p.PAD,
+            strides=p.STR,
+            dilation=p.DEL,
+            groups=p.GR,
+            channels=p.OC,
+            out_dtype="float32",
+            data_layout=p.D_LAYOUT,
+            kernel_layout=p.K_LAYOUT,
+        )
+
+        if c.Bias is not None:
+            bias = bld.arg(shape=b_shape, dtype="float32", is_const=c.Bias)
+            op = tvm.relay.nn.bias_add(op, bias, axis=c_dim)
+
+        if c.Sum is not None:
+            sum_in = bld.arg(shape=s_shape, dtype="float32", is_const=c.Sum)
+            op = tvm.relay.op.add(op, sum_in)
+
+        return bld.finalize(op)
+
+    conv_p, arg_p = conv_profiles
+    ref_mod, args = generate_model(conv_p, arg_p)
+    mod = partition_for_dnnl(ref_mod, alter_layout=False)
+    check_result(mod, ref_mod, args, tol=1e-5, desired_compiler="dnnl")
+
+
+qnn_dense_profiles = tvm.testing.parameter(
+    by_dict={
+        #  Pattern Dense + Requantize
+        "Base": (base_dense_profile, acp_regular, qp_regular),
+        "AsymmetricInput": (base_dense_profile, acp_regular, qp_asymmetric_data),
+        #  Pattern Dense + Requantize + Sum
+        "AsymmetricInput_Sum": (base_dense_profile, acp_with_sum, qp_asymmetric_data),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_qnn_dense(qnn_dense_profiles):
+    def generate_model(p, c, q):
+        np.random.seed(0)
+
+        d_shape = [p.N, p.IC]
+        w_shape = [p.OC, p.IC]
+        b_shape = [p.OC]
+        s_shape = [p.N, p.OC]
+
+        bld = QnnBuilder(qnn_profile=q)
+
+        # Start build a test graph
+        data = bld.arg(shape=d_shape, dtype="uint8", is_const=c.Data, filler=filler_uni(0, 20))
+        d_zp, d_scl = bld.make_zp_and_scl("d", p.IC)
+
+        # Convolution
+        wgh = bld.arg(shape=w_shape, dtype="int8", is_const=c.Weights, filler=filler_uni(-20, 20))
+        w_zp, w_scl = bld.make_zp_and_scl("k")
+
+        op = tvm.relay.qnn.op.dense(
+            data, wgh, d_zp, w_zp, d_scl, w_scl, units=p.OC, out_dtype="int32"
+        )
+        # Optional bias
+        if c.Bias is not None:
+            bias = bld.arg(
+                shape=b_shape, dtype="int32", is_const=c.Bias, filler=filler_uni(-50, 50)
+            )
+            op = tvm.relay.add(op, bias)
+
+        # Re-quantization
+        rq_in_zp = bld.make_zp(0)
+        rq_in_scl = bld.make_scl(q.d_scl * q.k_scl)  # in real cases that should be a vector
+        rq_out_zp, rq_out_scl = bld.make_zp_and_scl("rq")
+
+        op = tvm.relay.qnn.op.requantize(
+            op, rq_in_scl, rq_in_zp, rq_out_scl, rq_out_zp, out_dtype="int32"
+        )
+        op = tvm.relay.clip(
+            op, a_min=0.0, a_max=255.0
+        )  # pytorch frontend specific, I guess it's redundant
+        op = tvm.relay.cast(op, dtype="uint8")
+
+        # Optional sum (ResNet like)
+        if c.Sum is not None:
+            sum_in = bld.arg(dtype="uint8", shape=s_shape, filler=filler_uni(0, 10), is_const=c.Sum)
+
+            lhs_zp, lhs_scl = bld.make_zp_and_scl("rq")
+            rhs_zp, rhs_scl = bld.make_zp_and_scl("sum")
+            out_zp, out_scl = bld.make_zp_and_scl("o")
+
+            op = tvm.relay.qnn.op.add(op, sum_in, lhs_scl, lhs_zp, rhs_scl, rhs_zp, out_scl, out_zp)
+            op = tvm.relay.clip(op, a_min=0.0, a_max=255.0)
+
+        return bld.finalize(op)
+
+    conv_p, arg_p, quant_p = qnn_dense_profiles
+    ref_mod, args = generate_model(conv_p, arg_p, quant_p)
+    mod = partition_for_dnnl(ref_mod)
+
+    # atol=1 means int values should match with +-1 quantum value tolerance
+    check_result(mod, ref_mod, args, tol=1e-10, atol=1, desired_compiler="dnnl")
+
+
+dense_profiles = tvm.testing.parameter(
+    by_dict={
+        "Base": (base_dense_profile, acp_regular),
+        "WithSum": (base_dense_profile, acp_with_sum),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_dense_plus(dense_profiles):
+    def generate_model(p, c):
+        np.random.seed(0)
+
+        d_shape = [p.N, p.IC]
+        w_shape = [p.OC, p.IC]
+        b_shape = [p.OC]
+        s_shape = [p.N, p.OC]
+
+        c_dim = 1
+
+        bld = QnnBuilder()
+
+        op = bld.arg(shape=d_shape, dtype="float32", is_const=c.Data)
+        wgh = bld.arg(shape=w_shape, dtype="float32", is_const=c.Weights)
+        op = tvm.relay.nn.dense(op, wgh, out_dtype="float32")
+
+        if c.Bias is not None:
+            bias = bld.arg(shape=b_shape, dtype="float32", is_const=c.Bias)
+            op = tvm.relay.nn.bias_add(op, bias, axis=c_dim)
+
+        if c.Sum is not None:
+            sum_in = bld.arg(shape=s_shape, dtype="float32", is_const=c.Sum)
+            op = tvm.relay.op.add(op, sum_in)
+
+        return bld.finalize(op)
+
+    dense_p, arg_p = dense_profiles
+    ref_mod, args = generate_model(dense_p, arg_p)
+    mod = partition_for_dnnl(ref_mod)
+    check_result(mod, ref_mod, args, tol=1e-5, desired_compiler="dnnl")
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index dedeae56e9daf..58b41189a0f0c 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -926,11 +926,11 @@ def test_dnnl_fuse():
         conv2d_relu_pat,
         conv2d_sigmoid_pat,
     ) = (
-        dnnl_patterns[1],
-        dnnl_patterns[13],
-        dnnl_patterns[20],
-        dnnl_patterns[26],
-        dnnl_patterns[38],
+        dnnl_patterns[3],
+        dnnl_patterns[15],
+        dnnl_patterns[22],
+        dnnl_patterns[28],
+        dnnl_patterns[40],
     )
 
     def get_blocks(

From 2ffd9557794ab34b20deed1d0ed5e38cb2f81e20 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 17 Jun 2022 12:15:43 -0400
Subject: [PATCH 165/181] [hexagon][testing] add test-skip logic; fixes
 (#11737)

- Skip Hexagon benchmarks whenever the env. var `ANDROID_SERIAL_NUMBER`
  has the value `simulator`.

  This is a temporary hack to prevent the CI pre-commit hook from
  running benchmarks, due to the extra time required.

- Fix a bug where the elementwise-add benchmark code was broken by
  an earlier change to the `HexagonLauncherRPC` class.

- Rename `benchmark_elemwise_add.py` to `test_benchmark_elemwise_add.py`
  so that it's noticed by the CI test infrastructure.
  (CI tests are sometimes run in contexts _other than_ the pre-commit
  hook.)

- Miscellaneous small changes to
  `tests/python/contrib/test_hexagon/benchmark_util.py`.
---
 .../contrib/test_hexagon/benchmark_util.py    | 34 ++++++++++++++
 ..._add.py => test_benchmark_elemwise_add.py} | 46 ++++++++-----------
 2 files changed, 54 insertions(+), 26 deletions(-)
 rename tests/python/contrib/test_hexagon/{benchmark_elemwise_add.py => test_benchmark_elemwise_add.py} (93%)

diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py
index 113c7780c130f..35fe6bad64b80 100644
--- a/tests/python/contrib/test_hexagon/benchmark_util.py
+++ b/tests/python/contrib/test_hexagon/benchmark_util.py
@@ -16,6 +16,40 @@
 # under the License.
 
 import csv
+import os
+
+
+def skip_bencharks_flag_and_reason():
+    """
+    Returns one of these tuples:
+        (False, '') or
+        (True, (a string describing why the test should be skipped))
+
+    NOTE: This function is a temporary measure to prevent the TVM CI system
+    running benchmark scripts every time the CI pre-commit hook executes.
+    This should go away when a better system is in place to govern when various
+    tests / benchmarks are executed.
+    """
+    asn = os.environ.get("ANDROID_SERIAL_NUMBER")
+
+    if asn == "simulator":
+        return (True, "Skipping benchmarks when  ANDROID_SERIAL_NUMBER='simluator'")
+    else:
+        return (False, "")
+
+
+class UnsupportedException(Exception):
+    """
+    Indicates that the specified benchmarking configuration is known to
+    currently be unsupported.  The Exception message may provide more detail.
+    """
+
+
+class NumericalAccuracyException(Exception):
+    """
+    Indicates that the benchmarking configuration appeared to run successfully,
+    but the output data didn't have the expected accuracy.
+    """
 
 
 class BenchmarksTable:
diff --git a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
similarity index 93%
rename from tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
rename to tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
index 70266d7939bc5..f7f5f3e176e46 100644
--- a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
@@ -28,7 +28,9 @@
 from tvm.script import tir as T
 from tvm import te
 from tvm.contrib.hexagon.build import HexagonLauncherRPC
-from . import benchmark_util
+from . import benchmark_util as bu
+
+_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
 
 # This is a fixed detail of the v68 architecture.
 HVX_VECTOR_BYTES = 128
@@ -43,7 +45,7 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
-_BT = benchmark_util.BenchmarksTable()
+_BT = bu.BenchmarksTable()
 
 _CSV_COLUMN_ORDER = [
     # Identifies which TE-compute / TIRScript is used as the basis for the
@@ -88,21 +90,6 @@
 print("-" * 80)
 print()
 
-
-class UnsupportedException(Exception):
-    """
-    Indicates that the specified benchmarking configuration is known to
-    currently be unsupported.  The Exception message may provide more detail.
-    """
-
-
-class NumericalAccuracyException(Exception):
-    """
-    Indicates that the benchmarking configuration appeared to run successfully,
-    but the output data didn't have the expected accuracy.
-    """
-
-
 from typing import Tuple
 
 
@@ -129,7 +116,7 @@ def _get_irmod_elemwise_add(
     dtype_str = str(dtype)
 
     if mem_scope == "global.vtcm":
-        raise UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")
+        raise bu.UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")
 
         # This check is currently elided by the one above, but it should become relevant as soon
         # as we add VTCM support to this kernel generator.
@@ -147,7 +134,7 @@ def _get_irmod_elemwise_add(
         estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors
 
         if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes:
-            raise UnsupportedException("Expect to exceed VTCM budget.")
+            raise bu.UnsupportedException("Expect to exceed VTCM budget.")
 
     @tvm.script.ir_module
     class BenchmarkModule:
@@ -190,10 +177,10 @@ def _benchmark_hexagon_elementwise_add_kernel(
         "mem_scope": mem_scope,
     }
 
-    desc = benchmark_util.get_benchmark_decription(keys_dict)
+    desc = bu.get_benchmark_decription(keys_dict)
 
     # Create the host-side directory for this benchmark run's files / logs...
-    host_files_dir_name = benchmark_util.get_benchmark_id(keys_dict)
+    host_files_dir_name = bu.get_benchmark_id(keys_dict)
     host_files_dir_path = os.path.join(_HOST_OUTPUT_DIR, host_files_dir_name)
     os.mkdir(host_files_dir_path)
 
@@ -238,7 +225,9 @@ def _benchmark_hexagon_elementwise_add_kernel(
             # Upload the .so to the Android device's file system (or wherever is appropriate
             # when using the Hexagon simulator)...
             target_dso_binary_filename = "test_binary.so"
-            hexagon_launcher.upload(host_dso_binary_path, target_dso_binary_filename)
+            target_dso_binary_pathname = hexagon_launcher.upload(
+                host_dso_binary_path, target_dso_binary_filename
+            )
 
             # Generate our testing / validation data...
             (
@@ -251,7 +240,7 @@ def _benchmark_hexagon_elementwise_add_kernel(
                 # On the target device / simulator, make our Hexagon-native shared object
                 # available for use...
                 loaded_hexagon_module: tvm.runtime.module.Module = hexagon_launcher.load_module(
-                    target_dso_binary_filename, sess
+                    target_dso_binary_pathname, sess
                 )
 
                 # Create the target-side tensors to hold the primfunc's inputs and outputs...
@@ -296,11 +285,11 @@ def _benchmark_hexagon_elementwise_add_kernel(
                         result, host_numpy_C_data_expected, rel_tolerance, abs_tolerance
                     )
                 except AssertionError as e:
-                    raise NumericalAccuracyException(str(e))
+                    raise bu.NumericalAccuracyException(str(e))
 
                 _BT.record_success(timing_result, **keys_dict)
 
-        except NumericalAccuracyException as e:
+        except bu.NumericalAccuracyException as e:
             print()
             print(f"FAIL: Numerical accuracy error. See log file.")
 
@@ -309,7 +298,7 @@ def _benchmark_hexagon_elementwise_add_kernel(
 
             _BT.record_fail(**keys_dict, comments=f"Numerical accuracy error. See log file.")
 
-        except UnsupportedException as e:
+        except bu.UnsupportedException as e:
             print()
             print(f"SKIP: {e}")
 
@@ -381,6 +370,7 @@ def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
     ]
 
 
+@pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
 @tvm.testing.requires_hexagon
 def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
     for dtype in [
@@ -432,3 +422,7 @@ def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
 
     if _BT.has_fail() > 0:
         pytest.fail("At least one benchmark configuration failed", pytrace=False)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From dffc3108bbd406e7da49693533983adba634b19d Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Fri, 17 Jun 2022 17:42:49 +0100
Subject: [PATCH 166/181]  [CMSIS-NN] Fixed the case with repeating operands in
 the QNN binary ops (#11732)

---
 python/tvm/relay/op/contrib/cmsisnn.py        |  1 -
 .../contrib/cmsisnn/extract_constants.cc      | 13 +++-
 .../backend/contrib/cmsisnn/relay_to_tir.cc   | 14 ++++-
 .../cmsisnn/scalar_to_tensor_constant.cc      |  6 ++
 .../contrib/test_cmsisnn/test_binary_ops.py   | 61 ++++++++++++++++++-
 .../test_cmsisnn/test_extract_constants.py    | 34 +++++++++++
 .../test_scalar_to_tensor_constant.py         | 41 +++++++++++++
 7 files changed, 165 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 09831929e5277..8d714b7269d9c 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -223,7 +223,6 @@ def qnn_max_pool2d_pattern():
     def check_qnn_max_pool2d(pattern):
         """Check if max pool2d is supported by CMSIS-NN."""
         output = pattern
-        input_op = None
 
         if str(pattern.op.name) == "clip":
             pooling = pattern.args[0]
diff --git a/src/relay/backend/contrib/cmsisnn/extract_constants.cc b/src/relay/backend/contrib/cmsisnn/extract_constants.cc
index 1cbe36e30f765..c6ed7af9ff031 100644
--- a/src/relay/backend/contrib/cmsisnn/extract_constants.cc
+++ b/src/relay/backend/contrib/cmsisnn/extract_constants.cc
@@ -164,7 +164,18 @@ class ExtractConstantsMutator : public MixedModeMutator {
           function_signature.push_back(arg);
         } else {
           if (arg.as<VarNode>()) {
-            function_signature.push_back(arg);
+            // Only push if its not already present as multiple consumers of any input var
+            // will appear only once in the function signature.
+            bool found_in_existing_signature = false;
+            for (auto& sign : function_signature) {
+              if (arg.same_as(sign)) {
+                found_in_existing_signature = true;
+                break;
+              }
+            }
+            if (!found_in_existing_signature) {
+              function_signature.push_back(arg);
+            }
           }
           new_args.push_back(arg);
         }
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 524735caa9d6a..5c99061fa854b 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -556,7 +556,12 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     BufferCreator buffer_creator;
     tir::Var input_0 = buffer_creator.CreateBufferVar("input_0", DataType::Handle(8));
-    tir::Var input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    tir::Var input_1;
+    if (mul_call->args[0].same_as(mul_call->args[1])) {
+      input_1 = input_0;
+    } else {
+      input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    }
     tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
 
     tvm::Array<PrimExpr> args = {
@@ -626,7 +631,12 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     BufferCreator buffer_creator;
     tir::Var input_0 = buffer_creator.CreateBufferVar("input_0", DataType::Handle(8));
-    tir::Var input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    tir::Var input_1;
+    if (add_call->args[0].same_as(add_call->args[1])) {
+      input_1 = input_0;
+    } else {
+      input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    }
     tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
 
     tvm::Array<PrimExpr> args = {
diff --git a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
index 2448bfc766306..40fd773eb2092 100644
--- a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
+++ b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
@@ -179,6 +179,12 @@ class ScalarToTensorConstantMutator : public MixedModeMutator {
     auto new_body = VisitExpr(func->body);
     Function new_func = WithFields(func, FreeVars(new_body), new_body, func->ret_type,
                                    FreeTypeVars(new_body, mod_), func->attrs);
+
+    // Updating new_func parameters could result into uniquification of function parameters.
+    // Call arguments need to be aligned to the number of arguments expected by new_func.
+    if (new_args[0].same_as(new_args[1])) {
+      new_args.erase(new_args.begin());
+    }
     return Call(new_func, new_args);
   }
 
diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
index fec18c197e045..26604da0a64aa 100644
--- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py
+++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
@@ -101,7 +101,7 @@ def make_model(
 def test_op_int8(
     op, relu_type, input_0_scale, input_0_zero_point, input_1_scale, input_1_zero_point
 ):
-    """Tests QNN Conv2D operator for CMSIS-NN"""
+    """Tests QNN binary operator for CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -145,6 +145,65 @@ def test_op_int8(
     )
 
 
+@skip_if_no_reference_system
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("op", [relay.qnn.op.mul, relay.qnn.op.add])
+@pytest.mark.parametrize("relu_type", ["RELU", "NONE"])
+def test_same_input_to_binary_op(op, relu_type):
+    """Tests QNN binary operator for CMSIS-NN where both inputs are the same"""
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    dtype = "int8"
+    shape = [1, 16, 16, 3]
+    input_ = generate_variable("input")
+    input_scale = 0.256
+    input_zero_point = 33
+
+    model = make_model(
+        op,
+        input_,
+        input_,
+        input_scale,
+        input_zero_point,
+        input_scale,
+        input_zero_point,
+        relu_type,
+    )
+    orig_mod = make_module(model)
+
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod)
+
+    # validate pattern matching
+    assert_partitioned_function(orig_mod, cmsisnn_mod)
+
+    # Check if the number of internal function parameter is 1
+    cmsisnn_global_func = cmsisnn_mod["tvmgen_default_cmsis_nn_main_0"]
+    assert (
+        isinstance(cmsisnn_global_func.body, tvm.relay.expr.Call)
+        and len(cmsisnn_global_func.body.args) == 1
+    ), "Composite function for the binary op should have only 1 parameter."
+
+    # validate the output
+    in_min, in_max = get_range_for_dtype_str(dtype)
+    inputs = {
+        "input": np.random.randint(in_min, high=in_max, size=shape, dtype=dtype),
+    }
+    output_list = generate_ref_data(orig_mod["main"], inputs)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=output_list,
+            output_tolerance=1,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
 def parameterize_for_constant_inputs(test):
     """Generates parameters in such a way so that at least one of the inputs is a constant,
     both can't be variables, both can't be scalars.
diff --git a/tests/python/contrib/test_cmsisnn/test_extract_constants.py b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
index 8831596d40e63..7d3e81a9c79d2 100644
--- a/tests/python/contrib/test_cmsisnn/test_extract_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
@@ -116,6 +116,40 @@ def test_nested_function():
     relay.transform.InferType()(mod)
 
 
+@tvm.testing.requires_cmsisnn
+def test_internal_function_with_duplicate_arguments():
+    """Tests the pass ExternConstants when a composite function
+    is present within global function with repeating arguments
+    to one of the binary ops.
+    """
+    input0 = relay.var("input0", shape=(8, 8))
+    binary_op0 = input0 + input0
+    binary_op1 = binary_op0 * relay.const(5.0, "float32")
+    local_func = relay.Function([input0], binary_op1, relay.TensorType((8, 8), "float32"))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn")
+
+    arg = relay.var("arg", shape=(8, 8))
+    call_local_func = relay.Call(local_func, [arg])
+    extern_func = relay.Function([arg], call_local_func, relay.TensorType((8, 8), "float32"))
+
+    global_arg = relay.var("global_var", shape=(8, 8))
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [global_arg])
+    main_func = relay.Function([global_arg], call_extern_func, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
+
+    mod = tvm.IRModule()
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
+
+    mod = ExtractConstantsFromPartitionedFunction()(mod)
+    constant_verifier = CheckFunctionsForConstants()
+    constant_verifier.visit_function(mod[global_var])
+    constant_verifier.check_num_constants()
+    relay.transform.InferType()(mod)
+
+
 @tvm.testing.requires_cmsisnn
 def test_multiple_functions():
     """Tests the pass ExternConstants when global function
diff --git a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
index 557a65aeffcaf..df54f7ce55f15 100644
--- a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
+++ b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
@@ -256,6 +256,47 @@ def test_all_primary_operands_tensor_constants():
     assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body)
 
 
+@tvm.testing.requires_cmsisnn
+def test_duplicate_constant_arguments():
+    """Tests the pass when repeating operands are arguments to the binary op"""
+    dtype = "int8"
+    shape = (1, 3, 3, 32)
+    operand0 = generate_variable("operand0", shape, dtype)
+    operand1 = generate_variable("operand1", shape, dtype)
+    binary_op = make_binary_op(
+        relay.qnn.op.add,
+        operand0,
+        operand0,
+        input_0_scale=0.0128,
+        input_0_zero_point=32,
+        input_1_scale=0.256,
+        input_1_zero_point=-64,
+    )
+
+    local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
+
+    rng = np.random.default_rng(12345)
+    arg0 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
+    call_local_func = relay.Call(local_func, [arg0, arg0])
+    extern_func = relay.Function([], call_local_func, relay.TensorType(shape, dtype))
+
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [])
+    main_func = relay.Function([], call_extern_func, relay.TensorType(shape, dtype))
+    main_var = relay.GlobalVar("main")
+
+    mod = tvm.IRModule()
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
+
+    mod = relay.transform.InferType()(mod)
+    mod = ScalarToTensorConstants()(mod)
+    new_mod = relay.transform.InferType()(mod)
+    assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body)
+
+
 @tvm.testing.requires_cmsisnn
 def test_non_cmsisnn_ext_func():
     """Non CMSISNN functions should not be altered."""

From 0fdc0eab5199d1b6549d2b2f94c83d86d5545e81 Mon Sep 17 00:00:00 2001
From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com>
Date: Fri, 17 Jun 2022 11:55:39 -0700
Subject: [PATCH 167/181] [MetaSchedule] Distributed Measurement (#11683)

This PR includes the distributed measurement of tuning candidates using builder and async runner, as well as some auxiliary functions. It enables multiple builders and multiple runners with a tracker connecting in between. The hierarchy of files in the database can be further compacted to make the database more concise.
---
 include/tvm/meta_schedule/database.h          |  27 +++
 python/tvm/meta_schedule/database/database.py |  34 +++
 .../meta_schedule/database/memory_database.py |   3 +
 .../testing/dataset_sample_candidates.py      |  23 +-
 .../testing/distributed_measure_candidates.py | 198 ++++++++++++++++++
 python/tvm/meta_schedule/tune_context.py      |  44 ++++
 src/meta_schedule/database/database.cc        |  22 +-
 src/meta_schedule/database/json_database.cc   |   9 +
 src/meta_schedule/tune_context.cc             |  14 +-
 9 files changed, 361 insertions(+), 13 deletions(-)
 create mode 100644 python/tvm/meta_schedule/testing/distributed_measure_candidates.py

diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index 37a315bf744e9..b22d8beddbabb 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -98,6 +98,9 @@ struct WorkloadEqual {
   }
 };
 
+/*! \brief The class of measure candidates. */
+class MeasureCandidate;
+
 /*! \brief The class of tuning records. */
 class TuningRecordNode : public runtime::Object {
  public:
@@ -123,6 +126,9 @@ class TuningRecordNode : public runtime::Object {
   static constexpr const char* _type_key = "meta_schedule.TuningRecord";
   TVM_DECLARE_FINAL_OBJECT_INFO(TuningRecordNode, runtime::Object);
 
+  /*! \brief Construct the measure candidate given the initial IR module and trace
+   * stored in the tuning record. */
+  MeasureCandidate AsMeasureCandidate() const;
   /*!
    * \brief Export the tuning record to a JSON string.
    * \return An array containing the trace, running secs, serialized target, and
@@ -187,6 +193,11 @@ class DatabaseNode : public runtime::Object {
    * \return An array of top K tuning records for the given workload.
    */
   virtual Array<TuningRecord> GetTopK(const Workload& workload, int top_k) = 0;
+  /*!
+   * \brief Get all tuning records from the database.
+   * \return An Array of all the tuning records in the database.
+   */
+  virtual Array<TuningRecord> GetAllTuningRecords() = 0;
   /*!
    * \brief Get the size of the database.
    * \return The size of the database.
@@ -224,6 +235,11 @@ class PyDatabaseNode : public DatabaseNode {
    * \return An array of top K tuning records for the given workload.
    */
   using FGetTopK = runtime::TypedPackedFunc<Array<TuningRecord>(const Workload&, int)>;
+  /*!
+   * \brief The function type of `GetAllTuningRecords` method.
+   * \return An Array of all the tuning records in the database.
+   */
+  using FGetAllTuningRecords = runtime::TypedPackedFunc<Array<TuningRecord>()>;
   /*!
    * \brief The function type of `Size` method.
    * \return The size of the database.
@@ -238,6 +254,8 @@ class PyDatabaseNode : public DatabaseNode {
   FCommitTuningRecord f_commit_tuning_record;
   /*! \brief The packed function to the `GetTopK` function. */
   FGetTopK f_get_top_k;
+  /*! \brief The packed function to the `GetAllTuningRecords` function. */
+  FGetAllTuningRecords f_get_all_tuning_records;
   /*! \brief The packed function to the `Size` function. */
   FSize f_size;
 
@@ -249,6 +267,7 @@ class PyDatabaseNode : public DatabaseNode {
     // `f_commit_workload` is not visited
     // `f_commit_tuning_record` is not visited
     // `f_get_top_k` is not visited
+    // `f_get_all_tuning_records` is not visited
     // `f_size` is not visited
   }
 
@@ -273,6 +292,12 @@ class PyDatabaseNode : public DatabaseNode {
     return f_get_top_k(workload, top_k);
   }
 
+  Array<TuningRecord> GetAllTuningRecords() final {
+    ICHECK(f_get_all_tuning_records != nullptr)
+        << "PyDatabase's GetAllTuningRecords method not implemented!";
+    return f_get_all_tuning_records();
+  }
+
   int64_t Size() final {
     ICHECK(f_size != nullptr) << "PyDatabase's Size method not implemented!";
     return f_size();
@@ -302,6 +327,7 @@ class Database : public runtime::ObjectRef {
    * \param f_commit_workload The packed function of `CommitWorkload`.
    * \param f_commit_tuning_record The packed function of `CommitTuningRecord`.
    * \param f_get_top_k The packed function of `GetTopK`.
+   * \param f_get_all_tuning_records The packed function of `GetAllTuningRecords`.
    * \param f_size The packed function of `Size`.
    * \return The created database.
    */
@@ -309,6 +335,7 @@ class Database : public runtime::ObjectRef {
                                      PyDatabaseNode::FCommitWorkload f_commit_workload,
                                      PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record,
                                      PyDatabaseNode::FGetTopK f_get_top_k,
+                                     PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records,
                                      PyDatabaseNode::FSize f_size);
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(Database, runtime::ObjectRef, DatabaseNode);
 };
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index 802a739e69582..0c11f77591cc6 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -115,6 +115,17 @@ def __init__(  # type: ignore # pylint: disable=too-many-arguments
             args_info,
         )
 
+    def as_measure_candidate(self) -> Any:
+        """Generate a measure candidate given an initial IR module and a trace
+        stored in the tuning record.
+
+        Returns
+        -------
+        candidate : MeasureCandidate
+            A generated candidate.
+        """
+        return _ffi_api.TuningRecordAsMeasureCandidate(self)  # type: ignore # pylint: disable=no-member
+
     def as_json(self) -> Any:
         """Export the tuning record to a JSON string.
 
@@ -203,6 +214,16 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
         """
         return _ffi_api.DatabaseGetTopK(self, workload, top_k)  # type: ignore # pylint: disable=no-member
 
+    def get_all_tuning_records(self) -> List[TuningRecord]:
+        """Get all the tuning records from the database.
+
+        Returns
+        -------
+        tuning_records : List[TuningRecord]
+            All tuning records from the database.
+        """
+        return _ffi_api.DatabaseGetAllTuningRecords(self)  # type: ignore # pylint: disable=no-member
+
     def __len__(self) -> int:
         """Get the number of records in the database.
 
@@ -229,6 +250,7 @@ def __init__(
         f_commit_workload: Callable = None,
         f_commit_tuning_record: Callable = None,
         f_get_top_k: Callable = None,
+        f_get_all_tuning_records: Callable = None,
         f_size: Callable = None,
     ):
         """Constructor."""
@@ -239,6 +261,7 @@ def __init__(
             f_commit_workload,
             f_commit_tuning_record,
             f_get_top_k,
+            f_get_all_tuning_records,
             f_size,
         )
 
@@ -258,6 +281,7 @@ class PyDatabase:
             "commit_workload",
             "commit_tuning_record",
             "get_top_k",
+            "get_all_tuning_records",
             "__len__",
         ],
     }
@@ -317,6 +341,16 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
         """
         raise NotImplementedError
 
+    def get_all_tuning_records(self) -> List[TuningRecord]:
+        """Get all the tuning records from the database.
+
+        Returns
+        -------
+        tuning_records : List[TuningRecord]
+            All tuning records from the database.
+        """
+        raise NotImplementedError
+
     def __len__(self) -> int:
         """Get the number of records in the database.
 
diff --git a/python/tvm/meta_schedule/database/memory_database.py b/python/tvm/meta_schedule/database/memory_database.py
index 6d10e4b5272a9..95d937cc77aad 100644
--- a/python/tvm/meta_schedule/database/memory_database.py
+++ b/python/tvm/meta_schedule/database/memory_database.py
@@ -56,6 +56,9 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
             )
         )[: int(top_k)]
 
+    def get_all_tuning_records(self) -> List[TuningRecord]:
+        return self.records
+
     def __len__(self) -> int:
         return len(self.records)
 
diff --git a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
index c80d78173e2e4..35b872e7351e8 100644
--- a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
+++ b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
@@ -103,6 +103,14 @@ def sample_candidates(task, task_name, model_name):
     -------
     None
     """
+    candidate_path = os.path.join(
+        args.candidate_cache_dir, model_name, task_name + "_candidates.json"
+    )
+    workload_path = os.path.join(args.candidate_cache_dir, model_name, task_name + "_workload.json")
+    database = ms.database.JSONDatabase(
+        path_workload=workload_path,
+        path_tuning_record=candidate_path,
+    )
     sample_init_population = tvm.get_global_func(
         "meta_schedule.SearchStrategyEvolutionarySearchSampleInitPopulation"
     )
@@ -128,7 +136,7 @@ def sample_candidates(task, task_name, model_name):
     context.initialize()
     context.pre_tuning(
         context.generate_design_space(),
-        database=ms.database.MemoryDatabase(),  # type: ignore
+        database=database,
         cost_model=ms.cost_model.RandomModel(),  # type: ignore
     )
 
@@ -148,16 +156,9 @@ def sample_candidates(task, task_name, model_name):
     all_states = all_states[: args.num_samples_per_task]
 
     workload = ms.database.Workload(context.mod)
-    file_path = os.path.join(args.candidate_cache_dir, model_name, task_name + ".json")
-    with open(file_path, "w", encoding="utf8") as file:
-        for i, state in enumerate(all_states):
-            tuning_record = ms.database.TuningRecord(state.trace, workload)
-            json_str = json.dumps(tuning_record.as_json())
-            assert "\n" not in json_str, "Failed to generate single line string."
-            if i == len(all_states) - 1:
-                file.write(json_str)
-            else:
-                file.write(json_str + "\n")
+    database.commit_workload(context.mod)
+    for state in all_states:
+        database.commit_tuning_record(ms.database.TuningRecord(state.trace, workload))
 
 
 args = _parse_args()  # pylint: disable=invalid-name
diff --git a/python/tvm/meta_schedule/testing/distributed_measure_candidates.py b/python/tvm/meta_schedule/testing/distributed_measure_candidates.py
new file mode 100644
index 0000000000000..8e646c4846724
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/distributed_measure_candidates.py
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+
+import argparse
+import glob
+import os
+
+from tqdm import tqdm  # type: ignore
+from tvm import meta_schedule as ms
+from tvm.target import Target
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--candidate_cache_dir", type=str, help="Please provide the full path to the candidates."
+    )
+    parser.add_argument(
+        "--result_cache_dir", type=str, help="Please provide the full path to the result database."
+    )
+    parser.add_argument(
+        "--target",
+        type=str,
+        default="nvidia/nvidia-v100",
+        help="Please specify the target hardware for tuning context.",
+    )
+    parser.add_argument(
+        "--rpc_host", type=str, help="Please provide the private IPv4 address for the tracker."
+    )
+    parser.add_argument(
+        "--rpc_port", type=int, default=4445, help="Please provide the port for the tracker."
+    )
+    parser.add_argument(
+        "--rpc_key",
+        type=str,
+        default="p3.2xlarge",
+        help="Please provide the key for the rpc servers.",
+    )
+    parser.add_argument(
+        "--builder_timeout_sec",
+        type=int,
+        default=10,
+        help="The time for the builder session to time out.",
+    )
+    parser.add_argument(
+        "--min_repeat_ms", type=int, default=100, help="The time for preheating the gpu."
+    )
+    parser.add_argument(
+        "--runner_timeout_sec",
+        type=int,
+        default=100,
+        help="The time for the runner session to time out.",
+    )
+    parser.add_argument(
+        "--cpu_flush", type=bool, default=False, help="Whether to enable cpu cache flush or not."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=128,
+        help="The batch size of candidates sent to builder and runner each time.",
+    )
+    return parser.parse_args()
+
+
+# pylint: disable=too-many-locals
+def measure_candidates(database, builder, runner):
+    """Send the candidates to builder and runner for distributed measurement,
+    and save the results in a new json database.
+
+    Parameters
+    ----------
+    database : JSONDatabase
+        The database for candidates to be measured.
+    builder : Builder
+        The builder for building the candidates.
+    runner : Runner
+        The runner for measuring the candidates.
+
+    Returns
+    -------
+    None
+    """
+    candidates, runner_results, build_fail_indices, run_fail_indices = [], [], [], []
+    context = ms.TuneContext(target=Target(args.target))
+    tuning_records = database.get_all_tuning_records()
+    for record in tuning_records:
+        candidates.append(record.as_measure_candidate())
+    with ms.Profiler() as profiler:
+        for idx in range(0, len(candidates), args.batch_size):
+            batch_candidates = candidates[idx : idx + args.batch_size]
+            context._set_measure_candidates(batch_candidates)  # pylint: disable=protected-access
+            with ms.Profiler.timeit("build"):
+                context._send_to_builder(builder)  # pylint: disable=protected-access
+            with ms.Profiler.timeit("run"):
+                context._send_to_runner(runner)  # pylint: disable=protected-access
+                batch_runner_results = context._join()  # pylint: disable=protected-access
+            runner_results.extend(batch_runner_results)
+            for i, result in enumerate(context.builder_results):
+                if result.error_msg is None:
+                    ms.utils.remove_build_dir(result.artifact_path)
+                else:
+                    build_fail_indices.append(i + idx)
+            context._clear_measure_state()  # pylint: disable=protected-access
+
+    model_name, workload_name = database.path_workload.split("/")[-2:]
+    record_name = database.path_tuning_record.split("/")[-1]
+    new_database = ms.database.JSONDatabase(
+        path_workload=os.path.join(args.result_cache_dir, model_name, workload_name),
+        path_tuning_record=os.path.join(args.result_cache_dir, model_name, record_name),
+    )
+    workload = tuning_records[0].workload
+    new_database.commit_workload(workload.mod)
+    for i, (record, result) in enumerate(zip(tuning_records, runner_results)):
+        if result.error_msg is None:
+            new_database.commit_tuning_record(
+                ms.database.TuningRecord(
+                    trace=record.trace,
+                    workload=workload,
+                    run_secs=[v.value for v in result.run_secs],
+                    target=Target(args.target),
+                )
+            )
+        else:
+            run_fail_indices.append(i)
+    fail_indices_name = workload_name.replace("_workload.json", "_failed_indices.txt")
+    with open(
+        os.path.join(args.result_cache_dir, model_name, fail_indices_name), "w", encoding="utf8"
+    ) as file:
+        file.write(" ".join([str(n) for n in run_fail_indices]))
+    print(
+        f"Builder time: {profiler.get()['build']}, Runner time: {profiler.get()['run']}\n\
+            Failed number of builds: {len(build_fail_indices)},\
+            Failed number of runs: {len(run_fail_indices)}"
+    )
+
+
+args = _parse_args()  # pylint: disable=invalid-name
+
+
+def main():
+    builder = ms.builder.LocalBuilder(timeout_sec=args.builder_timeout_sec)
+    runner = ms.runner.RPCRunner(
+        rpc_config=ms.runner.RPCConfig(
+            tracker_host=args.rpc_host,
+            tracker_port=args.rpc_port,
+            tracker_key=args.rpc_key,
+            session_timeout_sec=args.runner_timeout_sec,
+        ),
+        evaluator_config=ms.runner.EvaluatorConfig(
+            number=3,
+            repeat=1,
+            min_repeat_ms=args.min_repeat_ms,
+            enable_cpu_cache_flush=args.cpu_flush,
+        ),
+        max_workers=os.cpu_count(),
+    )
+    if not os.path.isdir(args.candidate_cache_dir):
+        raise Exception("Please provide a correct candidate cache dir.")
+    try:
+        os.makedirs(args.result_cache_dir, exist_ok=True)
+    except OSError:
+        print(f"Directory {args.result_cache_dir} cannot be created successfully.")
+    model_dirs = glob.glob(os.path.join(args.candidate_cache_dir, "*"))
+    for model_dir in model_dirs:
+        model_name = model_dir.split("/")[-1]
+        os.makedirs(os.path.join(args.result_cache_dir, model_name), exist_ok=True)
+        all_tasks = glob.glob(os.path.join(model_dir, "*.json"))
+        workload_paths = []
+        for path in all_tasks:
+            if path.endswith("_workload.json"):
+                workload_paths.append(path)
+        for workload_path in tqdm(workload_paths):
+            candidate_path = workload_path.replace("_workload.json", "_candidates.json")
+            database = ms.database.JSONDatabase(
+                path_workload=workload_path,
+                path_tuning_record=candidate_path,
+            )
+            measure_candidates(database, builder, runner)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index b7975e7b2c4e9..30c726ded25bb 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -171,6 +171,50 @@ def __init__(
         )
         _ffi_api.TuneContextInitialize(self)  # type: ignore # pylint: disable=no-member
 
+    def _set_measure_candidates(self, candidates):
+        """Set candidates in a tuning context.
+
+        Parameters
+        ----------
+        candidates : List[MeasureCandidate]
+            A list of measure candidates for the tuning context.
+        """
+        _ffi_api.TuneContextSetMeasureCandidates(self, candidates)  # type: ignore # pylint: disable=no-member
+
+    def _send_to_builder(self, builder):
+        """Send candidates to builder.
+
+        Parameters
+        ----------
+        builder : Builder
+            The builder for building the candidates.
+        """
+        _ffi_api.TuneContextSendToBuilder(self, builder)  # type: ignore # pylint: disable=no-member
+
+    def _send_to_runner(self, runner):
+        """Send candidates to runner.
+
+        Parameters
+        ----------
+        runner : Runner
+            The runner for running the candidates.
+        """
+        _ffi_api.TuneContextSendToRunner(self, runner)  # type: ignore # pylint: disable=no-member
+
+    def _join(self):
+        """Join the runner processes.
+
+        Returns
+        -------
+        result : List[RunnerResult]
+            The runner results.
+        """
+        return _ffi_api.TuneContextJoin(self)  # type: ignore # pylint: disable=no-member
+
+    def _clear_measure_state(self):
+        """Clear the measure states."""
+        _ffi_api.TuneContextClearMeasureState(self)  # type: ignore # pylint: disable=no-member
+
     def generate_design_space(self) -> List[Schedule]:
         """Generate design spaces given a module.
 
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index 9905ff73c792c..5adff49984946 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -85,6 +85,19 @@ TuningRecord::TuningRecord(tir::Trace trace, Workload workload, Optional<Array<F
   this->data_ = n;
 }
 
+MeasureCandidate TuningRecordNode::AsMeasureCandidate() const {
+  tir::Schedule sch =
+      tir::Schedule::Traced(workload->mod, -1, 0, tir::ScheduleErrorRenderLevel::kDetail);
+  trace->ApplyToSchedule(sch, false, nullptr);
+  tir::PrimFunc func;
+  for (const auto& kv : sch->mod()->functions) {
+    func = Downcast<tir::PrimFunc>(kv.second);
+  }
+  Array<ArgInfo> args_info = ArgInfo::FromPrimFunc(func);
+  MeasureCandidate candidate = MeasureCandidate(sch, args_info);
+  return candidate;
+}
+
 ObjectRef TuningRecordNode::AsJSON() const {
   Optional<Array<ObjectRef>> json_args_info{nullptr};
   Optional<ObjectRef> json_target{nullptr};
@@ -152,12 +165,15 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w
 Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload,
                               PyDatabaseNode::FCommitWorkload f_commit_workload,
                               PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record,
-                              PyDatabaseNode::FGetTopK f_get_top_k, PyDatabaseNode::FSize f_size) {
+                              PyDatabaseNode::FGetTopK f_get_top_k,
+                              PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records,
+                              PyDatabaseNode::FSize f_size) {
   ObjectPtr<PyDatabaseNode> n = make_object<PyDatabaseNode>();
   n->f_has_workload = f_has_workload;
   n->f_commit_workload = f_commit_workload;
   n->f_commit_tuning_record = f_commit_tuning_record;
   n->f_get_top_k = f_get_top_k;
+  n->f_get_all_tuning_records = f_get_all_tuning_records;
   n->f_size = f_size;
   return Database(n);
 }
@@ -179,6 +195,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuningRecord")
                        Optional<Target> target, Optional<Array<ArgInfo>> args_info) {
       return TuningRecord(trace, workload, run_secs, target, args_info);
     });
+TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsMeasureCandidate")
+    .set_body_method<TuningRecord>(&TuningRecordNode::AsMeasureCandidate);
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsJSON")
     .set_body_method<TuningRecord>(&TuningRecordNode::AsJSON);
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordFromJSON").set_body_typed(TuningRecord::FromJSON);
@@ -190,6 +208,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.DatabaseCommitTuningRecord")
     .set_body_method<Database>(&DatabaseNode::CommitTuningRecord);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetTopK")
     .set_body_method<Database>(&DatabaseNode::GetTopK);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetAllTuningRecords")
+    .set_body_method<Database>(&DatabaseNode::GetAllTuningRecords);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabaseSize").set_body_method<Database>(&DatabaseNode::Size);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabasePyDatabase").set_body_typed(Database::PyDatabase);
 
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 4f5bd9b136131..9bb7ee1027b99 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -156,6 +156,15 @@ class JSONDatabaseNode : public DatabaseNode {
     return results;
   }
 
+  Array<TuningRecord> GetAllTuningRecords() {
+    Array<TuningRecord> results;
+    results.reserve(Size());
+    for (const TuningRecord& record : this->tuning_records_) {
+      results.push_back(record);
+    }
+    return results;
+  }
+
   int64_t Size() { return tuning_records_.size(); }
 };
 
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 0c70dcf5c406f..57b2344c6f8db 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -142,7 +142,9 @@ Array<RunnerResult> TuneContextNode::_Join() {
       results.push_back(future->Result());
     }
   }
-  this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results);
+  if (this->search_strategy.defined()) {
+    this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results);
+  }
   ICHECK(this->measure_candidates.defined());
   ICHECK(this->builder_results.defined());
   ICHECK_EQ(results.size(), this->measure_candidates.value().size());
@@ -177,6 +179,16 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuneContext")
 TVM_REGISTER_GLOBAL("meta_schedule._SHash2Hex").set_body_typed(SHash2Hex);
 TVM_REGISTER_GLOBAL("meta_schedule.TuneContextInitialize")
     .set_body_method<TuneContext>(&TuneContextNode::Initialize);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSetMeasureCandidates")
+    .set_body_method<TuneContext>(&TuneContextNode::_SetMeasureCandidates);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSendToBuilder")
+    .set_body_method<TuneContext>(&TuneContextNode::_SendToBuilder);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSendToRunner")
+    .set_body_method<TuneContext>(&TuneContextNode::_SendToRunner);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextJoin")
+    .set_body_method<TuneContext>(&TuneContextNode::_Join);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextClearMeasureState")
+    .set_body_method<TuneContext>(&TuneContextNode::_ClearMeasureState);
 
 }  // namespace meta_schedule
 }  // namespace tvm

From c5465d8037196c7abbf3f599a120dd28e1d029bd Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 17 Jun 2022 15:09:18 -0400
Subject: [PATCH 168/181] [hexagon][testing] add max_pool2d benchmark (#11720)

- Add benchmarking framework for Hexagon maxpool-2d kernels,
  and one (simple) kernel.
---
 .../contrib/test_hexagon/benchmark_util.py    |  76 ++++
 .../test_hexagon/test_benchmark_maxpool2d.py  | 351 ++++++++++++++++++
 2 files changed, 427 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py

diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py
index 35fe6bad64b80..e581c3d55d216 100644
--- a/tests/python/contrib/test_hexagon/benchmark_util.py
+++ b/tests/python/contrib/test_hexagon/benchmark_util.py
@@ -17,6 +17,9 @@
 
 import csv
 import os
+import pytest
+import tempfile
+import collections
 
 
 def skip_bencharks_flag_and_reason():
@@ -52,6 +55,20 @@ class NumericalAccuracyException(Exception):
     """
 
 
+class UnsupportedException(Exception):
+    """
+    Indicates that the specified benchmarking configuration is known to
+    currently be unsupported.  The Exception message may provide more detail.
+    """
+
+
+class NumericalAccuracyException(Exception):
+    """
+    Indicates that the benchmarking configuration appeared to run successfully,
+    but the output data didn't have the expected accuracy.
+    """
+
+
 class BenchmarksTable:
     """
     Stores/reports the result of benchmark runs.
@@ -207,3 +224,62 @@ def get_benchmark_decription(keys_dict):
     other characters that make it unsuitable for use as a filename.
     """
     return " ".join([f"{k}={v}" for k, v in keys_dict.items()])
+
+
+# This fixture provides some initialization / finalization logic for groups of related
+# benchmark runs.
+# See the fixture implementation below for details.
+#
+# The fixture's mechanics are described here: https://stackoverflow.com/a/63047695
+#
+# TODO: There may be cleaner ways to let each class that uses this fixture provide its
+# own value for `csv_column_order`.
+#
+# TODO: In the future we may wish to break this fixture up in to several smaller ones.
+#
+# The overall contract for a class (e.g. `MyTest`) using this fixture is as follows:
+#
+#    https://stackoverflow.com/a/63047695
+#
+#    @pytest.mark.usefixtures("benchmark_group")
+#    class MyTest:
+#
+#       # The fixture requires that this class variable is defined before
+#       # the fixture's finalizer-logic executes.
+#       #
+#       # This is used as an argument to BenchmarkTable.print_csv(...) after
+#       # all of MyTest's unit tests have executed.
+#       csv_column_order = [
+#          ...
+#          ]
+#
+#       # Before the MyTest's first unit test executes, the fixture will populate the
+#       # following class variables:
+#       MyTest.working_dir     : str
+#       MyTest.benchmark_table : BenchmarkTable
+@pytest.fixture(scope="class")
+def benchmark_group(request):
+    working_dir = tempfile.mkdtemp()
+    bt = BenchmarksTable()
+
+    request.cls.working_dir = working_dir
+    request.cls.benchmark_table = bt
+
+    yield
+
+    tabular_output_filename = os.path.join(working_dir, "benchmark-results.csv")
+
+    if not hasattr(request.cls, "csv_column_order"):
+        raise Exception('Classes using this fixture must have a member named "csv_column_order"')
+
+    with open(tabular_output_filename, "w") as csv_file:
+        bt.print_csv(csv_file, request.cls.csv_column_order)
+
+    print()
+    print("*" * 80)
+    print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
+    print("*" * 80)
+    print()
+
+    if bt.has_fail() > 0:
+        pytest.fail("At least one benchmark configuration failed", pytrace=False)
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
new file mode 100644
index 0000000000000..41169494417a2
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
@@ -0,0 +1,351 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+This module serves two purposes:
+    (1) Demonstrates how to write Python code that exercises various
+        Hexagon-related algorithms / features.
+
+    (2) Benchmark the resulting primfuncs.
+
+Current limitations:
+    - Input shapes are limited to NHWC --> NHWC_8h8w32c.
+
+    - Testing parameters (input shapes, dtypes, etc.) currently
+      support only one value for each parameter.
+
+    - H, W, C must be integer multiples of 8, 8, and 32,
+      respectively.  I.e., partial blocks aren't currently
+      supported by this script.
+
+    - Requires that I/O tensors reside in "global.VTCM" memory,
+      rather than "global" memory.
+      This prevents benchmarking with I/O tensors that are too
+      large to fit into availble VTCM.
+
+    - The script only develops one primfunc.
+      Future revisions to this script are expected to add more
+      primfuncs and demonstrate more coding strategies.
+"""
+
+import sys
+import pytest
+import numpy as np
+import copy
+import os
+
+import tvm.testing
+from tvm import te, topi, tir
+from tvm.topi import testing
+from tvm.script import tir as T
+from tvm.tir import IndexMap
+from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.hexagon.session import Session
+from typing import List
+
+from .infrastructure import allocate_hexagon_array
+from . import benchmark_util as bu
+from .benchmark_util import benchmark_group
+
+_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
+
+
+def _ceil_div(numerator, denominator):
+    return (numerator + (denominator - 1)) // denominator
+
+
+def _int8_nhwc_8h8w32c_map(n, h, w, c):
+    return [
+        n,
+        h // 8,
+        w // 8,
+        c // 32,
+        te.AXIS_SEPARATOR,
+        h % 8,
+        w % 8,
+        c % 32,
+    ]
+
+
+def _int8_nhwc_8h8w32c_shape(n, h, w, c) -> List[int]:
+    return [
+        n,
+        _ceil_div(h, 8),
+        _ceil_div(w, 8),
+        _ceil_div(c, 32),
+        8,
+        8,
+        32,
+    ]
+
+
+def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray:
+    """
+    Return a deep copy of 'arr_in', transformed from a NWHC to
+    NHWC-8h8wc32 shape.  Any newly created array elements have value 0.
+    """
+    stage1 = copy.copy(arr_in)
+
+    (
+        n,
+        h,
+        w,
+        c,
+    ) = stage1.shape
+
+    (
+        h_minor,
+        w_minor,
+        c_minor,
+    ) = [8, 8, 32]
+
+    h_major = _ceil_div(h, h_minor)
+    w_major = _ceil_div(w, w_minor)
+    c_major = _ceil_div(c, c_minor)
+
+    # This handles cases where the dimensions of arr_in are not cleanly divided
+    # by the minor block size, i.e. [8, 8, 32].
+    #
+    # Any additional array elements that this creates will ahve value 0.
+    # We shouldn't actually care what value is used for those elements, because they
+    # shouldn't be treated as meaningful by any of our algorithms.
+    if (h % h_minor) or (w % w_minor) or (c % c_minor):
+        stage1.resize((n, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False)
+
+    stage2 = stage1.reshape(n, h_major, h_minor, w_major, w_minor, c_major, c_minor)
+    stage3 = stage2.transpose(0, 1, 3, 5, 2, 4, 6)
+    return stage3
+
+
+def _create_test_input(shape, dtype: str) -> np.ndarray:
+    np_dtype = np.dtype(dtype)
+    min_value = np.iinfo(np_dtype).min
+    max_value = np.iinfo(np_dtype).max
+    return np.random.randint(low=min_value, high=max_value, size=tuple(shape), dtype=np.int8)
+
+
+@pytest.mark.usefixtures("benchmark_group")
+class TestMaxPool2D:
+    csv_column_order = [
+        # Identifies which TE-compute / TIRScript is used as the basis for the
+        # benchmarked primfunc. Only needs to be meaningful to humans.
+        "basic_kernel",
+        # When applicable, indicates the particular variation of schedules
+        # apply by the Python code. Decoding this may require looking at this
+        # script's source code.
+        "sched_type",
+        # Values directly based on test parameters...
+        "input_shape_4d",
+        "block_shape",
+        "DTYPE",
+        "KERNEL",
+        "STRIDE",
+        "DILATION",
+        "PADDING",
+        "IO_TENSOR_MEM_SCOPE",
+        # Reserved columns defined by the BenchmarksTable class.
+        "row_status",
+        "timings_min_usecs",
+        "timings_max_usecs",
+        "timings_median_usecs",
+        "timings_mean_usecs",
+        "timings_stddev_usecs",
+        # For benchmarks that produce files on the host file system, this indicates
+        # their location. Useful for post-mortem investigation of benchmark results.
+        "host_files_dir_path",
+        # Miscellaneous comments about the benchmark.
+        "comments",
+    ]
+
+    DTYPE = tvm.testing.parameter("int8")
+
+    # FIXME(cconvey): The script currently fails when H, W, or C is not an
+    # integer multiple of 8, 8, or 32, respectively.
+    N = tvm.testing.parameter(1)
+    H = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+    W = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+    C = tvm.testing.parameter(*[x * 32 for x in [1, 2]])
+
+    KERNEL = tvm.testing.parameter((1, 1), (3, 3))
+    STRIDE = tvm.testing.parameter((1, 1))
+    DILATION = tvm.testing.parameter((1, 1))
+    PADDING = tvm.testing.parameter((0, 0, 0, 0))
+    IO_TENSOR_MEM_SCOPE = tvm.testing.parameter("global.vtcm")
+
+    @pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
+    @tvm.testing.requires_hexagon
+    def test_maxpool2d_nhwc(
+        self,
+        N,
+        H,
+        W,
+        C,
+        DTYPE,
+        KERNEL,
+        STRIDE,
+        DILATION,
+        PADDING,
+        IO_TENSOR_MEM_SCOPE,
+        hexagon_session: Session,
+    ):
+        keys_dict = {
+            "basic_kernel": "max_pool2d",
+            "sched_type": 1,
+            "input_shape_4d": [N, H, W, C],
+            "block_shape": [8, 8, 32],
+            "DTYPE": DTYPE,
+            "KERNEL": KERNEL,
+            "STRIDE": STRIDE,
+            "DILATION": DILATION,
+            "PADDING": PADDING,
+            "IO_TENSOR_MEM_SCOPE": IO_TENSOR_MEM_SCOPE,
+        }
+
+        desc = bu.get_benchmark_decription(keys_dict)
+
+        # Create the host-side directory for this benchmark run's files / logs...
+        host_files_dir_name = bu.get_benchmark_id(keys_dict)
+        host_files_dir_path = os.path.join(self.working_dir, host_files_dir_name)
+        os.mkdir(host_files_dir_path)
+
+        keys_dict["host_files_dir_path"] = host_files_dir_path
+
+        log_file_path = os.path.join(host_files_dir_path, "out.txt")
+        with open(log_file_path, "w") as log_file:
+            print(f"CONFIGURATION: {desc}")
+            log_file.write(f"CONFIGURATION: {desc}\n")
+
+            try:
+                input_tensor_shape_4d = [N, H, W, C]
+                input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(N, H, W, C)
+
+                data = te.placeholder(tuple(input_tensor_shape_4d), dtype=DTYPE)
+
+                output = topi.nn.pool2d(
+                    data, KERNEL, STRIDE, DILATION, PADDING, "max", layout="NHWC"
+                )
+                primfunc = te.create_prim_func([data, output])
+
+                sch = tir.Schedule(primfunc, debug_mask="all")
+
+                sch.transform_layout(
+                    block="tensor", buffer="placeholder", index_map=_int8_nhwc_8h8w32c_map
+                )
+
+                target_hexagon = tvm.target.hexagon("v69", link_params=True)
+                # func = tvm.build(sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon))
+                built_module = tvm.build(
+                    sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon)
+                )
+
+                # Save a local copy of the Hexagon object code (in the form of a .so file)
+                # to allow post-mortem inspection.
+                host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so")
+                built_module.save(host_dso_binary_path)
+                print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}")
+
+                hexagon_mod = hexagon_session.load_module(built_module)
+
+                # Generate the input tensor's data.
+                # Note that we'll eventually need it in two different layouts:
+                # (1) NHWC as an argument to testing.poolnd_python.
+                # (2) NHWC_8h8w32c for as an argument to our Hexagon primfunc.
+                # a_numpy_4d = np.random.randint(low=-128, high=127, size=input_tensor_shape_4d, dtype=np.int8)
+                a_numpy_4d = _create_test_input(input_tensor_shape_4d, DTYPE)
+
+                ref_output_4d = testing.poolnd_python(
+                    a_numpy_4d.astype("int32"),
+                    KERNEL,
+                    STRIDE,
+                    DILATION,
+                    PADDING[0:2],
+                    PADDING[2:],
+                    pool_type="max",
+                    dtype="int32",
+                    layout="NHWC",
+                ).astype(DTYPE)
+
+                output_tensor_shape_4d = ref_output_4d.shape
+
+                a_numpy_7d = _int8_nhwc_8h8w32c_xform_immediate(a_numpy_4d)
+
+                a_hexagon_7d = allocate_hexagon_array(
+                    hexagon_session.device,
+                    tensor_shape=input_tensor_shape_7d,
+                    axis_separators=[4],
+                    dtype=DTYPE,
+                    mem_scope=IO_TENSOR_MEM_SCOPE,
+                )
+
+                c_hexagon_4d = allocate_hexagon_array(
+                    hexagon_session.device,
+                    tensor_shape=output_tensor_shape_4d,
+                    axis_separators=[],
+                    dtype=DTYPE,
+                    mem_scope=IO_TENSOR_MEM_SCOPE,
+                )
+
+                a_hexagon_7d.copyfrom(a_numpy_7d)
+
+                if DTYPE == "int8":
+                    rel_tolerance = 0
+                    abs_tolerance = 0
+                else:
+                    assert False, f"TODO: decide acceptable tolerances for DTYPE {DTYPE}"
+
+                # hexagon_mod(a_hexagon_7d, c_hexagon_4d)
+                # tvm.testing.assert_allclose(ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance)
+
+                timer = hexagon_mod.time_evaluator(
+                    "main", hexagon_session.device, number=10, repeat=1
+                )
+                timing_result = timer(a_hexagon_7d, c_hexagon_4d)
+
+                try:
+                    tvm.testing.assert_allclose(
+                        ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance
+                    )
+                except AssertionError as e:
+                    raise bu.NumericalAccuracyException(str(e))
+
+            except bu.NumericalAccuracyException as e:
+                print()
+                print(f"FAIL: Numerical accuracy error. See log file.")
+
+                log_file.write("\n")
+                log_file.write(f"FAIL: {e}\n")
+
+                self.benchmark_table.record_fail(
+                    **keys_dict, comments=f"Numerical accuracy error. See log file."
+                )
+
+            except bu.UnsupportedException as e:
+                print()
+                print(f"SKIP: {e}")
+
+                log_file.write("\n")
+                log_file.write(f"SKIP: {e}\n")
+
+                self.benchmark_table.record_skip(
+                    **keys_dict, comments=f"Unsupported configuration: {e}"
+                )
+
+            self.benchmark_table.record_success(timing_result, **keys_dict)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 648154d808f87d17d27754ef710dbcbaf5452d39 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mehrdad.hessar@gmail.com>
Date: Fri, 17 Jun 2022 12:27:52 -0700
Subject: [PATCH 169/181] [MLF] Add support for multiple modules in Model
 Library Format (#11464)

---
 .../template_project/microtvm_api_server.py   |  17 +-
 python/tvm/driver/tvmc/model.py               |  17 +-
 python/tvm/micro/contrib/stm32/emitter.py     |  12 +-
 python/tvm/micro/model_library_format.py      | 297 +++++++++++-------
 python/tvm/micro/testing/utils.py             |  31 +-
 tests/micro/zephyr/test_zephyr.py             |   1 -
 tests/python/relay/aot/test_crt_aot.py        |  36 +--
 .../relay/strategy/arm_cpu/test_avg_pool.py   |   4 +-
 .../relay/strategy/arm_cpu/test_conv1d_ncw.py |   4 +-
 .../relay/strategy/arm_cpu/test_conv1d_nwc.py |   4 +-
 .../strategy/arm_cpu/test_conv2d_NCHWc.py     |   4 +-
 .../relay/strategy/arm_cpu/test_dense_dsp.py  |   4 +-
 .../strategy/arm_cpu/test_depthwise_conv2d.py |   2 -
 .../arm_cpu/test_depthwise_conv2d_NCHWc.py    |   4 +-
 .../strategy/arm_cpu/test_group_conv2d.py     |   2 -
 .../relay/strategy/arm_cpu/test_max_pool.py   |   5 +-
 .../test_micro_model_library_format.py        | 266 +++++++++++++---
 17 files changed, 481 insertions(+), 229 deletions(-)

diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index 131f92a208298..0e922f06cb515 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -214,14 +214,21 @@ def _template_model_header(self, source_dir, metadata):
         with open(source_dir / "model.h", "r") as f:
             model_h_template = Template(f.read())
 
-        assert (
-            metadata["style"] == "full-model"
+        all_module_names = []
+        for name in metadata["modules"].keys():
+            all_module_names.append(name)
+
+        assert all(
+            metadata["modules"][mod_name]["style"] == "full-model" for mod_name in all_module_names
         ), "when generating AOT, expect only full-model Model Library Format"
 
-        template_values = {
-            "workspace_size_bytes": metadata["memory"]["functions"]["main"][0][
+        workspace_size_bytes = 0
+        for mod_name in all_module_names:
+            workspace_size_bytes += metadata["modules"][mod_name]["memory"]["functions"]["main"][0][
                 "workspace_size_bytes"
-            ],
+            ]
+        template_values = {
+            "workspace_size_bytes": workspace_size_bytes,
         }
 
         with open(source_dir / "model.h", "w") as f:
diff --git a/python/tvm/driver/tvmc/model.py b/python/tvm/driver/tvmc/model.py
index 04946ec9c6d0c..5f40d21223125 100644
--- a/python/tvm/driver/tvmc/model.py
+++ b/python/tvm/driver/tvmc/model.py
@@ -391,9 +391,20 @@ def import_package(self, package_path: str):
             with open(temp.relpath("metadata.json")) as metadata_json:
                 metadata = json.load(metadata_json)
 
-            has_graph_executor = "graph" in metadata["executors"]
-            graph = temp.relpath("executor-config/graph/graph.json") if has_graph_executor else None
-            params = temp.relpath(f'parameters/{metadata["model_name"]}.params')
+            all_module_names = []
+            for name in metadata["modules"].keys():
+                all_module_names.append(name)
+            assert len(all_module_names) == 1, "Multiple modules in MLF is not supported."
+
+            module_name = all_module_names[0]
+            module_metdata = metadata["modules"][module_name]
+            has_graph_executor = "graph" in module_metdata["executors"]
+            graph = (
+                temp.relpath(f"executor-config/graph/{module_name}.graph")
+                if has_graph_executor
+                else None
+            )
+            params = temp.relpath(f"parameters/{module_name}.params")
 
             self.type = "mlf"
         else:
diff --git a/python/tvm/micro/contrib/stm32/emitter.py b/python/tvm/micro/contrib/stm32/emitter.py
index aec5912871fd5..814f98f1b7882 100644
--- a/python/tvm/micro/contrib/stm32/emitter.py
+++ b/python/tvm/micro/contrib/stm32/emitter.py
@@ -482,8 +482,18 @@ def parse_library_format(self, model_library_format_path, quantization=None):
         with tarfile.TarFile(model_library_format_path) as f:
             f.extractall(extract_path)
 
+        with open(os.path.join(extract_path, "metadata.json")) as metadata_f:
+            metadata = json.load(metadata_f)
+
+        all_module_names = []
+        for name in metadata["modules"].keys():
+            all_module_names.append(name)
+        assert len(metadata["modules"]) == 1, "Multiple modules is not supported."
+
         # Extract informations from the Model Library Format
-        graph_file = os.path.join(extract_path, "executor-config", "graph", "graph.json")
+        graph_file = os.path.join(
+            extract_path, "executor-config", "graph", f"{all_module_names[0]}.graph"
+        )
         with open(graph_file, "r") as f:
             # returns JSON object as a dictionary
             graph_dict = json.load(f)
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 1dd63b319dbd5..e220fa1ca5430 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -39,6 +39,7 @@
 # This should be kept identical to runtime::symbol::tvm_module_main
 MAIN_FUNC_NAME_STR = "__tvm_main__"
 STANDALONE_CRT_URL = "./runtime"
+METADATA_FILE = "metadata.json"
 
 
 class UnsupportedInModelLibraryFormatError(Exception):
@@ -67,56 +68,78 @@ def generate_c_interface_header(
 EPHEMERAL_MODULE_TYPE_KEYS = ("metadata_module",)
 
 
-def _populate_codegen_dir(mod, codegen_dir: str, module_name: str = None):
+def _populate_codegen_dir(
+    mods: typing.Union[
+        typing.List[executor_factory.ExecutorFactoryModule],
+        typing.List[tvm.runtime.Module],
+    ],
+    codegen_dir: str,
+):
     """Populate the codegen sub-directory as part of a Model Library Format export.
 
     Parameters
     ----------
-    mod : tvm.runtime.Module
-        Module which should be written to codegen_dir.
+    mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule], List[tvm.runtime.Module]
+        A list of the return value of tvm.relay.build, which
+        will be exported into Model Library Format.
     codegen_dir : str
         Path to the codegen directory on disk.
     module_name: Optional[str]
         Name used to prefix the generated source files
 
     """
-    dso_modules = mod._collect_dso_modules()
-    non_dso_modules = mod._collect_from_import_tree(lambda m: m not in dso_modules)
+    dso_modules = []
+    for mod in mods:
+        if isinstance(mod, executor_factory.ExecutorFactoryModule):
+            lib = mod.lib
+        elif isinstance(mod, tvm.runtime.Module):
+            lib = mod
+        else:
+            raise RuntimeError(f"Not supported module type: {type(mod)}")
 
-    # Filter ephemeral modules which cannot be exported.
-    dso_modules = [m for m in dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS]
-    non_dso_modules = [m for m in non_dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS]
+        dso_modules = lib._collect_dso_modules()
+        non_dso_modules = lib._collect_from_import_tree(lambda m: m not in dso_modules)
 
-    if non_dso_modules:
-        raise UnsupportedInModelLibraryFormatError(
-            f"Don't know how to export non-c or non-llvm modules; found: {non_dso_modules!r}"
-        )
+        # Filter ephemeral modules which cannot be exported.
+        dso_modules = [m for m in dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS]
+        non_dso_modules = [
+            m for m in non_dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS
+        ]
 
-    mod_indices = {"lib": 0, "src": 0}
-    host_codegen_dir = os.path.join(codegen_dir, "host")
-    lib_name = f"{module_name}_lib" if module_name else "lib"
-
-    for dso_mod in dso_modules:
-        if dso_mod.type_key == "c":
-            assert dso_mod.format in ["c", "cc", "cpp"]
-            ext = dso_mod.format
-            index = mod_indices["src"]
-            mod_indices["src"] += 1
-            parent_dir = os.path.join(host_codegen_dir, "src")
-            file_name = os.path.join(parent_dir, f"{lib_name}{index}.{ext}")
-        elif dso_mod.type_key == "llvm":
-            index = mod_indices["lib"]
-            mod_indices["lib"] += 1
-            parent_dir = os.path.join(host_codegen_dir, "lib")
-            file_name = os.path.join(parent_dir, f"{lib_name}{index}.o")
-        else:
-            assert (
-                False
-            ), f"do not expect module with type_key={mod.type_key} from _collect_dso_modules"
+        if non_dso_modules:
+            raise UnsupportedInModelLibraryFormatError(
+                f"Don't know how to export non-c or non-llvm modules; found: {non_dso_modules!r}"
+            )
+
+        mod_indices = {"lib": 0, "src": 0}
+        host_codegen_dir = os.path.join(codegen_dir, "host")
+        lib_name = (
+            f"{mod.libmod_name}_lib"
+            if isinstance(mod, executor_factory.ExecutorFactoryModule)
+            else "lib"
+        )
 
-        if not os.path.exists(parent_dir):
-            os.makedirs(parent_dir)
-        dso_mod.save(file_name)
+        for dso_mod in dso_modules:
+            if dso_mod.type_key == "c":
+                assert dso_mod.format in ["c", "cc", "cpp"]
+                ext = dso_mod.format
+                index = mod_indices["src"]
+                mod_indices["src"] += 1
+                parent_dir = os.path.join(host_codegen_dir, "src")
+                file_name = os.path.join(parent_dir, f"{lib_name}{index}.{ext}")
+            elif dso_mod.type_key == "llvm":
+                index = mod_indices["lib"]
+                mod_indices["lib"] += 1
+                parent_dir = os.path.join(host_codegen_dir, "lib")
+                file_name = os.path.join(parent_dir, f"{lib_name}{index}.o")
+            else:
+                assert (
+                    False
+                ), f"do not expect module with type_key={lib.type_key} from _collect_dso_modules"
+
+            if not os.path.exists(parent_dir):
+                os.makedirs(parent_dir)
+            dso_mod.save(file_name)
 
 
 def _build_memory_map(mod):
@@ -297,7 +320,7 @@ def _should_generate_interface_header(mod):
     return "interface-api" in mod.executor and mod.executor["interface-api"] == "c"
 
 
-def _make_tar(source_dir, tar_file_path, mod):
+def _make_tar(source_dir, tar_file_path, modules):
     """Build a tar file from source_dir."""
     with tarfile.open(tar_file_path, "w") as tar_f:
 
@@ -307,91 +330,127 @@ def reset(tarinfo):
             return tarinfo
 
         tar_f.add(str(source_dir), arcname=".", filter=reset)
-        is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
-        if is_aot and str(mod.runtime) == "crt":
-            tar_f.add(get_standalone_crt_dir(), arcname=STANDALONE_CRT_URL)
+
+        for mod in modules:
+            is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
+            if is_aot and str(mod.runtime) == "crt":
+                tar_f.add(get_standalone_crt_dir(), arcname=STANDALONE_CRT_URL)
+                break
 
 
-_GENERATED_VERSION = 6
+_GENERATED_VERSION = 7
+
+
+def _is_module_names_unique(mods: typing.List[executor_factory.ExecutorFactoryModule]):
+    """Check if built modules have unique names.
+
+    Parameters
+    ----------
+    mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule]
+        A list of the return value of tvm.relay.build,
+        which will be exported into Model Library Format.
+    """
+    all_names = []
+    for mod in mods:
+        all_names.append(mod.libmod_name)
+
+    return len(set(all_names)) == len(all_names)
 
 
 def _export_graph_model_library_format(
-    mod: executor_factory.ExecutorFactoryModule, tempdir: pathlib.Path
+    mods: typing.List[executor_factory.ExecutorFactoryModule], tempdir: pathlib.Path
 ):
     """Export a tvm.relay.build artifact in Model Library Format.
 
     Parameters
     ----------
-    mod : tvm.relay.backend.executor_factory.ExecutorFactoryModule
-        The return value of tvm.relay.build, which will be exported into Model Library Format.
+    mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule]
+        A list of the return value of tvm.relay.build,
+        which will be exported into Model Library Format.
     tempdir : pathlib.Path
         Temporary directory to populate with Model Library Format contents.
     """
-    is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
-    executor = ["aot"] if is_aot else ["graph"]
+
+    assert _is_module_names_unique(mods), "Multiple modules should have unique names."
 
     metadata = {
         "version": _GENERATED_VERSION,
-        "model_name": mod.libmod_name,
-        "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
-        "memory": _build_memory_map(mod),
-        "target": [str(t) for t in mod.target],
-        "executors": executor,
-        "style": "full-model",
     }
-
-    if is_aot and (str(mod.runtime) == "crt"):
-        standalone_crt = {
-            "short_name": "tvm_standalone_crt",
-            "url": f"{STANDALONE_CRT_URL}",
-            "url_type": "mlf_path",
-            "version_spec": f"{tvm.__version__}",
+    metadata["modules"] = {}
+    for mod in mods:
+        is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
+        executor = ["aot"] if is_aot else ["graph"]
+        module_name = mod.libmod_name
+        metadata["modules"][module_name] = {
+            "model_name": module_name,
+            "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
+            "memory": _build_memory_map(mod),
+            "target": [str(t) for t in mod.target],
+            "executors": executor,
+            "style": "full-model",
         }
-        external_dependencies = [standalone_crt]
-        metadata["external_dependencies"] = external_dependencies
 
-    with open(tempdir / "metadata.json", "w") as json_f:
+        if is_aot and (str(mod.runtime) == "crt"):
+            standalone_crt = {
+                "short_name": "tvm_standalone_crt",
+                "url": f"{STANDALONE_CRT_URL}",
+                "url_type": "mlf_path",
+                "version_spec": f"{tvm.__version__}",
+            }
+            external_dependencies = [standalone_crt]
+            metadata["modules"][module_name]["external_dependencies"] = external_dependencies
+
+    with open(tempdir / METADATA_FILE, "w") as json_f:
         json.dump(metadata, json_f, indent=2, sort_keys=True)
 
     codegen_dir = tempdir / "codegen"
     codegen_dir.mkdir()
-    _populate_codegen_dir(mod.lib, codegen_dir, mod.libmod_name)
-
-    if _should_generate_interface_header(mod):
-        include_path = codegen_dir / "host" / "include"
-        include_path.mkdir()
-        inputs, outputs = _get_inputs_and_outputs_from_module(mod)
-        devices = mod.get_devices()
-        pools = _get_pools_from_module(mod)
-        io_pool_allocations = _get_io_pool_allocation_from_module(mod)
-        workspace_size = int(metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"])
-        generate_c_interface_header(
-            mod.libmod_name,
-            inputs,
-            outputs,
-            pools,
-            io_pool_allocations,
-            devices,
-            workspace_size,
-            include_path,
-        )
+    _populate_codegen_dir(mods, codegen_dir)
 
     parameters_dir = tempdir / "parameters"
     parameters_dir.mkdir()
-    param_filename = parameters_dir / f"{mod.libmod_name}.params"
-    with open(param_filename, "wb") as f:
-        f.write(param_dict.save_param_dict(mod.params))
-
     src_dir = tempdir / "src"
     src_dir.mkdir()
-    with open(src_dir / "relay.txt", "w") as f:
-        f.write(str(mod.ir_mod))
+    graph_config_dir = tempdir / "executor-config" / "graph"
+    for mod in mods:
+        if _should_generate_interface_header(mod):
+            include_path = codegen_dir / "host" / "include"
+            if not include_path.exists():
+                include_path.mkdir()
+
+            inputs, outputs = _get_inputs_and_outputs_from_module(mod)
+            devices = mod.get_devices()
+            pools = _get_pools_from_module(mod)
+            io_pool_allocations = _get_io_pool_allocation_from_module(mod)
+            workspace_size = int(
+                metadata["modules"][mod.libmod_name]["memory"]["functions"]["main"][0][
+                    "workspace_size_bytes"
+                ]
+            )
+            generate_c_interface_header(
+                mod.libmod_name,
+                inputs,
+                outputs,
+                pools,
+                io_pool_allocations,
+                devices,
+                workspace_size,
+                include_path,
+            )
+
+        is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
+        param_filename = parameters_dir / f"{mod.libmod_name}.params"
+        with open(param_filename, "wb") as f:
+            f.write(param_dict.save_param_dict(mod.params))
 
-    if not is_aot:
-        graph_config_dir = tempdir / "executor-config" / "graph"
-        graph_config_dir.mkdir(parents=True)
-        with open(graph_config_dir / "graph.json", "w") as f:
-            f.write(mod.get_executor_config())
+        with open(src_dir / f"{mod.libmod_name}.relay", "w") as f:
+            f.write(str(mod.ir_mod))
+
+        if not is_aot:
+            if not graph_config_dir.exists():
+                graph_config_dir.mkdir(parents=True)
+            with open(graph_config_dir / f"{mod.libmod_name}.graph", "w") as f:
+                f.write(mod.get_executor_config())
 
 
 class NonStaticShapeError(Exception):
@@ -451,14 +510,11 @@ def _eval_shape(param_name, buffer_shape):
 
 def _export_operator_model_library_format(mod: build_module.OperatorModule, tempdir):
     """Export the result of tvm.build() in Model Library Format.
-
     Parameters
     ----------
     mod : runtime.Module
         The Module returned from tvm.build().
-    args : list of Buffer or Tensor or Var, optional
-        The args supplied to tvm.build().
-    file_name : str
+    tempdir : str
         Path to the .tar archive to generate.
     """
     targets = []
@@ -484,12 +540,12 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
         "executors": [],
         "style": "operator",
     }
-    with open(tempdir / "metadata.json", "w") as metadata_f:
+    with open(tempdir / METADATA_FILE, "w") as metadata_f:
         json.dump(metadata, metadata_f)
 
     codegen_dir = tempdir / "codegen"
     codegen_dir.mkdir()
-    _populate_codegen_dir(mod, codegen_dir)
+    _populate_codegen_dir(list([mod]), codegen_dir)
 
 
 ExportableModule = typing.Union[
@@ -499,7 +555,10 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
 ]
 
 
-def export_model_library_format(mod: ExportableModule, file_name: typing.Union[str, pathlib.Path]):
+def export_model_library_format(
+    mods: typing.Union[ExportableModule, typing.List[ExportableModule]],
+    file_name: typing.Union[str, pathlib.Path],
+):
     """Export the build artifact in Model Library Format.
 
     This function creates a .tar archive containing the build artifacts in a standardized
@@ -508,7 +567,7 @@ def export_model_library_format(mod: ExportableModule, file_name: typing.Union[s
 
     Parameters
     ----------
-    mod : ExportableModule
+    mod : ExportableModule, List[ExportableModule]
         The return value of tvm.build or tvm.relay.build.
     file_name : str
         Path to the .tar archive to generate.
@@ -518,20 +577,36 @@ def export_model_library_format(mod: ExportableModule, file_name: typing.Union[s
     file_name : str
         The path to the generated .tar archive.
     """
-    file_name = pathlib.Path(file_name)
+    modules = mods
+    if not isinstance(mods, list):
+        modules = list([mods])
+
+    operator_module_type = all(isinstance(mod, build_module.OperatorModule) for mod in modules)
+    graph_module_type = all(
+        isinstance(
+            mod,
+            (
+                executor_factory.AOTExecutorFactoryModule,
+                executor_factory.GraphExecutorFactoryModule,
+            ),
+        )
+        for mod in modules
+    )
 
+    file_name = pathlib.Path(file_name)
     tempdir = utils.tempdir()
 
-    if isinstance(mod, build_module.OperatorModule):
-        _export_operator_model_library_format(mod, tempdir.path)
-    elif isinstance(
-        mod,
-        (executor_factory.AOTExecutorFactoryModule, executor_factory.GraphExecutorFactoryModule),
-    ):
-        _export_graph_model_library_format(mod, tempdir.path)
+    if operator_module_type:
+        if len(modules) != 1:
+            raise RuntimeError("Multiple operator is not supported.")
+        _export_operator_model_library_format(modules[0], tempdir.path)
+    elif graph_module_type:
+        _export_graph_model_library_format(modules, tempdir.path)
     else:
-        raise NotImplementedError(f"Don't know how to export module of type {mod.__class__!r}")
+        raise NotImplementedError(
+            f"Don't know how to export module of type {modules[0].__class__!r}"
+        )
 
-    _make_tar(tempdir.path, file_name, mod)
+    _make_tar(tempdir.path, file_name, modules)
 
     return file_name
diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py
index 81e29a92a86a1..a48c8dc3230fb 100644
--- a/python/tvm/micro/testing/utils.py
+++ b/python/tvm/micro/testing/utils.py
@@ -24,6 +24,8 @@
 import time
 from typing import Union
 
+import tvm
+from tvm import relay
 from tvm.micro.project_api.server import IoTimeoutError
 
 # Timeout in seconds for AOT transport.
@@ -77,9 +79,36 @@ def _read_line(transport, timeout_sec: int) -> str:
 def mlf_extract_workspace_size_bytes(mlf_tar_path: Union[pathlib.Path, str]) -> int:
     """Extract an MLF archive file and read workspace size from metadata file."""
 
+    workspace_size = 0
     with tarfile.open(mlf_tar_path, "r:*") as tar_file:
         tar_members = [ti.name for ti in tar_file.getmembers()]
         assert "./metadata.json" in tar_members
         with tar_file.extractfile("./metadata.json") as f:
             metadata = json.load(f)
-            return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"]
+            for mod_name in metadata["modules"].keys():
+                workspace_size += metadata["modules"][mod_name]["memory"]["functions"]["main"][0][
+                    "workspace_size_bytes"
+                ]
+            return workspace_size
+
+
+def get_conv2d_relay_module():
+    """Generate a conv2d Relay module for testing."""
+    data_shape = (1, 3, 64, 64)
+    weight_shape = (8, 3, 5, 5)
+    data = relay.var("data", relay.TensorType(data_shape, "int8"))
+    weight = relay.var("weight", relay.TensorType(weight_shape, "int8"))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        channels=8,
+        kernel_size=(5, 5),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    f = relay.Function([data, weight], y)
+    mod = tvm.IRModule.from_expr(f)
+    mod = relay.transform.InferType()(mod)
+    return mod
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 2651435434b11..05c8daa20c215 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -17,7 +17,6 @@
 import logging
 import os
 import pathlib
-import sys
 import logging
 
 import pytest
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 1a4f23ad467a5..987d425aa63d4 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -44,6 +44,7 @@
     create_relay_module_and_inputs_from_tflite_file,
 )
 from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER, parametrize_aot_options
+from tvm.micro.testing.utils import get_conv2d_relay_module
 
 
 def test_error_c_interface_with_packed_api():
@@ -76,22 +77,7 @@ def test_error_c_interface_with_packed_api():
 @parametrize_aot_options
 def test_conv_with_params(interface_api, use_unpacked_api, test_runner):
     """Tests compilation of convolution with parameters"""
-    relay_model = """
-#[version = "0.0.5"]
-def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
-    %1 = nn.conv2d(
-         %data,
-         %weight,
-         padding=[2, 2],
-         channels=8,
-         kernel_size=[5, 5],
-         data_layout="NCHW",
-         kernel_layout="OIHW",
-         out_dtype="int32");
-  %1
-}
-"""
-    mod = tvm.parser.fromtext(relay_model)
+    mod = get_conv2d_relay_module()
     main_func = mod["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
     type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
@@ -576,23 +562,7 @@ def test_multiple_models(interface_api, use_unpacked_api, test_runner):
     params1 = None
 
     # Convolution model
-    relay_model = """
-    #[version = "0.0.5"]
-    def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
-        %1 = nn.conv2d(
-            %data,
-            %weight,
-            padding=[2, 2],
-            channels=8,
-            kernel_size=[5, 5],
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32");
-    %1
-    }
-    """
-
-    mod2 = tvm.parser.fromtext(relay_model)
+    mod2 = get_conv2d_relay_module()
     main_func = mod2["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
     type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
diff --git a/tests/python/relay/strategy/arm_cpu/test_avg_pool.py b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
index 31a812b38eed7..3d6690a1a16f8 100644
--- a/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
+++ b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -165,4 +163,4 @@ class TestAvgPool3d(BasicPoolTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
index 0f0507cfe7d3d..b1dda10c42944 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -114,4 +112,4 @@ class TestConv1d_ncw(BasicConv1dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
index e430ade2fac14..3daed6221f68e 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -142,4 +140,4 @@ class TestConv1d_nwc(BasicConv1dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
index 3b43d37c9075f..8ca132ffba759 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -135,4 +133,4 @@ class TestConv2d_NCHWc(BasicConv2dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
index 3edffba8acaa6..a69ea6c09e790 100644
--- a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
+++ b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -87,4 +85,4 @@ class TestDense(BasicDenseTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
index 96628a6371d00..ee0d51c321f79 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
index 69e9ab09e4c95..178b44edbd403 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -118,4 +116,4 @@ class TestDepthWiseConv2d_NCHWc(BasicConv2dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
index b24c651de988f..47fe6d9f74c25 100644
--- a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
diff --git a/tests/python/relay/strategy/arm_cpu/test_max_pool.py b/tests/python/relay/strategy/arm_cpu/test_max_pool.py
index f58a041ecb746..ee890261d1b42 100644
--- a/tests/python/relay/strategy/arm_cpu/test_max_pool.py
+++ b/tests/python/relay/strategy/arm_cpu/test_max_pool.py
@@ -14,10 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-from pickle import FALSE
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -132,4 +129,4 @@ class TestMaxPool3d(BasicPoolTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index d707e6b4646b7..0caae1cdd9d46 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -15,15 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import pathlib
+import sys
 import datetime
 import json
 import os
 import tarfile
 
-import numpy
+import numpy as np
 import pytest
 import platform
 
+pytest.importorskip("tvm.micro")
+
 import tvm
 import tvm.relay
 from tvm.relay.backend import Executor, Runtime
@@ -31,12 +35,14 @@
 import tvm.runtime.module
 import tvm.testing
 from tvm.contrib import utils
+import tvm.micro as micro
+from tvm.micro.testing.utils import get_conv2d_relay_module
+import tvm.micro.model_library_format as model_library_format
+from tvm.micro.model_library_format import _GENERATED_VERSION
 
 
 @tvm.testing.requires_micro
 def test_export_operator_model_library_format():
-    import tvm.micro as micro
-
     target = tvm.target.target.micro("host")
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         A = tvm.te.placeholder((2,), dtype="int8")
@@ -63,7 +69,7 @@ def test_export_operator_model_library_format():
 
     with open(os.path.join(extract_dir, "metadata.json")) as json_f:
         metadata = json.load(json_f)
-        assert metadata["version"] == 6
+        assert metadata["version"] == _GENERATED_VERSION
         assert metadata["model_name"] == "add"
         export_datetime = datetime.datetime.strptime(
             metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
@@ -95,8 +101,35 @@ def test_export_operator_model_library_format():
             assert tir_f.read() == str(ir_mod)
 
 
+@tvm.testing.requires_micro
+def test_export_multiple_operator_model_library_format():
+    target = tvm.target.target.micro("host")
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        A = tvm.te.placeholder((2,), dtype="int8")
+        B = tvm.te.placeholder((1,), dtype="int8")
+        C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
+        sched = tvm.te.create_schedule(C.op)
+        mod = tvm.build(
+            sched,
+            [A, B, C],
+            tvm.target.Target(target, target),
+            runtime=Runtime("crt", {"system-lib": True}),
+            name="add",
+        )
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    with pytest.raises(RuntimeError) as exc:
+        micro.export_model_library_format([mod, mod], mlf_tar_path)
+
+        assert str(exc.exception) == ("Multiple operator is not supported.")
+
+
 def validate_graph_json(extract_dir, factory):
-    with open(os.path.join(extract_dir, "executor-config", "graph", "graph.json")) as graph_f:
+    with open(
+        os.path.join(extract_dir, "executor-config", "graph", f"{factory.libmod_name}.graph")
+    ) as graph_f:
         graph_json = graph_f.read()
         assert graph_json == factory.graph_json
 
@@ -141,12 +174,11 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                 executor=executor,
                 runtime=runtime,
                 mod_name="add",
-                params={"c": numpy.array([[2.0, 4.0]], dtype="float32")},
+                params={"c": np.array([[2.0, 4.0]], dtype="float32")},
             )
 
         temp_dir = utils.tempdir()
         mlf_tar_path = temp_dir.relpath("lib.tar")
-        import tvm.micro as micro
 
         micro.export_model_library_format(factory, mlf_tar_path)
         tf = tarfile.open(mlf_tar_path)
@@ -157,21 +189,22 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 
         with open(os.path.join(extract_dir, "metadata.json")) as json_f:
             metadata = json.load(json_f)
-            assert metadata["version"] == 6
-            assert metadata["model_name"] == "add"
+            module_name = factory.libmod_name
+            assert metadata["version"] == _GENERATED_VERSION
+            assert metadata["modules"][module_name]["model_name"] == "add"
             export_datetime = datetime.datetime.strptime(
-                metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+                metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-            assert metadata["target"] == [str(target)]
+            assert metadata["modules"][module_name]["target"] == [str(target)]
             if str(executor) == "graph":
-                assert metadata["memory"]["sids"] == [
+                assert metadata["modules"][module_name]["memory"]["sids"] == [
                     {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
                     {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
                     {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
                     {"storage_id": 3, "size_bytes": 8},
                 ]
-            assert metadata["memory"]["functions"]["main"] == [
+            assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [
                 {
                     "constants_size_bytes": json_constants_size_bytes,
                     "device": 1,
@@ -179,12 +212,14 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                     "workspace_size_bytes": 0,
                 }
             ]
-            assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
-                {"device": 1, "workspace_size_bytes": 0}
-            ]
+            assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                "workspace"
+            ] == [{"device": 1, "workspace_size_bytes": 0}]
             assert (
                 "fused_cast_multiply_add"
-                in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+                in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                    "function_name"
+                ]
             )
 
         assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "add_lib0.c"))
@@ -196,7 +231,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
         if str(executor) == "graph":
             validate_graph_json(extract_dir, factory)
 
-        with open(os.path.join(extract_dir, "src", "relay.txt")) as relay_f:
+        with open(os.path.join(extract_dir, "src", f"{module_name}.relay")) as relay_f:
             assert relay_f.read() == str(relay_mod)
 
         with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f:
@@ -227,12 +262,11 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                 target,
                 runtime=Runtime("crt", {"system-lib": True}),
                 mod_name="add",
-                params={"c": numpy.array([[2.0, 4.0]], dtype="float32")},
+                params={"c": np.array([[2.0, 4.0]], dtype="float32")},
             )
 
         temp_dir = utils.tempdir()
         mlf_tar_path = temp_dir.relpath("lib.tar")
-        import tvm.micro as micro
 
         micro.export_model_library_format(factory, mlf_tar_path)
         tf = tarfile.open(mlf_tar_path)
@@ -243,20 +277,21 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 
         with open(os.path.join(extract_dir, "metadata.json")) as json_f:
             metadata = json.load(json_f)
-            assert metadata["version"] == 6
-            assert metadata["model_name"] == "add"
+            module_name = factory.libmod_name
+            assert metadata["version"] == _GENERATED_VERSION
+            assert metadata["modules"][module_name]["model_name"] == "add"
             export_datetime = datetime.datetime.strptime(
-                metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+                metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-            assert metadata["target"] == [str(target)]
-            assert metadata["memory"]["sids"] == [
+            assert metadata["modules"][module_name]["target"] == [str(target)]
+            assert metadata["modules"][module_name]["memory"]["sids"] == [
                 {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
                 {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
                 {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
                 {"storage_id": 3, "size_bytes": 8},
             ]
-            assert metadata["memory"]["functions"]["main"] == [
+            assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [
                 {
                     "constants_size_bytes": 8,
                     "device": 1,
@@ -264,19 +299,21 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                     "workspace_size_bytes": 0,
                 }
             ]
-            assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
-                {"device": 1, "workspace_size_bytes": 0}
-            ]
+            assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                "workspace"
+            ] == [{"device": 1, "workspace_size_bytes": 0}]
             assert (
                 "fused_cast_multiply_add"
-                in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+                in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                    "function_name"
+                ]
             )
 
         assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "lib", "add_lib0.o"))
 
         validate_graph_json(extract_dir, factory)
 
-        with open(os.path.join(extract_dir, "src", "relay.txt")) as relay_f:
+        with open(os.path.join(extract_dir, "src", f"{module_name}.relay")) as relay_f:
             assert relay_f.read() == str(relay_mod)
 
         with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f:
@@ -314,7 +351,6 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
 
     temp_dir = utils.tempdir()
     mlf_tar_path = temp_dir.relpath("lib.tar")
-    import tvm.micro as micro
 
     micro.export_model_library_format(factory, mlf_tar_path)
     tf = tarfile.open(mlf_tar_path)
@@ -325,14 +361,15 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
 
     with open(os.path.join(extract_dir, "metadata.json")) as json_f:
         metadata = json.load(json_f)
-        assert metadata["version"] == 6
-        assert metadata["model_name"] == "qnn_conv2d"
+        module_name = factory.libmod_name
+        assert metadata["version"] == _GENERATED_VERSION
+        assert metadata["modules"][module_name]["model_name"] == "qnn_conv2d"
         export_datetime = datetime.datetime.strptime(
-            metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+            metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
         )
         assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-        assert metadata["target"] == [str(target)]
-        assert metadata["memory"]["functions"]["main"] == [
+        assert metadata["modules"][module_name]["target"] == [str(target)]
+        assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [
             {
                 "constants_size_bytes": 0,
                 "device": 1,
@@ -340,12 +377,14 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
                 "workspace_size_bytes": 2466816,
             }
         ]
-        assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
-            {"device": 1, "workspace_size_bytes": 2466816}
-        ]
+        assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+            "workspace"
+        ] == [{"device": 1, "workspace_size_bytes": 2466816}]
         assert (
             "fused_nn_conv2d_add_fixed_point_multiply_clip_cast"
-            in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+            in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                "function_name"
+            ]
         )
 
 
@@ -354,11 +393,9 @@ def test_export_non_dso_exportable():
     module = tvm.support.FrontendTestModule()
 
     temp_dir = utils.tempdir()
-    import tvm.micro as micro
-    import tvm.micro.model_library_format as model_library_format
 
     with pytest.raises(micro.UnsupportedInModelLibraryFormatError) as exc:
-        model_library_format._populate_codegen_dir(module, temp_dir.relpath("codegen"))
+        model_library_format._populate_codegen_dir([module], temp_dir.relpath("codegen"))
 
         assert str(exc.exception) == (
             "Don't know how to export non-c or non-llvm modules; found: ffi_testing"
@@ -408,8 +445,6 @@ def test_export_byoc_c_module():
     temp_dir = utils.tempdir()
     mlf_tar_path = temp_dir.relpath("lib.tar")
 
-    from tvm import micro
-
     micro.export_model_library_format(factory, mlf_tar_path)
 
     with tarfile.open(mlf_tar_path, "r:*") as tf:
@@ -418,7 +453,7 @@ def test_export_byoc_c_module():
         assert "./metadata.json" in tar_members
         with tf.extractfile("./metadata.json") as f:
             metadata = json.load(f)
-        main_md = metadata["memory"]["functions"]["main"]
+        main_md = metadata["modules"][factory.libmod_name]["memory"]["functions"]["main"]
         if platform.architecture()[0] == "64bit":
             assert main_md == [
                 {
@@ -439,5 +474,140 @@ def test_export_byoc_c_module():
             ]
 
 
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_same_module_name():
+    mod = get_conv2d_relay_module()
+
+    executor = Executor("graph")
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod")
+        factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod")
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    with pytest.raises(AssertionError, match="Multiple modules should have unique names"):
+        micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_graph():
+    mod = get_conv2d_relay_module()
+
+    executor = Executor("graph")
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod1")
+        factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod2")
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+    micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+    with tarfile.open(mlf_tar_path, "r:*") as tf:
+        tar_members = [ti.name for ti in tf.getmembers()]
+        print("tar members", tar_members)
+        assert "./metadata.json" in tar_members
+        assert "./codegen/host/src/mod1_lib0.c" in tar_members
+        assert "./codegen/host/src/mod2_lib0.c" in tar_members
+
+        with tf.extractfile("./metadata.json") as f:
+            metadata = json.load(f)
+        mod2_main_md = metadata["modules"]["mod2"]["memory"]["functions"]["main"]
+        assert mod2_main_md == [
+            {
+                "constants_size_bytes": 0,
+                "device": 1,
+                "io_size_bytes": 143960,
+                "workspace_size_bytes": 158088,
+            }
+        ]
+        assert metadata["modules"]["mod1"]["model_name"] == "mod1"
+        assert metadata["modules"]["mod2"]["model_name"] == "mod2"
+
+
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_c():
+    mod = get_conv2d_relay_module()
+
+    executor = Executor("aot", {"unpacked-api": True, "interface-api": "c"})
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod1")
+        factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod2")
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+    tf = tarfile.open(mlf_tar_path)
+
+    extract_dir = temp_dir.relpath("extract")
+    os.mkdir(extract_dir)
+    tf.extractall(extract_dir)
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib1.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib1.c"))
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod1.h"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod2.h"))
+
+    # check CRT runtime directory
+    assert os.path.exists(os.path.join(extract_dir, "runtime"))
+
+
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_aot_graph():
+    mod = get_conv2d_relay_module()
+
+    executor1 = Executor("graph")
+    executor2 = Executor("aot", {"unpacked-api": True, "interface-api": "c"})
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(
+            mod, target, runtime=runtime, executor=executor1, mod_name="mod1"
+        )
+        factory2 = tvm.relay.build(
+            mod, target, runtime=runtime, executor=executor2, mod_name="mod2"
+        )
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+    tf = tarfile.open(mlf_tar_path)
+    extract_dir = temp_dir.relpath("extract")
+    os.mkdir(extract_dir)
+    tf.extractall(extract_dir)
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib1.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib2.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib1.c"))
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod2.h"))
+
+    with open(os.path.join(extract_dir, "metadata.json")) as f:
+        metadata = json.load(f)
+
+    assert metadata["modules"]["mod1"]["executors"] == ["graph"]
+    assert metadata["modules"]["mod2"]["executors"] == ["aot"]
+    assert metadata["version"] == _GENERATED_VERSION
+
+
 if __name__ == "__main__":
-    tvm.testing.main()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 8a94b6699a16e688c2da26c5e83bf52e671d94fc Mon Sep 17 00:00:00 2001
From: Raghav Chakravarthy <vcraghav@yahoo.com>
Date: Fri, 17 Jun 2022 15:36:31 -0400
Subject: [PATCH 170/181] [Runtime][PipelineExecutor] Added Interface to Track
 Number of Global Inputs (#11315)

* [Runtime][PipleineExecutor] Added Interface to Track Number of Global Inputs

Added a feature to PipelineExecutor to track number of Global Inputs.

* Fixed CI Error

* Fixed remaining CI Error
---
 python/tvm/contrib/pipeline_executor.py      | 11 +++++++++++
 src/runtime/pipeline/pipeline_executor.cc    |  8 +++++++-
 src/runtime/pipeline/pipeline_executor.h     |  1 +
 src/runtime/pipeline/pipeline_struct.h       |  3 +++
 tests/python/relay/test_pipeline_executor.py |  2 ++
 5 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index a50fffaa2b434..5ef309bb28080 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -55,6 +55,7 @@ def __init__(self, module):
         self._get_input = self.module["get_input"]
         self._get_output = self.module["get_output"]
         self._get_num_outputs = self.module["get_num_outputs"]
+        self._get_num_inputs = self.module["get_num_inputs"]
         self._get_input_pipeline_map = self.module["get_input_pipeline_map"]
         self._get_pipe_execute_count = self.module["get_execute_count"]
 
@@ -159,6 +160,16 @@ def num_outputs(self):
         """
         return self._get_num_outputs()
 
+    @property
+    def num_inputs(self):
+        """Get the number of inputs
+        Returns
+        -------
+        count : int
+            The number of inputs
+        """
+        return self._get_num_inputs()
+
     @staticmethod
     def load_library(config_file_name):
         """Import files to create a pipeline executor.
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index a191f816f7159..b5c560e255e3b 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -34,6 +34,9 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name,
   if (name == "get_num_outputs") {
     return PackedFunc(
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumOutputs(); });
+  } else if (name == "get_num_inputs") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumInputs(); });
   } else if (name == "get_input_pipeline_map") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       if (String::CanConvertFrom(args[0])) {
@@ -87,7 +90,10 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name,
     return PackedFunc();
   }
 }
-
+/*!
+ * brief Returns number of global inputs.
+ */
+int PipelineExecutor::NumInputs(void) { return input_connection_config_.GetInputNum(); }
 /*!
  * \brief set input to the runtime module.
  * \param input_name The input name.
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index 9f9b24bdf0bec..87b50ed3a1a90 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -115,6 +115,7 @@ class TVM_DLL PipelineExecutor : public ModuleNode {
   int NumOutputs() const { return num_outputs_; }
   /*!\brief Run the pipeline executor.*/
   void Run();
+  int NumInputs();
   /*!
    * \brief Get a list output data.
    * \return A list of output data.
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index 2cb7b4a6d24e5..540103d0186cb 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -560,6 +560,9 @@ struct InputConnectionConfig {
     }
     return input_connection[key];
   }
+  /*!\brief Returns the number of global inputs through the input_runtime_map list size.*/
+  int GetInputNum() { return input_runtime_map.size(); }
+
   /*!
    * \brief Getting the global input index through the input name.
    * \param input_name The global input name.
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 541f3bba13da2..06614977d4caf 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -595,6 +595,8 @@ def test_pipeline():
                 if input_map[0] == "0":
                     input_data = pipeline_module_test.get_input("data_a")
                     tvm.testing.assert_allclose(data, input_data.numpy())
+
+                assert pipeline_module_test.num_inputs == 2
                 # Running the pipeline executor in the pipeline mode.
                 pipeline_module_test.run()
 

From 2708b6ca024d8e328151d9beb237b9852093cff6 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 17 Jun 2022 12:47:30 -0700
Subject: [PATCH 171/181] [MetaSchedule][Minor] Fix EvaluatorConfig Argument
 Description (#11766)

Pointed out by @sunggg that the description of `number` and `repeat` for evaluator configuration is not accurate, updated to a version more consistent with `TimeEvaluator`.

![TimeEvaluator](https://user-images.githubusercontent.com/3203174/174385966-74d3dbf6-dcca-43ea-9c0b-a91b4a281687.png)
---
 python/tvm/meta_schedule/runner/config.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/tvm/meta_schedule/runner/config.py b/python/tvm/meta_schedule/runner/config.py
index 585b88ed9939c..e3b5364dee22d 100644
--- a/python/tvm/meta_schedule/runner/config.py
+++ b/python/tvm/meta_schedule/runner/config.py
@@ -28,9 +28,14 @@ class EvaluatorConfig(NamedTuple):
     Parameters
     ----------
     number: int
-        The number of runs.
+        The number of times to run this function for taking average.
+        We call these runs as one `repeat` of measurement.
     repeat: int
-        The number of times to repeat in each run.
+        The number of times to repeat the measurement.
+        In total, the function will be invoked (1 + number x repeat) times,
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
     min_repeat_ms: int
         Minimum repeat time in ms. if the execution latency is too short,
         increase the number of runs to the given time (in ms) to reduce the measurement error.

From 65d45af54b7a90f759fe8effb4abe71209b8e08e Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Fri, 17 Jun 2022 18:40:13 -0700
Subject: [PATCH 172/181] Add tool to clear stale images. (#11772)

---
 docker/clear-stale-images.sh     | 113 +++++++++++++++++++++++++++++++
 docs/contribute/pull_request.rst |   8 +++
 2 files changed, 121 insertions(+)
 create mode 100755 docker/clear-stale-images.sh

diff --git a/docker/clear-stale-images.sh b/docker/clear-stale-images.sh
new file mode 100755
index 0000000000000..1e1e4b86a4d7b
--- /dev/null
+++ b/docker/clear-stale-images.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Remove tvm-related docker images from the local system which
+# are not used by the currently-checked-out branch in this git
+# repository plus any linked worktrees.
+
+set -euo pipefail
+
+dry_run=0
+repositories=( "$(cd $(dirname "$0") && git rev-parse --show-toplevel)" )
+skip_confirm=0
+verbose=0
+while [ "${1+x}" == "x" ]; do
+    case "$1" in
+        --help|-h)
+            echo "usage: $0 [-n] [-v] [-y] <repository> [<repository> ...]"
+            echo ""
+            echo "Remove tvm-related docker images from the local system which"
+            echo "are not used by the currently-checked-out branch in this git"
+            echo "repository plus any linked worktrees."
+            echo ""
+            echo 'This command should remove only docker images beginning with "tlcpack"'
+            echo ""
+            echo "Options:"
+            echo " -n           Perform a dry-run and just print the docker rmi command"
+            echo " -v           Verbosely list the images kept and why"
+            echo " -y           Skip confirmation"
+            echo " <repository> Additional git repositories to consult."
+            exit 2
+            ;;
+        -n)
+            dry_run=1
+            ;;
+        -v)
+            verbose=1
+            ;;
+        -y)
+            skip_confirm=1
+            ;;
+        *)
+            repositories=( "${repositories[@]}" "$1" )
+            ;;
+    esac
+    shift
+done
+
+declare -a used_images
+for r in "${repositories[@]}"; do
+    if [ -d "${r}/.git" ]; then
+        worktree="${r}"
+    else
+        worktree="$(cat "${r}/.git")"
+    fi
+    while read wt; do
+        d="${wt:9:${#wt}}"  # strip "worktree " prefix
+        for img in $(cat "${d}/Jenkinsfile" | grep -E '^ci_[a-z]+ = ' | sed -E "s/ci_[a-z]+ = '([^\"]*)'/\1/"); do
+            used_images=( "${used_images[@]}" "${img}" )
+        done
+    done < <(cd "${worktree}" && git worktree list --porcelain | grep '^worktree ')
+done
+
+declare -a to_rm
+while read image; do
+    if [ "${image}" == "<none>:<none>" ]; then
+        continue
+    fi
+    grep -qE "^tlcpack" < <(echo "$image") && is_tlcpack=1 || is_tlcpack=0
+    if [ $is_tlcpack -eq 0 ]; then   # non-tlcpack image
+        if [ $verbose -ne 0 ]; then
+            echo "skipping (non-tvm): $image"
+        fi
+        continue
+    fi
+    grep -q "$image" < <(echo "${used_images[@]}") && is_used=1 || is_used=0
+    if [ $is_used -eq 1 ]; then  # Image was found in used_images
+        if [ $verbose -ne 0 ]; then
+            echo "skipping (image used): $image"
+        fi
+        continue
+    fi
+    to_rm=( "${to_rm[@]}" "${image}" )
+done < <(docker images --format '{{.Repository}}:{{.Tag}}')
+
+docker_cmd=( docker rmi "${to_rm[@]}" )
+if [ ${dry_run} -ne 0 ]; then
+    echo "would run: ${docker_cmd[@]}"
+else
+    if [ $skip_confirm -eq 0 ]; then
+        echo "will run: ${docker_cmd[@]}"
+        read -p "Proceed? [y/N] " proceed
+        if [ "${proceed-}" != "y" -a "${proceed-}" != "Y" ]; then
+            echo "Aborted."
+            exit 2
+        fi
+    fi
+    "${docker_cmd[@]}"
+fi
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 82b5c5d43f416..26989fb8e6a3b 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -113,6 +113,14 @@ each time (e.g. you can test a change in CPU and i386 while retaining incrementa
     # run the CPU build and drop into a shell in the container
     python tests/scripts/ci.py cpu --interactive
 
+We regularly update our docker images and, over time, stale images may unnecessarily consume disk
+space. You can remove stale images that aren't used in the presently checked-out branch plus any
+other worktrees using the following command:
+
+.. code:: bash
+    docker/clear-stale-images.sh
+
+Consult the ``--help`` for more options.
 
 C++ (local)
 ^^^^^^^^^^^

From 4b1574623ce4b123508792415a78e9d1375bc3eb Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 17 Jun 2022 22:45:35 -0700
Subject: [PATCH 173/181] [MetaSchedule][Minor] Add Describe Function For
 Tuning Scripts (#11754)

This PR is based on #11751 and adds `describe` function for `tune_relay` and `tune_onnx` script on both AutoScheduler and MetaSchedule. It prints out very useful information for reproducibility as follows:
```
Python Environment
  TVM version    = 0.9.dev0
  Python version = 3.8.8 (default, Apr 13 2021, 19:58:26)  [GCC 7.3.0] (64 bit)
  os.uname()     = Linux 5.15.5-76051505-generic #202111250933~1638201579~21.04~09f1aa7-Ubuntu SMP Tue Nov 30 02: x86_64
CMake Options:
  {
    "BUILD_STATIC_RUNTIME": "OFF",
    "COMPILER_RT_PATH": "3rdparty/compiler-rt",
    "CUDA_VERSION": "NOT-FOUND",
    "DLPACK_PATH": "3rdparty/dlpack/include",
    "DMLC_PATH": "3rdparty/dmlc-core/include",
    "GIT_COMMIT_HASH": "3b872a0adae07b0cd60248346fd31b158cba630c",
    "GIT_COMMIT_TIME": "2022-06-15 11:27:59 -0700",
    "HIDE_PRIVATE_SYMBOLS": "OFF",
    "INDEX_DEFAULT_I64": "ON",
    "INSTALL_DEV": "OFF",
    "LLVM_VERSION": "11.0.1",
    "PICOJSON_PATH": "3rdparty/picojson",
    "RANG_PATH": "3rdparty/rang/include",
    "ROCM_PATH": "/opt/rocm",
    "SUMMARIZE": "OFF",
    "TVM_CXX_COMPILER_PATH": "/usr/lib/ccache/c++",
    "USE_ALTERNATIVE_LINKER": "AUTO",
    "USE_AOT_EXECUTOR": "ON",
    "USE_ARM_COMPUTE_LIB": "OFF",
    "USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR": "OFF",
    "USE_BLAS": "none",
    "USE_BNNS": "OFF",
    "USE_BYODT_POSIT": "OFF",
    "USE_CLML": "OFF",
    "USE_CLML_GRAPH_EXECUTOR": "OFF",
    "USE_CMSISNN": "OFF",
    "USE_COREML": "OFF",
    "USE_CPP_RPC": "OFF",
    "USE_CUBLAS": "OFF",
    "USE_CUDA": "/usr/lib/cuda-11.2",
    "USE_CUDNN": "OFF",
    "USE_CUSTOM_LOGGING": "OFF",
    "USE_CUTLASS": "OFF",
    "USE_DNNL": "OFF",
    "USE_ETHOSN": "OFF",
    "USE_FALLBACK_STL_MAP": "OFF",
    "USE_GRAPH_EXECUTOR": "ON",
    "USE_GRAPH_EXECUTOR_CUDA_GRAPH": "OFF",
    "USE_GTEST": "AUTO",
    "USE_HEXAGON": "OFF",
    "USE_HEXAGON_GTEST": "/path/to/hexagon/gtest",
    "USE_HEXAGON_RPC": "OFF",
    "USE_HEXAGON_SDK": "/path/to/sdk",
    "USE_IOS_RPC": "OFF",
    "USE_KHRONOS_SPIRV": "OFF",
    "USE_LIBBACKTRACE": "ON",
    "USE_LIBTORCH": "OFF",
    "USE_LLVM": "llvm-config-11",
    "USE_METAL": "OFF",
    "USE_MICRO": "OFF",
    "USE_MICRO_STANDALONE_RUNTIME": "OFF",
    "USE_MIOPEN": "OFF",
    "USE_MKL": "OFF",
    "USE_MSVC_MT": "OFF",
    "USE_NNPACK": "OFF",
    "USE_OPENCL": "OFF",
    "USE_OPENCL_GTEST": "/path/to/opencl/gtest",
    "USE_OPENMP": "none",
    "USE_PAPI": "OFF",
    "USE_PROFILER": "ON",
    "USE_PT_TVMDSOOP": "OFF",
    "USE_RANDOM": "ON",
    "USE_RELAY_DEBUG": "OFF",
    "USE_ROCBLAS": "OFF",
    "USE_ROCM": "OFF",
    "USE_RPC": "ON",
    "USE_RTTI": "ON",
    "USE_RUST_EXT": "OFF",
    "USE_SORT": "ON",
    "USE_SPIRV_KHR_INTEGER_DOT_PRODUCT": "OFF",
    "USE_STACKVM_RUNTIME": "OFF",
    "USE_TARGET_ONNX": "OFF",
    "USE_TENSORFLOW_PATH": "none",
    "USE_TENSORRT_CODEGEN": "OFF",
    "USE_TENSORRT_RUNTIME": "OFF",
    "USE_TFLITE": "OFF",
    "USE_TF_TVMDSOOP": "OFF",
    "USE_THREADS": "ON",
    "USE_THRUST": "OFF",
    "USE_VITIS_AI": "OFF",
    "USE_VULKAN": "OFF"
  }
```
---
 python/tvm/auto_scheduler/testing/tune_onnx.py  | 2 ++
 python/tvm/auto_scheduler/testing/tune_relay.py | 5 ++++-
 python/tvm/auto_scheduler/testing/tune_te.py    | 3 +++
 python/tvm/meta_schedule/testing/tune_onnx.py   | 2 ++
 python/tvm/meta_schedule/testing/tune_relay.py  | 4 +++-
 python/tvm/meta_schedule/testing/tune_te.py     | 3 +++
 6 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
index 2e6b9e5924e69..84ab1b48f8d25 100644
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -27,6 +27,7 @@
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.relay.frontend import from_onnx
+from tvm.support import describe
 
 
 def _parse_args():
@@ -152,6 +153,7 @@ def main():
     else:
         raise NotImplementedError(f"Unsupported target {ARGS.target}")
 
+    describe()
     print(f"Workload: {ARGS.model_name}")
     onnx_model = onnx.load(ARGS.onnx_path)
     shape_dict = {}
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index 48ed44ef19b78..2bd78139993be 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -26,6 +26,7 @@
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.support import describe
 
 
 def _parse_args():
@@ -149,6 +150,9 @@ def main():
         )
     else:
         raise NotImplementedError(f"Unsupported target {ARGS.target}")
+
+    describe()
+    print(f"Workload: {ARGS.workload}")
     mod, params, (input_name, input_shape, input_dtype) = get_network(
         ARGS.workload,
         ARGS.input_shape,
@@ -156,7 +160,6 @@ def main():
     )
     input_info = {input_name: input_shape}
     input_data = {}
-    print(f"Workload: {ARGS.workload}")
     for input_name, input_shape in input_info.items():
         print(f"  input_name: {input_name}")
         print(f"  input_shape: {input_shape}")
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py
index b02a6059e23dd..2eaddbbc081e9 100644
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -21,6 +21,7 @@
 import tvm
 from tvm import auto_scheduler
 from tvm.meta_schedule.testing.te_workload import CONFIGS
+from tvm.support import describe
 
 
 def _parse_args():
@@ -94,6 +95,8 @@ def _parse_args():
 
 
 def main():
+    describe()
+    print(f"Workload: {ARGS.workload}")
     log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
     workload_func, params = CONFIGS[ARGS.workload]
     params = params[0]  # type: ignore
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py
index 3a1b4cd5fe206..1a51622b5cde5 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -25,6 +25,7 @@
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.relay.frontend import from_onnx
+from tvm.support import describe
 
 
 def _parse_args():
@@ -120,6 +121,7 @@ def _parse_args():
 
 
 def main():
+    describe()
     print(f"Workload: {ARGS.model_name}")
     onnx_model = onnx.load(ARGS.onnx_path)
     shape_dict = {}
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index 8663eb460c4a8..6188e124fde82 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -24,6 +24,7 @@
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.support import describe
 
 
 def _parse_args():
@@ -118,6 +119,8 @@ def _parse_args():
 
 
 def main():
+    describe()
+    print(f"Workload: {ARGS.workload}")
     mod, params, (input_name, input_shape, input_dtype) = get_network(
         ARGS.workload,
         ARGS.input_shape,
@@ -125,7 +128,6 @@ def main():
     )
     input_info = {input_name: input_shape}
     input_data = {}
-    print(f"Workload: {ARGS.workload}")
     for input_name, input_shape in input_info.items():
         print(f"  input_name: {input_name}")
         print(f"  input_shape: {input_shape}")
diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py
index b2649564bfa98..cbc310f999ad9 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -24,6 +24,7 @@
 from tvm import meta_schedule as ms
 from tvm import tir
 from tvm.meta_schedule.testing.te_workload import create_te_workload
+from tvm.support import describe
 
 
 def _parse_args():
@@ -107,6 +108,8 @@ def _parse_args():
 
 
 def main():
+    describe()
+    print(f"Workload: {ARGS.workload}")
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(

From f8b320f523b24fd8ddb8cf7026e61bbb4f4ea348 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 18 Jun 2022 02:11:55 -0700
Subject: [PATCH 174/181] [MetaSchedule][Runtime] Enhance Runner RandomFill
 (#11758)

---
 CMakeLists.txt                                |   1 +
 cmake/config.cmake                            |   3 +
 cmake/modules/CUDA.cmake                      |  12 ++
 cmake/modules/LibInfo.cmake                   |   1 +
 cmake/utils/FindCUDA.cmake                    |   5 +
 docs/contribute/pull_request.rst              |   1 +
 .../tvm/auto_scheduler/testing/tune_onnx.py   |  10 +-
 .../tvm/auto_scheduler/testing/tune_relay.py  |  10 +-
 python/tvm/auto_scheduler/testing/tune_te.py  |  10 +-
 .../tvm/meta_schedule/runner/local_runner.py  |  48 ++++----
 python/tvm/meta_schedule/runner/rpc_runner.py |  50 +++++----
 python/tvm/meta_schedule/testing/tune_onnx.py |   8 +-
 .../tvm/meta_schedule/testing/tune_relay.py   |   8 +-
 python/tvm/meta_schedule/testing/tune_te.py   |   8 +-
 src/runtime/contrib/curand/curand.cc          | 104 ++++++++++++++++++
 .../contrib/curand/helper_cuda_kernels.cu     |  42 +++++++
 .../contrib/curand/helper_cuda_kernels.h      |  41 +++++++
 .../contrib/random/mt_random_engine.cc        | 103 +++++++++++++----
 src/runtime/contrib/random/random.cc          |  15 +++
 src/support/libinfo.cc                        |   5 +
 20 files changed, 377 insertions(+), 108 deletions(-)
 create mode 100644 src/runtime/contrib/curand/curand.cc
 create mode 100644 src/runtime/contrib/curand/helper_cuda_kernels.cu
 create mode 100644 src/runtime/contrib/curand/helper_cuda_kernels.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6931b40c667d9..31b0a90ef29f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ tvm_option(USE_CUDNN "Build with cuDNN" OFF)
 tvm_option(USE_CUBLAS "Build with cuBLAS" OFF)
 tvm_option(USE_CUTLASS "Build with CUTLASS" OFF)
 tvm_option(USE_THRUST "Build with Thrust" OFF)
+tvm_option(USE_CURAND "Build with cuRAND" OFF)
 tvm_option(USE_MIOPEN "Build with ROCM:MIOpen" OFF)
 tvm_option(USE_ROCBLAS "Build with ROCM:RoCBLAS" OFF)
 tvm_option(USE_SORT "Build with sort support" ON)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 212b565f25fbe..b9a3aaef7d7e8 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -296,6 +296,9 @@ set(USE_VTA_FPGA OFF)
 # Whether use Thrust
 set(USE_THRUST OFF)
 
+# Whether use cuRAND
+set(USE_CURAND OFF)
+
 # Whether to build the TensorFlow TVMDSOOp module
 set(USE_TF_TVMDSOOP OFF)
 
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 056ed18d442e5..bbbf6b89ba2e3 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -69,6 +69,18 @@ if(USE_CUDA)
     list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC})
   endif(USE_THRUST)
 
+  if(USE_CURAND)
+    message(STATUS "Build with cuRAND support")
+    message(STATUS "${CUDA_CURAND_LIBRARY}")
+    cmake_minimum_required(VERSION 3.13) # to compile CUDA code
+    enable_language(CUDA)
+    tvm_file_glob(GLOB CONTRIB_CURAND_SRC_CC src/runtime/contrib/curand/*.cc)
+    tvm_file_glob(GLOB CONTRIB_CURAND_SRC_CU src/runtime/contrib/curand/*.cu)
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CURAND_LIBRARY})
+    list(APPEND RUNTIME_SRCS ${CONTRIB_CURAND_SRC_CC})
+    list(APPEND RUNTIME_SRCS ${CONTRIB_CURAND_SRC_CU})
+  endif(USE_CURAND)
+
   if(USE_GRAPH_EXECUTOR_CUDA_GRAPH)
     if(NOT USE_GRAPH_EXECUTOR)
       message(FATAL_ERROR "CUDA Graph is only supported by graph executor, please set USE_GRAPH_EXECUTOR=ON")
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 06c42494a3314..3b3d8a4bcc9aa 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -111,6 +111,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_TFLITE="${USE_TFLITE}"
     TVM_INFO_USE_THREADS="${USE_THREADS}"
     TVM_INFO_USE_THRUST="${USE_THRUST}"
+    TVM_INFO_USE_CURAND="${USE_CURAND}"
     TVM_INFO_USE_VITIS_AI="${USE_VITIS_AI}"
     TVM_INFO_USE_VULKAN="${USE_VULKAN}"
     TVM_INFO_USE_CLML="${USE_CLML}"
diff --git a/cmake/utils/FindCUDA.cmake b/cmake/utils/FindCUDA.cmake
index 8f3f638309cd6..607f1761ae497 100644
--- a/cmake/utils/FindCUDA.cmake
+++ b/cmake/utils/FindCUDA.cmake
@@ -85,6 +85,10 @@ macro(find_cuda use_cuda use_cudnn)
         PATHS ${CUDA_TOOLKIT_ROOT_DIR}
         PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs lib/x86_64-linux-gnu
         NO_DEFAULT_PATH)
+      find_library(CUDA_CURAND_LIBRARY curand
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib
+        NO_DEFAULT_PATH)
       find_library(CUDA_CUBLAS_LIBRARY cublas
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
         ${CUDA_TOOLKIT_ROOT_DIR}/lib
@@ -134,6 +138,7 @@ macro(find_cuda use_cuda use_cudnn)
     message(STATUS "Found CUDA_CUDNN_INCLUDE_DIRS=" ${CUDA_CUDNN_INCLUDE_DIRS})
     message(STATUS "Found CUDA_CUDNN_LIBRARY=" ${CUDA_CUDNN_LIBRARY})
     message(STATUS "Found CUDA_CUBLAS_LIBRARY=" ${CUDA_CUBLAS_LIBRARY})
+    message(STATUS "Found CUDA_CURAND_LIBRARY=" ${CUDA_CURAND_LIBRARY})
     message(STATUS "Found CUDA_CUBLASLT_LIBRARY=" ${CUDA_CUBLASLT_LIBRARY})
   endif(CUDA_FOUND)
 endmacro(find_cuda)
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 26989fb8e6a3b..81852a2126108 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -118,6 +118,7 @@ space. You can remove stale images that aren't used in the presently checked-out
 other worktrees using the following command:
 
 .. code:: bash
+
     docker/clear-stale-images.sh
 
 Consult the ``--help`` for more options.
diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
index 84ab1b48f8d25..5fbc875d1eda9 100644
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -26,6 +26,7 @@
 from tvm import meta_schedule as ms
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.meta_schedule.utils import cpu_count
 from tvm.relay.frontend import from_onnx
 from tvm.support import describe
 
@@ -73,11 +74,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -100,7 +96,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -125,7 +121,7 @@ def main():
         key=ARGS.rpc_key,
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
-        n_parallel=ARGS.rpc_workers,
+        n_parallel=cpu_count(logical=True),
         number=ARGS.number,
         repeat=ARGS.repeat,
         min_repeat_ms=ARGS.min_repeat_ms,
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index 2bd78139993be..58ea327ec50b6 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -26,6 +26,7 @@
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.meta_schedule.utils import cpu_count
 from tvm.support import describe
 
 
@@ -66,11 +67,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -98,7 +94,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -123,7 +119,7 @@ def main():
         key=ARGS.rpc_key,
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
-        n_parallel=ARGS.rpc_workers,
+        n_parallel=cpu_count(logical=True),
         number=ARGS.number,
         repeat=ARGS.repeat,
         min_repeat_ms=ARGS.min_repeat_ms,
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py
index 2eaddbbc081e9..4a6874a53d347 100644
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -21,6 +21,7 @@
 import tvm
 from tvm import auto_scheduler
 from tvm.meta_schedule.testing.te_workload import CONFIGS
+from tvm.meta_schedule.utils import cpu_count
 from tvm.support import describe
 
 
@@ -56,11 +57,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -83,7 +79,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -132,7 +128,7 @@ def main():
         key=ARGS.rpc_key,
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
-        n_parallel=ARGS.rpc_workers,
+        n_parallel=cpu_count(logical=True),
         number=ARGS.number,
         repeat=ARGS.repeat,
         min_repeat_ms=ARGS.min_repeat_ms,
diff --git a/python/tvm/meta_schedule/runner/local_runner.py b/python/tvm/meta_schedule/runner/local_runner.py
index d76fe0b840a4a..2d3214f53b6bb 100644
--- a/python/tvm/meta_schedule/runner/local_runner.py
+++ b/python/tvm/meta_schedule/runner/local_runner.py
@@ -23,17 +23,17 @@
 
 from ...contrib.popen_pool import PopenPoolExecutor
 from ...runtime import Device, Module
+from ..profiler import Profiler
 from ..utils import derived_object, get_global_func_with_default_on_worker
 from .config import EvaluatorConfig
-from .runner import PyRunner, RunnerFuture, RunnerInput, RunnerResult, PyRunnerFuture
+from .runner import PyRunner, PyRunnerFuture, RunnerFuture, RunnerInput, RunnerResult
 from .utils import (
-    T_ARGUMENT_LIST,
     T_ARG_INFO_JSON_OBJ_LIST,
+    T_ARGUMENT_LIST,
     alloc_argument_common,
     run_evaluator_common,
 )
 
-
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
@@ -137,26 +137,29 @@ def resource_handler():
             yield
         finally:
             # Final step. Always clean up
-            f_cleanup()
+            with Profiler.timeit("LocalRunner/cleanup"):
+                f_cleanup()
 
     with resource_handler():
         # Step 1: create the local runtime module
-        rt_mod = tvm.runtime.load_module(artifact_path)
-        # Step 2: create the local device
-        device = tvm.runtime.device(dev_type=device_type, dev_id=0)
-        # Step 3: Allocate input arguments
-        repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
-            device,
-            args_info,
-            alloc_repeat,
-        )
-        # Step 4: Run time_evaluator
-        costs: List[float] = f_run_evaluator(
-            rt_mod,
-            device,
-            evaluator_config,
-            repeated_args,
-        )
+        with Profiler.timeit("LocalRunner/load_module"):
+            rt_mod = tvm.runtime.load_module(artifact_path)
+        # Step 2: Allocate input arguments
+        with Profiler.timeit("LocalRunner/alloc_argument"):
+            device = tvm.runtime.device(dev_type=device_type, dev_id=0)
+            repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
+                device,
+                args_info,
+                alloc_repeat,
+            )
+        # Step 3: Run time_evaluator
+        with Profiler.timeit("LocalRunner/run_evaluator"):
+            costs: List[float] = f_run_evaluator(
+                rt_mod,
+                device,
+                evaluator_config,
+                repeated_args,
+            )
     return costs
 
 
@@ -313,9 +316,6 @@ def _check(
             get_global_func_with_default_on_worker(name=f_alloc_argument, default=None)
             get_global_func_with_default_on_worker(name=f_run_evaluator, default=None)
             get_global_func_with_default_on_worker(name=f_cleanup, default=None)
-            get_global_func_with_default_on_worker(
-                name="tvm.contrib.random.random_fill", default=None
-            )
 
         value = self.pool.submit(
             _check,
@@ -348,7 +348,7 @@ def default_alloc_argument(
         The allocation args
     """
     f_random_fill = get_global_func_with_default_on_worker(
-        name="tvm.contrib.random.random_fill", default=None
+        name="tvm.contrib.random.random_fill_for_measure", default=None
     )
     return alloc_argument_common(f_random_fill, device, args_info, alloc_repeat)
 
diff --git a/python/tvm/meta_schedule/runner/rpc_runner.py b/python/tvm/meta_schedule/runner/rpc_runner.py
index 9ff2489f8eb1c..aa6f3daaac604 100644
--- a/python/tvm/meta_schedule/runner/rpc_runner.py
+++ b/python/tvm/meta_schedule/runner/rpc_runner.py
@@ -25,6 +25,7 @@
 from tvm.rpc import RPCSession
 from tvm.runtime import Device, Module
 
+from ..profiler import Profiler
 from ..utils import (
     cpu_count,
     derived_object,
@@ -243,7 +244,7 @@ def __init__(
         f_alloc_argument: Union[T_ALLOC_ARGUMENT, str, None] = None,
         f_run_evaluator: Union[T_RUN_EVALUATOR, str, None] = None,
         f_cleanup: Union[T_CLEANUP, str, None] = None,
-        max_workers: Optional[int] = 1,
+        max_workers: Optional[int] = None,
         initializer: Optional[Callable[[], None]] = None,
     ) -> None:
         """Constructor
@@ -284,7 +285,7 @@ def __init__(
         self.f_run_evaluator = f_run_evaluator
         self.f_cleanup = f_cleanup
         if max_workers is None:
-            max_workers = cpu_count()
+            max_workers = cpu_count(logical=True)
         logger.info("RPCRunner: max_workers = %d", max_workers)
         self.pool = PopenPoolExecutor(
             max_workers=max_workers,
@@ -378,31 +379,36 @@ def resource_handler():
             yield
         finally:
             # Final step. Always clean up
-            f_cleanup(session, remote_path)
+            with Profiler.timeit("RPCRunner/cleanup"):
+                f_cleanup(session, remote_path)
 
     with resource_handler():
         # Step 1. Create session
-        session = f_create_session(rpc_config)
-        device = session.device(dev_type=device_type, dev_id=0)
+        with Profiler.timeit("RPCRunner/create_session"):
+            session = f_create_session(rpc_config)
+            device = session.device(dev_type=device_type, dev_id=0)
         # Step 2. Upload the module
-        _, remote_path = osp.split(artifact_path)
-        local_path: str = artifact_path
-        rt_mod: Module = f_upload_module(session, local_path, remote_path)
+        with Profiler.timeit("RPCRunner/upload_module"):
+            _, remote_path = osp.split(artifact_path)
+            local_path: str = artifact_path
+            rt_mod: Module = f_upload_module(session, local_path, remote_path)
         # Step 3: Allocate input arguments
-        repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
-            session,
-            device,
-            args_info,
-            alloc_repeat,
-        )
+        with Profiler.timeit("RPCRunner/alloc_argument"):
+            repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
+                session,
+                device,
+                args_info,
+                alloc_repeat,
+            )
         # Step 4: Run time_evaluator
-        costs: List[float] = f_run_evaluator(
-            session,
-            rt_mod,
-            device,
-            evaluator_config,
-            repeated_args,
-        )
+        with Profiler.timeit("LocalRunner/run_evaluator"):
+            costs: List[float] = f_run_evaluator(
+                session,
+                rt_mod,
+                device,
+                evaluator_config,
+                repeated_args,
+            )
     return costs
 
 
@@ -474,7 +480,7 @@ def default_alloc_argument(
     """
     f_random_fill = get_global_func_on_rpc_session(
         session,
-        "tvm.contrib.random.random_fill",
+        "tvm.contrib.random.random_fill_for_measure",
         "Please make sure 'USE_RANDOM' is turned ON in the config.cmake on the RPC server.",
     )
 
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py
index 1a51622b5cde5..88cb360c0171b 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -71,11 +71,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -98,7 +93,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -140,7 +135,6 @@ def main():
             enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
         alloc_repeat=1,
-        max_workers=ARGS.rpc_workers,
     )
     with ms.Profiler() as profiler:
         lib = ms.tune_relay(
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index 6188e124fde82..ce15c60c15e68 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -64,11 +64,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -96,7 +91,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -141,7 +136,6 @@ def main():
             enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
         alloc_repeat=1,
-        max_workers=ARGS.rpc_workers,
     )
     with ms.Profiler() as profiler:
         lib = ms.tune_relay(
diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py
index cbc310f999ad9..8740d74424781 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -59,11 +59,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -86,7 +81,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -119,7 +114,6 @@ def main():
             enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
         alloc_repeat=1,
-        max_workers=ARGS.rpc_workers,
     )
     with ms.Profiler() as profiler:
         sch: Optional[tir.Schedule] = ms.tune_tir(
diff --git a/src/runtime/contrib/curand/curand.cc b/src/runtime/contrib/curand/curand.cc
new file mode 100644
index 0000000000000..23282304f7168
--- /dev/null
+++ b/src/runtime/contrib/curand/curand.cc
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <curand.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/registry.h>
+
+#include "../../cuda/cuda_common.h"
+#include "./helper_cuda_kernels.h"
+
+namespace tvm {
+namespace runtime {
+namespace curand {
+
+#define TVM_CURAND_CALL(func)                                    \
+  {                                                              \
+    curandStatus_t e = (func);                                   \
+    ICHECK(e == CURAND_STATUS_SUCCESS) << "cuRAND error: " << e; \
+  }
+
+class CURandGenerator {
+ public:
+  CURandGenerator() { TVM_CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); }
+  ~CURandGenerator() { TVM_CURAND_CALL(curandDestroyGenerator(gen)); }
+
+  void Generate32bit(void* ptr, int64_t n) {
+    TVM_CURAND_CALL(curandGenerateNormal(gen, static_cast<float*>(ptr), n, 0.0f, 5.0f));
+    cudaDeviceSynchronize();
+  }
+
+  void Generate64bit(void* ptr, int64_t n) {
+    TVM_CURAND_CALL(curandGenerateNormalDouble(gen, static_cast<double*>(ptr), n, 0.0f, 5.0f));
+  }
+
+  curandGenerator_t gen;
+};
+
+DeviceAPI* GetCUDADeviceAPI() {
+  const PackedFunc* get_cuda_api = runtime::Registry::Get("device_api.cuda");
+  ICHECK(get_cuda_api) << "ValueError: TVM is not built with USE_CUDA=ON";
+  void* ret = (*get_cuda_api)();
+  runtime::DeviceAPI* cuda_api = static_cast<runtime::DeviceAPI*>(ret);
+  return cuda_api;
+}
+
+int64_t GetTensorSize(DLTensor* tensor) {
+  int64_t tensor_size = 1;
+  for (int i = 0; i < tensor->ndim; ++i) {
+    tensor_size *= tensor->shape[i];
+  }
+  return tensor_size;
+}
+
+struct DeferredFunc {
+ public:
+  explicit DeferredFunc(std::function<void()> func) : func_(func) {}
+  ~DeferredFunc() { func_(); }
+
+ private:
+  std::function<void()> func_;
+};
+
+void RandomFill(DLTensor* tensor) {
+  static DeviceAPI* cuda_api = GetCUDADeviceAPI();
+  CHECK(tensor->device.device_type == DLDeviceType::kDLCUDA)
+      << "ValueError: cuRAND only works on CUDA devices";
+  if (tensor->dtype.code == DLDataTypeCode::kDLFloat && tensor->dtype.bits == 16) {
+    int64_t tensor_size = GetTensorSize(tensor);
+    void* data = cuda_api->AllocWorkspace(tensor->device, tensor_size * sizeof(float));
+    {
+      DeferredFunc defer([data, tensor]() { cuda_api->FreeWorkspace(tensor->device, data); });
+      CURandGenerator().Generate32bit(data, GetTensorSize(tensor));
+      ConvertFp32toFp16(/*src=*/data, /*dst=*/tensor->data, /*num=*/tensor_size);
+    }
+  } else if (tensor->dtype.code == DLDataTypeCode::kDLFloat && tensor->dtype.bits == 32) {
+    CURandGenerator().Generate32bit(tensor->data, GetTensorSize(tensor));
+  } else if (tensor->dtype.code == DLDataTypeCode::kDLFloat && tensor->dtype.bits == 64) {
+    CURandGenerator().Generate64bit(tensor->data, GetTensorSize(tensor));
+  } else {
+    LOG(FATAL) << "ValueError: Unsupported dtype: " << tensor->dtype;
+  }
+  TVMSynchronize(tensor->device.device_type, tensor->device.device_type, nullptr);
+}
+
+TVM_REGISTER_GLOBAL("runtime.contrib.curand.RandomFill").set_body_typed(RandomFill);
+
+}  // namespace curand
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/curand/helper_cuda_kernels.cu b/src/runtime/contrib/curand/helper_cuda_kernels.cu
new file mode 100644
index 0000000000000..a08fc09441b40
--- /dev/null
+++ b/src/runtime/contrib/curand/helper_cuda_kernels.cu
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <cuda_fp16.h>
+
+#include "./helper_cuda_kernels.h"
+
+namespace tvm {
+namespace runtime {
+namespace curand {
+
+__global__ void KernelFp32ToFp16(const float* src, half* dst, int num) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < num) {
+    dst[idx] = src[idx];
+  }
+}
+
+void ConvertFp32toFp16(const void* _src, void* _dst, int64_t num) {
+  const float* src = static_cast<const float*>(_src);
+  half* dst = static_cast<half*>(_dst);
+  KernelFp32ToFp16<<<(num + 255) / 256, 256>>>(src, dst, num);
+}
+
+}  // namespace curand
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/curand/helper_cuda_kernels.h b/src/runtime/contrib/curand/helper_cuda_kernels.h
new file mode 100644
index 0000000000000..582162579a3ad
--- /dev/null
+++ b/src/runtime/contrib/curand/helper_cuda_kernels.h
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_CONTRIB_CURAND_HELPER_CUDA_KERNELS_H_
+#define TVM_RUNTIME_CONTRIB_CURAND_HELPER_CUDA_KERNELS_H_
+
+#include <curand.h>
+#include <tvm/runtime/registry.h>
+
+namespace tvm {
+namespace runtime {
+namespace curand {
+
+/*!
+ * \brief An auxiliary function to convert an FP32 array to FP16.
+ * \param src The source FP32 array.
+ * \param dst The destination FP16 array.
+ * \param num The number of elements in the array.
+ */
+void ConvertFp32toFp16(const void* src, void* dst, int64_t num);
+
+}  // namespace curand
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_CURAND_HELPER_CUDA_KERNELS_H_
diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc
index 161ae62220123..ac52594360059 100644
--- a/src/runtime/contrib/random/mt_random_engine.cc
+++ b/src/runtime/contrib/random/mt_random_engine.cc
@@ -21,13 +21,16 @@
  * \file random/mt_random_engine.cc
  * \brief mt19937 random engine
  */
+#include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/threading_backend.h>
 
 #include <algorithm>
 #include <ctime>
 #include <random>
+#include <thread>
 
 #include "../3rdparty/compiler-rt/builtin_fp16.h"
 
@@ -116,52 +119,112 @@ class RandomEngine {
   }
 
   void RandomFill(DLTensor* data) {
-    int64_t size = 1;
-    for (int i = 0; i < data->ndim; ++i) {
-      size *= data->shape[i];
+    if (data->device.device_type == kDLCPU) {
+      FillData(data);
+    } else {
+      runtime::NDArray local = runtime::NDArray::Empty(
+          std::vector<int64_t>{data->shape, data->shape + data->ndim}, data->dtype, {kDLCPU, 0});
+      DLTensor* tensor = const_cast<DLTensor*>(local.operator->());
+      FillData(tensor);
+      runtime::NDArray::CopyFromTo(tensor, data);
     }
+  }
 
+  void RandomFillForMeasure(DLTensor* data) {
     if (data->device.device_type == kDLCPU) {
-      FillData(data, size);
+      FillDataForMeasure(data);
     } else {
       runtime::NDArray local = runtime::NDArray::Empty(
           std::vector<int64_t>{data->shape, data->shape + data->ndim}, data->dtype, {kDLCPU, 0});
       DLTensor* tensor = const_cast<DLTensor*>(local.operator->());
-      FillData(tensor, size);
+      FillDataForMeasure(tensor);
       runtime::NDArray::CopyFromTo(tensor, data);
     }
   }
 
  private:
-  void FillData(DLTensor* tensor, int64_t size) {
+  void FillDataImpl(void* data, int64_t st, int64_t ed, DLDataType dtype) {
     // Make the value be 1.0 - 10.0, not (0.0 - 1.0) so that we could satisfy
     // quantized dtype (uint8 / int8) data non-empty requirement
     std::uniform_real_distribution<> dist(1.0, 10.0);
     // Use float representation could make us work well on float / int type too.
-    if (tensor->dtype.bits == 1) {
-      std::generate_n(static_cast<bool*>(tensor->data), size, [&]() { return dist(rnd_engine_); });
-    } else if (tensor->dtype.bits == 4) {
+    if (dtype.bits == 1) {
+      std::generate_n(static_cast<bool*>(data) + st, ed - st, [&]() { return dist(rnd_engine_); });
+    } else if (dtype.bits == 4) {
       // For uint4/int4 we pack two values into a single byte.
       // Thus, to ensure both values are non-zero, we use a distribution of 17 - 30.
       std::uniform_real_distribution<> packed_dist(17.0, 30.0);
-      std::generate_n(reinterpret_cast<uint8_t*>(tensor->data), size,
+      std::generate_n(reinterpret_cast<uint8_t*>(data) + st, ed - st,
                       [&]() { return packed_dist(rnd_engine_); });
-    } else if (tensor->dtype.bits == 8) {
-      std::generate_n(static_cast<uint8_t*>(tensor->data), size,
+    } else if (dtype.bits == 8) {
+      std::generate_n(static_cast<uint8_t*>(data) + st, ed - st,
                       [&]() { return dist(rnd_engine_); });
-    } else if (tensor->dtype.bits == 16) {
-      std::generate_n(static_cast<uint16_t*>(tensor->data), size, [&]() {
+    } else if (dtype.bits == 16) {
+      std::generate_n(static_cast<uint16_t*>(data) + st, ed - st, [&]() {
         return __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(
             static_cast<float>(dist(rnd_engine_)));
       });
-    } else if (tensor->dtype.bits == 32) {
-      std::generate_n(static_cast<float*>(tensor->data), size, [&]() { return dist(rnd_engine_); });
-    } else if (tensor->dtype.bits == 64) {
-      std::generate_n(static_cast<double*>(tensor->data), size,
+    } else if (dtype.bits == 32) {
+      std::generate_n(static_cast<float*>(data) + st, ed - st, [&]() { return dist(rnd_engine_); });
+    } else if (dtype.bits == 64) {
+      std::generate_n(static_cast<double*>(data) + st, ed - st,
                       [&]() { return dist(rnd_engine_); });
     } else {
-      LOG(FATAL) << "Doesn't support dtype code " << tensor->dtype.code << " dtype bits "
-                 << tensor->dtype.bits;
+      LOG(FATAL) << "Doesn't support dtype code " << dtype.code << " dtype bits " << dtype.bits;
+    }
+  }
+
+  void FillData(DLTensor* tensor) {
+    int64_t size = 1;
+    for (int i = 0; i < tensor->ndim; ++i) {
+      size *= tensor->shape[i];
+    }
+    DLDataType dtype = tensor->dtype;
+    if (dtype.bits == 1 || dtype.bits == 4 || dtype.bits == 8 || dtype.bits == 16 ||
+        dtype.bits == 32 || dtype.bits == 64) {
+      FillDataImpl(tensor->data, 0, size, dtype);
+    } else {
+      LOG(FATAL) << "Doesn't support dtype code " << dtype.code << " dtype bits " << dtype.bits;
+    }
+  }
+
+  void FillDataForMeasure(DLTensor* tensor) {
+    struct ParallelTask {
+      static int RunTask(int task_id, TVMParallelGroupEnv* penv, void* cdata) {
+        ParallelTask* task = static_cast<ParallelTask*>(cdata);
+        task->Run(task_id);
+        return 0;
+      }
+
+      void Run(int i) {
+        int64_t chunk_size = size / num_threads;
+        int64_t st = i * chunk_size;
+        int64_t ed = std::min(st + chunk_size, size);
+        self->FillDataImpl(data, st, ed, dtype);
+      }
+
+      RandomEngine* self;
+      void* data;
+      int num_threads;
+      int64_t size;
+      DLDataType dtype;
+    };
+
+    ParallelTask task;
+    task.self = this;
+    task.data = tensor->data;
+    DLDataType dtype = task.dtype = tensor->dtype;
+    int64_t& size = task.size = 1;
+    for (int i = 0; i < tensor->ndim; ++i) {
+      size *= tensor->shape[i];
+    }
+    if (dtype.bits == 1 || dtype.bits == 4 || dtype.bits == 8 || dtype.bits == 16 ||
+        dtype.bits == 32 || dtype.bits == 64) {
+      int num_threads = task.num_threads = runtime::threading::MaxConcurrency();
+      int res = TVMBackendParallelLaunch(ParallelTask::RunTask, &task, num_threads);
+      ICHECK_EQ(res, 0) << "RandomFillForMeasure: TVMBackendParallelLaunch failed";
+    } else {
+      LOG(FATAL) << "Doesn't support dtype code " << dtype.code << " dtype bits " << dtype.bits;
     }
   }
 
diff --git a/src/runtime/contrib/random/random.cc b/src/runtime/contrib/random/random.cc
index 2cb56b87fdf57..38c2de6555e90 100644
--- a/src/runtime/contrib/random/random.cc
+++ b/src/runtime/contrib/random/random.cc
@@ -24,6 +24,7 @@
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/runtime/threading_backend.h>
 
 #include <algorithm>
 
@@ -123,5 +124,19 @@ TVM_REGISTER_GLOBAL("tvm.contrib.random.random_fill").set_body([](TVMArgs args,
   entry->random_engine.RandomFill(out);
 });
 
+TVM_REGISTER_GLOBAL("tvm.contrib.random.random_fill_for_measure")
+    .set_body([](TVMArgs args, TVMRetValue* ret) -> void {
+      static const PackedFunc* curand = Registry::Get("runtime.contrib.curand.RandomFill");
+      DLTensor* out = args[0];
+      if (curand && out->device.device_type == DLDeviceType::kDLCUDA) {
+        if (out->dtype.code == DLDataTypeCode::kDLFloat) {
+          (*curand)(out);
+          return;
+        }
+      }
+      RandomThreadLocalEntry* entry = RandomThreadLocalEntry::ThreadLocal();
+      entry->random_engine.RandomFillForMeasure(out);
+    });
+
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index be0cd9eb8f52c..6f0a6114f3d9d 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -163,6 +163,10 @@
 #define TVM_INFO_USE_THRUST "NOT-FOUND"
 #endif
 
+#ifndef TVM_INFO_USE_CURAND
+#define TVM_INFO_USE_CURAND "NOT-FOUND"
+#endif
+
 #ifndef TVM_INFO_USE_MIOPEN
 #define TVM_INFO_USE_MIOPEN "NOT-FOUND"
 #endif
@@ -308,6 +312,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_TFLITE", TVM_INFO_USE_TFLITE},
       {"USE_THREADS", TVM_INFO_USE_THREADS},
       {"USE_THRUST", TVM_INFO_USE_THRUST},
+      {"USE_CURAND", TVM_INFO_USE_CURAND},
       {"USE_VITIS_AI", TVM_INFO_USE_VITIS_AI},
       {"USE_VULKAN", TVM_INFO_USE_VULKAN},
       {"USE_CLML", TVM_INFO_USE_CLML},

From 77756eac303f167f534dac80c843bf52007f06ab Mon Sep 17 00:00:00 2001
From: Ziheng Jiang <ziheng@apache.org>
Date: Sat, 18 Jun 2022 17:23:00 -0700
Subject: [PATCH 175/181] [COMMUNITY] Denise Kutnick -> Reviewer (#11778)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 897606507b099..95e006513db99 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -119,6 +119,7 @@ We do encourage everyone to work anything they are interested in.
 - [Elen Kalda](https://github.com/ekalda): @ekalda
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame
 - [Tristan Konolige](https://github.com/tkonolige): @tkonolige
+- [Denise Kutnick](https://github.com/denise-k): @denise-k
 - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574
 - [Nicola Lancellotti](https://github.com/nicolalancellotti): @NicolaLancellotti
 - [Wuwei Lin](https://github.com/vinx13): @vinx13

From 9bba7580b0dcaea4963bd6b35df0bf6bf867b8ff Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Sat, 18 Jun 2022 18:04:52 -0700
Subject: [PATCH 176/181] [TIR, analysis] Add GetAutoTensorizeMappingInfo to
 generate transforms for auto tensorization (#11740)

This PR added a utility function `GetAutoTensorizeMappingInfo` to propose mapping from workload block iters to the iters in the tensor intrin. An example usage is conv2d, where the computation block has more iters than the matmul tensor intrin.
---
 .../tvm/meta_schedule/testing/te_workload.py  |  68 +++++
 python/tvm/tir/schedule/analysis.py           |  34 +++
 src/tir/schedule/analysis.h                   |  50 ++++
 src/tir/schedule/analysis/analysis.cc         | 275 ++++++++++++++++--
 src/tir/schedule/ir_comparator.cc             | 126 +++++++-
 src/tir/schedule/ir_comparator.h              |  52 +++-
 .../unittest/test_tir_schedule_analysis.py    |  56 +++-
 7 files changed, 630 insertions(+), 31 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/te_workload.py b/python/tvm/meta_schedule/testing/te_workload.py
index 52f5f49b0a12a..28a2df628c530 100644
--- a/python/tvm/meta_schedule/testing/te_workload.py
+++ b/python/tvm/meta_schedule/testing/te_workload.py
@@ -701,6 +701,74 @@ def softmax_mn(m, n) -> Tuple[te.Tensor, te.Tensor]:  # pylint: disable=invalid-
     return (a, b)
 
 
+def conv2d_nhwc_f16(  # pylint: disable=invalid-name,missing-docstring
+    N: int,
+    H: int,
+    W: int,
+    CI: int,
+    CO: int,
+    kernel_size: int,
+    stride: int = 1,
+    padding: int = 0,
+    dilation: int = 1,
+    groups: int = 1,
+):
+    inputs = te.placeholder((N, H, W, CI), name="inputs", dtype="float16")
+    weight = te.placeholder(
+        (kernel_size, kernel_size, CI // groups, CO), name="weight", dtype="float16"
+    )
+    batch_size, in_h, in_w, _ = inputs.shape
+    k_h, k_w, channel_per_group, out_channel = weight.shape
+    out_channel_per_group = out_channel // groups
+
+    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
+    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
+    rh = te.reduce_axis((0, k_h), name="rh")
+    rw = te.reduce_axis((0, k_w), name="rw")
+    rc = te.reduce_axis((0, channel_per_group), name="rc")
+
+    padded = topi.nn.pad(inputs, [0, padding, padding, 0])
+    output = te.compute(
+        (batch_size, out_h, out_w, out_channel),
+        lambda n, h, w, co: te.sum(
+            (
+                tir.Cast(
+                    value=padded[
+                        n,
+                        h * stride + rh * dilation,
+                        w * stride + rw * dilation,
+                        co // out_channel_per_group * channel_per_group + rc,
+                    ],
+                    dtype="float32",
+                )
+                * tir.Cast(value=weight[rh, rw, rc, co], dtype="float32")
+            ),
+            axis=[rh, rw, rc],
+        ),
+        name="conv2d_nhwc",
+    )
+    return (inputs, weight, output)
+
+
+def batch_matmul_nkkm_f16(  # pylint: disable=invalid-name,missing-docstring
+    B: int,
+    N: int,
+    M: int,
+    K: int,
+) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
+    x = te.placeholder((B, N, K), name="X", dtype="float16")
+    y = te.placeholder((B, K, M), name="Y", dtype="float16")
+    k = te.reduce_axis((0, K), name="k")
+    z = te.compute(  # pylint: disable=invalid-name
+        (B, N, M),
+        lambda b, i, j: te.sum(
+            tir.Cast("float32", x[b][i][k]) * tir.Cast("float32", y[b][k][j]), axis=[k]
+        ),
+        name="Z",
+    )
+    return (x, y, z)
+
+
 def create_te_workload(name: str, idx: int) -> tir.PrimFunc:
     workload_func, params = CONFIGS[name]
     return te.create_prim_func(workload_func(*params[idx]))  # type: ignore
diff --git a/python/tvm/tir/schedule/analysis.py b/python/tvm/tir/schedule/analysis.py
index 71ff024217c71..cdb4aa9cfa20f 100644
--- a/python/tvm/tir/schedule/analysis.py
+++ b/python/tvm/tir/schedule/analysis.py
@@ -87,3 +87,37 @@ def get_tensorize_loop_mapping(
         TensorizeInfo structure if a valid mapping is found, None otherwise
     """
     return _ffi_api.GetTensorizeLoopMapping(sch, block, desc_func)  # type: ignore
+
+
+@tvm._ffi.register_object("tir.schedule.AutoTensorizeMappingInfo")
+class AutoTensorizeMappingInfo(Object):
+    """Necessary information used to perform transformations for tensorization."""
+
+
+def get_auto_tensorize_mapping_info(
+    sch: Schedule, block: BlockRV, desc_func: PrimFunc
+) -> Optional[AutoTensorizeMappingInfo]:
+    """Get mapping info between a target block and an intrinsic description including layout
+    transformations to apply.
+
+    Parameters
+    ----------
+    sch : Schedule
+        The schedule to be tensorized
+    block : BlockRV
+        The compute block for auto tensorization
+    desc_func : PrimFunc
+        The prim func describing the computation to be tensorized
+
+    Returns
+    -------
+    auto_tensorize_mapping_info : Optional[AutoTensorizeMappingInfo]
+        AutoTensorizeMappingInfo structure if potential mappings found, None otherwise.
+
+    Note
+    ----
+    Returning a valid AutoTensorizeMappingInfo doesn't guarantee the block can be tensorized.
+    We will need to apply the suggested layout transformations and then match against the tensor
+    intrinsics.
+    """
+    return _ffi_api.GetAutoTensorizeMappingInfo(sch, block, desc_func)  # type: ignore
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index 5adc4f8f1b30a..b30cef829f1ea 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -707,6 +707,56 @@ Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
                                                 const tir::StmtSRef& block_sref,
                                                 const tir::PrimFunc& desc_func);
 
+/*！\brief Necessary information used to perform transformations for tensorization */
+class AutoTensorizeMappingInfoNode : public Object {
+ public:
+  /*! \brief Possible mappings to apply to block iters */
+  Array<IndexMap> mappings;
+
+  /* Additional information from AutoTensorizeComparator */
+
+  /*! \brief Mapping from LHS buffer to RHS buffer */
+  Map<Buffer, Buffer> lhs_buffer_map;
+  /*! \brief Buffer indices on RHS */
+  Map<Buffer, Array<PrimExpr>> rhs_buffer_indices;
+  /*! \brief Block iters on LHS */
+  Array<IterVar> lhs_iters;
+  /*! \brief Block iters on RHS */
+  Array<IterVar> rhs_iters;
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("mappings", &mappings);
+    v->Visit("lhs_buffer_map", &lhs_buffer_map);
+    v->Visit("rhs_buffer_indices", &rhs_buffer_indices);
+    v->Visit("lhs_iters", &lhs_iters);
+    v->Visit("rhs_iters", &rhs_iters);
+  }
+
+  static constexpr const char* _type_key = "tir.schedule.AutoTensorizeMappingInfo";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AutoTensorizeMappingInfoNode, Object);
+};
+
+class AutoTensorizeMappingInfo : public ObjectRef {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(AutoTensorizeMappingInfo, ObjectRef,
+                                            AutoTensorizeMappingInfoNode);
+};
+
+/*!
+ * \brief Get mapping info between a target block and an intrinsic description including layout
+ * transformations to apply.
+ * \param self The schedule state
+ * \param block_sref The compute block for auto tensorization
+ * \param desc_func The prim func describing the computation to be tensorized
+ * \return AutoTensorizeMappingInfo structure if a potential mapping is found, NullOpt otherwise.
+ * \note Returning a valid AutoTensorizeMappingInfo doesn't guarantee the block can be tensorized.
+ * We will need to apply the suggested layout transformations and then match against the tensor
+ * intrinsics.
+ */
+Optional<AutoTensorizeMappingInfo> GetAutoTensorizeMappingInfo(const ScheduleState& self,
+                                                               const StmtSRef& block_sref,
+                                                               const PrimFunc& desc_func);
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 7def8b8674e10..3ee1ed28b8571 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -19,6 +19,7 @@
 #include <tvm/runtime/container/optional.h>
 #include <tvm/tir/expr.h>
 
+#include "../ir_comparator.h"
 #include "../utils.h"
 
 namespace tvm {
@@ -2085,39 +2086,60 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self,   //
 
 TVM_REGISTER_NODE_TYPE(TensorizeInfoNode);
 
-Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
-                                                const tir::StmtSRef& block_sref,
-                                                const tir::PrimFunc& desc_func) {
-  arith::Analyzer analyzer;
-  const tir::BlockRealize& block = tir::GetBlockRealize(self, block_sref);
-  // Step 1. Analyze desc_func, extract its block, loops and loop vars
-  const tir::BlockRealizeNode* desc_block = nullptr;
+/*! \brief Auxiliary data structure of information extracted from tensor intrin description */
+struct TensorIntrinDescInfo {
+  /*! \brief The block of the description function, which is the (unique) direct child of the root
+   *         block.
+   */
+  const BlockRealizeNode* desc_block = nullptr;
+  /*! \brief The loops of the description function, in the order from outer loops to inner ones. */
   std::vector<const tir::ForNode*> desc_loops;
+  /*! \brief The loop variables. */
   std::unordered_set<const tir::VarNode*> desc_loop_vars;
-  const auto* desc_scope_realize = desc_func->body.as<tir::BlockRealizeNode>();
+};
+
+/*!
+ * \brief Extract auxilary information from the tensor intrin description.
+ * \param analyze The arithmetic analyzer
+ * \param desc_func The description PrimFunc
+ * \return The auxilary information
+ */
+TensorIntrinDescInfo ExtractTensorIntrinDescInfo(arith::Analyzer* analyzer,
+                                                 const PrimFunc& desc_func) {
+  TensorIntrinDescInfo info;
+  const auto* desc_scope_realize = desc_func->body.as<BlockRealizeNode>();
   ICHECK(desc_scope_realize);
   {
-    auto f_visit = [&desc_block, &desc_loops, &desc_loop_vars,
-                    &analyzer](const ObjectRef& obj) -> bool {
+    auto f_visit = [&](const ObjectRef& obj) -> bool {
       // Extract the block
-      if (const auto* block = obj.as<tir::BlockRealizeNode>()) {
-        desc_block = block;
+      if (const auto* block = obj.as<BlockRealizeNode>()) {
+        info.desc_block = block;
         return false;
       }
-      // Extract loops
-      if (const auto* loop = obj.as<tir::ForNode>()) {
-        desc_loops.push_back(loop);
-        desc_loop_vars.insert(loop->loop_var.get());
-        if (!analyzer.CanProve(loop->min == 0)) {
+      // Extract the loops
+      if (const auto* loop = obj.as<ForNode>()) {
+        info.desc_loops.push_back(loop);
+        info.desc_loop_vars.insert(loop->loop_var.get());
+        if (!analyzer->CanProve(loop->min == 0)) {
           return false;
         }
       }
       return true;
     };
     tir::PostOrderVisit(desc_scope_realize->block->body, f_visit);
-    std::reverse(desc_loops.begin(), desc_loops.end());
-    ICHECK(desc_block);
+    std::reverse(info.desc_loops.begin(), info.desc_loops.end());
+    ICHECK(info.desc_block);
   }
+  return info;
+}
+
+Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
+                                                const tir::StmtSRef& block_sref,
+                                                const tir::PrimFunc& desc_func) {
+  arith::Analyzer analyzer;
+  const tir::BlockRealize& block = tir::GetBlockRealize(self, block_sref);
+  // Step 1. Analyze desc_func, extract its block, loops and loop vars
+  TensorIntrinDescInfo desc_info = ExtractTensorIntrinDescInfo(&analyzer, desc_func);
   // Step 2. Collect loops from block_sref
   const tir::StmtSRef& scope_sref = GetScopeRoot(self, block_sref, false);
   const tir::BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
@@ -2138,6 +2160,9 @@ Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
     std::reverse(block_loops.begin(), block_loops.end());
   }
   // Step 3. Map from block loops to desc block loops
+  const std::vector<const ForNode*>& desc_loops = desc_info.desc_loops;
+  const std::unordered_set<const VarNode*>& desc_loop_vars = desc_info.desc_loop_vars;
+  const BlockRealizeNode* desc_block = desc_info.desc_block;
   ObjectPtr<TensorizeInfoNode> ret = make_object<TensorizeInfoNode>();
   const int n_block_vars = block->iter_values.size();
   const int n_desc_vars = desc_block->iter_values.size();
@@ -2240,5 +2265,217 @@ TVM_REGISTER_GLOBAL("tir.schedule.GetTensorizeLoopMapping")
       return GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block), desc_func);
     });
 
+/******** Auto Tensorization ********/
+
+/*! \brief IndexMap proposer for layout transformation in auto tensorization. */
+class AutoTensorizeMappingProposer {
+ public:
+  static Array<IndexMap> ProposeMappings(const AutoTensorizeComparator* extractor,
+                                         arith::Analyzer* analyzer) {
+    AutoTensorizeMappingProposer proposer(extractor, analyzer);
+    proposer.CollectFeasibleSet();
+    return proposer.ProposeAllFuseMapping();
+  }
+
+ private:
+  explicit AutoTensorizeMappingProposer(const AutoTensorizeComparator* extractor,
+                                        arith::Analyzer* analyzer)
+      : extractor_(extractor), analyzer_(analyzer) {}
+
+  using VarSet = std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>;
+
+  void CollectFeasibleSet() {
+    // Collect the set of potential iter var mapping between the workload and the tensor intrin.
+    // We analyze the appearance of each variable in the buffer indices of each buffer on LHS and
+    // RHS. The appearance of a variable in the buffer indices is encoded as bit-masks (BufferMask).
+    // Variables on the LHS and the RHS with the same bit-mask and the same iter type are potential
+    // mappings.
+    //
+    // For example, consider the conv2d case. We will try to match the workload
+    // conv2d[n, h, w, c] = sum_{rh, rw, rc} X[n, h + rh, w + rw, c + rc] * W[rh, rw, rc, c]
+    // against a matmul tensor intrin
+    // C[m, n] = sum_{k} A[m, k] * B[k, n]
+    // First we extract the correspondence of the buffers: conv2d <=> C, A <=> X, B <=> W.
+    // Then for each variable, we extract the buffers where it is used for indexing.
+    // Take the variable m on the RHS as an example. m is used to index buffer A and C. On the LHS,
+    // we will find the variables used to index only the exact corresponding buffers conv2d and X
+    // (the variable is not allowed to index other buffers). In this case, n, h, w is used to index
+    // both buffer conv2d and W, and not in other buffers. Therefore, {n, h, w} <=> m is a potential
+    // mapping.
+
+    // Note: the mapping is not unique when multiple variables on RHS has the same bit-mask.
+    // This is currently not supported.
+
+    using BufferMask = std::vector<bool>;
+
+    // Step 1: Assign an index to each buffer in LHS and RHS
+    std::unordered_map<Buffer, int, ObjectPtrHash, ObjectEqual> rhs_buffer_index;
+    std::unordered_map<Buffer, int, ObjectPtrHash, ObjectEqual> lhs_buffer_index;
+    {
+      int i = 0;
+      for (const auto& kv : extractor_->rhs_buffer_map_) {
+        const Buffer& rhs_buffer = kv.first;
+        const Buffer& lhs_buffer = kv.second;
+        rhs_buffer_index[rhs_buffer] = i;
+        lhs_buffer_index[lhs_buffer] = i;
+        ++i;
+      }
+    }
+
+    // Step 2: Compute the buffer mask
+    ICHECK_EQ(rhs_buffer_index.size(), lhs_buffer_index.size());
+    int num_buffers = rhs_buffer_index.size();
+    std::unordered_map<const VarNode*, std::vector<bool>> rhs_buffer_masks, lhs_buffer_masks;
+    // helper function to initialize or update the buffer mask
+    auto update_mask = [&](const VarNode* var,
+                           std::unordered_map<const VarNode*, std::vector<bool>>* masks, int i) {
+      if (!masks->count(var)) {
+        (*masks)[var].resize(num_buffers);
+      }
+      (*masks)[var][i] = true;
+    };
+
+    for (const auto& it : extractor_->rhs_buffer_indices_map_) {
+      const Buffer& rhs_buffer = it.first;
+      for (const PrimExpr& rhs_index : it.second) {
+        if (const VarNode* var_node = rhs_index.as<VarNode>()) {
+          update_mask(var_node, &rhs_buffer_masks, rhs_buffer_index.at(rhs_buffer));
+        } else {
+          LOG(FATAL) << "ValueError: Buffer index " << rhs_index
+                     << " other that variables in tensor intrinsics is not supported.";
+        }
+      }
+
+      auto lhs_buffer_it = extractor_->rhs_buffer_map_.find(rhs_buffer);
+      ICHECK(lhs_buffer_it != extractor_->rhs_buffer_map_.end());
+      const Buffer& lhs_buffer = lhs_buffer_it->second;
+      for (const PrimExpr& index : extractor_->lhs_buffer_indices_map_.at(lhs_buffer)) {
+        PreOrderVisit(index, [&](const ObjectRef& obj) -> bool {
+          if (const VarNode* var = obj.as<VarNode>()) {
+            update_mask(var, &lhs_buffer_masks, lhs_buffer_index.at(lhs_buffer));
+          }
+          return true;
+        });
+      }
+    }
+
+    // Step 3: Find variables on LHS and RHS with the same buffer mask. Ensure LHS and RHS vars
+    // have the same iter type.
+    std::unordered_map<BufferMask, VarSet> mask_to_rhs_vars;
+    for (const auto& kv : rhs_buffer_masks) {
+      const VarNode* rhs_var = kv.first;
+      const BufferMask& mask = kv.second;
+      mask_to_rhs_vars[mask].insert(GetRef<Var>(rhs_var));
+    }
+    std::unordered_map<const VarNode*, IterVarType> rhs_var_iter_type;
+    for (const auto& iter : extractor_->rhs_iters_) {
+      rhs_var_iter_type.emplace(iter->var.get(), iter->iter_type);
+    }
+    for (const auto& iter : extractor_->lhs_iters_) {
+      auto& potential_mappings = lhs_feasible_vars_[iter->var];
+      VarSet rhs_candidates = mask_to_rhs_vars[lhs_buffer_masks[iter->var.get()]];
+      std::copy_if(
+          rhs_candidates.begin(), rhs_candidates.end(),
+          std::inserter(potential_mappings, potential_mappings.begin()),
+          [&](const Var& var) { return rhs_var_iter_type.at(var.get()) == iter->iter_type; });
+    }
+  }
+
+  Array<IndexMap> ProposeAllFuseMapping() {
+    // Now we have calcuated potential mapping for each iter var on LHS. For iters on LHS mapped to
+    // the same iter on RHS, they will be fused in the original order in LHS block iters. We will
+    // generate IndexMap to represent such fusion on LHS. For example, if n, h, w on LHS are mapped
+    // to the same iter var on RHS, we will produce index map `lambda n, h, w: fuse(n, h, w)`, where
+    // fuse(v0, .., vn) = ((v0 * v1_extent + v1) + ... ) * vn_extent + vn
+
+    // the parameters of the result index map, each parameter corresponds to a LHS iter
+    Array<Var> index_map_src;
+    // the outputs of the result index map
+    Array<PrimExpr> index_map_tgt;
+
+    // Step 1: Collect extents of LHS iters and prepare the initial indices of the IndexMap
+    Map<Var, PrimExpr> lhs_iter_extents;
+    for (const auto& iter : extractor_->lhs_iters_) {
+      lhs_iter_extents.Set(iter->var, iter->dom->extent);
+      index_map_src.push_back(iter->var.copy_with_suffix(""));
+    }
+
+    // Step 2: Each iter on RHS has a group of corresponding iters on LHS. Initialize the fusion
+    // result for each group of iters on LHS.
+    Map<Var, PrimExpr> fused_lhs_iters;
+    for (const auto& iter : extractor_->rhs_iters_) {
+      fused_lhs_iters.Set(iter->var, 0);
+    }
+
+    // Step 3: Fuse LHS iters mapped to the same RHS iter
+    for (size_t i = 0; i < extractor_->lhs_iters_.size(); ++i) {
+      const Var& lhs_iter_var = extractor_->lhs_iters_[i]->var;
+      const VarSet& rhs_candidates = lhs_feasible_vars_[lhs_iter_var];
+      if (rhs_candidates.empty()) {
+        // put unmapped iters at the beginning
+        index_map_tgt.push_back(index_map_src[i]);
+      } else if (rhs_candidates.size() == 1) {
+        Var rhs_var = *rhs_candidates.begin();
+        PrimExpr fused_lhs = fused_lhs_iters.at(rhs_var);
+        PrimExpr updated_fused_lhs =
+            fused_lhs * lhs_iter_extents.at(lhs_iter_var) + index_map_src[i];
+        fused_lhs_iters.Set(rhs_var, updated_fused_lhs);
+      } else {
+        // non-unique mapping is not supported
+        return {};
+      }
+    }
+    for (const auto& iter : extractor_->rhs_iters_) {
+      index_map_tgt.push_back(analyzer_->Simplify(fused_lhs_iters[iter->var]));
+    }
+    // At most one mapping is supported.
+    return {IndexMap(index_map_src, index_map_tgt)};
+  }
+
+ private:
+  // The extractor that has extracted information for auto tensorization from the workload and the
+  // tensor intrin.
+  const AutoTensorizeComparator* extractor_;
+  // The arithmetic analyzer.
+  arith::Analyzer* analyzer_;
+  /*! \brief Potential mappings on RHS for each variable on LHS */
+  std::unordered_map<Var, VarSet, ObjectPtrHash, ObjectPtrEqual> lhs_feasible_vars_;
+};
+
+Optional<AutoTensorizeMappingInfo> GetAutoTensorizeMappingInfo(const tir::ScheduleState& self,
+                                                               const tir::StmtSRef& block_sref,
+                                                               const tir::PrimFunc& desc_func) {
+  arith::Analyzer analyzer;
+  const tir::BlockRealize& block = tir::GetBlockRealize(self, block_sref);
+  // Step 1. Analyze desc_func, extract its block, loops and loop vars
+  TensorIntrinDescInfo desc_info = ExtractTensorIntrinDescInfo(&analyzer, desc_func);
+  // Step 2. Check if `desc_block` matches `block`
+  // Ignore the scope of buffers when comparing, since we can do cache_read/write
+  const StmtSRef& scope_sref = GetScopeRoot(self, block_sref, false);
+  const tir::BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
+  AutoTensorizeComparator extractor(self->mod);
+  if (!extractor.VisitStmt(block->block, desc_info.desc_block->block)) {
+    return NullOpt;
+  }
+  Array<IndexMap> mappings = AutoTensorizeMappingProposer::ProposeMappings(&extractor, &analyzer);
+  if (mappings.empty()) {
+    return NullOpt;
+  }
+  ObjectPtr<AutoTensorizeMappingInfoNode> ret = make_object<AutoTensorizeMappingInfoNode>();
+  ret->mappings = std::move(mappings);
+  ret->lhs_buffer_map = std::move(extractor.lhs_buffer_map_);
+  ret->rhs_buffer_indices = std::move(extractor.rhs_buffer_indices_map_);
+  ret->lhs_iters = std::move(extractor.lhs_iters_);
+  ret->rhs_iters = std::move(extractor.rhs_iters_);
+  return AutoTensorizeMappingInfo(ret);
+}
+
+TVM_REGISTER_NODE_TYPE(AutoTensorizeMappingInfoNode);
+
+TVM_REGISTER_GLOBAL("tir.schedule.GetAutoTensorizeMappingInfo")
+    .set_body_typed([](Schedule sch, BlockRV block, PrimFunc desc_func) {
+      return GetAutoTensorizeMappingInfo(sch->state(), sch->GetSRef(block), desc_func);
+    });
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/ir_comparator.cc b/src/tir/schedule/ir_comparator.cc
index 58c502379a7a9..d8ac40ef05868 100644
--- a/src/tir/schedule/ir_comparator.cc
+++ b/src/tir/schedule/ir_comparator.cc
@@ -333,12 +333,12 @@ bool TensorizeComparator::CompareBufferAccess(const T* lhs, const T* rhs) {
   return true;
 }
 
-template <typename T, typename F>
-bool TensorizeComparator::CompareArray(const Array<T>& lhs, const Array<T>& rhs, F cmp) {
+template <typename T, typename Self, typename F>
+bool TensorizeComparator::CompareArray(const Array<T>& lhs, const Array<T>& rhs, F Self::*cmp) {
   if (lhs.same_as(rhs)) return true;
   if (lhs.size() != rhs.size()) return false;
   for (size_t i = 0; i < lhs.size(); ++i) {
-    if (!(this->*cmp)(lhs[i], rhs[i])) return false;
+    if (!(static_cast<Self*>(this)->*cmp)(lhs[i], rhs[i])) return false;
   }
   return true;
 }
@@ -355,5 +355,125 @@ void TensorizeComparator::EmitError(const std::string& error_message) {
   error_messages_.push_back(error_message);
 }
 
+/******** AutoTensorize Extractor ********/
+
+bool AutoTensorizeComparator::VisitExprDefault_(const Object* op, const PrimExpr& other) {
+  return false;
+}
+
+bool AutoTensorizeComparator::VisitStmtDefault_(const Object* op, const Stmt& other) {
+  return false;
+}
+
+bool AutoTensorizeComparator::VisitStmt_(const BlockNode* op, const Stmt& other) {
+  const auto* rhs = other.as<BlockNode>();
+  // Check block equality.
+  // All iter vars and buffer regions including the order should match.
+  // When checking iter vars, DefEqual is used to remap variables.
+  if (!is_scope_block) {
+    if (!CompareArray(op->iter_vars, rhs->iter_vars, &AutoTensorizeComparator::CompareIterVar)) {
+      return false;
+    }
+    if (!CompareAnnotationMap(op->annotations, rhs->annotations)) {
+      return false;
+    }
+    if (!CompareArray(op->alloc_buffers, rhs->alloc_buffers,
+                      &AutoTensorizeComparator::CompareBuffer)) {
+      return false;
+    }
+    for (const IterVar& block_iter : op->iter_vars) {
+      inner_iter_dom_map_.Set(block_iter->var, arith::IntSet::FromRange(block_iter->dom));
+    }
+  } else {
+    auto collect_iter = [&](const BlockNode* op, std::vector<IterVar>& iters) -> bool {
+      for (const auto& iter : op->iter_vars) {
+        analyzer_.Bind(iter->var, iter->dom);
+        if (iter->iter_type == IterVarType::kDataPar ||
+            iter->iter_type == IterVarType::kCommReduce) {
+          iters.push_back(iter);
+        } else {
+          return false;
+        }
+      }
+      return true;
+    };
+    if (!collect_iter(op, lhs_iters_)) {
+      return false;
+    }
+    if (!collect_iter(rhs, rhs_iters_)) {
+      return false;
+    }
+  }
+  is_scope_block = false;
+  return VisitStmt(op->body, rhs->body);
+}
+
+bool AutoTensorizeComparator::CompareBuffer(const Buffer& lhs, const Buffer& rhs) {
+  if (lhs.same_as(rhs)) return true;
+  auto it = rhs_buffer_map_.find(rhs);
+  bool equal;
+  if (it != rhs_buffer_map_.end()) {
+    equal = (*it).second.same_as(lhs);
+  } else {
+    // Remap both buffer itself and buffer data, skip buffer shape and scope
+    equal = DefEqual(lhs->data, rhs->data) && lhs->dtype == rhs->dtype;
+    if (equal) {
+      rhs_buffer_map_[rhs] = lhs;
+      lhs_buffer_map_[lhs] = rhs;
+    }
+  }
+  return equal;
+}
+
+bool AutoTensorizeComparator::VisitStmt_(const BufferStoreNode* op, const Stmt& other) {
+  const auto* rhs = other.as<BufferStoreNode>();
+  return CompareBufferAccess(op, rhs) && VisitExpr(op->value, rhs->value);
+}
+
+bool AutoTensorizeComparator::VisitExpr_(const BufferLoadNode* op, const PrimExpr& other) {
+  const auto* rhs = other.as<BufferLoadNode>();
+  return CompareBufferAccess(op, rhs);
+}
+
+template <typename T>
+bool AutoTensorizeComparator::CompareBufferAccess(const T* lhs, const T* rhs) {
+  if (!CompareBuffer(lhs->buffer, rhs->buffer)) return false;
+  auto it_lhs = lhs_buffer_indices_map_.find(lhs->buffer);
+  if (it_lhs == lhs_buffer_indices_map_.end()) {
+    if (rhs_buffer_indices_map_.find(rhs->buffer) != rhs_buffer_indices_map_.end()) {
+      return false;
+    }
+    std::vector<PrimExpr> lhs_indices;
+    for (const auto& index : lhs->indices) {
+      lhs_indices.push_back(analyzer_.Simplify(index));
+    }
+    for (const auto& index : rhs->indices) {
+      if (!index.template as<VarNode>()) return false;
+    }
+    lhs_buffer_indices_map_[lhs->buffer] = lhs_indices;
+    rhs_buffer_indices_map_[rhs->buffer] = rhs->indices;
+  } else {
+    auto it_rhs = rhs_buffer_indices_map_.find(rhs->buffer);
+    if (it_rhs == rhs_buffer_indices_map_.end()) {
+      return false;
+    }
+    auto indices_check = [&](const Array<PrimExpr>& indices,
+                             const Array<PrimExpr>& old_indices) -> bool {
+      if (indices.size() != old_indices.size()) {
+        return false;
+      }
+      for (size_t i = 0; i < indices.size(); ++i) {
+        if (!analyzer_.CanProveEqual(indices[i], old_indices[i])) {
+          return false;
+        }
+      }
+      return true;
+    };
+    if (!indices_check(lhs->indices, it_lhs->second)) return false;
+    if (!indices_check(rhs->indices, it_rhs->second)) return false;
+  }
+  return true;
+}
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/ir_comparator.h b/src/tir/schedule/ir_comparator.h
index 359677d8852ff..394d828673931 100644
--- a/src/tir/schedule/ir_comparator.h
+++ b/src/tir/schedule/ir_comparator.h
@@ -90,8 +90,8 @@ class TensorizeComparator : public ExprComparator, public StmtComparator {
   bool CompareAnnotationMap(const Map<String, ObjectRef>& lhs, const Map<String, ObjectRef>& rhs);
   template <typename T>
   bool CompareBufferAccess(const T* lhs, const T* rhs);
-  template <typename T, typename F>
-  bool CompareArray(const Array<T>& lhs, const Array<T>& rhs, F cmp);
+  template <typename T, typename Self, typename F>
+  bool CompareArray(const Array<T>& lhs, const Array<T>& rhs, F Self::*cmp);
   bool CompareRange(const Range& lhs, const Range& rhs);
   bool CompareIterVar(const IterVar& lhs, const IterVar& rhs);
   void EmitError(const std::string& error_message);
@@ -110,6 +110,54 @@ class TensorizeComparator : public ExprComparator, public StmtComparator {
   std::unordered_map<ObjectRef, ObjectRef, ObjectPtrHash, ObjectPtrEqual> equal_map_;
 };
 
+/*!
+ * \brief IR comparator for auto tensorization.
+ * This comparator is used to extract correspondence between the IR of the workload (LHS) and the
+ * tensor intrin (RHS). Unlike `TensorizeComparator`, this comparator has relaxed requirements
+ * during comparison. It ignores the loop structure (number of loops and their extents) and buffer
+ * indices. It only requires the LHS and the RHS to have the same arithmetic operations and the same
+ * dtype. With such relaxed requirements, workloads that can only match the tensor intrin after
+ * certain transformations (e.g. im2col for conv2d) are allowed for auto tensorization.
+ */
+class AutoTensorizeComparator : public TensorizeComparator {
+ public:
+  explicit AutoTensorizeComparator(const IRModule& lhs_mod)
+      : TensorizeComparator(lhs_mod, /* assert_mode=*/false) {}
+
+ private:
+  bool VisitExprDefault_(const Object* op, const PrimExpr& other) override;
+  bool VisitStmtDefault_(const Object* op, const Stmt& other) override;
+
+  bool VisitStmt_(const BlockNode* op, const Stmt& other) override;
+  bool VisitStmt_(const BufferStoreNode* op, const Stmt& other) override;
+
+  bool VisitExpr_(const BufferLoadNode* op, const PrimExpr& other) override;
+
+  bool CompareBuffer(const Buffer& lhs, const Buffer& rhs) override;
+  template <typename T>
+  bool CompareBufferAccess(const T* lhs, const T* rhs);
+
+ public:
+  // Additional information extracted from LHS (the workload) and RHS (the tensor intrin).
+
+  /*! \brief Block iters in the LHS stmt. */
+  std::vector<IterVar> lhs_iters_;
+  /*! \brief Block iters in the RHS stmt. */
+  std::vector<IterVar> rhs_iters_;
+  /*! \brief The buffer and its access indices in the LHS stmt. */
+  std::unordered_map<Buffer, Array<PrimExpr>, ObjectPtrHash, ObjectPtrEqual>
+      lhs_buffer_indices_map_;
+  /*! \brief The buffer and its access indices in the RHS stmt. */
+  std::unordered_map<Buffer, Array<PrimExpr>, ObjectPtrHash, ObjectPtrEqual>
+      rhs_buffer_indices_map_;
+  /*! \brief Map from LHS buffer to RHS buffer */
+  std::unordered_map<Buffer, Buffer, ObjectHash, ObjectEqual> lhs_buffer_map_;
+
+ private:
+  /*! \brief The domain of the inner block iters. */
+  Map<Var, arith::IntSet> inner_iter_dom_map_;
+};
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index 19be0b8699ac6..6761203a5a4da 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -16,14 +16,22 @@
 # under the License.
 # pylint: disable=missing-docstring
 from typing import List
-
+import pytest
 import tvm
+import tvm.testing
+from tvm.tir.function import TensorIntrin
 from tvm.tir.tensor_intrin.x86 import dot_product_16x4_u8i8i32_desc
+from tvm.tir.tensor_intrin.cuda import WMMA_SYNC_16x16x16_f16f16f32_INTRIN
 
 
 from tvm.tir import Evaluate, For, ForKind, IndexMap, Var, decl_buffer, floordiv, floormod, Schedule
 from tvm.tir.analysis import expr_deep_equal
-from tvm.tir.schedule.analysis import suggest_index_map, get_tensorize_loop_mapping, TensorizeInfo
+from tvm.tir.schedule.analysis import (
+    get_auto_tensorize_mapping_info,
+    suggest_index_map,
+    get_tensorize_loop_mapping,
+    TensorizeInfo,
+)
 from tvm.script import tir as T
 from tvm.tir.stmt_functor import pre_order_visit
 from tvm.meta_schedule.testing import te_workload
@@ -252,9 +260,43 @@ def matmul_16x16x16xf16f16f16_desc(
         assert s.get(desc_loop_to_sref[desc_loops[2]]) == s.get(i2)
 
 
+def check_index_map(workload, block_name, intrin_name, expected_index_map):
+    s = Schedule(workload)
+    block = s.get_block(block_name)
+    desc_func = TensorIntrin.get(intrin_name).desc
+    info = get_auto_tensorize_mapping_info(s, block, desc_func)
+    assert len(info.mappings) == 1
+    assert IndexMap.from_func(expected_index_map).is_equivalent_to(info.mappings[0])
+
+
+def test_get_auto_tensorize_mapping_info_conv2d():
+    conv2d = create_prim_func(te_workload.conv2d_nhwc_f16(4, 16, 16, 64, 64, 3, 1, 1))
+    check_index_map(
+        conv2d,
+        "conv2d_nhwc",
+        WMMA_SYNC_16x16x16_f16f16f32_INTRIN,
+        lambda n, h, w, c, rh, rw, rc: (n * 256 + h * 16 + w, c, rh * 192 + rw * 64 + rc),
+    )
+
+
+def test_get_auto_tensorize_mapping_info_conv2d_unit_batch():
+    conv2d = create_prim_func(te_workload.conv2d_nhwc_f16(1, 16, 16, 64, 64, 3, 1, 1))
+    check_index_map(
+        conv2d,
+        "conv2d_nhwc",
+        WMMA_SYNC_16x16x16_f16f16f32_INTRIN,
+        # unit iter is not mapped
+        lambda n, h, w, c, rh, rw, rc: (n, h * 16 + w, c, rh * 192 + rw * 64 + rc),
+    )
+
+
+@pytest.mark.parametrize("b,m,n,k", [(1, 512, 512, 512), (16, 32, 32, 32)])
+def test_get_auto_tensorize_mapping_info_batch_matmul(b, m, n, k):
+    matmul = create_prim_func(te_workload.batch_matmul_nkkm_f16(b, m, n, k))
+    check_index_map(
+        matmul, "Z", WMMA_SYNC_16x16x16_f16f16f32_INTRIN, lambda b, m, n, k: (b, m, n, k)
+    )
+
+
 if __name__ == "__main__":
-    test_suggest_index_map_simple()
-    test_suggest_index_map_bijective()
-    test_get_tensorize_loop_mapping_dense_vnni()
-    test_get_tensorize_loop_mapping_conv2d_nchwc_vnni()
-    test_get_tensorize_loop_mapping_matmul_mma()
+    tvm.testing.main()

From 8bf6cd5800daaf42935fd69cbd63180c97bef262 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 20 Jun 2022 13:48:15 +0100
Subject: [PATCH 177/181] [TVMC] Fix tvmc run when using rpc (#11757)

* [TVMC] Fix tvmc run when using rpc

As described in #11707, the RPC mechanism does not support
objects of type Map which breaks the use of tvmc run when using
RPC after #9889. This commit intends to workaround this issue by
providing a fallback to the old implementation when RPC is being
used. Further, a test has been provided to help prevent this
regression in the future.

Change-Id: I70c1863d00098270e27c08ba834a3587e9132d69

* fix lint

Change-Id: I958cf4e19988d047bdd2e02f6475b9f70afe80c8
---
 python/tvm/driver/tvmc/runner.py        | 62 ++++++++++++++++++++++++-
 tests/python/driver/tvmc/test_runner.py | 43 ++++++++++++++++-
 2 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index 5be588a3ae7f7..afb198ce1c6ee 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -24,6 +24,8 @@
 from tarfile import ReadError
 import argparse
 import sys
+import json
+
 import numpy as np
 
 import tvm
@@ -33,6 +35,7 @@
 from tvm.contrib import graph_executor as executor
 from tvm.contrib.debugger import debug_executor
 from tvm.runtime import profiler_vm
+from tvm.relay.param_dict import load_param_dict
 from . import TVMCException
 from .arguments import TVMCSuppressedArgumentParser
 from .project import (
@@ -292,6 +295,56 @@ def drive_run(args):
         result.save(args.outputs)
 
 
+def get_input_info(graph_str: str, params: Dict[str, tvm.nd.NDArray]):
+    """Return the 'shape' and 'dtype' dictionaries for the input
+    tensors of a compiled module.
+
+    .. note::
+        We can't simply get the input tensors from a TVM graph
+        because weight tensors are treated equivalently. Therefore, to
+        find the input tensors we look at the 'arg_nodes' in the graph
+        (which are either weights or inputs) and check which ones don't
+        appear in the params (where the weights are stored). These nodes
+        are therefore inferred to be input tensors.
+
+    .. note::
+        There exists a more recent API to retrieve the input information
+        directly from the module. However, this isn't supported when using
+        with RPC due to a lack of support for Array and Map datatypes.
+        Therefore, this function exists only as a fallback when RPC is in
+        use. If RPC isn't being used, please use the more recent API.
+
+    Parameters
+    ----------
+    graph_str : str
+        JSON graph of the module serialized as a string.
+    params : dict
+        Parameter dictionary mapping name to value.
+
+    Returns
+    -------
+    shape_dict : dict
+        Shape dictionary - {input_name: tuple}.
+    dtype_dict : dict
+        dtype dictionary - {input_name: dtype}.
+    """
+
+    shape_dict = {}
+    dtype_dict = {}
+    params_dict = load_param_dict(params)
+    param_names = [k for (k, v) in params_dict.items()]
+    graph = json.loads(graph_str)
+    for node_id in graph["arg_nodes"]:
+        node = graph["nodes"][node_id]
+        # If a node is not in the params, infer it to be an input node
+        name = node["name"]
+        if name not in param_names:
+            shape_dict[name] = graph["attrs"]["shape"][1][node_id]
+            dtype_dict[name] = graph["attrs"]["dltype"][1][node_id]
+
+    return shape_dict, dtype_dict
+
+
 def generate_tensor_data(shape: tuple, dtype: str, fill_mode: str):
     """Generate data to produce a tensor of given shape and dtype.
 
@@ -586,7 +639,14 @@ def run_module(
             module.load_params(tvmc_package.params)
 
             logger.debug("Collecting graph input shape and type:")
-            shape_dict, dtype_dict = module.get_input_info()
+
+            if isinstance(session, tvm.rpc.client.RPCSession):
+                # RPC does not support datatypes such as Array and Map,
+                # fallback to obtaining input information from graph json.
+                shape_dict, dtype_dict = get_input_info(tvmc_package.graph, tvmc_package.params)
+            else:
+                shape_dict, dtype_dict = module.get_input_info()
+
             logger.debug("Graph input shape: %s", shape_dict)
             logger.debug("Graph input type: %s", dtype_dict)
 
diff --git a/tests/python/driver/tvmc/test_runner.py b/tests/python/driver/tvmc/test_runner.py
index 3f4ab11f6ba2e..f0d363dc59acc 100644
--- a/tests/python/driver/tvmc/test_runner.py
+++ b/tests/python/driver/tvmc/test_runner.py
@@ -17,6 +17,7 @@
 import pytest
 import numpy as np
 
+from tvm import rpc
 from tvm.driver import tvmc
 from tvm.driver.tvmc.model import TVMCResult
 from tvm.driver.tvmc.result_utils import get_top_results
@@ -103,6 +104,44 @@ def test_run_tflite_module__with_profile__valid_input(
     assert (
         tiger_cat_mobilenet_id in top_5_ids
     ), "tiger cat is expected in the top-5 for mobilenet v1"
-    assert type(result.outputs) is dict
-    assert type(result.times) is BenchmarkResult
+    assert isinstance(result.outputs, dict)
+    assert isinstance(result.times, BenchmarkResult)
+    assert "output_0" in result.outputs.keys()
+
+
+def test_run_tflite_module_with_rpc(
+    tflite_mobilenet_v1_1_quant, tflite_compile_model, imagenet_cat
+):
+    """
+    Test to check that TVMC run is functional when it is being used in
+    conjunction with an RPC server.
+    """
+    pytest.importorskip("tflite")
+
+    inputs = np.load(imagenet_cat)
+    input_dict = {"input": inputs["input"].astype("uint8")}
+
+    tflite_compiled_model = tflite_compile_model(tflite_mobilenet_v1_1_quant)
+
+    server = rpc.Server("127.0.0.1", 9099)
+    result = tvmc.run(
+        tflite_compiled_model,
+        inputs=input_dict,
+        hostname=server.host,
+        port=server.port,
+        device="cpu",
+    )
+
+    top_5_results = get_top_results(result, 5)
+    top_5_ids = top_5_results[0]
+
+    # IDs were collected from this reference:
+    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/
+    # java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
+    tiger_cat_mobilenet_id = 283
+
+    assert (
+        tiger_cat_mobilenet_id in top_5_ids
+    ), "tiger cat is expected in the top-5 for mobilenet v1"
+    assert isinstance(result.outputs, dict)
     assert "output_0" in result.outputs.keys()

From ccc935dda9d96ba72b2e0ff75aa132bea3968903 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 20 Jun 2022 08:29:06 -0700
Subject: [PATCH 178/181] [ci][docker] Remove Docker image upload prefix
 (#11769)

These should go to the same tag as in `tlcpack` so the only difference is the user
---
 Jenkinsfile              | 18 +++++++++---------
 jenkins/Deploy.groovy.j2 |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8e284ee951c8c..722aabd5f6f9e 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-10T12:12:40.419262
+// Generated at 2022-06-17T13:38:47.940292
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -3396,14 +3396,14 @@ def deploy() {
               returnStdout: true,
             ).trim()
             def tag = "${date_Ymd_HMS}-${upstream_revision.substring(0, 8)}"
-            update_docker(ci_arm, "tlcpackstaging/test_ci_arm:${tag}")
-            update_docker(ci_cpu, "tlcpackstaging/test_ci_cpu:${tag}")
-            update_docker(ci_gpu, "tlcpackstaging/test_ci_gpu:${tag}")
-            update_docker(ci_hexagon, "tlcpackstaging/test_ci_hexagon:${tag}")
-            update_docker(ci_i386, "tlcpackstaging/test_ci_i386:${tag}")
-            update_docker(ci_lint, "tlcpackstaging/test_ci_lint:${tag}")
-            update_docker(ci_qemu, "tlcpackstaging/test_ci_qemu:${tag}")
-            update_docker(ci_wasm, "tlcpackstaging/test_ci_wasm:${tag}")
+            update_docker(ci_arm, "tlcpackstaging/ci_arm:${tag}")
+            update_docker(ci_cpu, "tlcpackstaging/ci_cpu:${tag}")
+            update_docker(ci_gpu, "tlcpackstaging/ci_gpu:${tag}")
+            update_docker(ci_hexagon, "tlcpackstaging/ci_hexagon:${tag}")
+            update_docker(ci_i386, "tlcpackstaging/ci_i386:${tag}")
+            update_docker(ci_lint, "tlcpackstaging/ci_lint:${tag}")
+            update_docker(ci_qemu, "tlcpackstaging/ci_qemu:${tag}")
+            update_docker(ci_wasm, "tlcpackstaging/ci_wasm:${tag}")
           } finally {
             sh(
               script: 'docker logout',
diff --git a/jenkins/Deploy.groovy.j2 b/jenkins/Deploy.groovy.j2
index 0c81f8f4724a1..0249aaffc5123 100644
--- a/jenkins/Deploy.groovy.j2
+++ b/jenkins/Deploy.groovy.j2
@@ -106,7 +106,7 @@ def deploy() {
             ).trim()
             def tag = "${date_Ymd_HMS}-${upstream_revision.substring(0, 8)}"
             {% for image in images %}
-            update_docker({{ image.name }}, "tlcpackstaging/test_{{ image.name }}:${tag}")
+            update_docker({{ image.name }}, "tlcpackstaging/{{ image.name }}:${tag}")
             {% endfor %}
           } finally {
             sh(

From 2c78daed488ea1c9f75c62587098a0c59519d8f0 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 20 Jun 2022 15:58:22 -0700
Subject: [PATCH 179/181] [MetaSchedule] Enhance parsing in JSONDatabase
 (#11791)

Originally, when failed with `std::stoi` and `std::stod`, the parser
disruptly stops and throw an incomprehensible error message, for
example, "stoi". This PR improves the user experience by detailing which
string causes the parsing issue.

A minor fix: out-of-range integers in parsing will now automatically fall
back to floating point numbers.
---
 src/meta_schedule/database/database_utils.cc | 30 +++++++++++++++-----
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/meta_schedule/database/database_utils.cc b/src/meta_schedule/database/database_utils.cc
index 278c5267ea93c..9e563c39d408c 100644
--- a/src/meta_schedule/database/database_utils.cc
+++ b/src/meta_schedule/database/database_utils.cc
@@ -160,14 +160,30 @@ class JSONTokenizer {
     if (st == cur_) {
       return false;
     }
-    // TODO(@junrushao1994): error checking
+    std::string to_parse(st, cur_);
+    if (!is_float) {
+      try {
+        *token = Token{TokenType::kInteger, IntImm(DataType::Int(64), std::stoll(to_parse))};
+      } catch (const std::invalid_argument& e) {
+        LOG(WARNING) << "ValueError: Invalid argument to std::stoll: " << to_parse
+                     << ". Details: " << e.what() << ". Switching to std::stod now.";
+        is_float = true;
+      } catch (const std::out_of_range& e) {
+        LOG(WARNING) << "ValueError: Out-of-range for std::stoll: " << to_parse
+                     << ". Details: " << e.what() << ". Switching to std::stod now.";
+        is_float = true;
+      }
+    }
     if (is_float) {
-      *token = Token{TokenType::kFloat,
-                     FloatImm(DataType::Float(64),  //
-                              std::stod(std::string(st, cur_)))};
-    } else {
-      *token = Token{TokenType::kInteger,  //
-                     Integer(std::stoi(std::string(st, cur_)))};
+      try {
+        *token = Token{TokenType::kFloat, FloatImm(DataType::Float(64), std::stod(to_parse))};
+      } catch (const std::invalid_argument& e) {
+        LOG(INFO) << "ValueError: Invalid argument to std::stod: " << to_parse
+                  << ". Details: " << e.what();
+      } catch (const std::out_of_range& e) {
+        LOG(INFO) << "ValueError: Out-of-range for std::stod: " << to_parse
+                  << ". Details: " << e.what();
+      }
     }
     return true;
   }

From 25db38105de93705fd67f2b5f3fd7dbd0a428416 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 20 Jun 2022 17:25:56 -0700
Subject: [PATCH 180/181] [MetaSchedule][Minor] Update CPU Flush ArgParse Type
 (#11792)

Previously `cpu-flush` option existed as a boolean or integer argument, which is a bit counter-intuitive because for argparse, any non-empty string such as `False` will be parsed to `True` when using as a boolean and integer a little bit vague here IMHO. This PR used a function from `distutils` to directly parse input string to boolean, which makes the usage more stragiht-forward like `--cpu-flush True` or `--cpu-flush False`. Meanwhile it still supports usage of `0/1` and made sure the argument is always required.
---
 python/tvm/auto_scheduler/testing/tune_onnx.py  | 4 +++-
 python/tvm/auto_scheduler/testing/tune_relay.py | 4 +++-
 python/tvm/auto_scheduler/testing/tune_te.py    | 4 +++-
 python/tvm/meta_schedule/testing/tune_onnx.py   | 4 +++-
 python/tvm/meta_schedule/testing/tune_relay.py  | 4 +++-
 python/tvm/meta_schedule/testing/tune_te.py     | 4 +++-
 6 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
index 5fbc875d1eda9..1998f3d2c38fb 100644
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -19,6 +19,7 @@
 import json
 import os
 
+from distutils.util import strtobool
 import numpy as np  # type: ignore
 import onnx  # type: ignore
 import tvm
@@ -96,8 +97,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index 58ea327ec50b6..dce29775a7ede 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -19,6 +19,7 @@
 import json
 import os
 
+from distutils.util import strtobool
 import numpy as np  # type: ignore
 import tvm
 from tvm import auto_scheduler
@@ -94,8 +95,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py
index 4a6874a53d347..c844bb9bf61f3 100644
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -17,6 +17,7 @@
 # pylint: disable=missing-docstring
 import argparse
 import os
+from distutils.util import strtobool
 
 import tvm
 from tvm import auto_scheduler
@@ -79,8 +80,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py
index 88cb360c0171b..f8a3f1f0cacd8 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -19,6 +19,7 @@
 import json
 import logging
 
+from distutils.util import strtobool
 import numpy as np  # type: ignore
 import onnx  # type: ignore
 import tvm
@@ -93,8 +94,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index ce15c60c15e68..611a8cdd7a636 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -19,6 +19,7 @@
 import json
 import logging
 
+from distutils.util import strtobool
 import numpy as np  # type: ignore
 import tvm
 from tvm import meta_schedule as ms
@@ -91,8 +92,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py
index 8740d74424781..df437838f95d4 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -19,6 +19,7 @@
 import logging
 from os import cpu_count
 from typing import Optional
+from distutils.util import strtobool
 
 import tvm
 from tvm import meta_schedule as ms
@@ -81,8 +82,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)

From 9ff2a5e796b9cac0bdf05bfd600c50ade9728f1f Mon Sep 17 00:00:00 2001
From: ah cheng <darkvan_wen@hotmail.com>
Date: Tue, 21 Jun 2022 09:05:38 +0800
Subject: [PATCH 181/181] add layerNormal infer layout (#11784)

---
 src/relay/op/nn/nn.cc                         | 36 +++++++++++++
 .../relay/test_pass_convert_op_layout.py      | 50 +++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index b8d48d9e9e3dd..bf00ee5117c16 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -921,6 +921,41 @@ Expr MakeLayerNorm(Expr data, Expr gamma, Expr beta, int axis, double epsilon, b
   return Call(op, {data, gamma, beta}, Attrs(attrs), {});
 }
 
+InferCorrectLayoutOutput LayerNormInferCorrectLayout(const Attrs& attrs,
+                                                     const Array<Layout>& new_in_layouts,
+                                                     const Array<Layout>& old_in_layouts,
+                                                     const Array<tvm::relay::Type>& old_in_types) {
+  const auto* attrs_ptr = attrs.as<LayerNormAttrs>();
+  ICHECK(attrs_ptr);
+  ObjectPtr<LayerNormAttrs> param = make_object<LayerNormAttrs>(*attrs_ptr);
+
+  Array<Array<IndexExpr>> old_in_shapes;
+  for (auto old_in_t : old_in_types) {
+    ICHECK(old_in_t.as<TensorTypeNode>());
+    old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
+  }
+
+  size_t axis =
+      param->axis < 0 ? param->axis + old_in_shapes[0].size() : static_cast<size_t>(param->axis);
+
+  Layout ret = Layout::Undef();
+
+  // If new_in_layouts are defined, this code tries to modify the layout.
+  if (new_in_layouts.defined() && old_in_layouts.defined()) {
+    // Get the new C axis. Extract the dim in old layout. Find the index of that dim in next layout.
+    const auto& ln_dim = old_in_layouts[0][axis];
+    auto new_index = new_in_layouts[0].IndexOf(ln_dim);
+    param->axis = new_index;
+    ret = new_in_layouts[0];
+  } else if (old_in_layouts.defined()) {
+    ret = old_in_layouts[0];
+  }
+
+  // LN has 3 inputs, 1 outputs. The last 2 inputs have "C" layout.
+  Layout c_layout = Layout("C");
+  return InferCorrectLayoutOutput({ret, c_layout, c_layout}, {ret}, Attrs(param));
+}
+
 TVM_REGISTER_GLOBAL("relay.op.nn._make.layer_norm").set_body_typed(MakeLayerNorm);
 
 RELAY_REGISTER_OP("nn.layer_norm")
@@ -931,6 +966,7 @@ RELAY_REGISTER_OP("nn.layer_norm")
     .add_argument("data", "Tensor", "Input to which layer_norm will be applied.")
     .add_argument("gamma", "Tensor", "The gamma scale factor.")
     .add_argument("beta", "Tensor", "The beta offset factor.")
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", LayerNormInferCorrectLayout)
     .set_support_level(1)
     .add_type_rel("LayerNorm", LayerNormRel);
 
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index b2eb0bae5737a..7d093d4854bfd 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -974,6 +974,56 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_conv_ln_convert_layout():
+    """Check that layout transforms are propagated through ln."""
+
+    def before():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight = relay.var("weight", shape=(3, 3, 64, 64))
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+
+        dtype = "float32"
+        beta = relay.var("beta", relay.TensorType((64,), dtype))
+        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
+
+        y = relay.nn.layer_norm(y, gamma, beta, axis=3)
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def expected():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        w = relay.var("weight", shape=(3, 3, 64, 64))
+        x = relay.layout_transform(x, "NHWC", "NCHW")
+        w = relay.layout_transform(w, "HWIO", "OIHW")
+        y = relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1))
+
+        dtype = "float32"
+        beta = relay.var("beta", relay.TensorType((64,), dtype))
+        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
+
+        y = relay.nn.layer_norm(y, gamma, beta, axis=1)
+        y = relay.layout_transform(y, "NCHW", "NHWC")
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    a = before()
+    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
+    print(a)
+    b = run_opt_pass(expected(), transform.InferType())
+    print(" ")
+    print(b)
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 def test_conv_bn_convert_layout():
     """Check that layout transforms are propagated through bn."""