diff --git a/CMakeLists.txt b/CMakeLists.txt
index 306a8be308584..5e9de5b66fa4d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -295,6 +295,7 @@ tvm_file_glob(GLOB_RECURSE RELAY_OP_SRCS
     )
 tvm_file_glob(GLOB_RECURSE RELAY_PASS_SRCS
     src/relay/analysis/*.cc
+    src/relay/collage/*.cc
     src/relay/transforms/*.cc
     src/relay/quantize/*.cc
     )
diff --git a/collage_autotvm_rtx3070.tuninglog b/collage_autotvm_rtx3070.tuninglog
new file mode 100644
index 0000000000000..d71fdc551054d
--- /dev/null
+++ b/collage_autotvm_rtx3070.tuninglog
@@ -0,0 +1,314 @@
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [768, 768], "float16"], null, "float16"], {}], "config": {"index": 4580561, "code_hash": null, "entity": [["tile_x", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 1, 4, 2]], ["tile_k", "sp", [-1, 24, 1]]]}, "result": [[0.0011703262512077295, 0.0011717971932367149, 0.001173296154589372], 0, 2.1227333545684814, 1649806187.4005826], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_tensorcore.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [768, 768], "float16"], null, "float16"], {}], "config": {"index": 15358, "code_hash": null, "entity": [["block_row_warps", "ot", 2], ["block_col_warps", "ot", 2], ["warp_row_tiles", "ot", 4], ["warp_col_tiles", "ot", 2], ["chunk", "ot", 2], ["offset", "ot", 8], ["offsetCS", "ot", 8], ["vec", "ot", 8], ["wmma_m", "ot", 8]]}, "result": [[4.1945246429498836e-05, 4.20124894832511e-05, 4.206600389509219e-05], 0, 1.5296962261199951, 1649810763.887689], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [768, 768], "float16"], null, "float16"], {}], "config": {"index": 9, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[0.0017168304615384617, 0.0017169396923076923, 0.0017175537692307693], 0, 1.3375377655029297, 1649811474.8204205], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [3072, 768], "float16"], null, "float16"], {}], "config": {"index": 44253765, "code_hash": null, "entity": [["tile_x", "sp", [-1, 16, 4, 1]], ["tile_y", "sp", [-1, 12, 4, 1]], ["tile_k", "sp", [-1, 4, 4]]]}, "result": [[0.0010211715700483093, 0.0010240237777777777, 0.001028588038647343], 0, 5.8342344760894775, 1649812254.5651937], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_tensorcore.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [3072, 768], "float16"], null, "float16"], {}], "config": {"index": 15439, "code_hash": null, "entity": [["block_row_warps", "ot", 2], ["block_col_warps", "ot", 2], ["warp_row_tiles", "ot", 4], ["warp_col_tiles", "ot", 2], ["chunk", "ot", 4], ["offset", "ot", 8], ["offsetCS", "ot", 8], ["vec", "ot", 8], ["wmma_m", "ot", 8]]}, "result": [[0.0001372752479338843, 0.00013770897272727273, 0.00013806367107438016], 0, 1.7162327766418457, 1649813685.478743], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [3072, 768], "float16"], null, "float16"], {}], "config": {"index": 6, "code_hash": null, "entity": [["tile_k", "sp", [-1, 12]]]}, "result": [[0.016466704999999998, 0.0164685995, 0.0164699646], 0, 2.3179266452789307, 1649814497.3615065], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [2304, 768], "float16"], null, "float16"], {}], "config": {"index": 65237833, "code_hash": null, "entity": [["tile_x", "sp", [-1, 10, 8, 1]], ["tile_y", "sp", [-1, 16, 8, 1]], ["tile_k", "sp", [-1, 6, 4]]]}, "result": [[0.0004378561058495821, 0.0004378913426183844, 0.0004379270584958217], 0, 6.199182510375977, 1649879456.1792707], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_tensorcore.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [2304, 768], "float16"], null, "float16"], {}], "config": {"index": 10265, "code_hash": null, "entity": [["block_row_warps", "ot", 4], ["block_col_warps", "ot", 2], ["warp_row_tiles", "ot", 1], ["warp_col_tiles", "ot", 4], ["chunk", "ot", 4], ["offset", "ot", 8], ["offsetCS", "ot", 8], ["vec", "ot", 8], ["wmma_m", "ot", 16]]}, "result": [[0.00011448282788944724, 0.00011478408417085428, 0.00011491263505025125], 0, 1.6557848453521729, 1649880882.8527882], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [2304, 768], "float16"], null, "float16"], {}], "config": {"index": 9, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[0.00506267196875, 0.005084877875, 0.005099014031250001], 0, 1.4695072174072266, 1649881655.5889988], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_tensorcore.cuda", [["TENSOR", [600, 32, 64], "float16"], ["TENSOR", [600, 32, 64], "float16"], [600, 32, 32], "float16", 0, 1], {}], "config": {"index": 10210, "code_hash": null, "entity": [["block_row_warps", "ot", 2], ["block_col_warps", "ot", 2], ["warp_row_tiles", "ot", 1], ["warp_col_tiles", "ot", 1], ["chunk", "ot", 4], ["offset", "ot", 8], ["offsetCS", "ot", 8], ["vec", "ot", 8], ["wmma_m", "ot", 16]]}, "result": [[1.617625803613724e-05, 1.6183566759152736e-05, 1.621106395073425e-05], 0, 1.9195764064788818, 1649882086.06151], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul.cuda", [["TENSOR", [600, 32, 64], "float16"], ["TENSOR", [600, 32, 64], "float16"], [600, 32, 32], "float16", 0, 1], {}], "config": {"index": 11588, "code_hash": null, "entity": [["tile_y", "sp", [-1, 4, 8]], ["tile_x", "sp", [-1, 32, 1]], ["tile_k", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 64], ["unroll_explicit", "ot", 0]]}, "result": [[2.080483443708609e-05, 2.0816280353200882e-05, 2.083283294960348e-05], 0, 1.7147831916809082, 1649883281.0301726], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_tensorcore.cuda", [["TENSOR", [600, 32, 32], "float16"], ["TENSOR", [600, 64, 32], "float16"], [600, 32, 64], "float16", 0, 1], {}], "config": {"index": 3060, "code_hash": null, "entity": [["block_row_warps", "ot", 1], ["block_col_warps", "ot", 1], ["warp_row_tiles", "ot", 2], ["warp_col_tiles", "ot", 4], ["chunk", "ot", 2], ["offset", "ot", 8], ["offsetCS", "ot", 0], ["vec", "ot", 4], ["wmma_m", "ot", 32]]}, "result": [[1.4489859942609439e-05, 1.4505717687282495e-05, 1.4542667378960864e-05], 0, 2.024470806121826, 1649884257.5815887], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul.cuda", [["TENSOR", [600, 32, 32], "float16"], ["TENSOR", [600, 64, 32], "float16"], [600, 32, 64], "float16", 0, 1], {}], "config": {"index": 10118, "code_hash": null, "entity": [["tile_y", "sp", [-1, 4, 8]], ["tile_x", "sp", [-1, 32, 1]], ["tile_k", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 32], ["unroll_explicit", "ot", 0]]}, "result": [[2.0256624645161293e-05, 2.0411544258064515e-05, 2.0471600903225806e-05], 0, 1.4079077243804932, 1649885460.5937498], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 3072], "float16"], ["TENSOR", [768, 3072], "float16"], null, "float16"], {}], "config": {"index": 44926587, "code_hash": null, "entity": [["tile_x", "sp", [-1, 16, 2, 1]], ["tile_y", "sp", [-1, 12, 2, 1]], ["tile_k", "sp", [-1, 1, 8]]]}, "result": [[0.004547732660377358, 0.004549542320754717, 0.0045726865471698115], 0, 3.755682945251465, 1649890840.317121], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_tensorcore.cuda", [["TENSOR", [1600, 3072], "float16"], ["TENSOR", [768, 3072], "float16"], null, "float16"], {}], "config": {"index": 10192, "code_hash": null, "entity": [["block_row_warps", "ot", 2], ["block_col_warps", "ot", 2], ["warp_row_tiles", "ot", 2], ["warp_col_tiles", "ot", 4], ["chunk", "ot", 2], ["offset", "ot", 8], ["offsetCS", "ot", 8], ["vec", "ot", 8], ["wmma_m", "ot", 16]]}, "result": [[0.000155265987607245, 0.00015604860152526216, 0.0001561262850333651], 0, 1.7700400352478027, 1649891493.4140472], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 3072], "float16"], ["TENSOR", [768, 3072], "float16"], null, "float16"], {}], "config": {"index": 9, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[0.0100141173, 0.01003659415, 0.0100415897], 0, 1.8085365295410156, 1649892333.886837], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 3, 224, 224], "float32"], ["TENSOR", [32, 3, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 34318245, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 16, 1]], ["tile_rc", "sp", [-1, 3]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}, "result": [[5.8621e-06, 5.8625999999999996e-06, 5.8631e-06], 0, 0.5667502880096436, 1649968594.6997252], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [32, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 15657147, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 8]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 16, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.3555999999999995e-06, 7.35565e-06, 7.3781000000000006e-06], 0, 0.7575399875640869, 1650319502.366343], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [32, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 6072920, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 4]], ["tile_x", "sp", [-1, 1, 56, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[5.77155e-06, 5.7885e-06, 5.81905e-06], 0, 1.1465270519256592, 1650323152.8143198], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [16, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 9785718, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 2, 8]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 16, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[4.79855e-06, 4.81355e-06, 4.81455e-06], 0, 0.8575699329376221, 1650325997.844903], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 16, 112, 112], "float32"], ["TENSOR", [96, 16, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 63569063, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 8, 2]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.2961600000000001e-05, 1.2997649999999999e-05, 1.3000100000000001e-05], 0, 0.71053147315979, 1650330982.632312], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 96, 112, 112], "float32"], ["TENSOR", [96, 1, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 6588288, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 2, 4]], ["tile_x", "sp", [-1, 1, 28, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[1.655015e-05, 1.655815e-05, 1.66232e-05], 0, 2.27034854888916, 1650334575.9977324], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 96, 56, 56], "float32"], ["TENSOR", [24, 96, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 15104777, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 3]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 4, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[6.8786000000000005e-06, 6.899099999999999e-06, 6.915549999999999e-06], 0, 1.0633580684661865, 1650336756.828295], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 24, 56, 56], "float32"], ["TENSOR", [144, 24, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 99179720, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 6, 4]], ["tile_y", "sp", [-1, 4, 2, 1]], ["tile_x", "sp", [-1, 1, 8, 1]], ["tile_rc", "sp", [-1, 6]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[6.287049999999999e-06, 6.2876e-06, 6.31505e-06], 0, 2.9279582500457764, 1650340980.0825403], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float32"], ["TENSOR", [144, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 12539100, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 7]], ["tile_x", "sp", [-1, 1, 28, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[6.821099999999999e-06, 6.8255999999999995e-06, 6.850050000000001e-06], 0, 1.4625983238220215, 1650347726.6219482], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float32"], ["TENSOR", [24, 144, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 18208026, "code_hash": null, "entity": [["tile_f", "sp", [-1, 3, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[8.36655e-06, 8.3721e-06, 8.377600000000001e-06], 0, 0.9377567768096924, 1650350453.9616745], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float32"], ["TENSOR", [144, 1, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 2479050, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[4.43905e-06, 4.45505e-06, 4.46055e-06], 0, 1.6908175945281982, 1650355541.390374], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 144, 28, 28], "float32"], ["TENSOR", [32, 144, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 7217873, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 2, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[5.71105e-06, 5.7370499999999996e-06, 5.7421e-06], 0, 2.532322883605957, 1650358471.728223], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 28, 28], "float32"], ["TENSOR", [192, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 9368144, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[4.228049999999999e-06, 4.2320499999999996e-06, 4.25555e-06], 0, 0.9063670635223389, 1650363448.7873058], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [192, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 2508576, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 14, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[3.8405e-06, 3.84355e-06, 3.84605e-06], 0, 0.98264479637146, 1650366846.8783011], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [32, 192, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 6769873, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 2, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[7.15055e-06, 7.1561e-06, 7.1566e-06], 0, 2.941910743713379, 1650369652.7351506], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [64, 192, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 4656979, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 12]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[9.7176e-06, 9.7226e-06, 9.7241e-06], 0, 0.7810451984405518, 1650373123.1804664], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 28, 28], "float32"], ["TENSOR", [384, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 13558142, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[9.5361e-06, 9.554600000000002e-06, 9.5571e-06], 0, 1.1758761405944824, 1650377186.8371902], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float32"], ["TENSOR", [384, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 3415200, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 7]], ["tile_x", "sp", [-1, 1, 28, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[5.35005e-06, 5.37305e-06, 5.377549999999999e-06], 0, 1.0839922428131104, 1650380285.1866374], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float32"], ["TENSOR", [64, 384, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 5194603, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 12]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.72372e-05, 1.727575e-05, 1.72932e-05], 0, 0.6490328311920166, 1650382649.6077251], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float32"], ["TENSOR", [384, 1, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 702720, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.667600000000001e-06, 4.67305e-06, 4.70105e-06], 0, 0.8721504211425781, 1650384995.9365287], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [128, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 13507251, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.216615e-05, 1.21901e-05, 1.2196650000000001e-05], 0, 0.9639995098114014, 1650392782.272614], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [128, 4, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 50062362, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 2, 1, 2]], ["tile_y", "sp", [-1, 1, 2, 4]], ["tile_x", "sp", [-1, 1, 56, 1]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[7.382600000000001e-06, 7.39615e-06, 7.4001e-06], 0, 2.2717225551605225, 1650424944.499887], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [256, 128, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 45700339, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 8]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.95423e-05, 3.9552349999999994e-05, 3.9561300000000004e-05], 0, 2.2523245811462402, 1650430175.9074345], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [256, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 18573740, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.01327e-05, 2.0137700000000002e-05, 2.01432e-05], 0, 1.0902690887451172, 1650436320.534228], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [128, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 9657656, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 2, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[4.146595e-05, 4.152795e-05, 4.1557949999999994e-05], 0, 0.8977162837982178, 1650441321.9324644], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [256, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 29864625, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 8]], ["tile_y", "sp", [-1, 2, 2, 2]], ["tile_x", "sp", [-1, 1, 8, 1]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}, "result": [[7.57113e-05, 7.581275e-05, 7.593024999999999e-05], 0, 0.7584812641143799, 1650447316.5830467], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [256, 8, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 33342114, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 1, 8]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[9.6796e-06, 9.6861e-06, 9.722649999999999e-06], 0, 2.743138313293457, 1650451842.4668179], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [512, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 17125050, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.177395e-05, 4.179295e-05, 4.1814449999999994e-05], 0, 1.3813395500183105, 1650456647.9994063], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [512, 256, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 7532887, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 16, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 2, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.2836050000000004e-05, 5.2853600000000005e-05, 5.28761e-05], 0, 0.8911774158477783, 1650459586.3381498], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [256, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 14151622, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 2]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.994355e-05, 5.005455e-05, 5.0164050000000006e-05], 0, 1.1229846477508545, 1650462059.7162044], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [256, 8, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 33536454, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[7.202550000000001e-06, 7.2061e-06, 7.2231e-06], 0, 2.9223580360412598, 1650465984.0746238], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [512, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 15365101, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[7.897534999999999e-05, 7.900685e-05, 7.902235e-05], 0, 1.4378130435943604, 1650471801.2672544], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [512, 16, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 5252700, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 4, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 0]]}, "result": [[9.1631e-06, 9.168100000000001e-06, 9.189600000000001e-06], 0, 2.878096342086792, 1650475376.2008486], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [1024, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 3200427, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 2]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[5.586205e-05, 5.58681e-05, 5.60791e-05], 0, 1.1162426471710205, 1650478289.7313669], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [1024, 512, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 3185277, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[6.979539999999999e-05, 7.000990000000001e-05, 7.007095e-05], 0, 1.088874340057373, 1650503541.7996006], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [512, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 208633, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}, "result": [[6.765845e-05, 6.768145000000001e-05, 6.770450000000001e-05], 0, 0.7414331436157227, 1650505644.5295877], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [512, 16, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 6381558, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 0]]}, "result": [[8.845050000000001e-06, 8.855549999999999e-06, 8.88055e-06], 0, 3.725031852722168, 1650509494.6496606], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [1024, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 3492656, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[0.00010417354999999999, 0.00010418149999999999, 0.00010423009999999998], 0, 1.3148417472839355, 1650512952.687562], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [1024, 32, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 761250, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 0]]}, "result": [[1.1431599999999999e-05, 1.143465e-05, 1.1438650000000001e-05], 0, 5.0806190967559814, 1650515567.1315808], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [2048, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 346926, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[8.6393e-05, 8.645245000000001e-05, 8.649599999999999e-05], 0, 1.271730661392212, 1650518280.522468], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [2048, 1024, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 345844, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 7, 1, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[0.00013144805, 0.0001315021, 0.00013154105], 0, 1.8906102180480957, 1650520029.3195784], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 2048, 7, 7], "float32"], ["TENSOR", [1024, 2048, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 85322, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 2]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 64]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[0.0001166115, 0.00011664449999999998, 0.00011665295], 0, 0.9735369682312012, 1650521383.2568395], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [1024, 32, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 1503126, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[1.08582e-05, 1.085915e-05, 1.085915e-05], 0, 3.2422332763671875, 1650522436.543362], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 384, 14, 14], "float32"], ["TENSOR", [96, 384, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 2276787, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[8.7111e-06, 8.716649999999999e-06, 8.727600000000001e-06], 0, 0.9531784057617188, 1650594246.792376], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 96, 14, 14], "float32"], ["TENSOR", [576, 96, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 14467404, "code_hash": null, "entity": [["tile_f", "sp", [-1, 6, 8, 1]], ["tile_y", "sp", [-1, 2, 7, 1]], ["tile_x", "sp", [-1, 1, 2, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[1.0403099999999999e-05, 1.040915e-05, 1.04111e-05], 0, 3.6800646781921387, 1650596813.592452], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float32"], ["TENSOR", [576, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 1013880, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[3.897049999999999e-06, 3.91e-06, 3.92105e-06], 0, 1.0108630657196045, 1650599520.8898165], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float32"], ["TENSOR", [96, 576, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 3021363, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.4886650000000001e-05, 1.4891699999999998e-05, 1.489365e-05], 0, 1.296245813369751, 1650602852.9900029], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float32"], ["TENSOR", [576, 1, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 75600, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.5125499999999995e-06, 3.5165e-06, 3.52305e-06], 0, 0.7898616790771484, 1650604889.7693179], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 576, 7, 7], "float32"], ["TENSOR", [160, 576, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 198967, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 32, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 48]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.187615e-05, 1.188065e-05, 1.189265e-05], 0, 0.9806699752807617, 1650607006.9707348], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 160, 7, 7], "float32"], ["TENSOR", [960, 160, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 723143, "code_hash": null, "entity": [["tile_f", "sp", [-1, 6, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[8.1326e-06, 8.1471e-06, 8.1586e-06], 0, 1.4358642101287842, 1650608992.9624553], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [960, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 78004, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 3, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}, "result": [[3.2270500000000003e-06, 3.2275e-06, 3.2370500000000006e-06], 0, 0.7736132144927979, 1650610364.983347], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [160, 960, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 245551, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.996525e-05, 1.997425e-05, 2.00027e-05], 0, 1.1400103569030762, 1650611207.975019], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [320, 960, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 385083, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.39933e-05, 2.4011299999999998e-05, 2.40183e-05], 0, 0.6805839538574219, 1650613344.2010403], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 320, 7, 7], "float32"], ["TENSOR", [1280, 320, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 418478, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 80]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.70027e-05, 1.70537e-05, 1.70547e-05], 0, 1.3821897506713867, 1650615147.8768744], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1280, 1, 1], "float32"], ["TENSOR", [1000, 1280, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 33248, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 1, 1]], ["tile_rc", "sp", [-1, 80]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[2.96109e-05, 2.964935e-05, 2.96514e-05], 0, 1.2670972347259521, 1650616370.1425672], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 7, 7], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"], {}], "config": {"index": 76584961, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 7]], ["tile_rx", "sp", [-1, 7]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.018785e-05, 3.021085e-05, 3.0319899999999997e-05], 0, 2.8301658630371094, 1650641335.9067605], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 9455056, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[8.2506e-06, 8.2611e-06, 8.262600000000001e-06], 0, 0.9980921745300293, 1650646562.0615616], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 152951, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 8]], ["tile_x", "sp", [-1, 4, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.538185e-05, 2.54259e-05, 2.5454850000000002e-05], 0, 1.7803680896759033, 1650673519.9665477], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 87933763, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.642805e-05, 3.644205e-05, 3.6453999999999994e-05], 0, 0.7112207412719727, 1650676122.1516056], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [64, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 26113940, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 2, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[2.661535e-05, 2.6678849999999996e-05, 2.6717849999999997e-05], 0, 0.46747660636901855, 1650677634.7384923], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 17407042, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 4]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.10854e-05, 7.109085e-05, 7.11029e-05], 0, 0.7810003757476807, 1650680015.5899546], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [512, 128, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 12549050, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[2.24233e-05, 2.2450800000000002e-05, 2.24848e-05], 0, 0.708620548248291, 1650681553.685307], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [128, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 4732823, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[3.2699899999999995e-05, 3.27019e-05, 3.2721899999999996e-05], 0, 0.47304797172546387, 1650683034.4890563], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 545975, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 4, 8]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[2.8336850000000006e-05, 2.8396349999999998e-05, 2.839935e-05], 0, 1.9903209209442139, 1650684248.641913], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 35838982, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.2699049999999996e-05, 4.2715550000000005e-05, 4.272905e-05], 0, 0.7070727348327637, 1650686601.5726962], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 2812995, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 32, 2]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 1, 1, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[0.00010861985, 0.00010863435, 0.0001086479], 0, 0.7317159175872803, 1650688514.756369], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [1024, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 1574517, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.8229850000000002e-05, 2.8233350000000003e-05, 2.825485e-05], 0, 0.5290811061859131, 1650689649.0227518], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [256, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 2015046, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[4.65231e-05, 4.652755e-05, 4.65556e-05], 0, 0.5142829418182373, 1650690398.0774868], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 83259, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 4]], ["tile_x", "sp", [-1, 7, 7, 1]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.3635449999999997e-05, 3.365495e-05, 3.366745e-05], 0, 1.430091381072998, 1650691249.2586534], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 3569676, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.170045000000001e-05, 7.171045000000001e-05, 7.17234e-05], 0, 0.7033722400665283, 1650692260.7750976], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 82748, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}, "result": [[0.00019157085, 0.0001915749, 0.00019157595], 0, 0.5982248783111572, 1650693315.4548712], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [2048, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 319626, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 7]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.5206150000000004e-05, 4.5243650000000005e-05, 4.538615e-05], 0, 0.6440334320068359, 1650694063.325511], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 2048, 7, 7], "float32"], ["TENSOR", [512, 2048, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 227061, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[8.21551e-05, 8.217515e-05, 8.223510000000001e-05], 0, 0.5891335010528564, 1650694683.6785614], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 190201, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 4]], ["tile_x", "sp", [-1, 1, 8, 2]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.184079999999999e-05, 5.184079999999999e-05, 5.1939300000000006e-05], 0, 0.6536734104156494, 1650695338.214328], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 791579, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 7]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[9.482695e-05, 9.48854e-05, 9.493345e-05], 0, 1.4417965412139893, 1650697059.9097984], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1, 2048], "float32"], ["TENSOR", [1000, 2048], "float32"], null, "float32"], {}], "config": {"index": 5, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[2.1966849999999998e-05, 2.197135e-05, 2.19843e-05], 0, 0.4426250457763672, 1650697491.6213834], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [128, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 13507399, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.0136100000000001e-05, 1.0145149999999999e-05, 1.0159099999999999e-05], 0, 0.7864212989807129, 1650724653.7135007], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [128, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 22158174, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 1, 4]], ["tile_y", "sp", [-1, 1, 8, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 0]]}, "result": [[6.9551e-06, 6.962599999999998e-06, 6.9646e-06], 0, 0.7211275100708008, 1650726163.0400467], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [256, 128, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 20804619, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.81296e-05, 2.8130599999999998e-05, 2.814765e-05], 0, 0.5774815082550049, 1650727742.5586843], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [256, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 18573740, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.541225e-05, 1.54172e-05, 1.54537e-05], 0, 0.8937528133392334, 1650729498.3622022], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [128, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 17433624, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[3.12759e-05, 3.13064e-05, 3.13069e-05], 0, 0.7797591686248779, 1650731061.2478383], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [256, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 13412619, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[4.96386e-05, 4.970209999999999e-05, 4.970365e-05], 0, 0.6517493724822998, 1650732352.357867], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [256, 8, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 36224490, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 4, 2]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[8.519599999999999e-06, 8.5411e-06, 8.5516e-06], 0, 0.9091670513153076, 1650733940.9465184], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [512, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 4481521, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[3.21494e-05, 3.21714e-05, 3.218245e-05], 0, 0.627678394317627, 1650735124.7769744], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [512, 256, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 14581630, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[4.980815e-05, 4.9814650000000006e-05, 4.9839099999999995e-05], 0, 0.9194021224975586, 1650736745.8550131], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [256, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 14448421, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 16, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.917355e-05, 3.918705e-05, 3.920855e-05], 0, 0.6747403144836426, 1650738065.3893676], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [256, 8, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 26828436, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 4, 2, 1]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0], ["fuse_yx", "ot", 1]]}, "result": [[6.2195500000000004e-06, 6.2225999999999995e-06, 6.2360999999999995e-06], 0, 0.9589457511901855, 1650740107.0941224], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 8353521, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[6.006074999999999e-05, 6.0142750000000006e-05, 6.016975e-05], 0, 0.6222403049468994, 1650741436.1006827], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 16, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 11704914, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[9.4151e-06, 9.4271e-06, 9.44265e-06], 0, 1.0609674453735352, 1650742517.3273706], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [1024, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 1032786, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 7, 2]], ["tile_x", "sp", [-1, 2, 1, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[4.349615e-05, 4.3545600000000005e-05, 4.35581e-05], 0, 0.6170883178710938, 1650743933.698596], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [1024, 512, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 3334053, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 4]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[6.675835e-05, 6.676189999999999e-05, 6.67844e-05], 0, 0.8200364112854004, 1650745036.010395], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [512, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 1560709, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.721785000000001e-05, 5.723285e-05, 5.7259849999999994e-05], 0, 0.7093315124511719, 1650746357.0012312], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [512, 16, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 2887260, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 4, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0], ["fuse_yx", "ot", 0]]}, "result": [[8.0721e-06, 8.0771e-06, 8.077649999999999e-06], 0, 1.0441720485687256, 1650747537.4062088], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [1024, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 1077100, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[8.750664999999999e-05, 8.752015e-05, 8.752465e-05], 0, 0.6045539379119873, 1650749113.1365385], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [1024, 32, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 1530018, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[1.173565e-05, 1.1736649999999999e-05, 1.174615e-05], 0, 1.378283977508545, 1650750073.250046], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 7, 7], "float16"], ["TENSOR", [2048, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 91389, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[6.80499e-05, 6.80609e-05, 6.810035e-05], 0, 0.781482458114624, 1650751244.5316045], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [2048, 1024, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 353303, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 8]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[9.634765e-05, 9.63937e-05, 9.644220000000001e-05], 0, 1.0039925575256348, 1650752141.1724086], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 2048, 7, 7], "float16"], ["TENSOR", [1024, 2048, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 305015, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 4]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 64]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[9.278935e-05, 9.279235e-05, 9.27989e-05], 0, 1.5176756381988525, 1650753010.2550077], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 1024, 7, 7], "float16"], ["TENSOR", [1024, 32, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 1514322, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 4, 8]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[1.128565e-05, 1.129965e-05, 1.130615e-05], 0, 2.028611183166504, 1650754020.2265894], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 3, 224, 224], "float16"], ["TENSOR", [32, 3, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 16795845, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 56, 1]], ["tile_rc", "sp", [-1, 3]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[6.9991000000000005e-06, 7.01165e-06, 7.02965e-06], 0, 0.6407957077026367, 1650813939.149559], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float16"], ["TENSOR", [32, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 17044204, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 4]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 56, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[6.348049999999999e-06, 6.3581e-06, 6.3720999999999995e-06], 0, 0.5244479179382324, 1650816895.9108312], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float16"], ["TENSOR", [32, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 4975320, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 4]], ["tile_x", "sp", [-1, 1, 56, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[5.4956e-06, 5.5181e-06, 5.5605500000000005e-06], 0, 0.6335549354553223, 1650817497.488131], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float16"], ["TENSOR", [16, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 22304829, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 2, 4]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.6231e-06, 4.635050000000001e-06, 4.64005e-06], 0, 1.0218045711517334, 1650818669.1302266], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 16, 112, 112], "float16"], ["TENSOR", [96, 16, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 53847898, "code_hash": null, "entity": [["tile_f", "sp", [-1, 6, 4, 2]], ["tile_y", "sp", [-1, 4, 2, 1]], ["tile_x", "sp", [-1, 1, 16, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.173100000000001e-06, 7.176100000000001e-06, 7.176100000000001e-06], 0, 0.8336286544799805, 1650819872.646985], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 96, 112, 112], "float16"], ["TENSOR", [96, 1, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 6922048, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 1]], ["tile_x", "sp", [-1, 1, 8, 7]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[6.5691e-06, 6.575050000000001e-06, 6.577099999999999e-06], 0, 0.763239860534668, 1650821086.6168768], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 96, 56, 56], "float16"], ["TENSOR", [24, 96, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 15206426, "code_hash": null, "entity": [["tile_f", "sp", [-1, 3, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.61605e-06, 5.61705e-06, 5.6181000000000005e-06], 0, 0.7702598571777344, 1650822139.752306], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 24, 56, 56], "float16"], ["TENSOR", [144, 24, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 52846557, "code_hash": null, "entity": [["tile_f", "sp", [-1, 6, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.506550000000001e-06, 5.50705e-06, 5.5156e-06], 0, 0.8066568374633789, 1650823884.4936872], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float16"], ["TENSOR", [144, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 10299450, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 1, 8]], ["tile_x", "sp", [-1, 1, 28, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[6.08305e-06, 6.0940500000000005e-06, 6.1166e-06], 0, 0.6963748931884766, 1650825481.4431868], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float16"], ["TENSOR", [24, 144, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 34561186, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 6, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 4, 2]], ["tile_rc", "sp", [-1, 12]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[7.8756e-06, 7.8781e-06, 7.8936e-06], 0, 0.600581169128418, 1650826356.6420047], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float16"], ["TENSOR", [144, 1, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 3175550, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 2, 7]], ["tile_x", "sp", [-1, 1, 14, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.35055e-06, 4.3510500000000005e-06, 4.37055e-06], 0, 1.0711958408355713, 1650827886.3514745], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 144, 28, 28], "float16"], ["TENSOR", [32, 144, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 7217856, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 2, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[5.16955e-06, 5.1786e-06, 5.1786e-06], 0, 1.1553356647491455, 1650828704.5764093], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 28, 28], "float16"], ["TENSOR", [192, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 3039664, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}, "result": [[3.95305e-06, 3.95305e-06, 3.96305e-06], 0, 0.5106737613677979, 1650829874.8851082], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float16"], ["TENSOR", [192, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 3046176, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 14, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.69705e-06, 3.7045500000000004e-06, 3.71005e-06], 0, 0.5545451641082764, 1650830991.8584304], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float16"], ["TENSOR", [32, 192, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 3283875, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.763050000000001e-06, 5.76505e-06, 5.7651e-06], 0, 0.8265197277069092, 1650831715.1286488], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float16"], ["TENSOR", [64, 192, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 4522579, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[8.88765e-06, 8.8976e-06, 8.9001e-06], 0, 0.7089235782623291, 1650833027.5999374], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 28, 28], "float16"], ["TENSOR", [384, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 15094839, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 4]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.6831e-06, 7.7046e-06, 7.7066e-06], 0, 1.0405845642089844, 1650834890.3047237], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float16"], ["TENSOR", [384, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 3585120, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 7, 4]], ["tile_x", "sp", [-1, 1, 14, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[5.17855e-06, 5.18105e-06, 5.2031e-06], 0, 0.6448190212249756, 1650836046.3030014], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float16"], ["TENSOR", [64, 384, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 12193990, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 48]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[1.315215e-05, 1.315965e-05, 1.31667e-05], 0, 1.0322420597076416, 1650837129.5166345], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float16"], ["TENSOR", [384, 1, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 702720, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.46905e-06, 4.47055e-06, 4.472050000000001e-06], 0, 0.5480890274047852, 1650837907.944853], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 384, 14, 14], "float16"], ["TENSOR", [96, 384, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 2391475, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.8601e-06, 7.8606e-06, 7.867599999999999e-06], 0, 0.8951401710510254, 1650839601.1209044], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 96, 14, 14], "float16"], ["TENSOR", [576, 96, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 3808975, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 4]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[7.5676e-06, 7.5796e-06, 7.5806e-06], 0, 0.5769634246826172, 1650841183.037211], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float16"], ["TENSOR", [576, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1017240, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 7, 2]], ["tile_x", "sp", [-1, 1, 7, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[3.3840500000000002e-06, 3.3995499999999995e-06, 3.4110500000000006e-06], 0, 0.5795722007751465, 1650842147.0042777], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float16"], ["TENSOR", [96, 576, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 3079603, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.072065e-05, 1.0726149999999999e-05, 1.073915e-05], 0, 1.0619502067565918, 1650843026.3948507], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float16"], ["TENSOR", [576, 1, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 48759, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 3, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}, "result": [[3.3625500000000005e-06, 3.37555e-06, 3.3775000000000003e-06], 0, 0.4328014850616455, 1650843813.8582454], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 576, 7, 7], "float16"], ["TENSOR", [160, 576, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 412645, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 1]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[1.0541650000000001e-05, 1.056665e-05, 1.061315e-05], 0, 2.12727427482605, 1650844230.8154716], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 160, 7, 7], "float16"], ["TENSOR", [960, 160, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 658631, "code_hash": null, "entity": [["tile_f", "sp", [-1, 6, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[6.8161e-06, 6.823100000000001e-06, 6.841600000000001e-06], 0, 1.1339399814605713, 1650844824.3560324], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float16"], ["TENSOR", [960, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 34996, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 3, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 0]]}, "result": [[3.2030500000000006e-06, 3.20655e-06, 3.22755e-06], 0, 0.46258974075317383, 1650845433.5094543], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float16"], ["TENSOR", [160, 960, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 256311, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 32, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.67563e-05, 1.6783300000000002e-05, 1.67918e-05], 0, 1.3270132541656494, 1650846386.3554056], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float16"], ["TENSOR", [320, 960, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 535611, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}, "result": [[2.13803e-05, 2.138485e-05, 2.14313e-05], 0, 0.6166877746582031, 1650847120.3649514], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 320, 7, 7], "float16"], ["TENSOR", [1280, 320, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 872914, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 8]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 160]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[1.4967699999999999e-05, 1.4970200000000001e-05, 1.497575e-05], 0, 3.8850479125976562, 1650848115.3614283], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1280, 1, 1], "float16"], ["TENSOR", [1000, 1280, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 18036, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 5, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 1, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.67544e-05, 2.68309e-05, 2.6850400000000003e-05], 0, 0.5703048706054688, 1650848714.866295], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [2304, 768], "float32"], null, "float32"], {}], "config": {"index": 7797746, "code_hash": null, "entity": [["tile_x", "sp", [-1, 10, 2, 1]], ["tile_y", "sp", [-1, 2, 2, 9]], ["tile_k", "sp", [-1, 8, 1]]]}, "result": [[0.0038837555500000004, 0.00388391505, 0.0038845200499999996], 0, 2.5623362064361572, 1650922517.9653258], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [2304, 768], "float32"], null, "float32"], {}], "config": {"index": 5, "code_hash": null, "entity": [["tile_k", "sp", [-1, 8]]]}, "result": [[0.026828510899999998, 0.026837540599999998, 0.0268397599], 0, 3.169992446899414, 1650922648.847927], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul.cuda", [["TENSOR", [600, 32, 64], "float32"], ["TENSOR", [600, 32, 64], "float32"], [600, 32, 32], "float32", 0, 1], {}], "config": {"index": 4951, "code_hash": null, "entity": [["tile_y", "sp", [-1, 2, 8]], ["tile_x", "sp", [-1, 16, 1]], ["tile_k", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 16], ["unroll_explicit", "ot", 0]]}, "result": [[3.28759e-05, 3.29589e-05, 3.2961349999999996e-05], 0, 0.5255897045135498, 1650922854.3883076], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul.cuda", [["TENSOR", [600, 32, 32], "float32"], ["TENSOR", [600, 64, 32], "float32"], [600, 32, 64], "float32", 0, 1], {}], "config": {"index": 27148, "code_hash": null, "entity": [["tile_y", "sp", [-1, 2, 8]], ["tile_x", "sp", [-1, 16, 1]], ["tile_k", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 64], ["unroll_explicit", "ot", 1]]}, "result": [[3.2340449999999996e-05, 3.23409e-05, 3.23504e-05], 0, 0.5354282855987549, 1650923290.7878547], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [768, 768], "float32"], null, "float32"], {}], "config": {"index": 4812433, "code_hash": null, "entity": [["tile_x", "sp", [-1, 10, 8, 1]], ["tile_y", "sp", [-1, 1, 8, 8]], ["tile_k", "sp", [-1, 24, 1]]]}, "result": [[0.00024860935, 0.00024866190000000005, 0.00024869645], 0, 1.9623265266418457, 1650924106.1380405], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [768, 768], "float32"], null, "float32"], {}], "config": {"index": 9, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[0.00217030065, 0.0021703712, 0.00217064015], 0, 0.5637376308441162, 1650924967.713869], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [3072, 768], "float32"], null, "float32"], {}], "config": {"index": 44601969, "code_hash": null, "entity": [["tile_x", "sp", [-1, 1, 4, 8]], ["tile_y", "sp", [-1, 2, 4, 4]], ["tile_k", "sp", [-1, 4, 4]]]}, "result": [[0.0018861851000000003, 0.00188620165, 0.0018940356999999999], 0, 1.277604103088379, 1650927453.3620265], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [3072, 768], "float32"], null, "float32"], {}], "config": {"index": 5, "code_hash": null, "entity": [["tile_k", "sp", [-1, 8]]]}, "result": [[0.03605818615, 0.0362008626, 0.0362174195], 0, 4.180805444717407, 1650928571.656076], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 3072], "float32"], ["TENSOR", [768, 3072], "float32"], null, "float32"], {}], "config": {"index": 15182559, "code_hash": null, "entity": [["tile_x", "sp", [-1, 2, 2, 5]], ["tile_y", "sp", [-1, 4, 2, 3]], ["tile_k", "sp", [-1, 8, 2]]]}, "result": [[0.00559109345, 0.00559257735, 0.0056176812], 0, 3.4281482696533203, 1650932142.7651248], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 3072], "float32"], ["TENSOR", [768, 3072], "float32"], null, "float32"], {}], "config": {"index": 11, "code_hash": null, "entity": [["tile_k", "sp", [-1, 64]]]}, "result": [[0.0361240734, 0.036126658900000004, 0.03614633765], 0, 4.267355918884277, 1650932378.2820215], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 3, 224, 224], "float16"], ["TENSOR", [64, 3, 7, 7], "float16"], [2, 2], [3, 3, 3, 3], [1, 1], "float16"], {}], "config": {"index": 76554127, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 2, 4]], ["tile_y", "sp", [-1, 1, 2, 7]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 7]], ["tile_rx", "sp", [-1, 7]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.347395e-05, 3.347595e-05, 3.352245e-05], 0, 2.189602851867676, 1650936160.5027037], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [64, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 9992684, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.1521e-06, 7.1581e-06, 7.1616e-06], 0, 0.80893874168396, 1650937165.8348072], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [64, 64, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 346643, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 4, 4]], ["tile_x", "sp", [-1, 7, 14, 1]], ["tile_rc", "sp", [-1, 64]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[1.979975e-05, 1.9807250000000003e-05, 1.98152e-05], 0, 4.116674423217773, 1650937833.3360083], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [64, 64, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 87994222, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.28174e-05, 3.28324e-05, 3.283545e-05], 0, 0.8266921043395996, 1650939008.9283469], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [64, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 11665964, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.11318e-05, 2.11398e-05, 2.114625e-05], 0, 0.6964631080627441, 1650940826.7842171], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [128, 128, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 29503014, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[5.76747e-05, 5.769175e-05, 5.772775e-05], 0, 0.8780744075775146, 1650942896.389656], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 28, 28], "float16"], ["TENSOR", [512, 128, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 6593422, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.71372e-05, 1.71937e-05, 1.723925e-05], 0, 0.6540956497192383, 1650944631.3486328], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [128, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 10686988, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[2.57673e-05, 2.577485e-05, 2.578635e-05], 0, 1.1292264461517334, 1650946431.3661022], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 128, 28, 28], "float16"], ["TENSOR", [128, 128, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 474214, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 8]], ["tile_x", "sp", [-1, 1, 49, 2]], ["tile_rc", "sp", [-1, 128]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 1]]}, "result": [[2.18753e-05, 2.187975e-05, 2.18833e-05], 0, 1.2164885997772217, 1650947759.1606152], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 28, 28], "float16"], ["TENSOR", [128, 128, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 8384098, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 1, 1, 4]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[4.11295e-05, 4.113955e-05, 4.1163549999999997e-05], 0, 1.0902330875396729, 1650949638.5774388], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [256, 256, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 7353795, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 32, 2]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 2, 1, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[8.86406e-05, 8.864109999999999e-05, 8.864115e-05], 0, 0.8693947792053223, 1650950775.1563373], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 14, 14], "float16"], ["TENSOR", [1024, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 930783, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 4]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[2.34788e-05, 2.34858e-05, 2.35078e-05], 0, 0.5252759456634521, 1650951747.1516812], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [256, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 1212775, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 64]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[3.91651e-05, 3.9180599999999995e-05, 3.9193099999999996e-05], 0, 0.6574358940124512, 1650953175.4410644], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 256, 14, 14], "float16"], ["TENSOR", [256, 256, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 86722, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 8, 4, 4]], ["tile_x", "sp", [-1, 1, 49, 1]], ["tile_rc", "sp", [-1, 128]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[2.68669e-05, 2.6873399999999996e-05, 2.687835e-05], 0, 1.2940130233764648, 1650953832.3535905], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 14, 14], "float16"], ["TENSOR", [256, 256, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 4312500, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 2]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 2, 1, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.6155250000000004e-05, 5.6155750000000005e-05, 5.616675e-05], 0, 0.949575662612915, 1650956448.7583783], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 42753, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 2]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}, "result": [[0.0001527741, 0.0001528176, 0.00015290065], 0, 0.6949319839477539, 1650957527.024363], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 7, 7], "float16"], ["TENSOR", [2048, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 143805, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[3.205195e-05, 3.209495e-05, 3.21014e-05], 0, 0.7755589485168457, 1650958461.6669595], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 2048, 7, 7], "float16"], ["TENSOR", [512, 2048, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 188833, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 2]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[6.173455e-05, 6.1736e-05, 6.174349999999999e-05], 0, 0.7472929954528809, 1650959393.5092852], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 512, 7, 7], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 45226, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 4]], ["tile_x", "sp", [-1, 1, 4, 4]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}, "result": [[3.226995e-05, 3.22995e-05, 3.236145e-05], 0, 0.8135907649993896, 1650960185.1974587], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 7, 7], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 789819, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 7, 1, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[8.708445e-05, 8.710639999999999e-05, 8.711495e-05], 0, 2.0074827671051025, 1650961212.134662], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1, 2048], "float16"], ["TENSOR", [1000, 2048], "float16"], null, "float16"], {}], "config": {"index": 5, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[6.6981e-06, 6.7081e-06, 6.7141e-06], 0, 0.44542431831359863, 1650961789.8975906], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [1024, 32, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 2, "code_hash": null, "entity": [["algo", "ot", 2]]}, "result": [[7.8907e-05, 7.89105e-05, 7.89305e-05], 0, 1.3035199642181396, 1651696037.0387743], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 2048, 7, 7], "float32"], ["TENSOR", [1024, 2048, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.2306149999999994e-05, 5.232314999999999e-05, 5.233665e-05], 0, 0.6558830738067627, 1651696051.5444815], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [2048, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[3.39419e-05, 3.395845e-05, 3.4114899999999994e-05], 0, 0.6455562114715576, 1651696060.9264348], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [2048, 1024, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[7.451495e-05, 7.6711e-05, 8.551005e-05], 0, 1.2933030128479004, 1651696077.6541371], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [1024, 32, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[8.338005000000001e-05, 8.339205e-05, 8.345205e-05], 0, 1.2021043300628662, 1651696086.517543], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [1024, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.517355e-05, 4.5203050000000004e-05, 4.52056e-05], 0, 0.6302511692047119, 1651696097.985193], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [512, 16, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 2, "code_hash": null, "entity": [["algo", "ot", 2]]}, "result": [[6.15488e-05, 6.161125e-05, 6.16368e-05], 0, 1.2534654140472412, 1651696115.4964283], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [512, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.3825550000000006e-05, 4.3842049999999995e-05, 4.38481e-05], 0, 0.6925802230834961, 1651696129.3560424], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [1024, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[2.49448e-05, 2.494735e-05, 2.4948799999999997e-05], 0, 0.7423253059387207, 1651696138.326287], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [1024, 512, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[5.060915e-05, 5.0639650000000006e-05, 5.0771650000000004e-05], 0, 1.2802436351776123, 1651696155.4096394], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [512, 16, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 2, "code_hash": null, "entity": [["algo", "ot", 2]]}, "result": [[6.28783e-05, 6.29178e-05, 6.48748e-05], 0, 1.224653720855713, 1651696166.4411144], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [512, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[7.627445e-05, 7.629045e-05, 7.63595e-05], 0, 1.2499911785125732, 1651696178.3922029], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [256, 8, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 2, "code_hash": null, "entity": [["algo", "ot", 2]]}, "result": [[5.29612e-05, 5.3004150000000006e-05, 5.5102199999999996e-05], 0, 1.3471219539642334, 1651696192.9950073], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [256, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[2.69028e-05, 2.6947850000000003e-05, 2.709185e-05], 0, 0.6759750843048096, 1651696206.3298318], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [512, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[4.0580999999999997e-05, 4.0632550000000005e-05, 4.0673e-05], 0, 1.2982831001281738, 1651696215.4789267], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [512, 256, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.3493549999999996e-05, 4.352155e-05, 4.361855e-05], 0, 1.3339207172393799, 1651696233.0065742], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [256, 8, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[5.64837e-05, 5.6511250000000005e-05, 6.553234999999999e-05], 0, 1.2724413871765137, 1651696246.2444258], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [256, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.016415e-05, 5.0178150000000003e-05, 5.02056e-05], 0, 0.6765668392181396, 1651696257.173029], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [128, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[4.13285e-05, 4.135955e-05, 4.1496e-05], 0, 1.2668747901916504, 1651696265.374396], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [256, 128, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.542705e-05, 4.566555e-05, 4.8272150000000005e-05], 0, 0.7064330577850342, 1651696278.1836157], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [256, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.19779e-05, 3.2020399999999996e-05, 3.22334e-05], 0, 1.2480614185333252, 1651696293.8619213], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [128, 4, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.314195e-05, 3.315195e-05, 3.31679e-05], 0, 1.3792345523834229, 1651696312.5689838], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [128, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.605575e-05, 1.6090699999999998e-05, 1.61782e-05], 0, 1.2849547863006592, 1651696321.6512122], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 3072], "float16"], ["TENSOR", [768, 3072], "float16"], null, "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.00013500355, 0.0001350666, 0.00013514510000000001], 0, 0.6779699325561523, 1651704762.118352], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [3072, 768], "float16"], null, "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.000130729, 0.000130799, 0.000130856], 0, 0.7103776931762695, 1651704764.6885316], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [768, 768], "float16"], null, "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[4.252165e-05, 4.254115e-05, 4.254215e-05], 0, 0.5850663185119629, 1651704767.20349], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_cublas.cuda", [["TENSOR", [600, 32, 32], "float16"], ["TENSOR", [600, 64, 32], "float16"], [600, 32, 64], "float16", 0, 1], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[1.93158e-05, 1.9405800000000003e-05, 1.9479299999999998e-05], 0, 0.5462775230407715, 1651704769.5398254], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_cublas.cuda", [["TENSOR", [600, 32, 64], "float16"], ["TENSOR", [600, 32, 64], "float16"], [600, 32, 32], "float16", 0, 1], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[2.1815349999999998e-05, 2.1823350000000002e-05, 2.19333e-05], 0, 0.5473110675811768, 1651704771.9580688], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [2304, 768], "float16"], null, "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[9.769749999999999e-05, 9.77725e-05, 9.781149999999998e-05], 0, 0.6392307281494141, 1651704774.6099348], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1280, 1, 1], "float32"], ["TENSOR", [1000, 1280, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[1.585375e-05, 1.585375e-05, 1.586625e-05], 0, 0.6553614139556885, 1651705312.2156374], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 320, 7, 7], "float32"], ["TENSOR", [1280, 320, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.87343e-05, 1.8758849999999997e-05, 1.8765299999999998e-05], 0, 1.254763126373291, 1651705318.060327], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [320, 960, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.7147599999999996e-05, 3.71591e-05, 3.71691e-05], 0, 1.297356128692627, 1651705334.5834737], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [160, 960, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.79461e-05, 3.79671e-05, 3.796715e-05], 0, 1.2885222434997559, 1651705348.3673804], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 160, 7, 7], "float32"], ["TENSOR", [960, 160, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.5520249999999998e-05, 1.5562750000000003e-05, 1.644575e-05], 0, 1.3106482028961182, 1651705361.6600392], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 576, 7, 7], "float32"], ["TENSOR", [160, 576, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.5274899999999997e-05, 2.5275900000000003e-05, 2.530395e-05], 0, 1.29083251953125, 1651705367.3725972], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 576, 14, 14], "float32"], ["TENSOR", [96, 576, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.6715450000000002e-05, 2.6721899999999996e-05, 2.6767950000000002e-05], 0, 1.2988712787628174, 1651705383.1567316], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 96, 14, 14], "float32"], ["TENSOR", [576, 96, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.613175e-05, 1.6169249999999998e-05, 1.621125e-05], 0, 1.295907974243164, 1651705394.4066052], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 384, 14, 14], "float32"], ["TENSOR", [96, 384, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.90103e-05, 1.90198e-05, 1.90208e-05], 0, 1.2901599407196045, 1651705407.7473567], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 384, 28, 28], "float32"], ["TENSOR", [64, 384, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.90728e-05, 1.90858e-05, 1.909985e-05], 0, 1.2148852348327637, 1651705422.5198638], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 28, 28], "float32"], ["TENSOR", [384, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.539325e-05, 1.539975e-05, 1.5983249999999996e-05], 0, 1.2038941383361816, 1651705431.8003838], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [64, 192, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.5386250000000002e-05, 1.540225e-05, 1.541325e-05], 0, 1.298431634902954, 1651705446.8170843], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [32, 192, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.58127e-05, 1.6144749999999998e-05, 1.63108e-05], 0, 1.2637646198272705, 1651705455.472888], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 28, 28], "float32"], ["TENSOR", [192, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.4976249999999999e-05, 1.7667749999999997e-05, 1.774875e-05], 0, 1.1796517372131348, 1651705468.7749956], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 144, 28, 28], "float32"], ["TENSOR", [32, 144, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.532325e-05, 1.5424749999999998e-05, 1.567525e-05], 0, 1.286487340927124, 1651705487.1521738], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 144, 56, 56], "float32"], ["TENSOR", [24, 144, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.51377e-05, 1.5184749999999999e-05, 1.5248749999999999e-05], 0, 1.1545443534851074, 1651705491.9636042], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 24, 56, 56], "float32"], ["TENSOR", [144, 24, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.583025e-05, 1.583175e-05, 1.641125e-05], 0, 1.2789053916931152, 1651705511.8756096], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 96, 56, 56], "float32"], ["TENSOR", [24, 96, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.596475e-05, 1.63408e-05, 1.766875e-05], 0, 1.2472562789916992, 1651705522.051849], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 16, 112, 112], "float32"], ["TENSOR", [96, 16, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.4875400000000002e-05, 2.4914400000000003e-05, 2.49284e-05], 0, 1.2587049007415771, 1651705532.8126292], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [16, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.580675e-05, 1.5856300000000003e-05, 1.5899250000000002e-05], 0, 1.2889397144317627, 1651705545.7501018], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [32, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.533775e-05, 1.536725e-05, 1.536775e-05], 0, 1.2521677017211914, 1651705557.5611386], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 3, 224, 224], "float32"], ["TENSOR", [32, 3, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.612825e-05, 1.6157300000000002e-05, 1.6178749999999998e-05], 0, 1.2984960079193115, 1651705570.6978276], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 7, 7], "float16"], ["TENSOR", [1024, 32, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.8641849999999995e-05, 5.8685849999999996e-05, 5.9010850000000004e-05], 0, 1.2714319229125977, 1651713264.5110943], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 2048, 7, 7], "float16"], ["TENSOR", [1024, 2048, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.15054e-05, 6.163035e-05, 6.22079e-05], 0, 1.270298957824707, 1651713270.2057958], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 7, 7], "float16"], ["TENSOR", [2048, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.843080000000001e-05, 5.888885e-05, 5.890435000000001e-05], 0, 1.3643712997436523, 1651713286.55753], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [2048, 1024, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.20044e-05, 6.215889999999999e-05, 6.27134e-05], 0, 1.3452882766723633, 1651713299.5873055], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [1024, 32, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.113384999999999e-05, 6.12164e-05, 6.12939e-05], 0, 1.3625106811523438, 1651713310.588895], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [1024, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[7.63861e-05, 7.63931e-05, 7.77211e-05], 0, 1.4055383205413818, 1651713325.2641764], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [512, 16, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.80418e-05, 5.821035e-05, 5.842684999999999e-05], 0, 1.346651315689087, 1651713331.084954], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [512, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.944685e-05, 5.953585e-05, 5.9695400000000004e-05], 0, 1.2687809467315674, 1651713342.4434164], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [1024, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[6.35162e-05, 6.37062e-05, 6.374014999999999e-05], 0, 1.2827024459838867, 1651713358.4088492], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [1024, 512, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.186515e-05, 6.191315e-05, 6.222864999999999e-05], 0, 1.2956676483154297, 1651713367.8496935], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 16, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.0032100000000004e-05, 6.0201099999999995e-05, 6.102315e-05], 0, 1.367894172668457, 1651713384.8863037], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.615235e-05, 4.6155350000000005e-05, 4.67844e-05], 0, 0.6502845287322998, 1651713389.0626698], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [256, 8, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.905665e-05, 5.910555e-05, 5.9806099999999996e-05], 0, 1.3620550632476807, 1651713401.7820523], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [256, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[6.297215e-05, 6.298165e-05, 6.303215e-05], 0, 1.2902483940124512, 1651713416.3568978], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [512, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.8349199999999995e-05, 3.8372200000000005e-05, 3.83862e-05], 0, 1.2480621337890625, 1651713429.365083], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [512, 256, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.86542e-05, 3.866125e-05, 3.8687700000000006e-05], 0, 1.2490315437316895, 1651713438.7554371], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [256, 8, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.891705e-05, 5.90376e-05, 5.916055e-05], 0, 1.2681810855865479, 1651713447.8981256], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [256, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[6.140065e-05, 6.14486e-05, 6.149865e-05], 0, 1.307438611984253, 1651713464.5051072], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [128, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.749815e-05, 3.7540649999999994e-05, 3.75712e-05], 0, 1.3630273342132568, 1651713472.181648], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [256, 128, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.620115e-05, 3.6212149999999995e-05, 3.628415e-05], 0, 1.2850840091705322, 1651713485.997229], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [256, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.2912399999999998e-05, 2.29144e-05, 2.2933899999999997e-05], 0, 1.3092646598815918, 1651713499.214019], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [128, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[5.240745e-05, 5.2427950000000003e-05, 5.24484e-05], 0, 1.3632240295410156, 1651713513.548847], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [128, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.67913e-05, 1.68143e-05, 1.68283e-05], 0, 1.330857515335083, 1651713525.1422098], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 3072], "float32"], ["TENSOR", [768, 3072], "float32"], null, "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.0006445004499999999, 0.0006610111999999999, 0.0006667967499999999], 0, 0.648043155670166, 1651716140.0694156], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [3072, 768], "float32"], null, "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.00062875765, 0.00063329275, 0.0006615721500000001], 0, 0.6620566844940186, 1651716142.9851758], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [768, 768], "float32"], null, "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.00016988825000000001, 0.00016991475, 0.00017001125], 0, 0.553156852722168, 1651716145.929202], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_cublas.cuda", [["TENSOR", [600, 32, 32], "float32"], ["TENSOR", [600, 64, 32], "float32"], [600, 32, 64], "float32", 0, 1], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[8.121880000000001e-05, 8.12663e-05, 8.133385e-05], 0, 0.5643446445465088, 1651716148.5040443], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_cublas.cuda", [["TENSOR", [600, 32, 64], "float32"], ["TENSOR", [600, 32, 64], "float32"], [600, 32, 32], "float32", 0, 1], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.00012531600000000002, 0.0001253595, 0.00012557855000000002], 0, 0.5810286998748779, 1651716151.1036334], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [2304, 768], "float32"], null, "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.00046191695, 0.00046221050000000007, 0.00047105154999999997], 0, 0.6363368034362793, 1651716154.1708283], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1280, 1, 1], "float16"], ["TENSOR", [1000, 1280, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.02168e-05, 5.048629999999999e-05, 5.07603e-05], 0, 1.243542194366455, 1651717284.5080416], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 320, 7, 7], "float16"], ["TENSOR", [1280, 320, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[4.0327099999999995e-05, 4.033165e-05, 4.037015e-05], 0, 1.2465286254882812, 1651717295.9882414], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 960, 7, 7], "float16"], ["TENSOR", [320, 960, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.149594999999999e-05, 6.213245e-05, 6.32615e-05], 0, 1.318899154663086, 1651717309.334955], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 960, 7, 7], "float16"], ["TENSOR", [160, 960, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.716590000000001e-05, 5.74579e-05, 5.75394e-05], 0, 1.2854020595550537, 1651717317.5957215], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 160, 7, 7], "float16"], ["TENSOR", [960, 160, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.303085e-05, 2.303885e-05, 2.3046849999999998e-05], 0, 1.2467610836029053, 1651717327.262812], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 576, 7, 7], "float16"], ["TENSOR", [160, 576, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.0953449999999995e-05, 6.199295e-05, 6.449e-05], 0, 1.27321195602417, 1651717339.923953], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 576, 14, 14], "float16"], ["TENSOR", [96, 576, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.847925e-05, 4.8582249999999994e-05, 4.8923249999999996e-05], 0, 0.7204091548919678, 1651717351.960737], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 96, 14, 14], "float16"], ["TENSOR", [576, 96, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.88083e-05, 1.883275e-05, 1.884675e-05], 0, 1.296351432800293, 1651717366.7528808], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 384, 14, 14], "float16"], ["TENSOR", [96, 384, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.69267e-05, 4.69492e-05, 4.775025e-05], 0, 0.6626412868499756, 1651717373.8019524], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 384, 28, 28], "float16"], ["TENSOR", [64, 384, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.43037e-05, 4.433015e-05, 4.5084200000000005e-05], 0, 0.7547762393951416, 1651717383.6092317], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 28, 28], "float16"], ["TENSOR", [384, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.609725e-05, 1.614725e-05, 1.641175e-05], 0, 1.2399446964263916, 1651717399.7526286], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 192, 28, 28], "float16"], ["TENSOR", [64, 192, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.708045e-05, 2.70849e-05, 2.7090900000000002e-05], 0, 1.2248289585113525, 1651717410.9461355], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 192, 28, 28], "float16"], ["TENSOR", [32, 192, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.6925900000000006e-05, 2.6931899999999995e-05, 2.706495e-05], 0, 1.2500784397125244, 1651717417.8287508], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 28, 28], "float16"], ["TENSOR", [192, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.570075e-05, 1.5813749999999997e-05, 1.586775e-05], 0, 1.197462558746338, 1651717429.7097864], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 144, 28, 28], "float16"], ["TENSOR", [32, 144, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.344685e-05, 2.34489e-05, 2.347385e-05], 0, 1.221846342086792, 1651717441.3164573], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 144, 56, 56], "float16"], ["TENSOR", [24, 144, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.3502899999999998e-05, 2.350685e-05, 2.3518900000000003e-05], 0, 1.2411229610443115, 1651717456.703392], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 24, 56, 56], "float16"], ["TENSOR", [144, 24, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.656425e-05, 1.65673e-05, 1.67688e-05], 0, 1.2433347702026367, 1651717463.5552585], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 96, 56, 56], "float16"], ["TENSOR", [24, 96, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.69633e-05, 1.6982800000000003e-05, 1.69848e-05], 0, 1.2298755645751953, 1651717475.358641], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 16, 112, 112], "float16"], ["TENSOR", [96, 16, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.81473e-05, 1.818825e-05, 1.8337800000000003e-05], 0, 1.2717070579528809, 1651717489.3928924], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 112, 112], "float16"], ["TENSOR", [16, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.660675e-05, 1.6634750000000002e-05, 1.6637750000000002e-05], 0, 1.2657883167266846, 1651717502.1012578], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 112, 112], "float16"], ["TENSOR", [32, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.70378e-05, 1.70493e-05, 1.75908e-05], 0, 1.2577564716339111, 1651717513.2569478], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 3, 224, 224], "float16"], ["TENSOR", [32, 3, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.683475e-05, 1.6837750000000003e-05, 1.6855800000000002e-05], 0, 1.2461504936218262, 1651717520.0673897], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1, 2048], "float16"], ["TENSOR", [1000, 2048], "float16"], null, "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[1.012365e-05, 1.013265e-05, 1.017665e-05], 0, 0.511929988861084, 1651718651.679756], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 7, 7], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.54195e-05, 6.549299999999999e-05, 6.550705e-05], 0, 1.395721435546875, 1651718658.7949653], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 2048, 7, 7], "float16"], ["TENSOR", [512, 2048, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.2396e-05, 6.26255e-05, 6.280845e-05], 0, 1.296645164489746, 1651718672.3619213], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 7, 7], "float16"], ["TENSOR", [2048, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.71654e-05, 5.731935e-05, 5.7772900000000004e-05], 0, 1.2215275764465332, 1651718687.327885], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.98051e-05, 6.982555e-05, 6.986205e-05], 0, 1.2769300937652588, 1651718697.2846546], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 14, 14], "float16"], ["TENSOR", [256, 256, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.818955e-05, 6.820205e-05, 6.87891e-05], 0, 1.224205493927002, 1651718706.8167992], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [256, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.9839299999999996e-05, 5.013474999999999e-05, 5.0225299999999995e-05], 0, 0.675896167755127, 1651718721.3677247], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 14, 14], "float16"], ["TENSOR", [1024, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.600905e-05, 3.601655e-05, 3.60421e-05], 0, 1.211909532546997, 1651718733.218238], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [256, 256, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.584605e-05, 6.602705e-05, 6.646905e-05], 0, 1.2575101852416992, 1651718744.2928133], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 28, 28], "float16"], ["TENSOR", [128, 128, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.469235e-05, 5.492235e-05, 5.5110899999999994e-05], 0, 1.3174407482147217, 1651718761.5744207], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [128, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.54077e-05, 4.565825e-05, 4.6260699999999996e-05], 0, 0.6943933963775635, 1651718771.8819072], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 28, 28], "float16"], ["TENSOR", [512, 128, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.460135e-05, 2.4602899999999998e-05, 2.461885e-05], 0, 1.2023015022277832, 1651718779.6079652], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [128, 128, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.0838449999999993e-05, 6.0851449999999995e-05, 6.165745e-05], 0, 1.2409472465515137, 1651718790.9801152], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [64, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.40235e-05, 3.405705e-05, 3.4086e-05], 0, 1.2000832557678223, 1651718801.1824563], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [64, 64, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[6.791055e-05, 6.794504999999999e-05, 6.79531e-05], 0, 1.2569653987884521, 1651718819.732785], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [64, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.638225e-05, 1.63893e-05, 1.671225e-05], 0, 1.1855523586273193, 1651718824.89687], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 3, 224, 224], "float16"], ["TENSOR", [64, 3, 7, 7], "float16"], [2, 2], [3, 3, 3, 3], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.066025e-05, 5.0691799999999995e-05, 5.092584999999999e-05], 0, 1.2404775619506836, 1651718841.5797617], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1, 2048], "float32"], ["TENSOR", [1000, 2048], "float32"], null, "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[2.261185e-05, 2.2639350000000002e-05, 2.264785e-05], 0, 0.5434033870697021, 1651719634.0237954], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[0.0001233614, 0.00012337235, 0.0001235209], 0, 1.3691678047180176, 1651719642.5484385], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 2048, 7, 7], "float32"], ["TENSOR", [512, 2048, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.8770900000000003e-05, 5.880735e-05, 5.896090000000001e-05], 0, 0.7814383506774902, 1651719660.6665301], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [2048, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[2.3442850000000002e-05, 2.348735e-05, 2.349685e-05], 0, 0.7805249691009521, 1651719670.6545522], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[0.0001233889, 0.0001234364, 0.00012356584999999998], 0, 1.3443200588226318, 1651719689.3742628], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 6, "code_hash": null, "entity": [["algo", "ot", 6]]}, "result": [[6.372395e-05, 6.374645e-05, 6.375595e-05], 0, 1.365302562713623, 1651719706.674646], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [256, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[4.4549200000000004e-05, 4.45562e-05, 4.457515e-05], 0, 1.327195644378662, 1651719715.0719266], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [1024, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[2.234335e-05, 2.23598e-05, 2.237235e-05], 0, 0.6870465278625488, 1651719729.4540246], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.869755e-05, 6.872955e-05, 6.886254999999999e-05], 0, 1.3498952388763428, 1651719742.1431673], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 6, "code_hash": null, "entity": [["algo", "ot", 6]]}, "result": [[3.7592099999999995e-05, 3.768655e-05, 3.935910000000001e-05], 0, 1.39213228225708, 1651719760.1923416], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [128, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.28385e-05, 3.28675e-05, 3.2889e-05], 0, 1.3503003120422363, 1651719778.4827368], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [512, 128, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.4456849999999996e-05, 2.4474399999999997e-05, 2.4484349999999995e-05], 0, 1.3296136856079102, 1651719786.8804219], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[7.127059999999999e-05, 7.132754999999999e-05, 7.13411e-05], 0, 1.249570608139038, 1651719801.3369465], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [64, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[2.11993e-05, 2.130335e-05, 2.186885e-05], 0, 0.655217170715332, 1651719817.2530777], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 6, "code_hash": null, "entity": [["algo", "ot", 6]]}, "result": [[3.667505e-05, 3.6721099999999996e-05, 3.738805e-05], 0, 1.327420711517334, 1651719833.0931804], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.589875e-05, 1.591525e-05, 1.617775e-05], 0, 1.33211088180542, 1651719846.9002128], "version": 0.2, "tvm_version": "0.9.dev0"}
+{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 7, 7], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[4.56097e-05, 4.56587e-05, 4.57937e-05], 0, 1.3442072868347168, 1651719855.7311447], "version": 0.2, "tvm_version": "0.9.dev0"}
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index f73f2230df4d7..eebb2fb77c201 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -544,6 +544,24 @@ constexpr const char* kExternalMods = "external_mods";
  */
 constexpr const char* kConstNameToConstant = "const_name_to_constant";
 
+/*!
+ * \brief All the runtime::Modules accumulated during compilation by external codegen. These
+ * modules must be either directly linked or captured in the final compilation artifact.
+ *
+ * Type: Array<runtime::Module>
+ */
+constexpr const char* kExternalMods = "external_mods";
+
+/*!
+ * \brief All the named runtime::NDArrays accumulated during compilation by external codegen.
+ * Generally the associated runtime::Module will indicate it requires bindings for these names,
+ * and during module initialization these bindings will be recovered from a ConstLoaderModule.
+ * See also kConstantsArray above, which is the analog for PrimFuncs.
+ *
+ * Type: Map<String, runtime::NDArray>
+ */
+constexpr const char* kConstNameToNDArray = "const_name_to_ndarray";
+
 }  // namespace attr
 }  // namespace tvm
 #endif  // TVM_IR_MODULE_H_
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 11a608d4cbbf8..ee645976ec90c 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -58,6 +58,11 @@ class DispatchContext(object):
     def __init__(self):
         self._old_ctx = DispatchContext.current
 
+    # TODO(mbs): Collage only: Allow cache query
+    # DO NOT SUBMIT
+    def contains(self, target, workload):
+        raise NotImplementedError()
+
     def query(self, target, workload):
         """
         Query the context to get the specific config for a template.
@@ -297,8 +302,10 @@ def load(self, records):
         counter = 0
         for inp, res in joint_records:
             counter += 1
-            if res.error_no != 0:
-                continue
+            # TODO(mbs): Collage only: Cache the error so don't re-tune
+            # DO NOT SUBMIT
+            # if res.error_no != 0:
+            #     continue
 
             # use target keys in tvm target system as key to build best map
             for k in inp.target.keys:
@@ -320,7 +327,16 @@ def load(self, records):
                 if np.mean(other_res.costs) > np.mean(res.costs):
                     best_by_model[key] = (inp, res)
 
-        logger.debug("Finish loading %d records", counter)
+        # TODO(mbs): Collage only: Too verbose
+        # DO NOT SUBMIT
+        # logger.info("Finished loading %d records", counter)
+
+    # TODO(mbs): Collage only: Allow cache query
+    # DO NOT SUBMIT
+    def contains(self, target, workload):
+        # logger.info(
+        #    f"look for match with {target} and {workload} with {len(self._best_user_defined)} user-defined, {len(self.best_by_model)} model and {len(self.best_by_targetkey)} target entries")
+        return self._query_inside(target, workload) is not None
 
     def _query_inside(self, target, workload):
         if target is None:
diff --git a/python/tvm/meta_schedule/testing/custom_builder_runner.py b/python/tvm/meta_schedule/testing/custom_builder_runner.py
index 3ba007d9a4d37..a62ad0ae3eec1 100644
--- a/python/tvm/meta_schedule/testing/custom_builder_runner.py
+++ b/python/tvm/meta_schedule/testing/custom_builder_runner.py
@@ -84,11 +84,8 @@ def build_relay_with_tensorrt(
     from tvm.relay.op.contrib import tensorrt
     from tvm.runtime import Module
 
-    mod, config = tensorrt.partition_for_tensorrt(mod, params)
-    with PassContext(
-        opt_level=3,
-        config={"relay.ext.tensorrt.options": config},
-    ):
+    mod = tensorrt.partition_for_tensorrt(mod, params)
+    with PassContext(opt_level=3):
         result = relay_build(mod, target=target, target_host=None, params=params)
     assert isinstance(result, Module)
     return result
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 89c8fcb17d731..97842738e5cd4 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -32,6 +32,7 @@
 
 from . import transform
 from . import analysis
+from . import collage
 from .build_module import build, create_executor, optimize
 from .transform import build_config
 from . import debug
diff --git a/python/tvm/relay/collage/__init__.py b/python/tvm/relay/collage/__init__.py
new file mode 100644
index 0000000000000..bb77f69a7c2cb
--- /dev/null
+++ b/python/tvm/relay/collage/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+from .collage_partitioner import *
diff --git a/python/tvm/relay/collage/_ffi_api.py b/python/tvm/relay/collage/_ffi_api.py
new file mode 100644
index 0000000000000..afaa5ce98df10
--- /dev/null
+++ b/python/tvm/relay/collage/_ffi_api.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI APIs for the Collage partitioner."""
+import tvm._ffi
+
+
+tvm._ffi._init_api("collage", __name__)
diff --git a/python/tvm/relay/collage/collage_partitioner.py b/python/tvm/relay/collage/collage_partitioner.py
new file mode 100644
index 0000000000000..88a1b1da8fe2f
--- /dev/null
+++ b/python/tvm/relay/collage/collage_partitioner.py
@@ -0,0 +1,237 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Search for optimal partitionings over Relay models."""
+
+import tvm
+import numpy as np
+from tvm._ffi.registry import register_func, register_object
+from tvm.runtime import Object
+import logging
+import os
+import shutil
+import math
+import tempfile
+
+from . import _ffi_api
+
+AUTOTVM_NUM_TRIALS = 2000
+AUTOTVM_EARLY_STOPPING = 600
+MEASURE_NUMBER = 20
+MEASURE_REPEAT = 5
+WARMUP_MIN_REPEAT_MS = 250
+TIMEOUT = 10
+
+
+@register_object("collage.CostEstimator")
+class CostEstimator(Object):
+    """CostEstimator class"""
+
+    def __init__(self):
+        self.__init_handle_by_constructor__(_ffi_api.CostEstimator)
+
+
+@register_object("collage.MockEstimator")
+class MockEstimator(Object):
+    """MockEstimator class"""
+
+    def __init__(self, target_costs):
+        self.__init_handle_by_constructor__(_ffi_api.MockEstimator, target_costs)
+
+
+def arg_for(type, device):
+    """Returns a test argument of type on device"""
+    assert isinstance(type, tvm.ir.TensorType)
+    return tvm.nd.array(
+        np.random.uniform(-1.0, 1.0, size=type.concrete_shape).astype(type.dtype), device=device
+    )
+
+
+def is_already_tuned(task, log_filename):
+    """Returns true if we already have a tuning record for task in turning logs in log_filename"""
+    if not os.path.exists(log_filename):
+        return False
+
+    dispatch_context = tvm.autotvm.task.ApplyHistoryBest(log_filename)
+    return dispatch_context.contains(task.target, task.workload)
+
+
+def extract_autotvm_tasks(mod, target):
+    return tvm.autotvm.task.extract_from_program(mod, target=target, params=None)
+
+
+def optional_tuning_records(log_filename):
+    if log_filename == "" or not os.path.exists(log_filename):
+        return tvm.autotvm.task.FallbackContext()
+    else:
+        return tvm.autotvm.task.ApplyHistoryBest(log_filename)
+
+
+def tune_autotvm_tasks(tasks, log_filename):
+    """Appends to log_filename the best strategies for tasks"""
+    if len(tasks) == 0:
+        return
+
+    measure_option = tvm.autotvm.measure_option(
+        builder=tvm.autotvm.LocalBuilder(timeout=TIMEOUT),
+        runner=tvm.autotvm.LocalRunner(
+            number=MEASURE_NUMBER, repeat=MEASURE_REPEAT, timeout=TIMEOUT, min_repeat_ms=0
+        ),
+    )
+
+    logging.info(
+        f"Using autotvm tuning for {len(tasks)} tasks with {AUTOTVM_NUM_TRIALS} trials, logging to {log_filename}"
+    )
+
+    # create tmp log file, starting with contents from existing log file
+    tmp_log_filename = log_filename + ".tmp"
+    if os.path.exists(tmp_log_filename):
+        os.remove(tmp_log_filename)
+    if os.path.exists(log_filename):
+        logging.info(f"Copying existing log {log_filename} to {tmp_log_filename}")
+        shutil.copy(log_filename, tmp_log_filename)
+
+    for i, task in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+        logging.info(f"Considering task {task.name} {prefix}")
+        if is_already_tuned(task, tmp_log_filename):
+            logging.info(f"Re-using existing record for {task.name}")
+            continue
+
+        logging.info(f"Using autotvm to tune {task.name}")
+        tuner_obj = tvm.autotvm.tuner.XGBTuner(task, loss_type="rank")
+        if os.path.exists(tmp_log_filename):
+            tuner_obj.load_history(tvm.autotvm.record.load_from_file(tmp_log_filename))
+
+        # do tuning
+        n_trial = min(AUTOTVM_NUM_TRIALS, len(task.config_space))
+        tuner_obj.tune(
+            n_trial=n_trial,
+            early_stopping=AUTOTVM_EARLY_STOPPING,
+            measure_option=measure_option,
+            callbacks=[
+                tvm.autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                tvm.autotvm.callback.log_to_file(tmp_log_filename),
+            ],
+        )
+
+    # pick best records and copy back to main log file
+    tvm.autotvm.record.pick_best(tmp_log_filename, log_filename)
+    os.remove(tmp_log_filename)
+
+    logging.info("Done with autotvm tuning")
+
+
+def vm_estimate_seconds(device, vm, func_name, args):
+    # Warmup
+    vm.benchmark(
+        device, repeat=1, number=1, min_repeat_ms=WARMUP_MIN_REPEAT_MS, func_name=func_name, **args
+    )
+    # For realz this time
+    return vm.benchmark(
+        device,
+        repeat=MEASURE_REPEAT,
+        number=MEASURE_NUMBER,
+        min_repeat_ms=0,
+        func_name=func_name,
+        **args,
+    )
+
+
+@register_func("tvm.relay.collage.estimate_seconds")
+def estimate_seconds(mod, target, needs_tvm_tuning):
+    """Returns the mean execution time of "main" in mod on target with params. The module
+    may contain "Primitive" functions, possibly with "Compiler" attributes."""
+    device = tvm.device(target.kind.device_type)
+
+    try:
+        # Build the module.
+        logging.info("Compiling module to estimate")
+        exe = tvm.relay.vm.compile(mod, target)
+    except RuntimeError as e:
+        # A build failure indicates the partition is not supported.
+        # eg trying to build an nn.batch_norm on GPU, which has no schedule since we assume it
+        # is only ever used with a tuple projection which is rewritten away.
+        logging.info(f"Assigning module infinite cost since unable to build: {e}")
+        return math.inf
+
+    # Finalize compilation
+    tmp_dir = tempfile.mkdtemp()
+    code, lib = exe.save()
+    lib_path = os.path.join(tmp_dir, "library.so")
+    # TODO(mbs): Avoid nvcc dependency?
+    lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
+    lib = tvm.runtime.load_module(lib_path)
+    exe = tvm.runtime.vm.Executable.load_exec(code, lib)
+
+    # Benchmark the module.
+    vm = tvm.runtime.vm.VirtualMachine(exe, device)
+    func_name = "main"
+    main_args = {v.name_hint: arg_for(v.checked_type, device) for v in mod[func_name].params}
+    logging.info("Benchmarking module to estimate")
+    profile = vm_estimate_seconds(device, vm, func_name, main_args)
+    logging.info(f"profile: {profile}")
+    return profile.median  # seconds
+
+
+make_labelled_dfpattern_partition_rule = tvm._ffi.get_global_func(
+    "relay.collage.make_labelled_dfpattern_partition_rule"
+)
+make_labelled_dfpattern_partition_rule_with_predicate = tvm._ffi.get_global_func(
+    "relay.collage.make_labelled_dfpattern_partition_rule_with_predicate"
+)
+make_pattern_byoc_partition_rule = tvm._ffi.get_global_func(
+    "relay.collage.make_pattern_byoc_partition_rule"
+)
+
+
+def make_labelled_dfpattern_partition_rule_wrapper(compiler, tuple):
+    if len(tuple) == 2:
+        rule_name, dataflow_pattern = tuple
+        return make_labelled_dfpattern_partition_rule(compiler, rule_name, dataflow_pattern)
+    else:
+        rule_name, dataflow_pattern, predicate = tuple
+        return make_labelled_dfpattern_partition_rule_with_predicate(
+            compiler, rule_name, dataflow_pattern, predicate
+        )
+
+
+@register_func("tvm.relay.collage.make_byoc_partition_rule")
+def make_byoc_partition_rule(compiler):
+    """Returns the PartitionRule for BYOC compiler"""
+    pattern_table = tvm.relay.op.contrib.get_pattern_table(compiler)
+    assert (
+        pattern_table is not None
+    ), f"No pattern table entry was found for BYOC compiler {compiler}"
+    logging.info(
+        f"Converting {len(pattern_table)} rules for {compiler} for use in pattern style BYOC lowering/codegen"
+    )
+    sub_rules = [
+        make_labelled_dfpattern_partition_rule_wrapper(compiler, tuple) for tuple in pattern_table
+    ]
+    return make_pattern_byoc_partition_rule(compiler, sub_rules)
+
+
+def autotvm_tune_module(mod, target, log_filename):
+    if log_filename == "":
+        logging.info("Not tuning with autotvm since disabled")
+        return
+    # Extract and tune any TVM kernels. BYOC partitions will have no tasks extracted.
+    logging.info("Extracting tasks from overall module")
+    tasks = extract_autotvm_tasks(mod, target)
+    logging.info(f"Auto-tuning {len(tasks)} tasks from overall module")
+    tune_autotvm_tasks(tasks, log_filename)
diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index a69e2d4105290..c441c30808c3f 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -26,13 +26,17 @@
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
 from tvm.relay.dataflow_pattern import is_op, wildcard, is_constant, is_tuple, is_tuple_get_item
-from tvm.relay.expr import Call, Constant, TupleGetItem
+from tvm.relay.expr import Call, Constant, GlobalVar, TupleGetItem
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 from tvm.relay.op.contrib.register import register_pattern_table
 
 logger = logging.getLogger("TensorRT")
 
 
+def is_tensorrt_compiler_enabled() -> bool:
+    return tvm.get_global_func("relay.ext.tensorrt.is_runtime_enabled", True) is not None
+
+
 def is_tensorrt_runtime_enabled() -> bool:
     """Check if the TensorRT graph executor is present.
     Returns
@@ -40,116 +44,90 @@ def is_tensorrt_runtime_enabled() -> bool:
     ret: bool
         True if present, False if not.
     """
-    check_enabled = tvm.get_global_func("relay.op.is_tensorrt_runtime_enabled", True)
+    check_enabled = tvm.get_global_func("relay.ext.tensorrt.is_runtime_enabled", True)
     if check_enabled:
         return check_enabled()
     return False
 
 
+def get_tensorrt_target() -> tvm.target.Target:
+    """Returns the current Target, which must be of kind "tensorrt"."""
+    target = tvm.target.Target.current()
+    assert target.kind.name == "tensorrt"
+    return target
+
+
 def get_tensorrt_version() -> Tuple[int, int, int]:
-    """Gets the version of TensorRT that TVM is built against or is targeting.
+    """Returns the version of TensorRT to assume during compilation.
+    In order of preference this is taken from:
+     - The current "tensorrt" target's "tensorrt_version" attribute string.
+     - The version linked to the TVM runtime.
+     - (6, 0, 1)
 
     Returns
     -------
     ret: Tuple[int, int, int]
-        TensorRT version as a tuple of major, minor, and patch number. If TVM
-        is not built with TensorRT, the value set by set_tensorrt_version() is returned instead.
+        TensorRT version as a tuple of (major, minor, patch).
     """
-    pass_ctx = tvm.transform.PassContext.current()
-    if "relay.ext.tensorrt.options" in pass_ctx.config:
-        return tuple(pass_ctx.config["relay.ext.tensorrt.options"].tensorrt_version)  # type: ignore
-    return tuple(tvm.get_global_func("relay.op.get_tensorrt_version")())  # type: ignore
+    target = get_tensorrt_target()
+    version = target.attrs["tensorrt_version"]
+    if len(version) == 3:
+        return int(version[0]), int(version[1]), int(version[2])
+    assert len(version) == 0
+
+    get_version = tvm.get_global_func("relay.ext.tensorrt.get_version", True)
+    if get_version:
+        version = get_version()
+        assert len(version) == 3
+        return int(version[0]), int(version[1]), int(version[2])
 
-
-def get_tensorrt_use_implicit_batch_mode() -> bool:
-    pass_ctx = tvm.transform.PassContext.current()
-    if "relay.ext.tensorrt.options" in pass_ctx.config:
-        return pass_ctx.config["relay.ext.tensorrt.options"].use_implicit_batch
     logger.warning(
-        "PassContext has no relay.ext.tensorrt.options config, using default value "
-        "use_implicit_batch=True."
+        "TVM was not built against TensorRT and no version was provided to "
+        "partition_for_tensorrt. Defaulting to 6.0.1"
     )
-    return True
+    return (6, 0, 1)
+
+
+def get_tensorrt_use_implicit_batch_mode() -> bool:
+    """Returns the "use_implicit_batch" attribute of the current "tensorrt" target."""
+    target = get_tensorrt_target()
+    return target.attrs["use_implicit_batch"]
 
 
 def get_tensorrt_remove_no_mac_subgraphs() -> bool:
-    pass_ctx = tvm.transform.PassContext.current()
-    if "relay.ext.tensorrt.options" in pass_ctx.config:
-        return pass_ctx.config["relay.ext.tensorrt.options"].remove_no_mac_subgraphs
-    logger.warning(
-        "PassContext has no relay.ext.tensorrt.options config, using default value "
-        "remove_no_mac_subgraphs=False."
-    )
-    return False
+    """Returns the "remove_no_mac_subgraphs" attribute of the current "tensorrt" target."""
+    target = get_tensorrt_target()
+    return target.attrs["remove_no_mac_subgraphs"]
+
+
+def get_tensorrt_use_fp16() -> bool:
+    """Returns the "use_fp16" attribute of the current "tensorrt" target."""
+    target = get_tensorrt_target()
+    return target.attrs["use_fp16"]
 
 
 def partition_for_tensorrt(
     mod: tvm.IRModule,
     params: Optional[Dict[str, tvm.nd.NDArray]] = None,
-    version: Optional[Tuple[int, int, int]] = None,
-    use_implicit_batch: bool = True,
-    remove_no_mac_subgraphs: bool = False,
-    max_workspace_size: int = 1 << 30,
-    use_fp16: bool = False,
-    use_uint8: bool = False,
-) -> Tuple[tvm.IRModule, Dict[str, Any]]:
-    """Partition the graph greedily offloading supported operators to TensorRT.
+    target: tvm.target.Target = tvm.target.Target("tensorrt"),
+) -> tvm.IRModule:
+    """Partition all functions in mod to greedily offload supported operators to TensorRT.
 
     Parameters
     ----------
     mod : tvm.IRModule
-        The module to run passes on.
+        The module to partition.
+    target : tvm.target.Target
+        A target of kind "tensorrt" describing additional partitioning and compilation options.
     params : Optional[Dict[str, tvm.nd.NDArray]]
         Constant input parameters.
-    version : Optional[Tuple[int, int, int]]
-        TensorRT version to target as tuple of (major, minor, patch). If TVM is compiled with
-        USE_TENSORRT_RUNTIME=ON, the linked TensorRT version will be used instead.
-    use_implicit_batch : bool
-        Use TensorRT implicit batch mode (default true). Setting to false will enable explicit batch
-        mode which will widen supported operators to include those which modify the batch dimension,
-        but may reduce performance for some models.
-    remove_no_mac_subgraphs : bool
-        Removes subgraphs which have been partitioned for TensorRT if they do not have any
-        multiply-accumulate operations. The removed subgraphs will go through TVM's standard
-        compilation instead. Can improve performance.
-    max_workspace_size : int
-        How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation.
-        See TensorRT documentation for more info.
-    use_fp16: bool
-        Allows, TRT to automatically convert FP32 inputs to FP16. Also, it is required to be enabled
-        if FP16 inputs tensors and weights are used.
-        Note that TensorRT will still choose a higher-precision kernel if it results in overall
-        lower runtime, or if no low-precision implementation exists.
-    use_uint8: bool
-        Allows, TRT to automatically convert FP32 inputs to UINT8.
 
     Returns
     -------
-    mod_and_config : Tuple[tvm.IRModule, Dict[str, Any]]
-        A tuple of 1) annotated and partitioned module and 2) "relay.ext.tensorrt.options"
-        configuration which should be given to PassContext when building.
+    partitioned_mod : tvm.IRModule
+        The partitioned module.
 
     """
-    config: Dict[str, Any] = {
-        "use_implicit_batch": use_implicit_batch,
-        "max_workspace_size": max_workspace_size,
-        "remove_no_mac_subgraphs": remove_no_mac_subgraphs,
-        "use_fp16": use_fp16,
-        "use_uint8": use_uint8,
-    }
-    if version:
-        assert isinstance(version, tuple) and len(version) == 3
-        config["tensorrt_version"] = version
-    else:
-        linked_version = tuple(tvm.get_global_func("relay.op.get_tensorrt_version")())
-        if not linked_version:
-            logger.warning(
-                "TVM was not built against TensorRT and no version was provided to "
-                "partition_for_tensorrt. Defaulting to 6.0.1"
-            )
-            linked_version = (6, 0, 1)
-        config["tensorrt_version"] = linked_version
-
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
 
@@ -174,24 +152,27 @@ def partition_for_tensorrt(
             transform.InferType(),
         ]
     )
-    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+    with target:
         mod = seq(mod)
-        # TODO(mbs): Revisit
-        # mod = prune_tensorrt_subgraphs(mod)
-    return mod, config
+        mod = prune_tensorrt_subgraphs(mod)
+    return mod
 
 
 def is_supported_trt_type(typ: Union[tvm.ir.TensorType, tvm.ir.TupleType], op_name: str) -> bool:
     """Check whether a type is supported by TensorRT."""
-    supported_dtypes = ["float32", "float16"]
+    supported_dtypes = ["float32"]
+    if get_tensorrt_use_fp16():
+        supported_dtypes.append("float16")
     if isinstance(typ, tvm.ir.TensorType):
         if typ.dtype not in supported_dtypes:
-            logger.info(f"{op_name}: Only float32 and float16 tensor dtypes are supported.")
+            logger.info(f"{op_name}: Only {supported_dtypes} tensor dtypes are supported.")
             return False
-        # assumes dim 0 is for batch and can be dynamic
-        # TODO(mbs): But does this depend use_implicit_batch flag?
-        for dim_shape in typ.shape[1:]:
-            if isinstance(dim_shape, tvm.tir.expr.Any):
+        dims = typ.shape
+        if get_tensorrt_use_implicit_batch_mode():
+            # The first dimension can be Any.
+            dims = dims[1:]
+        for dim in dims:
+            if isinstance(dim, tvm.tir.expr.Any):
                 logger.info(f"{op_name}: Only statically known tensor shapes are supported.")
                 return False
     elif isinstance(typ, tvm.ir.TupleType):
@@ -247,7 +228,10 @@ def predicate(expr: relay.expr.Expr) -> bool:
         args = get_args(expr)
         if not all([is_supported_trt_type(arg.checked_type, op_name) for arg in args]):
             return False
-        return checker(attrs, args, op_name)
+        if not checker(attrs, args, op_name):
+            return False
+        logger.info(f"{op_name}: Predicate passes")
+        return True
 
     return predicate
 
@@ -535,11 +519,16 @@ def concatenate_checker(
         if int(attrs.axis) == 0:
             logger.info(f"{op_name}: can't modify batch dimension.")
             return False
-        if isinstance(args[0], relay.Tuple):
-            for tuple_input in args[0].fields:
-                if isinstance(tuple_input, Constant):
-                    logger.info(f"{op_name}: can't concatenate tensors with constants.")
-                    return False
+
+    if not isinstance(args[0], relay.Tuple):
+        logger.info("f{op_name}: concatenate must be applied to a literal tuple")
+        return False
+
+    for tuple_input in args[0].fields:
+        if isinstance(tuple_input, Constant):
+            logger.info(f"{op_name}: can't concatenate tensors with constants.")
+            return False
+
     return True
 
 
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index d7979a757171b..fa2c2ceeb5ee2 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1461,3 +1461,45 @@ def InlineCompilerFunctionsBoundTo(global_vars):
         The pass.
     """
     return _ffi_api.InlineCompilerFunctionsBoundTo(global_vars)
+
+
+def CaptureIndexInSpans():
+    """Captures the post-dfs index and dominator post-dfs index of (most) expression nodes in
+    their span, in the form "index:<post-dfs index>:<dominator post-dfs index>".
+
+    This is useful for debugging since a) it helps identify pretty-printed sub-expressions within
+    the overall model and b) the indexes are heavily used by Collage for its compact representation
+    of sub-graphs.
+
+    Note that Op and Constructor nodes are not changed even though they are assigned an
+    post-dfs index.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The pass.
+    """
+    return _ffi_api.CaptureIndexInSpans()
+
+
+def CollagePartition(config, cost_estimator=None):
+    """Partition the bodies of all functions according to the available targets so as to
+    minimize model latency. See https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md.
+
+    Parameters
+    ----------
+    config : CompilationConfig
+        The available targets.
+    cost_estimator : CostEstimator, optional
+        The custom cost estimator to use for costing each candidate partition.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The pass.
+
+    """
+    if cost_estimator is None:
+        cost_estimator = relay.collage.CostEstimator()
+
+    return _ffi_api.CollagePartition(config, cost_estimator)
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
index e08cd240d4d1e..ec1887cee9097 100644
--- a/src/relay/backend/contrib/tensorrt/codegen.cc
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -33,42 +33,56 @@
 #include "../codegen_json/codegen_json.h"
 
 #if TVM_GRAPH_EXECUTOR_TENSORRT
+#include "../../../transforms/compiler_function_utils.h"
 #include "NvInfer.h"
 #endif
 
 namespace tvm {
 namespace relay {
 namespace contrib {
+namespace tensorrt {
 
-/*! \brief Attributes to store the compiler options for TensorRT. */
-struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfigNode> {
-  Array<Integer> tensorrt_version;
-  bool use_implicit_batch;
-  size_t max_workspace_size;
-  bool remove_no_mac_subgraphs;
-  bool use_fp16;
-  bool use_uint8;
-
-  TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") {
-    TVM_ATTR_FIELD(tensorrt_version)
-        .describe("TensorRT version as (major, minor, patch).")
-        .set_default(Array<Integer>({6, 0, 1}));
-    TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
-    TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
-    TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
-    TVM_ATTR_FIELD(use_fp16).set_default(false);
-    TVM_ATTR_FIELD(use_uint8).set_default(false);
-  }
-};
+/*!
+ * \brief Check whether TensorRT graph executor is enabled.
+ * \return True if enabled, False if not.
+ */
+inline constexpr bool IsRuntimeEnabled() {
+#if TVM_GRAPH_EXECUTOR_TENSORRT
+  return true;
+#else
+  return false;
+#endif  // TVM_GRAPH_EXECUTOR_TENSORRT
+}
 
-class TensorRTCompilerConfig : public Attrs {
- public:
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TensorRTCompilerConfig, Attrs,
-                                            TensorRTCompilerConfigNode);
-};
+TVM_REGISTER_GLOBAL("relay.ext.tensorrt.is_runtime_enabled").set_body_typed(IsRuntimeEnabled);
 
-TVM_REGISTER_NODE_TYPE(TensorRTCompilerConfigNode);
-TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.tensorrt.options", TensorRTCompilerConfig);
+/*!
+ * \brief Get TensorRT version that TVM is built against.
+ * \return Array of three integers for major, minor, and patch, or empty array if TensorRT graph
+ * runtime is not enabled.
+ */
+Array<Integer> GetVersion() {
+#if TVM_GRAPH_EXECUTOR_TENSORRT
+  return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)};
+#else
+  return {};
+#endif  // TVM_GRAPH_EXECUTOR_TENSORRT
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.tensorrt.get_version").set_body_typed(GetVersion);
+
+/*!
+ * \brief Returns the "tensorrt" Target instance to use for compilation.
+ */
+Target GetTensorRTTarget() {
+  Target target = Target::Current(/*allow_not_defined=*/true);
+  if (!target.defined() || target->kind->name != "tensorrt") {
+    // Since we allow partition_for_tensorrt to use the default "tensorrt" target, we should
+    // similarly allow the custom pass to execute without a specific "tensorrt" target in scope.
+    target = Target("tensorrt");
+  }
+  return target;
+}
 
 using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
 using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
@@ -87,6 +101,7 @@ class CollectFromCompositeFunctionBody : public ExprVisitor {
   explicit CollectFromCompositeFunctionBody(TensorRTJSONSerializer* serializer)
       : serializer_(serializer), node_(std::make_shared<JSONGraphNode>()) {}
 
+  // We'll need to implement these out-of-band since they use the serializer.
   void VisitExpr_(const ConstantNode* constant_node) final;
   void VisitExpr_(const CallNode* call_node) final;
 
@@ -190,6 +205,7 @@ class CollectFromCompositeFunctionBody : public ExprVisitor {
     extractor.Extract(const_cast<Object*>(attr_obj));
   }
 
+  /*! \brief The parent serializer for the overall TensorRT partition. */
   TensorRTJSONSerializer* serializer_;
   /*! \brief Accumulated translated arguments. */
   std::vector<JSONGraphNodeEntry> args_;
@@ -207,9 +223,10 @@ class CollectFromCompositeFunctionBody : public ExprVisitor {
  */
 class TensorRTJSONSerializer : public JSONSerializer {
  public:
-  TensorRTJSONSerializer(const std::string& symbol, const Expr& expr)
-      : JSONSerializer(symbol, expr) {}
+  TensorRTJSONSerializer(Target target, const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr), target_(std::move(target)) {}
 
+ private:
   using JSONSerializer::VisitExpr_;
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* call_node) final {
@@ -245,40 +262,58 @@ class TensorRTJSONSerializer : public JSONSerializer {
     node->CaptureAttrs(*collector.node_);
 
     // Capture global settings on the JSON node.
-    SaveGlobalAttributes(node);
+    // TODO(mbs): Why on every call?
+    SaveGlobalAttributes(node.get());
 
     VLOG(1) << name << " has " << node->GetInputs().size() << " inputs";
 
     return AddNode(node, GetRef<Expr>(call_node));
   }
 
-  static void SaveGlobalAttributes(std::shared_ptr<JSONGraphNode> node) {
-    auto ctx = transform::PassContext::Current();
-    auto cfg = ctx->GetConfig<TensorRTCompilerConfig>("relay.ext.tensorrt.options");
-    if (!cfg.defined()) {
-      cfg = AttrsWithDefaultValues<TensorRTCompilerConfig>();
+  static void SetAttr(JSONGraphNode* node, const std::string& key,
+                      std::vector<std::string> values) {
+    node->SetAttr(key, std::vector<dmlc::any>({std::move(values)}));
+  }
+
+  /*! \brief Capture the compilation options as attributes on \p node. */
+  void SaveGlobalAttributes(JSONGraphNode* node) {
+    {
+      Array<Integer> target_attr = target_->GetAttr<Array<Integer>>("tensorrt_version").value();
+      if (target_attr.empty()) {
+        target_attr = GetVersion();
+      }
+      if (target_attr.empty()) {
+        target_attr = {6, 0, 1};
+      }
+      ICHECK_EQ(target_attr.size(), 3);
+      SetAttr(node, "tensorrt_version",
+              {std::to_string(target_attr[0]), std::to_string(target_attr[1]),
+               std::to_string(target_attr[2])});
+    }
+
+    {
+      Bool target_attr = target_->GetAttr<Bool>("use_implicit_batch").value();
+      SetAttr(node, "use_implicit_batch", {std::to_string(target_attr->value)});
+    }
+
+    {
+      Integer target_attr = target_->GetAttr<Integer>("max_workspace_size").value();
+      SetAttr(node, "max_workspace_size", {std::to_string(target_attr->value)});
+    }
+
+    {
+      Bool target_attr = target_->GetAttr<Bool>("use_fp16").value();
+      SetAttr(node, "use_fp16", {std::to_string(target_attr->value)});
+    }
+
+    {
+      Bool target_attr = target_->GetAttr<Bool>("use_uint8").value();
+      SetAttr(node, "use_uint8", {std::to_string(target_attr->value)});
     }
-    ICHECK_EQ(cfg.value()->tensorrt_version.size(), 3);
-    std::vector<std::string> tensorrt_version = {std::to_string(cfg.value()->tensorrt_version[0]),
-                                                 std::to_string(cfg.value()->tensorrt_version[1]),
-                                                 std::to_string(cfg.value()->tensorrt_version[2])};
-    std::vector<std::string> use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)};
-    std::vector<std::string> max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)};
-    std::vector<std::string> use_fp16 = {std::to_string(cfg.value()->use_fp16)};
-    std::vector<std::string> use_uint8 = {std::to_string(cfg.value()->use_uint8)};
-    std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr,
-        use_fp16_attr, use_uint8_attr;
-    tensorrt_version_attr.emplace_back(tensorrt_version);
-    use_implicit_batch_attr.emplace_back(use_implicit_batch);
-    max_workspace_size_attr.emplace_back(max_workspace_size);
-    use_fp16_attr.emplace_back(use_fp16);
-    use_uint8_attr.emplace_back(use_uint8);
-    node->SetAttr("tensorrt_version", tensorrt_version_attr);
-    node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
-    node->SetAttr("max_workspace_size", max_workspace_size_attr);
-    node->SetAttr("use_fp16", use_fp16_attr);
-    node->SetAttr("use_uint8", use_uint8_attr);
   }
+
+  /*! \brief The "tensorrt" Target guiding compilation. */
+  Target target_;
 };
 
 void CollectFromCompositeFunctionBody::VisitExpr_(const ConstantNode* constant_node) {
@@ -304,64 +339,75 @@ void CollectFromCompositeFunctionBody::VisitExpr_(const CallNode* call_node) {
 }
 
 /*!
- * \brief Create a runtime module for TensorRT.
- * \param ref The ext_func Relay expression/module to be executed using extern ops.
- * \return A runtime module.
- */
-runtime::Module TensorRTCompiler(const ObjectRef& ref) {
-  ICHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
-  Function func = Downcast<Function>(ref);
-  std::string func_name = backend::GetExtSymbol(func);
-
-  VLOG(1) << "TensorRT partition:" << std::endl << PrettyPrint(func);
-  TensorRTJSONSerializer serializer(func_name, func);
-  serializer.serialize();
-  std::string graph_json = serializer.GetJSON();
-  VLOG(1) << "TensorRT JSON:" << std::endl << graph_json;
-
-  // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes
-  // a callback which calls backend::UpdateConstants to capture the map before the function
-  // 'disappears' into lowered form, on the assumption the visit order and thus constant
-  // names match those generated by the JSONSerializer.
-
-  const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create");
-  ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function.";
-  VLOG(1) << "Creating tensorrt runtime::Module for '" << func_name << "'";
-  runtime::Module lib = (*pf)(func_name, graph_json, serializer.const_names());
-  return lib;
-}
-
-TVM_REGISTER_GLOBAL("relay.ext.tensorrt").set_body_typed(TensorRTCompiler);
-
-/*!
- * \brief Check whether TensorRT graph executor is enabled.
- * \return True if enabled, False if not.
+ * \brief The main TensorRT compiler.
+ *
+ * TODO(mbs): Currently we create a \p TensorRTRuntimeModule for every function with
+ * Compiler="tensorrt" (ie for each partition). Since the TensorRT engine is only designed to
+ * handle a single entry point this is mostly sensible, however there are probably opportunities
+ * for more sharing between functions. However, note this means each call to a TensorRT-compiled
+ * function will require a linear scan of imported runtime modules to find the matching
+ * TensorRTRuntimeModule implementing it.
  */
-inline constexpr bool IsTensorRTRuntimeEnabled() {
-#if TVM_GRAPH_EXECUTOR_TENSORRT
-  return true;
-#else
-  return false;
-#endif  // TVM_GRAPH_EXECUTOR_TENSORRT
+transform::Pass CompileForTensorRTImpl() {
+  auto pass_func = [](IRModule mod, const transform::PassContext& pass_ctx) {
+    VLOG(1) << "CompileForTensorRT input:" << std::endl << PrettyPrint(mod);
+    Target target = GetTensorRTTarget();
+
+    const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create");
+    ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function.";
+
+    // The accumulated external runtime modules.
+    Array<runtime::Module> external_mods =
+        mod->GetAttr<Array<runtime::Module>>(tvm::attr::kExternalMods, Array<runtime::Module>())
+            .value();
+    // The accumulated constant bindings.
+    Map<String, runtime::NDArray> const_name_to_ndarray =
+        mod->GetAttr<Map<String, runtime::NDArray>>(tvm::attr::kConstNameToNDArray,
+                                                    Map<String, runtime::NDArray>())
+            .value();
+
+    for (const auto& kv : mod->functions) {
+      if (const auto* function_node = kv.second.as<FunctionNode>()) {
+        if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+          Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+          if (opt_compiler && opt_compiler.value() == "tensorrt") {
+            // Serialize the function to JSON.
+            TensorRTJSONSerializer serializer(target, kv.first->name_hint,
+                                              GetRef<Function>(function_node));
+            serializer.serialize();
+            std::string graph_json = serializer.GetJSON();
+            VLOG(1) << "TensorRT JSON for '" << kv.first->name_hint << "':" << std::endl
+                    << graph_json;
+
+            // Remember all the constant bindings.
+            for (const auto& kv2 : serializer.const_name_to_constant()) {
+              ICHECK_EQ(const_name_to_ndarray.count(kv2.first), 0);
+              const_name_to_ndarray.Set(kv2.first, kv2.second);
+            }
+
+            // Create the actual runtime module.
+            runtime::Module runtime_mod =
+                (*pf)(kv.first->name_hint, graph_json, serializer.const_names());
+
+            // Remember the runtime module.
+            external_mods.push_back(runtime_mod);
+          }
+        }
+      }
+    }
+    return WithAttrs(mod, {{tvm::attr::kExternalMods, external_mods},
+                           {tvm::attr::kConstNameToNDArray, const_name_to_ndarray}});
+  };
+  return tvm::transform::CreateModulePass(pass_func, 0, "CompileForTensorRT", {});
 }
 
-/*!
- * \brief Get TensorRT version that TVM is built against.
- * \return Array of three integers for major, minor, and patch, or empty array if TensorRT graph
- * runtime is not enabled.
- */
-Array<Integer> GetTensorRTVersion() {
-#if TVM_GRAPH_EXECUTOR_TENSORRT
-  return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)};
-#else
-  return {};
-#endif  // TVM_GRAPH_EXECUTOR_TENSORRT
+transform::Pass CompileForTensorRT() {
+  return transform::Sequential(
+      {transforms::OutlineCompilerFunctionsWithExistingGlobalSymbols("tensorrt"),
+       CompileForTensorRTImpl(), transforms::MarkCompilerFunctionsAsExtern("tensorrt")});
 }
 
-TVM_REGISTER_GLOBAL("relay.op.is_tensorrt_runtime_enabled")
-    .set_body_typed(IsTensorRTRuntimeEnabled);
-TVM_REGISTER_GLOBAL("relay.op.get_tensorrt_version").set_body_typed(GetTensorRTVersion);
-
+}  // namespace tensorrt
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/contrib/tensorrt/codegen.h b/src/relay/backend/contrib/tensorrt/codegen.h
new file mode 100644
index 0000000000000..813a8663756dd
--- /dev/null
+++ b/src/relay/backend/contrib/tensorrt/codegen.h
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/tensorrt/codegen.h
+ * \brief The 'custom' compilation pass for TensorRT (invoked by the RelayToTIRTargetHook pass).
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_
+#define TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_
+
+#include <tvm/ir/transform.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace tensorrt {
+
+/*!
+ * \brief Returns the pass which replaces all calls to "Primitive" functions with a "Compiler"
+ * attribute of "tensorrt" with calls to an extern which is implemented by a \p TensorRTRuntime
+ * runtime module added to the IRModule's "external_mods" attribute.
+ */
+transform::Pass CompileForTensorRT();
+
+}  // namespace tensorrt
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_
diff --git a/src/relay/backend/contrib/tensorrt/target.cc b/src/relay/backend/contrib/tensorrt/target.cc
index 85d127ab71152..2e4581d30a3c6 100644
--- a/src/relay/backend/contrib/tensorrt/target.cc
+++ b/src/relay/backend/contrib/tensorrt/target.cc
@@ -24,19 +24,46 @@
 
 #include <tvm/target/target.h>
 
+#include "./codegen.h"
+
 namespace tvm {
 namespace relay {
 namespace contrib {
+namespace tensorrt {
 
 /*!
  * \brief This external codegen target can offload compilation to the TensorRT compiler.
  *  - Patterns: python/tvm/relay/op/contrib/tensorrt.py
  *  - Custom compiler: src/relay/backend/contrib/tensorrt/codegen.cc
- *  - Runtime: src/runtime/contrib/tensorrt/ *.cc
+ *  - Runtime: src/runtime/contrib/tensorrt/...
  */
 TVM_REGISTER_TARGET_KIND("tensorrt", kDLCUDA)
-    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true))
+    .set_attr<FTVMRelayToTIR>("RelayToTIR", CompileForTensorRT())
+    // A array of three integers given the major, minor, and patch numbers for the supported
+    // TensorRT compiler version. If empty will be auto-detected from linked library. Default empty.
+    .add_attr_option<Array<Integer>>("tensorrt_version", Array<Integer>())
+    // If true, the first tensor dimension for most operators is allowed to be Any and
+    // TensorRT will assume it represents a batch dimension only known at inference time.
+    // Fewer Relay operators are supported in implicit batch mode. Default true.
+    .add_attr_option<Bool>("use_implicit_batch", Bool(true))
+    // If true, excludes sub-graphs which do not have multiply-accumulate operations, even though
+    // TensorRT supports them. ad. This is a simple heuristic to optimize the partitioning between
+    // TensorRT and TVM. Not required if using Collage for partitioning. Defalut false.
+    .add_attr_option<Bool>("remove_no_mac_subgraphs", Bool(false))
+    // How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation.
+    // Default 1G.
+    .add_attr_option<Integer>("max_workspace_size", Integer(1 << 30))
+    // If true, allows TensorRT to automatically convert float32 operations to float16. Must also be
+    // enabled if any float16 operations are in the model. Note that TensorRT may still choose a
+    // higher-precision kernel if it results in overall lower runtime, or if no low-precision
+    // implementation exists. Default false.
+    .add_attr_option<Bool>("use_fp16", Bool(false))
+    // If true, allows TensorRT to automatically convert float32 operations to uint8
+    // (aka quantized). Default false.
+    .add_attr_option<Bool>("use_uint8", Bool(false));
 
+}  // namespace tensorrt
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index bd3047e2862c1..4376f87787086 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -343,6 +343,7 @@ relay::Function BindParamsByName(relay::Function func,
 
 void BindParamsInModule(IRModule mod,
                         const std::unordered_map<std::string, runtime::NDArray>& params) {
+  VLOG(1) << "BindParamsInModule";
   if (!params.empty()) {
     BaseFunc base_func = mod->Lookup("main");
     ICHECK(base_func->IsInstance<FunctionNode>());
diff --git a/src/relay/collage/README.md b/src/relay/collage/README.md
new file mode 100644
index 0000000000000..dc56496092cc0
--- /dev/null
+++ b/src/relay/collage/README.md
@@ -0,0 +1,26 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+The `CollagePartition` pass for finding optimal partitionings of Relay models.
+
+See the [RFC](https://github.com/mbs-octoml/mbs-tvm-rfcs/blob/mbs-rfcs-collage/rfcs/xxxx-collage.md).
+
+Based on:
+> *Collage: Automated Integration of Deep Learning Backends*  
+> Byungsoo Jeon, Sunghyun Park, Peiyuan Liao, Sheng Xu, Tianqi Chen, Zhihao Jia
+
+CAUTION: This is a prototype, do not use in prod.
diff --git a/src/relay/collage/candidate_function_cache.cc b/src/relay/collage/candidate_function_cache.cc
new file mode 100644
index 0000000000000..32982dc08f3d7
--- /dev/null
+++ b/src/relay/collage/candidate_function_cache.cc
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_function_cache.cc
+ * \brief A cache of the unique global name and costs for partitioned functions.
+ */
+
+#include "./candidate_function_cache.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+CandidateFunctionCache::Entry& CandidateFunctionCache::GetEntry(const std::string& label,
+                                                                const Function& function) {
+  auto itr = cache_.find(function);
+  if (itr == cache_.end()) {
+    String compiler = function->GetAttr<String>(attr::kCompiler, String("tvm")).value();
+    std::string global_symbol_name = name_supply_->Fresh({compiler, label});
+    GlobalVar global_symbol(std::move(global_symbol_name), function->checked_type());
+    itr = cache_.emplace(function, Entry(std::move(global_symbol))).first;
+  }
+  return itr->second;
+}
+
+GlobalVar CandidateFunctionCache::GetGlobalSymbol(const Function& function) {
+  return GetEntry(/*label=*/"", function).global_symbol;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/candidate_function_cache.h b/src/relay/collage/candidate_function_cache.h
new file mode 100644
index 0000000000000..322128c46fbad
--- /dev/null
+++ b/src/relay/collage/candidate_function_cache.h
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_function_cache.h
+ * \brief A cache of the unique global symbol name and cost for partitioned functions.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_CANDIDATE_FUNCTION_CACHE_H_
+#define TVM_RELAY_COLLAGE_CANDIDATE_FUNCTION_CACHE_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "../transforms/compiler_function_utils.h"
+#include "cost.h"
+#include "name_supply.h"
+#include "tvm/relay/function.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief A cache of the unique global symbol and cost for functions extracted to represent
+ * partitions. If two functions are structurally equal (which includes equality of their "Compiler"
+ * attributes) then they will share the same global symbol and estimated cost. We rely on the
+ * function's attributes to distinguish partitions which are structurally the same graph but
+ * intended for different targets.
+ */
+class CandidateFunctionCache : public transforms::GlobalSymbolCache {
+ public:
+  explicit CandidateFunctionCache(std::shared_ptr<NameSupply> name_supply)
+      : name_supply_(std::move(name_supply)) {}
+
+  struct Entry {
+    GlobalVar global_symbol;
+    Cost cost = Cost::Unknown();  // Filled in when have estimated cost.
+
+    explicit Entry(GlobalVar global_symbol) : global_symbol(std::move(global_symbol)) {}
+  };
+
+  /*!
+   * \brief Returns the unique entry for \p function. If no such entry already exists, create it
+   * and assign it a unique global symbol name.
+   */
+  Entry& GetEntry(const std::string& label, const Function& function);
+
+  GlobalVar GetGlobalSymbol(const Function& function) final;
+
+ private:
+  std::shared_ptr<NameSupply> name_supply_;
+  std::unordered_map<Function, Entry, StructuralHash, StructuralEqual> cache_;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_CANDIDATE_FUNCTION_CACHE_H_
diff --git a/src/relay/collage/candidate_partition.cc b/src/relay/collage/candidate_partition.cc
new file mode 100644
index 0000000000000..45365d0c7e0f8
--- /dev/null
+++ b/src/relay/collage/candidate_partition.cc
@@ -0,0 +1,357 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_partition.cc
+ * \brief A potential partition in the search.
+ */
+
+#include "./candidate_partition.h"
+
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/memory.h>
+#include <tvm/relay/transform.h>
+
+#include "../transforms/compiler_function_utils.h"
+#include "./candidate_function_cache.h"
+#include "./candidate_set.h"
+#include "./partition_rule.h"
+#include "./partition_spec.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+TVM_REGISTER_NODE_TYPE(CandidatePartitionNode);
+
+void CandidatePartitionNode::VisitAttrs(AttrVisitor* v) {
+  v->Visit("rule_name", &rule_name_);
+  v->Visit("sub_graph", &sub_graph_);
+  v->Visit("spec", &spec_);
+  // TODO(mbs): cost_
+}
+
+PartitionSpec CandidatePartitionNode::partition_spec() const {
+  return Downcast<PartitionSpec>(spec_);
+}
+
+std::string CandidatePartitionNode::partition_spec_name() const {
+  return Downcast<PartitionSpec>(spec_)->spec_name_;
+}
+
+Target CandidatePartitionNode::target() const { return Downcast<PartitionSpec>(spec_)->target_; }
+
+std::string CandidatePartitionNode::ToSummary(const DataflowGraph& dataflow_graph) const {
+  std::ostringstream os;
+  os << sub_graph_->label_;
+  os << " | (";
+  bool first = true;
+  for (PostDfsIndex index : sub_graph_->input_) {
+    Expr sub_expr = dataflow_graph.index_to_node(index)->ref();
+    if (CanInline(sub_expr)) {
+      continue;
+    }
+    if (first) {
+      first = false;
+    } else {
+      os << ", ";
+    }
+    os << PrettyPrint(sub_expr->checked_type());
+  }
+  os << ") -> (";
+  first = true;
+  for (PostDfsIndex index : sub_graph_->exit_) {
+    Expr sub_expr = dataflow_graph.index_to_node(index)->ref();
+    if (CanInline(sub_expr)) {
+      continue;
+    }
+    if (first) {
+      first = false;
+    } else {
+      os << ", ";
+    }
+    os << PrettyPrint(sub_expr->checked_type());
+  }
+  os << ") | ";
+  os << sub_graph_->inside_.ToString();
+  os << " | ";
+  os << partition_spec_name();
+  os << " | ";
+  os << cost_.ToString();
+  return os.str();
+}
+
+std::string CandidatePartitionNode::ToString() const {
+  std::ostringstream os;
+  os << "{rule_name=" << rule_name_;
+  os << ",sub_graph=" << sub_graph_->ToString();
+  os << ",spec_name=" << partition_spec_name();
+  if (!cost_.is_unknown()) {
+    os << ",cost=" << cost_.ToString();
+  }
+  os << "}";
+  return os.str();
+}
+
+namespace {
+/*!
+ * \brief If function's body is a call to an inlined "Primitive" function, return it.
+ * Otherwise return function directly.
+ */
+Function GetPrimitiveFunction(const Function& function) {
+  if (const auto* call_node = function->body.as<CallNode>()) {
+    if (const auto* function_node = call_node->op.as<FunctionNode>()) {
+      if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+        return GetRef<Function>(function_node);
+      }
+    }
+  }
+  return function;
+}
+
+/*!
+ * \brief Eta-expand any tuple arguments of \p function. Ie rewrite:
+ * \code
+ *   f(x: (t1, t2)) { ... x ... }
+ * \endcode
+ * to
+ * \code
+ *   f(x_1: t1, x_2: t2) { ... (x_1, x_2) ... }
+ * \endcode
+ */
+Function EtaExpandTuples(const Function& function) {
+  Map<Var, Expr> subst;
+  Array<Var> new_params;
+  for (const auto& param : function->params) {
+    std::vector<TensorType> tensor_types = FlattenTupleType(param->type_annotation);
+    if (tensor_types.size() == 1) {
+      new_params.push_back(param);
+    } else {
+      Array<Expr> fields;
+      for (size_t i = 0; i < tensor_types.size(); ++i) {
+        Var new_param(param->name_hint() + "_" + std::to_string(i), tensor_types[i], param->span);
+        new_param->checked_type_ = tensor_types[i];
+        new_params.push_back(new_param);
+        fields.push_back(new_param);
+      }
+      Tuple new_tuple(fields);
+      subst.Set(param, new_tuple);
+    }
+  }
+  if (subst.empty()) {
+    return function;
+  }
+  return WithFields(function, new_params, Bind(function->body, subst));
+}
+
+}  // namespace
+
+Cost CandidatePartitionNode::EstimatedCost(const DataflowGraph& dataflow_graph,
+                                           CostEstimator cost_estimator,
+                                           const CompilationConfig& config,
+                                           std::shared_ptr<CandidateFunctionCache> cache) const {
+  if (cost_.is_unknown()) {
+    VLOG_CONTEXT << "spec " << partition_spec_name();
+    Function extracted_function = sub_graph_->ExtractAsFunction(dataflow_graph);
+    extracted_function = EtaExpandTuples(extracted_function);
+    VLOG(2) << "Validating function:" << std::endl << PrettyPrint(extracted_function);
+    String error = partition_spec()->validate_sub_graph_func_(extracted_function);
+    if (!error.empty()) {
+      cost_ = Cost::Invalid();
+      VLOG(1) << "Unable to rewrite function: " << error;
+    } else {
+      // The extracted function may be the eta-expansion of a "Primitive" function.
+      // If so we want the cached external name and cost to be w.r.t. that function
+      // rather than the outer so that we'll get a cache hit when we outline functions
+      // in the final program.
+      Function primitive_function = GetPrimitiveFunction(extracted_function);
+      CandidateFunctionCache::Entry& entry =
+          cache->GetEntry(sub_graph_->label_, primitive_function);
+      if (entry.cost.is_unknown()) {
+        IRModule mod = IRModule::FromExpr(extracted_function);
+        VLOG(1) << "Outlining:" << std::endl << PrettyPrint(mod);
+        mod = OutlineCompilerFunctions(cache)(mod);
+        VLOG(1) << "Estimating cost of:" << std::endl
+                << PrettyPrint(mod) << std::endl
+                << "using target " << target()->ToDebugString();
+        entry.cost = cost_estimator->Estimate(mod, target(),
+                                              /*needs_tvm_tuning=*/!target().IsExternalCodegen());
+        VLOG(1) << "Measured cost as " << entry.cost.ToString();
+      } else {
+        VLOG(1) << "Reusing cost " << entry.cost.ToString()
+                << " cached in candidate function cache";
+      }
+      cost_ = entry.cost;
+    }
+  } else {
+    VLOG(1) << "Reusing cost " << cost_.ToString() << " cached in candidate";
+  }
+  return cost_;
+}
+
+CandidatePartition::CandidatePartition(String rule_name, SubGraph sub_graph,
+                                       ObjectRef /* actually PartitionSpec */ spec, Cost cost) {
+  auto node = runtime::make_object<CandidatePartitionNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_graph_ = std::move(sub_graph);
+  node->spec_ = std::move(spec);
+  node->cost_ = cost;
+  data_ = std::move(node);
+}
+
+CandidatePartition WithRuleName(CandidatePartition candidate, String rule_name) {
+  if (rule_name == candidate->rule_name_) {
+    return candidate;
+  }
+  auto* node = candidate.CopyOnWrite();
+  node->rule_name_ = std::move(rule_name);
+  return GetRef<CandidatePartition>(node);
+}
+
+CandidatePartition WithSubGraph(CandidatePartition candidate, SubGraph sub_graph) {
+  if (sub_graph == candidate->sub_graph_) {
+    return candidate;
+  }
+  auto* node = candidate.CopyOnWrite();
+  node->sub_graph_ = std::move(sub_graph);
+  return GetRef<CandidatePartition>(node);
+}
+
+bool CandidatePartition::operator<(const CandidatePartition& that) const {
+  // Order lexicographically on sub-graphs.
+  if (*get()->sub_graph_.get() < *that->sub_graph_.get()) {
+    return true;
+  }
+  if (*that->sub_graph_.get() < *get()->sub_graph_.get()) {
+    return false;
+  }
+  // Break ties by rule name.
+  return get()->rule_name_ < that->rule_name_;
+}
+
+bool CandidatePartition::AreTouching(const DataflowGraph& dataflow_graph,
+                                     const CandidatePartition& that) const {
+  return get()->spec_ == that->spec_ &&
+         get()->sub_graph_.AreTouching(dataflow_graph, that->sub_graph_);
+}
+
+CandidatePartition CandidatePartition::DisjointUnion(const DataflowGraph& dataflow_graph,
+                                                     const CandidatePartition& that) const {
+  ICHECK_EQ(get()->spec_, that->spec_);
+  return CandidatePartition(UnionLabels(get()->rule_name_, that->rule_name_),
+                            get()->sub_graph_.DisjointUnion(dataflow_graph, that->sub_graph_),
+                            get()->spec_, get()->cost_ + that->cost_);
+}
+
+/*static*/
+CandidatePartition CandidatePartition::DisjointUnion(const DataflowGraph& dataflow_graph,
+                                                     std::vector<CandidatePartition> candidates) {
+  ICHECK_GT(candidates.size(), 1);
+  CandidatePartition result = candidates.front();
+  for (size_t i = 1; i < candidates.size(); ++i) {
+    result = result.DisjointUnion(dataflow_graph, candidates[i]);
+  }
+  return result;
+}
+
+/*static*/
+Expr CandidatePartition::ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr,
+                                         const std::vector<CandidatePartition>& candidates) {
+  std::vector<SubGraph> sub_graphs;
+  for (const auto& candidate : candidates) {
+    sub_graphs.emplace_back(candidate->sub_graph_);
+  }
+  return SubGraph::ParallelRewrite(dataflow_graph, expr, sub_graphs);
+}
+
+/*static*/
+std::vector<CandidatePartition> CandidatePartition::MaxCoalesce(
+    const DataflowGraph& dataflow_graph, std::vector<CandidatePartition> candidates) {
+  VLOG(1) << "Running MaxCoalesce over " << candidates.size() << " candidates";
+  // This is an eager version of using the simple (kOpaque, kOpaque) combiner.
+
+  // Switch to set representation.
+  CandidateSet result_set(std::move(candidates));
+
+  // Until fixed point...
+  size_t num_rounds = 0;
+  while (result_set.PrepareForNextRound()) {
+    VLOG_CONTEXT << "round " << ++num_rounds;
+    VLOG(1) << "checking " << result_set.size() << " candidates (" << result_set.first_new_index()
+            << " existing)";
+    IndexSet removed_this_round(result_set.size());  // over candidate indexes!
+
+    // Build map from post-dfs indices to the indices of candidates with corresponding entry node.
+    // NOTE: the index set is over candidate indices not post-dfs indices!
+    std::vector<IndexSet> entry_map(dataflow_graph.size(), IndexSet(result_set.size()));
+    for (size_t i = 0; i < result_set.size(); ++i) {
+      CandidatePartition candidate = result_set.at(i);
+      for (PostDfsIndex entry_index : candidate->sub_graph_->entry_) {
+        entry_map[entry_index].Add(i);
+      }
+    }
+
+    for (size_t i = 0; i < result_set.size(); ++i) {
+      if (removed_this_round[i]) {
+        // Already merged.
+        continue;
+      }
+      CandidatePartition upstream = result_set.at(i);
+      // Narrow our search to just those candidates which could touch.
+      IndexSet possible_downstream(result_set.size());  // over candidate indexes!
+      for (PostDfsIndex output_index : upstream->sub_graph_->output_) {
+        possible_downstream = possible_downstream | entry_map[output_index];
+      }
+      for (size_t j : possible_downstream) {
+        if (removed_this_round[j]) {
+          // Already merged.
+          continue;
+        }
+        if (i == j) {
+          // Ignore self.
+          continue;
+        }
+        CandidatePartition downstream = result_set.at(j);
+        if (!upstream.AreTouching(dataflow_graph, downstream)) {
+          continue;
+        }
+        CandidatePartition new_candidate = upstream.DisjointUnion(dataflow_graph, downstream);
+        VLOG(2) << "Merging upstream candidate " << upstream->ToString()
+                << " and downstream candidate " << downstream->ToString() << " to yield "
+                << new_candidate->ToString();
+        result_set.Add(dataflow_graph, new_candidate);
+        result_set.Remove(upstream);
+        removed_this_round.Add(i);
+        result_set.Remove(downstream);
+        removed_this_round.Add(j);
+      }
+    }
+  }
+
+  // Restore canonical order.
+  result_set.sort();
+
+  VLOG(1) << "MaxCoalesce produced " << result_set.size() << " candidates";
+  return result_set.MovedCurrentCandidates();
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/candidate_partition.h b/src/relay/collage/candidate_partition.h
new file mode 100644
index 0000000000000..1e324666fc658
--- /dev/null
+++ b/src/relay/collage/candidate_partition.h
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_partition.cc
+ * \brief A potential partition in the search.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_
+#define TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_
+
+#include <tvm/runtime/container/string.h>
+#include <tvm/target/compilation_config.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "./candidate_function_cache.h"
+#include "./cost.h"
+#include "./cost_estimator.h"
+#include "./name_supply.h"
+#include "./sub_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+class PartitionSpec;
+
+/*!
+ * \brief A candidate partition w.r.t. the body of an overall Relay expression.
+ *
+ * We represent the partition as a sub-graph. This means not only can we represent the scope
+ * of Relay sub-expressions intended for a particular partition (or kernel), but we can also
+ * represent various conventions for encoding how the operators in the partition should be
+ * tagged for downstream processing.
+ */
+class CandidatePartitionNode : public Object {
+ public:
+  CandidatePartitionNode() = default;
+
+  /*!
+   * \brief Combination of all the partition rule names which produced this candidate.
+   * For debugging and explainability.
+   */
+  String rule_name_;
+
+  /*!
+   * \brief The sub-graph of the overall expression matched by the partition rule.
+   */
+  SubGraph sub_graph_;
+
+  /*!
+   * \brief The partition specification which produced this candidate.
+   */
+  ObjectRef /* actually PartitionSpec */ spec_;
+
+  /*!
+   * \brief The (cached) cost of the partition.
+   *
+   * Initially Cost::Unknown, calculated and cached by EstimateCost.
+   */
+  mutable Cost cost_ = Cost::Unknown();
+
+  void VisitAttrs(AttrVisitor* v);
+
+  /*!
+   * \brief Returns the partition specification which produced this candidate.
+   */
+  PartitionSpec partition_spec() const;
+
+  /*!
+   * \brief Returns the name of the partition specification which produced this candidate.
+   */
+  std::string partition_spec_name() const;
+
+  /*!
+   * \brief Returns the target of the partition specification which produced this candidate.
+   */
+  Target target() const;
+
+  /*!
+   * \brief Return the estimated cost of the candidate partition, using \p cost_estimator and
+   * \p cache.
+   */
+  Cost EstimatedCost(const DataflowGraph& dataflow_graph, CostEstimator cost_estimator,
+                     const CompilationConfig& config,
+                     std::shared_ptr<CandidateFunctionCache> cache) const;
+
+  /*!
+   * \brief Returns a brief description of candidate suitable for debugging output.
+   */
+  std::string ToSummary(const DataflowGraph& dataflow_graph) const;
+
+  std::string ToString() const;
+
+  static constexpr const char* _type_key = "relay.collage.CandidatePartition";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CandidatePartitionNode, Object);
+};
+
+class CandidatePartition : public ObjectRef {
+ public:
+  CandidatePartition(String rule_name, SubGraph sub_graph,
+                     ObjectRef /* actually PartitionSpec */ spec, Cost cost = Cost::Unknown());
+
+  bool operator<(const CandidatePartition& that) const;
+
+  /*!
+   * \brief Returns true if this and \p that candidate are disjoint, have the same (or no) target,
+   * and touch. This does not imply the \p DisjointUnion of this and that will be valid. For
+   * example, the result may be too deep or have too many outputs.
+   */
+  bool AreTouching(const DataflowGraph& dataflow_graph, const CandidatePartition& that) const;
+
+  /*!
+   * \brief Returns the disjoint union of this and \p that.
+   */
+  CandidatePartition DisjointUnion(const DataflowGraph& dataflow_graph,
+                                   const CandidatePartition& that) const;
+
+  /*!
+   * \brief Returns the disjoint union of all \p candidates.
+   */
+  static CandidatePartition DisjointUnion(const DataflowGraph& dataflow_graph,
+                                          std::vector<CandidatePartition> candidates);
+
+  /*!
+   * \brief Returns \p expr rewritten to apply all the partitions implied by \p candidates.
+   * The candidates can be in any order but must be disjoint.
+   */
+  static Expr ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr,
+                              const std::vector<CandidatePartition>& candidates);
+
+  /*!
+   * Eagerly merge all touching candidates for the same target. The candidates must be disjoint
+   * and have their Targets filled in. This is typically called on the optimal list of candidate
+   * partitions found by the Collage search in order to remove unnecessary partition boundaries.
+   * Ideally the search would never produce such candidates however to keep the search space
+   * manageable Collage may only consider candidate partitions up to a particular depth.
+   */
+  static std::vector<CandidatePartition> MaxCoalesce(const DataflowGraph& dataflow_graph,
+                                                     std::vector<CandidatePartition> candidates);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(CandidatePartition, ObjectRef, CandidatePartitionNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(CandidatePartitionNode);
+};
+
+CandidatePartition WithRuleName(CandidatePartition candidate, String rule_name);
+CandidatePartition WithTarget(CandidatePartition candidate, Target target);
+CandidatePartition WithSubGraph(CandidatePartition candidate, SubGraph sub_graph);
+
+struct CandidatePartitionHash {
+  size_t operator()(const CandidatePartition& candidate) const {
+    return candidate->sub_graph_->hash();
+  }
+};
+
+struct CandidatePartitionEquals {
+  bool operator()(const CandidatePartition& left, const CandidatePartition& right) const {
+    return *left->sub_graph_.get() == *right->sub_graph_.get();
+  }
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_
diff --git a/src/relay/collage/candidate_partition_index.cc b/src/relay/collage/candidate_partition_index.cc
new file mode 100644
index 0000000000000..7541df87d331c
--- /dev/null
+++ b/src/relay/collage/candidate_partition_index.cc
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/collage/candidate_partition_index.h
+ * \brief Index for finding relevant candidate partitions for a particular search state.
+ */
+
+#include "./candidate_partition_index.h"
+
+#include "./gather_partition_specs.h"
+#include "./prune_candidates.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+CandidatePartitionIndex::CandidatePartitionIndex(
+    const std::unordered_map<const ExprNode*, VirtualDevice>* virtual_devices,
+    DataflowGraph* dataflow_graph)
+    : virtual_devices_(virtual_devices),
+      dataflow_graph_(dataflow_graph),
+      first_inside_index_to_candidates_(dataflow_graph->size()) {}
+
+void CandidatePartitionIndex::Index(const Array<PartitionSpec>& partition_specs) {
+  std::vector<CandidatePartition> candidates = Collect(partition_specs);
+  candidates = PruneCandidates(*dataflow_graph_, candidates);
+  // Index the candidates by their first inside index.
+  for (auto& candidate : candidates) {
+    first_inside_index_to_candidates_[candidate->sub_graph_->first_inside_index_].emplace_back(
+        candidate);
+  }
+  size_ = candidates.size();
+}
+
+void CandidatePartitionIndex::EstimateAllCosts(CostEstimator cost_estimator,
+                                               const CompilationConfig& config,
+                                               std::shared_ptr<CandidateFunctionCache> cache) {
+  size_t n = 0;
+  for (PostDfsIndex index = 0; index < dataflow_graph_->size(); ++index) {
+    for (const auto& candidate : first_inside_index_to_candidates_[index]) {
+      LOG(INFO) << "Estimating cost of candidate " << candidate->ToSummary(*dataflow_graph_) << " ["
+                << n++ << "/" << size_ << "]";
+      // Cost will be cached in candidate as a side effect.
+      Cost cost = candidate->EstimatedCost(*dataflow_graph_, cost_estimator, config, cache);
+      LOG(INFO) << "Candidate has cost " << cost.ToString();
+    }
+  }
+}
+
+std::string CandidatePartitionIndex::ToSummary() const {
+  std::vector<std::string> lines;
+  for (const auto& candidates : first_inside_index_to_candidates_) {
+    for (const auto& candidate : candidates) {
+      if (candidate->partition_spec_name() == kHostSpecName) {
+        continue;
+      }
+      lines.emplace_back(candidate->ToSummary(*dataflow_graph_));
+    }
+  }
+  std::sort(lines.begin(), lines.end());
+  std::ostringstream os;
+  bool first = true;
+  for (const auto& line : lines) {
+    if (first) {
+      first = false;
+    } else {
+      os << std::endl;
+    }
+    os << line;
+  }
+  return os.str();
+}
+
+bool CandidatePartitionIndex::IsCompatibleWithVirtualDevice(const CandidatePartition& candidate) {
+  for (PostDfsIndex index : candidate->sub_graph_->inside_) {
+    const ExprNode* sub_expr_node = dataflow_graph_->index_to_node(index)->node_ref_;
+    auto itr = virtual_devices_->find(sub_expr_node);
+    ICHECK(itr != virtual_devices_->end());
+    const Target& existing_target = itr->second->target;
+    if (!existing_target.defined()) {
+      // No constraint.
+      continue;
+    }
+    if (StructuralEqual()(existing_target, candidate->target())) {
+      // No disagreement.
+      continue;
+    }
+    if (!candidate->target().IsExternalCodegenFor(itr->second->target)) {
+      // The candidate's target is not an external codegen target compatible with the existing
+      // target.
+      // TODO(mbs): There's a conflict here between Collage's desire to leave some expression nodes
+      // 'behind' on the VM and PlanDevice's desire to assign a primitive Target to every node.
+      // I think PlanDevices is the one that needs to give here by leaving such nodes
+      // unconstrained.
+      VLOG(1) << "Ignoring candidate " << candidate->ToString()
+              << " since incompatible with existing virtual device assignment of:" << std::endl
+              << itr->second << std::endl
+              << "to sub-graph:" << std::endl
+              << PrettyPrint(GetRef<Expr>(sub_expr_node));
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<CandidatePartition> CandidatePartitionIndex::Collect(
+    const Array<PartitionSpec>& partition_specs) {
+  VLOG_CONTEXT << "collecting";
+  std::vector<CandidatePartition> result;
+  for (const auto& spec : partition_specs) {
+    VLOG_CONTEXT << "spec " << spec->spec_name_;
+    VLOG(1) << "collecting candidates";
+    std::vector<CandidatePartition> candidates = spec->AllCandidates(*dataflow_graph_);
+    for (auto& candidate : candidates) {
+      if (!IsCompatibleWithVirtualDevice(candidate)) {
+        continue;
+      }
+      result.push_back(candidate);
+    }
+  }
+  VLOG(1) << "Found " << result.size() << " candidates";
+  return result;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/candidate_partition_index.h b/src/relay/collage/candidate_partition_index.h
new file mode 100644
index 0000000000000..cfb83de829967
--- /dev/null
+++ b/src/relay/collage/candidate_partition_index.h
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/collage/candidate_partition_index.h
+ * \brief Index for finding relevant candidate partitions for a particular search state.
+ */
+#ifndef TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_INDEX_H_
+#define TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_INDEX_H_
+
+#include <tvm/relay/expr.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "partition_spec.h"
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Collects and indexes all the candidate partitions for the overall expression. This index
+ * is used during partitioning search to find the next valid candidate partition to explore from the
+ * current search state. We do not yet attempt to estimate the cost of each candidate partition, and
+ * when we do so during the search we may discover it to be infeasible.
+ */
+class CandidatePartitionIndex {
+ public:
+  CandidatePartitionIndex(const std::unordered_map<const ExprNode*, VirtualDevice>* virtual_devices,
+                          DataflowGraph* dataflow_graph);
+
+  /*! \brief Constructs the index. */
+  void Index(const Array<PartitionSpec>& partition_specs);
+
+  /*! \brief Returns all the candidates which may begin at \p index. */
+  const std::vector<CandidatePartition>& candidates_at(PostDfsIndex index) const {
+    ICHECK_LT(index, dataflow_graph_->size());
+    return first_inside_index_to_candidates_[index];
+  }
+
+  /*! \brief Estimates the casts of all candidates in the index. Each candidate caches its cost. */
+  void EstimateAllCosts(CostEstimator cost_estimater, const CompilationConfig& config,
+                        std::shared_ptr<CandidateFunctionCache> cache);
+
+  size_t size() const { return size_; }
+
+  std::string ToSummary() const;
+
+ private:
+  /*!
+   * \brief Returns true if \p candidate's desired target is compatible with any existing target
+   * constraints on the candidate's sub-expressions.
+   */
+  bool IsCompatibleWithVirtualDevice(const CandidatePartition& candidate);
+
+  /*! \brief Returns all valid candidates found from \p partition_specs. */
+  std::vector<CandidatePartition> Collect(const Array<PartitionSpec>& partition_specs);
+
+  /*!
+   * \brief The \p VirtualDevice for every sub-expression in the overall expression. Needed to
+   * ensure candidates do not contradict the target/device placement already determined by
+   * device planning.
+   */
+  const std::unordered_map<const ExprNode*, VirtualDevice>* virtual_devices_;
+
+  /*! \brief Dataflow graph for overall expression. */
+  DataflowGraph* dataflow_graph_;
+
+  /*!
+   * \brief Maps post-dfs indexes to the all the candidates which have that as their first inside
+   * index, and which should be considered in the Collage search.
+   */
+  std::vector<std::vector<CandidatePartition>> first_inside_index_to_candidates_;
+
+  /*! \brief Number of entries in above. */
+  size_t size_ = 0;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_INDEX_H_
diff --git a/src/relay/collage/candidate_set.cc b/src/relay/collage/candidate_set.cc
new file mode 100644
index 0000000000000..2c2a7eaf8d540
--- /dev/null
+++ b/src/relay/collage/candidate_set.cc
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_set.cc
+ * \brief Collects a set of candidate partitions.
+ */
+
+#include "./candidate_set.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+CandidateSet::CandidateSet(std::vector<CandidatePartition> candidates_to_add)
+    : candidates_to_add_(std::move(candidates_to_add)) {
+  for (const auto& candidate : candidates_to_add_) {
+    seen_.emplace(candidate);
+  }
+}
+
+void CandidateSet::Add(const DataflowGraph& dataflow_graph,
+                       const CandidatePartition& new_candidate) {
+  VLOG(2) << "adding " << new_candidate->ToString();
+  if (seen_.count(new_candidate)) {
+    VLOG(2) << "already seen candidate, ignoring";
+    return;
+  }
+  seen_.emplace(new_candidate);
+  candidates_to_add_.emplace_back(new_candidate);
+}
+
+void CandidateSet::Remove(const CandidatePartition& old_candidate) {
+  ICHECK(seen_.count(old_candidate));
+  VLOG(2) << "removing " << old_candidate->ToString();
+  candidates_to_remove_.emplace_back(old_candidate);
+}
+
+bool CandidateSet::PrepareForNextRound() {
+  size_t init_size = current_candidates_.size();
+  for (const auto& candidate_to_remove : candidates_to_remove_) {
+    current_candidates_.erase(
+        std::remove(current_candidates_.begin(), current_candidates_.end(), candidate_to_remove),
+        current_candidates_.end());
+  }
+  size_t num_removed = init_size - current_candidates_.size();
+  candidates_to_remove_.clear();
+  first_new_index_ = current_candidates_.size();
+  for (const auto& new_candidate : candidates_to_add_) {
+    current_candidates_.push_back(new_candidate);
+  }
+  size_t num_added = candidates_to_add_.size();
+  candidates_to_add_.clear();
+  VLOG(1) << "removed " << num_removed << " and added " << num_added << " candidates";
+  return num_removed + num_added > 0;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/candidate_set.h b/src/relay/collage/candidate_set.h
new file mode 100644
index 0000000000000..4cb2c40e9500e
--- /dev/null
+++ b/src/relay/collage/candidate_set.h
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_set.h
+ * \brief Collects a set of candidate partitions.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_CANDIDATE_SET_H_
+#define TVM_RELAY_COLLAGE_CANDIDATE_SET_H_
+
+#include <algorithm>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "./candidate_partition.h"
+#include "./dataflow_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Holds a vector of current candidates and the additions/removals to apply to them.
+ */
+struct CandidateSet {
+  CandidateSet() = default;
+
+  explicit CandidateSet(std::vector<CandidatePartition> candidates_to_add);
+
+  /*!
+   * \brief Schedule \p new_candidate for addition before the next round (unless it is not valid).
+   */
+  void Add(const DataflowGraph& dataflow_graph, const CandidatePartition& new_candidate);
+
+  /*! \brief Schedule \p old_candidate for removal before the next round. */
+  void Remove(const CandidatePartition& old_candidate);
+
+  /*!
+   * \brief Update \p current_candidates and \p first_new_index. Return false if no
+   * new candidates were added, in which case we have reached a fixed point.
+   */
+  bool PrepareForNextRound();
+
+  size_t size() const { return current_candidates_.size(); }
+
+  CandidatePartition operator[](size_t i) const {
+    ICHECK_LT(i, current_candidates_.size());
+    return current_candidates_[i];
+  }
+  CandidatePartition at(size_t i) const { return (*this)[i]; }
+
+  size_t first_new_index() const { return first_new_index_; }
+
+  void sort() { std::sort(current_candidates_.begin(), current_candidates_.end()); }
+
+  std::vector<CandidatePartition> MovedCurrentCandidates() {
+    return std::move(current_candidates_);
+  }
+
+ private:
+  /*!
+   * \brief Index of first candidate in current_candidates added in last round. This can be used to
+   * avoid considering candidates or candidate combinations which have already been considered in an
+   * earlier round.
+   */
+  size_t first_new_index_ = 0;
+  /*! \brief Candidates gathered in previous rounds. */
+  std::vector<CandidatePartition> current_candidates_;
+  /*! \brief New candidates gathered in the current round. */
+  std::vector<CandidatePartition> candidates_to_add_;
+  /*! \brief Existing candidates to remove before starting the next round. */
+  std::vector<CandidatePartition> candidates_to_remove_;
+  /*! \brief Which candidates have been seen so far and should not be added again. */
+  std::unordered_set<CandidatePartition, CandidatePartitionHash, CandidatePartitionEquals> seen_;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_CANDIDATE_SET_H_
diff --git a/src/relay/collage/collage_partitioner.cc b/src/relay/collage/collage_partitioner.cc
new file mode 100644
index 0000000000000..52abd7c08c45b
--- /dev/null
+++ b/src/relay/collage/collage_partitioner.cc
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/collage_partitioner.cc
+ * \brief Search for an optimal partitioning of a Relay model.
+ */
+
+#include "./collage_partitioner.h"
+
+#include <math.h>
+#include <tvm/ir/attrs.h>
+#include <tvm/ir/function.h>
+#include <tvm/ir/transform.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+#include <tvm/relay/transform.h>
+#include <tvm/target/target.h>
+
+#include "../ir/dataflow_matcher_impl.h"
+#include "../transforms/compiler_function_utils.h"
+#include "./candidate_partition.h"
+#include "./candidate_partition_index.h"
+#include "./cost.h"
+#include "./cost_estimator.h"
+#include "./gather_partition_specs.h"
+#include "./name_supply.h"
+#include "./partition_rule.h"
+#include "./partition_spec.h"
+#include "./priority_queue.h"
+#include "./recover_virtual_device_map.h"
+#include "./sub_graph.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+namespace {
+
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.collage.tvm_max_max_depth", Integer);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.collage.byoc_max_max_depth", Integer);
+
+/*!
+ * \brief Represents the overall expression after some number of non-overlapping candidate
+ * partitions have been applied.
+ */
+class SearchState {
+ public:
+  explicit SearchState(IndexSet covered) : covered_(std::move(covered)) {}
+
+  /*!
+   * \brief Order states by increasing best cost, breaking ties by lexicographic order on
+   * the covering sub graph.
+   */
+  bool operator<(const SearchState& that) const {
+    return std::tie(best_cost_, covered_) < std::tie(that.best_cost_, that.covered_);
+  }
+
+  const IndexSet& covered() const { return covered_; }
+
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "State(";
+    os << "covered=" << covered_.ToString();
+    os << ",best_cost=" << best_cost_.ToString();
+    if (best_candidate_.defined()) {
+      os << ",best_candidate=" << best_candidate_->ToString();
+    }
+    os << ")";
+    return os.str();
+  }
+
+ private:
+  /*! \brief Which nodes of overall expression have been placed on all paths to this state. */
+  IndexSet covered_;
+  /*! \brief Predecessor state for sequence of candidates reaching this state with least
+   * cost. Null if initial search state. */
+  SearchState* pred_state_ = nullptr;
+  /*!
+   * \brief Cost of reaching this state using placement implied by path given by pred_state fields.
+   * Includes estimated/measured cost of all candidates plus any candidate launch penalty.
+   * Initially invalid cost.
+   */
+  Cost best_cost_ = Cost::Invalid();
+  /*! \brief Candidate partition selected in transition from pred_state to this state. */
+  CandidatePartition best_candidate_;
+
+  friend class Partitioner;
+};
+
+struct CompareSearchStatePtrs {
+  bool operator()(const SearchState* left, const SearchState* right) const {
+    return *left < *right;
+  }
+};
+
+struct EqualSearchStatePtrs {
+  bool operator()(const SearchState* left, const SearchState* right) const {
+    return left->covered() == right->covered();
+  }
+};
+
+/*!
+ * \brief Finds the optimal partitioning of an expression to candidate partitions.
+ * Though no candidate partitions overlap, it is possible some sub-expressions end up in
+ * no candidate. Those sub-expressions must be evaluated by the host executor (eg VM).
+ */
+class Partitioner {
+ public:
+  explicit Partitioner(CompilationConfig config, Array<PartitionSpec> partition_specs,
+                       const std::unordered_map<const ExprNode*, VirtualDevice>* virtual_devices,
+                       CostEstimator cost_estimator, std::shared_ptr<CandidateFunctionCache> cache)
+      : config_(std::move(config)),
+        partition_specs_(std::move(partition_specs)),
+        virtual_devices_(virtual_devices),
+        cost_estimator_(std::move(cost_estimator)),
+        cache_(std::move(cache)) {}
+
+  Expr Partition(const Expr& expr) {
+    // Establish core data structures.
+    dataflow_graph_ = std::make_unique<DataflowGraph>(expr);
+    VLOG(1) << "Created dataflow graph with " << dataflow_graph_->size() << " nodes";
+
+    // Build the candidate index. This is where all the partition rules are invoked .
+    index_ = std::make_unique<CandidatePartitionIndex>(virtual_devices_, dataflow_graph_.get());
+    index_->Index(partition_specs_);
+    VLOG(1) << "All candidates before search:" << std::endl << index_->ToSummary();
+
+    // 'Eagerly' estimate the cost of all candidates.
+    //
+    // Note if this is not done costs will simply be estimated 'lazily' as the search proceeds.
+    // Typically, some candidates are never explored during the search because:
+    //  - There are no paths in which the candidate does not intersect candidates already
+    //    applied on the path.
+    //  - The Dijkstra search terminates early with a least cost path.
+    // So eager may result in more estimation overhead. However, eager could be made
+    // embarrassingly parallel.
+    VLOG(1) << "Beginning eager cost estimation";
+    index_->EstimateAllCosts(cost_estimator_, config_, cache_);
+    VLOG(1) << "Finished eager cost estimation";
+
+    // Setup initial state.
+    SearchState* init_state = GetState(IndexSet(dataflow_graph_->size()));
+    init_state->best_cost_ = Cost::Zero();
+    pq_.Push(init_state);
+
+    size_t num_transitions = 0;
+
+    VLOG(1) << "#### Commencing Collage search over " << index_->size() << " candidates ####";
+    while (!pq_.empty()) {
+      SearchState* curr_state = pq_.Pop();
+      VLOG(1) << "Looking at state " << curr_state->covered_.ToString();
+      PostDfsIndex next_index = curr_state->covered_.FirstOutsideIndex();
+
+      if (next_index >= dataflow_graph_->size()) {
+        // The entire expression has been explored. Collect the candidates on the optimal path.
+        VLOG(1) << "#### Finished Collage search after exploring " << num_transitions
+                << " transitions ####";
+        std::vector<CandidatePartition> best_candidates;
+        while (curr_state != init_state) {
+          ICHECK(curr_state->best_candidate_.defined());
+          best_candidates.emplace_back(curr_state->best_candidate_);
+          curr_state = curr_state->pred_state_;
+          ICHECK(curr_state != nullptr);
+        }
+        return Finalize(expr, best_candidates);
+      }
+
+      size_t num_fires = 0;
+      Expr sub_expr = dataflow_graph_->index_to_node(next_index)->ref();
+      VLOG(1) << "Looking at index " << next_index << " for sub-expression "
+              << SubExprKindAndLabel(sub_expr).second << " out of " << dataflow_graph_->size()
+              << " total dataflow nodes";
+
+      // Explore all the outgoing candidates from the current state.
+      for (const auto& candidate : index_->candidates_at(next_index)) {
+        VLOG(1) << "Considering candidate " << candidate->ToSummary(*dataflow_graph_)
+                << " for transition " << ++num_transitions << " over " << index_->size()
+                << " total candidates";
+        if (!candidate->sub_graph_->inside_.AreDisjoint(curr_state->covered_)) {
+          LOG(INFO) << "Candidate overlaps with already partitioned nodes";
+          continue;
+        }
+        IndexSet next_covered = curr_state->covered_ | candidate->sub_graph_->inside_;
+        SearchState* next_state = GetState(next_covered);
+        Relax(curr_state, next_state, candidate);
+        ++num_fires;
+      }
+      ICHECK_GT(num_fires, 0)
+          << "No candidate was found covering sub-expression at index " << next_index
+          << ", suggesting the partition rules are incomplete for the given targets.";
+    }
+    ICHECK(false) << "should have reached end state in which all sub-expressions are covered";
+    return {};
+  }
+
+  /*! \brief Returns the unique state corresponding to the \p covered sub-graph. */
+  SearchState* GetState(const IndexSet& covered) {
+    auto itr = covered_to_state_.find(covered);
+    if (itr != covered_to_state_.end()) {
+      return itr->second.get();
+    }
+    auto state = std::make_unique<SearchState>(covered);
+    SearchState* raw_ptr = state.get();
+    covered_to_state_.emplace(covered, std::move(state));
+    return raw_ptr;
+  }
+
+  /*!
+   * \brief Record that it is possible to reach \p next_state by choosing \p candidate
+   * in \p curr_state. If the resulting cost is better than the best known so far, update
+   * \p next_state's best cost, predecessor and candidate to match.
+   */
+  void Relax(SearchState* curr_state, SearchState* next_state,
+             const CandidatePartition& candidate) {
+    // Note this may already be cached if the candidate partition costs were 'eagerly' estimated.
+    Cost candidate_cost =
+        candidate->EstimatedCost(*dataflow_graph_, cost_estimator_, config_, cache_);
+    VLOG(1) << "Candidate has cost " << candidate_cost.ToString();
+    Cost new_state_cost = candidate_cost + curr_state->best_cost_;
+    const bool is_new = next_state->best_cost_.is_invalid();
+    CandidatePartition previously_best_candidate = next_state->best_candidate_;
+    if (is_new || new_state_cost < next_state->best_cost_) {
+      next_state->pred_state_ = curr_state;
+      Cost previously_best_cost = next_state->best_cost_;
+      next_state->best_cost_ = new_state_cost;
+      next_state->best_candidate_ = candidate;
+      if (is_new) {
+        VLOG(1) << "transition " << curr_state->ToString() << " --> " << next_state->ToString()
+                << " (New state for spec " << candidate->partition_spec_name() << ")";
+        pq_.Push(next_state);
+      } else {
+        VLOG(1) << "transition " << curr_state->ToString() << " --> " << next_state->ToString()
+                << " (Spec " << candidate->partition_spec_name() << " beats previous spec "
+                << previously_best_candidate->partition_spec_name() << " by "
+                << (previously_best_cost - curr_state->best_cost_).ToString() << ")";
+        pq_.Update(next_state);
+      }
+    } else {
+      VLOG(1) << "transition " << curr_state->ToString() << " --> " << next_state->ToString()
+              << " (Spec " << candidate->partition_spec_name() << " does not beat existing spec "
+              << previously_best_candidate->partition_spec_name() << ")";
+    }
+  }
+
+  /*!
+   * \brief Returns the result of partitioning \p expr according to 'optimal' candidates found
+   * by the search.
+   */
+  Expr Finalize(const Expr& expr, std::vector<CandidatePartition> best_candidates) {
+    best_candidates = CandidatePartition::MaxCoalesce(*dataflow_graph_, best_candidates);
+
+    Cost total_cost = Cost::Zero();
+    std::ostringstream os;
+    os << "Optimal partitioning:" << std::endl;
+    for (const auto& best_candidate : best_candidates) {
+      if (best_candidate->partition_spec_name() == kHostSpecName) {
+        continue;
+      }
+      os << best_candidate->ToSummary(*dataflow_graph_);
+      os << std::endl;
+      total_cost = total_cost + best_candidate->cost_;
+    }
+    os << "Estimated overall cost is " << total_cost.ToString();
+    LOG(INFO) << os.str();
+
+    LOG(INFO) << "All candidates after search:" << std::endl << index_->ToSummary();
+
+    return CandidatePartition::ParallelRewrite(*dataflow_graph_, expr, best_candidates);
+  }
+
+ private:
+  /*! \brief Available targets, including both 'regular' and 'external codegen'. */
+  CompilationConfig config_;
+  /*! \brief Available partition specs to use during search. */
+  Array<PartitionSpec> partition_specs_;
+  /*!
+   * \brief The virtual devices for every sub-expression so we can respect any existing target
+   * constraints.
+   */
+  const std::unordered_map<const ExprNode*, VirtualDevice>* virtual_devices_;
+  /*! \brief Cost estimator to use for candidates. */
+  CostEstimator cost_estimator_;
+  /*! \brief Cached names and costs for all partition functions. */
+  std::shared_ptr<CandidateFunctionCache> cache_;
+  /*! \brief Dataflow graph for overall expression. */
+  std::unique_ptr<DataflowGraph> dataflow_graph_;
+  /*! \brief Index of all avoilable candidates we are searching over. */
+  std::unique_ptr<CandidatePartitionIndex> index_;
+  /*! \brief Map from covered sub-graphs to the corresponding state. */
+  std::unordered_map<IndexSet, std::unique_ptr<SearchState>, IndexSetHash, IndexSetEqual>
+      covered_to_state_;
+  /*! \brief Priority queue of states, ordered by increasing cost. */
+  PriorityQueue<SearchState, CompareSearchStatePtrs, EqualSearchStatePtrs> pq_;
+};
+
+}  // namespace
+
+transform::Pass CollagePartition(CompilationConfig config, CostEstimator cost_estimator) {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [config = std::move(config), cost_estimator = std::move(cost_estimator)](
+          IRModule mod, transform::PassContext ctxt) {
+        VLOG(1) << "CollagePartition input:" << std::endl << PrettyPrint(mod);
+
+        Array<PartitionSpec> partition_specs = GatherPartitionSpecs(config);
+        VLOG(1) << "Gathered " << partition_specs.size() << " partition specs";
+
+        auto cache =
+            std::make_shared<CandidateFunctionCache>(std::make_shared<NameSupply>("collage"));
+
+        IRModule out_mod = mod->ShallowCopy();
+        for (const auto& kv : mod->functions) {
+          if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) {
+            auto function = GetRef<Function>(function_node);
+            std::unordered_map<const ExprNode*, VirtualDevice> virtual_devices =
+                RecoverVirtualDeviceMap(mod, function);
+            Partitioner partitioner(config, partition_specs, &virtual_devices, cost_estimator,
+                                    cache);
+            Function result = Downcast<Function>(partitioner.Partition(function));
+            out_mod->Add(kv.first, result);
+          }
+        }
+
+        out_mod = OutlineCompilerFunctions(cache)(std::move(out_mod));
+        VLOG(1) << "CollagePartition result:" << std::endl << PrettyPrint(out_mod);
+        return out_mod;
+      };
+  return tvm::transform::CreateModulePass(pass_func, /*opt_level=*/0, "CollagePartition", {});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.CollagePartition").set_body_typed(CollagePartition);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/collage_partitioner.h b/src/relay/collage/collage_partitioner.h
new file mode 100644
index 0000000000000..7c8de87ffe0a3
--- /dev/null
+++ b/src/relay/collage/collage_partitioner.h
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/collage/collage_partitioner.h
+ * \brief Search for an optimal partitioning of a Relay model.
+ *
+ * See:
+ *   Collage: Automated Integration of Deep Learning Backends
+ *   Byungsoo Jeon, Sunghyun Park, Peiyuan Liao, Sheng Xu, Tianqi Chen, Zhihao Jia
+ *   https://arxiv.org/pdf/2111.00655.pdf
+ */
+#ifndef TVM_RELAY_COLLAGE_COLLAGE_PARTITIONER_H_
+#define TVM_RELAY_COLLAGE_COLLAGE_PARTITIONER_H_
+
+#include <tvm/relay/transform.h>
+
+#include "./cost_estimator.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Explores the space of all possible (sub-graph, target) pairs which cover the
+ * model, and applies the globally optimal choice (assuming partition costs are additive).
+ */
+transform::Pass CollagePartition(CompilationConfig config, CostEstimator cost_estimator);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_COLLAGE_PARTITIONER_H_
diff --git a/src/relay/collage/combiner_rule.cc b/src/relay/collage/combiner_rule.cc
new file mode 100644
index 0000000000000..bf6e0eec1cf42
--- /dev/null
+++ b/src/relay/collage/combiner_rule.cc
@@ -0,0 +1,396 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/combiner_rule.cc
+ * \brief Helpers for the \p CombinePartitionRule
+ */
+
+#include "./combiner_rule.h"
+
+#include "./partition_spec.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+TVM_REGISTER_NODE_TYPE(SimpleCombinerRuleNode);
+
+void SimpleCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+bool SimpleCombinerRuleNode::Fires(const DataflowGraph& dataflow_graph,
+                                   const CandidatePartition& upstream,
+                                   const CandidatePartition& downstream) const {
+  return false;
+}
+
+std::string SimpleCombinerRuleNode::ToString() const {
+  return "SimpleCombinerRule(" + rule_name_ + ")";
+}
+
+SimpleCombinerRule::SimpleCombinerRule(String rule_name) {
+  auto node = runtime::make_object<SimpleCombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(ByKindSimpleCombinerRuleNode);
+
+void ByKindSimpleCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+bool ByKindSimpleCombinerRuleNode::Fires(const DataflowGraph& dataflow_graph,
+                                         const CandidatePartition& upstream,
+                                         const CandidatePartition& downstream) const {
+  return upstream->sub_graph_->kind_ <= upstream_kind_ &&
+         downstream->sub_graph_->kind_ <= downstream_kind_;
+}
+
+std::string ByKindSimpleCombinerRuleNode::ToString() const {
+  std::ostringstream os;
+  os << "ByKindSimpleCombinerRule(" << rule_name_ << ")";
+  return os.str();
+}
+
+ByKindSimpleCombinerRule::ByKindSimpleCombinerRule(OpPatternKind upstream_kind,
+                                                   OpPatternKind downstream_kind) {
+  auto node = runtime::make_object<ByKindSimpleCombinerRuleNode>();
+  String rule_name = KindToString(upstream_kind) + "->" + KindToString(downstream_kind);
+  node->rule_name_ = std::move(rule_name);
+  node->upstream_kind_ = upstream_kind;
+  node->downstream_kind_ = downstream_kind;
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(CombinerRuleNode);
+
+void CombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+void CombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const {}
+
+std::string CombinerRuleNode::ToString() const { return "CombinerRuleNode(" + rule_name_ + ")"; }
+
+CombinerRule::CombinerRule(String rule_name) {
+  auto node = runtime::make_object<CombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(AllSimpleCombinerRuleNode);
+
+void AllSimpleCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+void AllSimpleCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const {
+  VLOG(1) << "running AllSimpleCombinerRule(" << rule_name_ << ")";
+  // Build map from post-dfs indices to the indices of candidates with corresponding entry node.
+  // NOTE: the index set is over candidate indices not post-dfs indices!
+  std::vector<IndexSet> entry_map(ctxt->dataflow_graph->size(),
+                                  IndexSet(ctxt->candidate_set->size()));
+  for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition candidate = ctxt->candidate_set->at(i);
+    for (PostDfsIndex entry_index : candidate->sub_graph_->entry_) {
+      entry_map[entry_index].Add(i);
+    }
+  }
+
+  for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition upstream = ctxt->candidate_set->at(i);
+    // Narrow our search to just those candidates which could touch.
+    IndexSet possible_downstream(ctxt->candidate_set->size());
+    for (PostDfsIndex output_index : upstream->sub_graph_->output_) {
+      possible_downstream = possible_downstream | entry_map[output_index];
+    }
+    size_t start_j =
+        i < ctxt->candidate_set->first_new_index() ? ctxt->candidate_set->first_new_index() : 0;
+    for (size_t j : possible_downstream) {
+      if (i == j) {
+        continue;
+      }
+      if (i < start_j) {
+        // We already explored the cross-product of candidates [0, first_new_index), so don't
+        // do it again.
+        continue;
+      }
+      // Note that the rules are not commutative so we can't just ignore if j < i.
+      CandidatePartition downstream = ctxt->candidate_set->at(j);
+      if (ctxt->max_max_depth > 0 &&
+          upstream->sub_graph_->max_depth_ + downstream->sub_graph_->max_depth_ >
+              ctxt->max_max_depth) {
+        continue;
+      }
+      if (!upstream.AreTouching(*ctxt->dataflow_graph, downstream)) {
+        continue;
+      }
+      for (const auto& simple_rule : simple_rules_) {
+        if (simple_rule->Fires(*ctxt->dataflow_graph, upstream, downstream)) {
+          CandidatePartition new_candidate =
+              upstream.DisjointUnion(*ctxt->dataflow_graph, downstream);
+          VLOG(2) << "Fired " << simple_rule->rule_name_ << " on upstream candidate "
+                  << upstream->ToString() << " and downstream candidate " << downstream->ToString()
+                  << " to yield " << new_candidate->ToString();
+          ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate);
+        }
+      }
+    }
+  }
+}
+
+std::string AllSimpleCombinerRuleNode::ToString() const {
+  std::ostringstream os;
+  os << "AllSimpleCombinerRule(" << rule_name_;
+  for (const auto& simple : simple_rules_) {
+    os << ", " << simple->ToString();
+  }
+  os << ")";
+  return os.str();
+}
+
+AllSimpleCombinerRule::AllSimpleCombinerRule(String rule_name,
+                                             Array<SimpleCombinerRule> simple_rules) {
+  auto node = runtime::make_object<AllSimpleCombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->simple_rules_ = std::move(simple_rules);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(TupleArgCombinerRuleNode);
+
+void TupleArgCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+void TupleArgCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const {
+  VLOG(1) << "running TupleArgCombinerRule(" << rule_name_ << ")";
+  // Build map from post-dfs index to the indices of injective candidates with corresponding entry
+  // node. NOTE: the index set is over candidate indices not post-dfs indices!
+  std::vector<IndexSet> exit_map(ctxt->dataflow_graph->size(),
+                                 IndexSet(ctxt->candidate_set->size()));
+  for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition candidate = ctxt->candidate_set->at(i);
+    if (candidate->sub_graph_->kind_ > kInjective) {
+      continue;
+    }
+    for (PostDfsIndex exit_index : candidate->sub_graph_->exit_) {
+      exit_map[exit_index].Add(i);
+    }
+  }
+
+  // The two-step I -> tuple -> I rule.
+  // Look all possible tuple consumers...
+  for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition tuple_consumer_candidate = ctxt->candidate_set->at(i);
+    if (tuple_consumer_candidate->sub_graph_->kind_ > kInjective) {
+      continue;
+    }
+    // For all possible tuples feeding into candidate...
+    for (PostDfsIndex input_index : tuple_consumer_candidate->sub_graph_->input_) {
+      auto node = ctxt->dataflow_graph->index_to_node(input_index);
+      Expr sub_expr = node->ref();
+      const auto* tuple_node = sub_expr.as<TupleNode>();
+      if (tuple_node == nullptr) {
+        continue;
+      }
+      // The tuple_consumer_candidate candidate consumes (at least one) tuple, eg as an argument
+      // to an operator.
+      // eg: concatenate((field1, ..., fieldn))
+      auto tuple_dataflow_node = ctxt->dataflow_graph->item_to_node(tuple_node);
+
+      // Collect all the possible unions. There may be more than one if different candidates
+      // could supply the same tuple field.
+      std::vector<std::vector<CandidatePartition>> all_possible_unions;
+
+      // Obviously we must include the consumer.
+      all_possible_unions.emplace_back();
+      all_possible_unions.back().emplace_back(tuple_consumer_candidate);
+
+      // We must include the tuple itself.
+      SubGraph tuple_sub_graph(*ctxt->dataflow_graph,
+                               IndexSet(ctxt->dataflow_graph->size(), {node->index_}), kInjective,
+                               "tuple");
+      CandidatePartition tuple_candidate("", std::move(tuple_sub_graph),
+                                         tuple_consumer_candidate->partition_spec());
+      all_possible_unions.back().emplace_back(std::move(tuple_candidate));
+
+      // For all tuple fields...
+      bool all_tuple_fields_have_producer = true;
+      for (auto* tuple_field_dataflow_node : tuple_dataflow_node->inputs_) {
+        // Collect all the candidates which could produce this tuple field.
+        std::vector<CandidatePartition> to_appends;
+        size_t start_j =
+            i < ctxt->candidate_set->first_new_index() ? ctxt->candidate_set->first_new_index() : 0;
+        for (size_t j : exit_map[tuple_field_dataflow_node->index_]) {
+          if (i == j) {
+            continue;
+          }
+          if (i < start_j) {
+            // We already explored the cross-product of candidates [0, first_new_index), so don't
+            // do it again.
+            continue;
+          }
+          CandidatePartition tuple_field_producer = ctxt->candidate_set->at(j);
+          // The tuple_field_producer candidate can provide this tuple field.
+          // eg concatenate((..., producer, ...))
+          to_appends.emplace_back(tuple_field_producer);
+        }
+        if (to_appends.empty()) {
+          // At least one of the tuple's fields does not have a producer candidate we can
+          // union in, so we need to give up.
+          all_tuple_fields_have_producer = false;
+          break;
+        } else {
+          // If to_appends = [A, B] and we already have possible unions [C, D] and [E, F] then
+          // the new possible unions are [C, D, A], [C, D, B], [E, F, A] and [E, F, B].
+          std::vector<std::vector<CandidatePartition>> new_all_possible_unions;
+          for (const auto& to_append : to_appends) {
+            for (const auto& possible_union : all_possible_unions) {
+              new_all_possible_unions.emplace_back(possible_union);
+              new_all_possible_unions.back().emplace_back(to_append);
+            }
+          }
+          all_possible_unions = std::move(new_all_possible_unions);
+        }
+      }
+
+      if (!all_tuple_fields_have_producer) {
+        continue;
+      }
+
+      // Actually build the candidates which union according to all_possible_unions.
+      for (const auto& possible_union : all_possible_unions) {
+        if (possible_union.size() > 2) {
+          CandidatePartition new_candidate =
+              CandidatePartition::DisjointUnion(*ctxt->dataflow_graph, possible_union);
+#if TVM_LOG_DEBUG
+          std::ostringstream os;
+          bool first = true;
+          for (const auto& candidate : possible_union) {
+            if (first) {
+              first = false;
+            } else {
+              os << ", ";
+            }
+            os << candidate->ToString();
+          }
+          VLOG(2) << "Fired rule " << rule_name_ << " on {" << os.str() << "} to yield "
+                  << new_candidate->ToString();
+#endif
+          ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate);
+        }
+      }
+    }
+  }
+}
+
+std::string TupleArgCombinerRuleNode::ToString() const {
+  return "TupleArgCombinerRule(" + rule_name_ + ")";
+}
+
+TupleArgCombinerRule::TupleArgCombinerRule(String rule_name) {
+  auto node = runtime::make_object<TupleArgCombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(TupleProjCombinerRuleNode);
+
+void TupleProjCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+void TupleProjCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const {
+  VLOG(1) << "running TupleProjCombinerRule(" << rule_name_ << ")";
+  // We already explored [0, first_new_index), so don't do it again.
+  for (size_t i = ctxt->candidate_set->first_new_index(); i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition base = ctxt->candidate_set->at(i);
+    for (PostDfsIndex index : base->sub_graph_->output_) {
+      auto node = ctxt->dataflow_graph->index_to_node(index);
+      if (node->ref().as<TupleGetItemNode>()) {
+        IndexSet index_set(ctxt->dataflow_graph->size(), {node->index_});
+        SubGraph sub_graph(*ctxt->dataflow_graph, std::move(index_set), kInjective, "proj");
+        CandidatePartition proj_candidate("", std::move(sub_graph), base->spec_);
+        CandidatePartition new_candidate =
+            base.DisjointUnion(*ctxt->dataflow_graph, proj_candidate);
+        VLOG(2) << "Fired rule " << rule_name_ << " on " << proj_candidate->ToString() << " and "
+                << base->ToString() << " to yield " << new_candidate->ToString();
+        ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate);
+      }
+    }
+  }
+}
+
+std::string TupleProjCombinerRuleNode::ToString() const {
+  return "TupleProjCombinerRule(" + rule_name_ + ")";
+}
+
+TupleProjCombinerRule::TupleProjCombinerRule(String rule_name) {
+  auto node = runtime::make_object<TupleProjCombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(ConstantCombinerRuleNode);
+
+void ConstantCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+void ConstantCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const {
+  VLOG(1) << "running ConstantCombinerRule(" << rule_name_ << ")";
+  // We already explored [0, first_new_index), so don't do it again.
+  for (size_t i = ctxt->candidate_set->first_new_index(); i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition base = ctxt->candidate_set->at(i);
+    IndexSet new_constants(ctxt->dataflow_graph->size());
+    for (PostDfsIndex index : base->sub_graph_->input_) {
+      auto node = ctxt->dataflow_graph->index_to_node(index);
+      if (node->ref().as<ConstantNode>()) {
+        new_constants.Add(index);
+      }
+    }
+    if (!new_constants.IsZero()) {
+      SubGraph sub_graph(*ctxt->dataflow_graph, new_constants, kElemWise, "const");
+      CandidatePartition new_const_candidate("", std::move(sub_graph), base->spec_);
+      CandidatePartition new_candidate =
+          base.DisjointUnion(*ctxt->dataflow_graph, new_const_candidate);
+      VLOG(2) << "Fired rule " << rule_name_ << " on " << new_const_candidate->ToString() << " and "
+              << base->ToString() << " to yield " << new_candidate->ToString();
+      ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate);
+    }
+  }
+}
+
+std::string ConstantCombinerRuleNode::ToString() const {
+  return "ConstantCombinerRule(" + rule_name_ + ")";
+}
+
+ConstantCombinerRule::ConstantCombinerRule(String rule_name) {
+  auto node = runtime::make_object<ConstantCombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/combiner_rule.h b/src/relay/collage/combiner_rule.h
new file mode 100644
index 0000000000000..bbaa9486d9297
--- /dev/null
+++ b/src/relay/collage/combiner_rule.h
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/combiner_rule.h
+ * \brief Helpers for the \p CombinePartitionRule
+ */
+
+#ifndef TVM_RELAY_COLLAGE_COMBINER_RULE_H_
+#define TVM_RELAY_COLLAGE_COMBINER_RULE_H_
+
+#include <tvm/relay/dataflow_pattern.h>
+#include <tvm/relay/expr.h>
+
+#include <string>
+
+#include "./candidate_partition.h"
+#include "./candidate_set.h"
+#include "./sub_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Base class for all 'simple' combiner rules.
+ *
+ * Given \p upstream and \p downstream candidates which touch, a simple combiner rule returns
+ * true if their union should also be considered a candidate.
+ */
+class SimpleCombinerRuleNode : public Object {
+ public:
+  String rule_name_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  virtual bool Fires(const DataflowGraph& dataflow_graph, const CandidatePartition& upstream,
+                     const CandidatePartition& downstream) const;
+
+  virtual std::string ToString() const;
+
+  static constexpr const char* _type_key = "relay.collage.SimpleCombinerRule";
+  static constexpr const uint32_t _type_child_slots = 1;
+  TVM_DECLARE_BASE_OBJECT_INFO(SimpleCombinerRuleNode, Object);
+};
+
+class SimpleCombinerRule : public ObjectRef {
+ public:
+  explicit SimpleCombinerRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(SimpleCombinerRule, ObjectRef, SimpleCombinerRuleNode);
+};
+
+/*!
+ * \brief A simple combiner rule which fires if the \p upstream and \p downstream candidates have
+ * the given \p upstream_kind and \p downstream_kind (or less) respectively.
+ */
+class ByKindSimpleCombinerRuleNode : public SimpleCombinerRuleNode {
+ public:
+  OpPatternKind upstream_kind_;
+  OpPatternKind downstream_kind_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  bool Fires(const DataflowGraph& dataflow_graph, const CandidatePartition& upstream,
+             const CandidatePartition& downstream) const override;
+  std::string ToString() const override;
+
+  static constexpr const char* _type_key = "relay.collage.ByKindSimpleCombinerRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ByKindSimpleCombinerRuleNode, SimpleCombinerRuleNode);
+};
+
+class ByKindSimpleCombinerRule : public SimpleCombinerRule {
+ public:
+  ByKindSimpleCombinerRule(OpPatternKind upstream_kind, OpPatternKind downstream_kind);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(ByKindSimpleCombinerRule, SimpleCombinerRule,
+                                ByKindSimpleCombinerRuleNode);
+};
+
+/*! \brief Context required by CombineRuleNode::AppendAllResultsContext. */
+struct AppendAllResultsContext {
+  AppendAllResultsContext(const DataflowGraph* dataflow_graph, size_t max_max_depth,
+                          CandidateSet* candidate_set)
+      : dataflow_graph(dataflow_graph),
+        max_max_depth(max_max_depth),
+        candidate_set(candidate_set) {}
+
+  const DataflowGraph* dataflow_graph;
+  size_t max_max_depth;
+  CandidateSet* candidate_set;
+};
+
+/*!
+ * \brief Base class for all 'combiner' rules.
+ *
+ * Given the current candidate set, a combiner rule looks for opportunities to form larger
+ * candidates, optionally removing existing candidates in the process.
+ */
+class CombinerRuleNode : public Object {
+ public:
+  String rule_name_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  virtual void AppendAllResults(AppendAllResultsContext* ctxt) const;
+  virtual std::string ToString() const;
+
+  static constexpr const char* _type_key = "relay.collage.CombinerRule";
+  static constexpr const uint32_t _type_child_slots = 4;
+  TVM_DECLARE_BASE_OBJECT_INFO(CombinerRuleNode, Object);
+};
+
+class CombinerRule : public ObjectRef {
+ public:
+  explicit CombinerRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(CombinerRule, ObjectRef, CombinerRuleNode);
+};
+
+/*!
+ * \brief A combiner rule which runs one or more simple combiner rules over the current
+ * touching candidates.
+ */
+class AllSimpleCombinerRuleNode : public CombinerRuleNode {
+ public:
+  Array<SimpleCombinerRule> simple_rules_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  void AppendAllResults(AppendAllResultsContext* ctxt) const override;
+  std::string ToString() const override;
+
+  static constexpr const char* _type_key = "relay.collage.AllSimpleCombinerRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AllSimpleCombinerRuleNode, CombinerRuleNode);
+};
+
+class AllSimpleCombinerRule : public CombinerRule {
+ public:
+  AllSimpleCombinerRule(String rule_name, Array<SimpleCombinerRule> simple_rules);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(AllSimpleCombinerRule, CombinerRule, AllSimpleCombinerRuleNode);
+};
+
+/*!
+ * \brief A combiner rule which combines injective sub-groups which appear inside tuples which are
+ * themselves inputs to injective sub-groups.
+ */
+class TupleArgCombinerRuleNode : public CombinerRuleNode {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  void AppendAllResults(AppendAllResultsContext* ctxt) const override;
+  std::string ToString() const override;
+
+  static constexpr const char* _type_key = "relay.collage.TupleArgCombinerRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TupleArgCombinerRuleNode, CombinerRuleNode);
+};
+
+class TupleArgCombinerRule : public CombinerRule {
+ public:
+  explicit TupleArgCombinerRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(TupleArgCombinerRule, CombinerRule, TupleArgCombinerRuleNode);
+};
+
+/*!
+ * \brief A combiner rule which combines tuple projection if it's an output of an injective
+ * group.
+ */
+class TupleProjCombinerRuleNode : public CombinerRuleNode {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  void AppendAllResults(AppendAllResultsContext* ctxt) const override;
+  std::string ToString() const override;
+
+  static constexpr const char* _type_key = "relay.collage.TupleProjCombinerRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TupleProjCombinerRuleNode, CombinerRuleNode);
+};
+
+class TupleProjCombinerRule : public CombinerRule {
+ public:
+  explicit TupleProjCombinerRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(TupleProjCombinerRule, CombinerRule, TupleProjCombinerRuleNode);
+};
+
+/*!
+ * \brief A combiner rule which combines constants in argument positions to existing candidates.
+ * Note that scalars are always inlined, so this rule only combines tensor constant arguments.
+ */
+class ConstantCombinerRuleNode : public CombinerRuleNode {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  void AppendAllResults(AppendAllResultsContext* ctxt) const override;
+  std::string ToString() const override;
+
+  static constexpr const char* _type_key = "relay.collage.ConstantCombinerRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ConstantCombinerRuleNode, CombinerRuleNode);
+};
+
+class ConstantCombinerRule : public CombinerRule {
+ public:
+  explicit ConstantCombinerRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(ConstantCombinerRule, CombinerRule, ConstantCombinerRuleNode);
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_COMBINER_RULE_H_
diff --git a/src/relay/collage/cost.cc b/src/relay/collage/cost.cc
new file mode 100644
index 0000000000000..ae2eb8600ebd0
--- /dev/null
+++ b/src/relay/collage/cost.cc
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/cost.cc
+ * \brief Represents the estimated cost of a candidate partition.
+ */
+
+#include "./cost.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+std::string Cost::ToString() const {
+  if (is_invalid()) {
+    return "invalid";
+  } else if (is_unknown()) {
+    return "unknown";
+  } else if (value_ == 0.0) {
+    return "0";
+  } else {
+    return std::to_string(value_ * 1e6) + "us";
+  }
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/cost.h b/src/relay/collage/cost.h
new file mode 100644
index 0000000000000..8ae276d22078f
--- /dev/null
+++ b/src/relay/collage/cost.h
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/cost.h
+ * \brief Represents the estimated cost of a candidate partition.
+ */
+#ifndef TVM_RELAY_COLLAGE_COST_H_
+#define TVM_RELAY_COLLAGE_COST_H_
+
+#include <tvm/runtime/logging.h>
+
+#include <cmath>
+#include <limits>
+#include <string>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief The assumed cost for a candidate partition. Generally average execution time in seconds.
+ * However other cost functions are possible, for example to introduce a penalty for high memory
+ * use, etc.
+ */
+class Cost {
+ public:
+  Cost() = delete;
+
+  static Cost Zero() { return Cost(0.0); }
+
+  /*!
+   * \brief Returns the distinguished 'invalid' cost signaling a candidate partition is not
+   * supported by the intended target, for example because the sub-graph has an unsupported operator
+   * or the intermediate memory required exceeds some system limit.
+   */
+  static Cost Invalid() { return Cost(std::numeric_limits<double>::infinity()); }
+
+  bool is_invalid() const { return std::isinf(value_) && value_ > 0.0; }
+
+  /*!
+   * \brief Returns the distinguished 'unknown' cost, signaling fixed priorities should be used to
+   * choose the best partitions. This can be used to disable tuning and fallback to fixed rules,
+   * much as TVM will use an un-tuned kernel if no tuning records are available.
+   */
+  static Cost Unknown() { return Cost(std::numeric_limits<double>::quiet_NaN()); }
+
+  bool is_unknown() const { return std::isnan(value_); }
+
+  /*! \brief Returns cost with given finite, non-negative value. */
+  static Cost Value(double value) {
+    ICHECK(!std::isnan(value) && !std::isinf(value) && value >= 0.0);
+    return Cost(value);
+  }
+
+  bool is_value() const { return !std::isnan(value_) && !std::isinf(value_); }
+
+  /*! \brief Return true if the less-than relation is defined for this and that. */
+  bool are_comparable(Cost that) const { return !std::isnan(value_) && !std::isnan(that.value_); }
+
+  /*! \brief Returns sum of this and that. */
+  Cost operator+(Cost that) const { return Cost(value_ + that.value_); }
+
+  /*! \brief Returns difference of this and that. */
+  Cost operator-(Cost that) const { return Cost(value_ - that.value_); }
+
+  /*! \brief Returns true if this is cheaper than that, assuming they are comparable. */
+  bool operator<(Cost that) const { return value_ < that.value_; }
+
+  std::string ToString() const;
+
+ private:
+  explicit Cost(double value) : value_(value) {}
+
+  /*!
+   * \brief Non-negative value or:
+   *   - +inf if candidate partition is not feasible.
+   *   - NaN if candidate partition has an unknown cost (priority may be used to break ties).
+   */
+  double value_ = 0.0;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_COST_H_
diff --git a/src/relay/collage/cost_estimator.cc b/src/relay/collage/cost_estimator.cc
new file mode 100644
index 0000000000000..94a3062f9dc60
--- /dev/null
+++ b/src/relay/collage/cost_estimator.cc
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/cost_estimator.cc
+ * \brief Interface for measuring candidate partition cost.
+ */
+
+#include "./cost_estimator.h"
+
+#include <math.h>
+#include <tvm/relay/expr_functor.h>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+TVM_REGISTER_OBJECT_TYPE(CostEstimatorNode);
+TVM_REGISTER_OBJECT_TYPE(MockEstimatorNode);
+
+CostEstimator::CostEstimator() {
+  auto node = make_object<CostEstimatorNode>();
+  data_ = std::move(node);
+}
+
+Cost CostEstimatorNode::Estimate(const IRModule& mod, const Target& target,
+                                 bool needs_tvm_turning) const {
+  static const runtime::PackedFunc* estimate_seconds =
+      runtime::Registry::Get("tvm.relay.collage.estimate_seconds");
+  ICHECK(estimate_seconds);
+  const double value = (*estimate_seconds)(mod, target, needs_tvm_turning);
+  if (std::isinf(value)) {
+    return Cost::Invalid();
+  } else if (std::isnan(value)) {
+    return Cost::Unknown();
+  } else {
+    return Cost::Value(value);
+  }
+}
+
+class MockEstimationVisitor : private ExprVisitor {
+ public:
+  MockEstimationVisitor(double op_cost, double fusion_benefit)
+      : op_cost_(op_cost), fusion_benefit_(fusion_benefit) {}
+
+  double EstimateCost(const Expr& body) {
+    this->VisitExpr(body);
+    return cost_;
+  }
+
+ private:
+  double op_cost_;
+  double fusion_benefit_;
+  int ops_ = 0;
+  double cost_ = 0.0;
+
+  void VisitExpr_(const CallNode* call) final {
+    if (call->op->IsInstance<OpNode>()) {
+      cost_ += op_cost_ * pow(fusion_benefit_, ops_);
+      ops_++;
+    }
+    ExprVisitor::VisitExpr_(call);
+  }
+};
+
+Cost MockEstimatorNode::Estimate(const IRModule& mod, const Target& target,
+                                 bool needs_tvm_tuning) const {
+  double op_cost = static_cast<double>(target_costs_.at(target->kind->name));
+  double cost = 0;
+  for (const auto& gv : mod->GetGlobalVars()) {
+    cost += MockEstimationVisitor(op_cost, /*fusion_benefit=*/0.9).EstimateCost(mod->Lookup(gv));
+  }
+  return Cost::Value(cost);
+}
+
+MockEstimator::MockEstimator(Map<String, Integer> target_costs) {
+  auto node = make_object<MockEstimatorNode>();
+  node->target_costs_ = std::move(target_costs);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_GLOBAL("collage.CostEstimator").set_body_typed([]() { return CostEstimator(); });
+
+TVM_REGISTER_GLOBAL("collage.MockEstimator").set_body_typed([](Map<String, Integer> target_costs) {
+  return MockEstimator(target_costs);
+});
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/cost_estimator.h b/src/relay/collage/cost_estimator.h
new file mode 100644
index 0000000000000..145845b4a5587
--- /dev/null
+++ b/src/relay/collage/cost_estimator.h
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/cost_estimator.cc
+ * \brief Interface for measuring candidate partition cost.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_COST_ESTIMATOR_H_
+#define TVM_RELAY_COLLAGE_COST_ESTIMATOR_H_
+
+#include <tvm/relay/function.h>
+
+#include "./cost.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief An (abstract) estimator for the cost of executing "main" in an \p IRModule representing
+ * a candidate partition, using the given target for lowering and codegen.
+ *
+ * Generally the implementation will compile to a \p runtime::Module (possibly on a target-specific
+ * worker if cross-compilation is not available), repeatedly invoke "main" with random data until
+ * measure variance is acceptable (on a target-specific worker), and return the summarized costs.
+ *
+ * If using a TVM native \p Target, it is possible compilation will itself invoke TVM tuning.
+ *
+ * TODO(mbs): Actually, currently not abstract so can get some local measurements.
+ */
+class CostEstimatorNode : public Object {
+ public:
+  /*!
+   * \brief Returns the estimated cost (possibly after many many minutes of training time) of
+   * running "main" in \p mod using \p target, which represents a possible partitioning of
+   * some overall Relay expression.
+   */
+  virtual Cost Estimate(const IRModule& mod, const Target& target, bool needs_tvm_tuning) const;
+
+  static constexpr const char* _type_key = "collage.CostEstimator";
+  TVM_DECLARE_BASE_OBJECT_INFO(CostEstimatorNode, Object);
+};
+class CostEstimator : public ObjectRef {
+ public:
+  CostEstimator();
+  explicit CostEstimator(::tvm::runtime::ObjectPtr<::tvm::runtime::Object> n) : ObjectRef(n) {}
+  TVM_DEFINE_DEFAULT_COPY_MOVE_AND_ASSIGN(CostEstimator);
+  const CostEstimatorNode* operator->() const {
+    return static_cast<const CostEstimatorNode*>(data_.get());
+  }
+  const CostEstimatorNode* get() const { return operator->(); }
+  using ContainerType = CostEstimatorNode;
+};
+
+class MockEstimatorNode : public CostEstimatorNode {
+ public:
+  Cost Estimate(const IRModule& mod, const Target& target, bool needs_tvm_tuning) const;
+
+  static constexpr const char* _type_key = "collage.MockEstimator";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MockEstimatorNode, CostEstimatorNode);
+
+ protected:
+  friend class MockEstimator;
+
+  Map<String, Integer> target_costs_;
+};
+
+class MockEstimator : public CostEstimator {
+ public:
+  explicit MockEstimator(Map<String, Integer> target_costs);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(MockEstimator, CostEstimator, MockEstimatorNode);
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_COST_ESTIMATOR_H_
diff --git a/src/relay/collage/dataflow_graph.cc b/src/relay/collage/dataflow_graph.cc
new file mode 100644
index 0000000000000..b4e19a73f04d3
--- /dev/null
+++ b/src/relay/collage/dataflow_graph.cc
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/dataflow_graph.cc
+ * \brief A representation of the dataflow for an overall Relay expression.
+ */
+
+#include "./dataflow_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+DataflowGraph::DataflowGraph(Expr expr) : expr_(std::move(expr)) {
+  indexed_graph_ = CreateIndexedGraph(expr_);
+  downstream_map_.reserve(indexed_graph_->size());
+  for (PostDfsIndex index = 0; index < indexed_graph_->size(); ++index) {
+    const Node* node = indexed_graph_->index_to_node(index);
+    std::unordered_set<const Node*> downstream_nodes;
+    node->AccumulateDownstreamNodes(&downstream_nodes);
+    IndexSet index_set(indexed_graph_->size());
+    for (const Node* downstream_node : downstream_nodes) {
+      index_set.Add(downstream_node->index_);
+    }
+    downstream_map_.emplace_back(std::move(index_set));
+  }
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/dataflow_graph.h b/src/relay/collage/dataflow_graph.h
new file mode 100644
index 0000000000000..a30132ec3d61a
--- /dev/null
+++ b/src/relay/collage/dataflow_graph.h
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/dataflow_graph.h
+ * \brief A representation of the dataflow for an overall Relay expression.
+ */
+#ifndef TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_
+#define TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_
+
+#include <tvm/relay/expr.h>
+
+#include <memory>
+#include <vector>
+
+#include "../ir/indexed_graph.h"
+#include "index_set.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Represents the dataflow of an overall Relay expression.
+ */
+class DataflowGraph {
+ public:
+  using Node = IndexedGraph<Expr>::Node;
+
+  explicit DataflowGraph(Expr expr);
+
+  size_t size() const { return indexed_graph_->size(); }
+  const Node* index_to_node(PostDfsIndex index) const {
+    return indexed_graph_->index_to_node(index);
+  }
+  const Node* item_to_node(const Expr& expr) const { return indexed_graph_->item_to_node(expr); }
+  const Node* item_to_node(const ExprNode* expr_node) const {
+    return indexed_graph_->item_to_node(expr_node);
+  }
+  const IndexedGraph<Expr>& indexed_graph() const { return *indexed_graph_; }
+
+  const IndexSet& downstream_of(PostDfsIndex index) const {
+    ICHECK_LT(index, indexed_graph_->size());
+    return downstream_map_[index];
+  }
+
+ private:
+  /*! \brief The overall expression. */
+  Expr expr_;
+  /*! \brief The indexed graph which captures the main dataflow. */
+  std::unique_ptr<IndexedGraph<Expr>> indexed_graph_;
+  /*! \brief Map from a node's PostDfsIndex to the set of it's downstream dataflow node indexes. */
+  std::vector<IndexSet> downstream_map_;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_
diff --git a/src/relay/collage/gather_partition_specs.cc b/src/relay/collage/gather_partition_specs.cc
new file mode 100644
index 0000000000000..0275541d9fa53
--- /dev/null
+++ b/src/relay/collage/gather_partition_specs.cc
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/gather_partition_specs.cc
+ * \brief Gather the relevant \p PartitionSpecs from the available \p Targets.
+ */
+
+#include "./gather_partition_specs.h"
+
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+namespace {
+
+PartitionRule MakeCombinePartitionRule(PartitionRule sub_rule, Array<CombinerRule> combiner_rules,
+                                       size_t max_max_depth) {
+  if (combiner_rules.empty()) {
+    return sub_rule;
+  } else {
+    return CombinePartitionRule("", std::move(sub_rule), std::move(combiner_rules), max_max_depth);
+  }
+}
+
+/*! \brief Returns the primitive combiner rules which mimic TVM's \p FuseOps. */
+Array<CombinerRule> TVMCombinerRules() {
+  Array<SimpleCombinerRule> simple_rules;
+  // Mimic the FuseOps rules.
+  simple_rules.push_back(ByKindSimpleCombinerRule(kOutEWiseFusable, kBroadcast));
+  simple_rules.push_back(ByKindSimpleCombinerRule(kBroadcast, kCommReduce));
+  simple_rules.push_back(ByKindSimpleCombinerRule(kInjective, kInjective));
+
+  Array<CombinerRule> combiner_rules;
+  // Fire the simple fusion rules
+  combiner_rules.push_back(AllSimpleCombinerRule("combiner", std::move(simple_rules)));
+  // Fuse tuple arguments
+  combiner_rules.push_back(TupleArgCombinerRule("tuple"));
+  // Fuse tuple projection
+  combiner_rules.push_back(TupleProjCombinerRule("proj"));
+
+  return combiner_rules;
+}
+
+size_t GetMaxMaxDepth(std::string key) {
+  transform::PassContext ctxt = transform::PassContext::Current();
+  std::string config_key = "relay.collage." + key;
+  Optional<Integer> opt_max_max_depth = ctxt->GetConfig(config_key, Optional<Integer>());
+  ICHECK(opt_max_max_depth.defined())
+      << "missing binding for '" << config_key << " in pass context";
+  ICHECK(opt_max_max_depth.value()->value > 0)
+      << "invalid value for '" << config_key << " in pass context";
+  return static_cast<size_t>(opt_max_max_depth.value());
+}
+
+/*! \brief Returns partition rule mimicking TVM FuseOps. */
+PartitionRule MakeTVMPartitionRule() {
+  size_t max_max_depth = GetMaxMaxDepth("tvm_max_max_depth");
+  // Build singleton candidates for all calls to ops <= kOutEWiseFusable.
+  OpCallByKindPartitionRule op_call_by_kind("");
+  // Combine candidates according to the TVM fusion rules.
+  PartitionRule combine =
+      MakeCombinePartitionRule(std::move(op_call_by_kind), TVMCombinerRules(), max_max_depth);
+  // Discard invalid candidates.
+  SubGraphConfig sub_graph_config;
+  sub_graph_config.allow_taps = false;
+  sub_graph_config.max_max_depth = max_max_depth;
+  sub_graph_config.max_exits = 1;
+  return OnlyValidPartitionRule("", std::move(combine), sub_graph_config);
+  // NOTE: We don't wrap by a "Primitive" since we want to defer making TVM fusion decisions until
+  // after running more Relay passes.
+}
+
+/*!
+ * \brief Returns the fusion style for \p compiler.
+ *
+ * TODO(mbs): Defer to per-BYOC integration definition.
+ */
+BYOCStyle BYOCFusionStyleForCompiler(const String& compiler) {
+  if (compiler == "cutlass" || compiler == "cublas" || compiler == "cudnn") {
+    return kNoFusionBYOCStyle;
+  } else if (compiler == "tensorrt") {
+    return kTVMFusionBYOCStyle;
+  } else {
+    return kArbitraryFusionBYOCStyle;
+  }
+}
+
+/*!
+ * \brief Returns the primitive combiner rules which allow for any touching candidates
+ * to be fused provided they don't have kind \p kOpaque.
+ */
+Array<CombinerRule> BYOCCombinerRules(const String& compiler) {
+  Array<SimpleCombinerRule> simple_rules;
+  Array<CombinerRule> combiner_rules;
+  switch (BYOCFusionStyleForCompiler(compiler)) {
+    case kNoFusionBYOCStyle:
+      break;
+    case kTVMFusionBYOCStyle:
+      // Conservatively assume the BYOC toolchain follows the same rules as for TVM's FuseOps.
+      simple_rules.push_back(ByKindSimpleCombinerRule(kOutEWiseFusable, kBroadcast));
+      simple_rules.push_back(ByKindSimpleCombinerRule(kBroadcast, kCommReduce));
+      simple_rules.push_back(ByKindSimpleCombinerRule(kInjective, kInjective));
+      combiner_rules.push_back(AllSimpleCombinerRule("combiner", std::move(simple_rules)));
+      break;
+    case kArbitraryFusionBYOCStyle:
+      // Just try all combinations up to the max_max_depth limit.
+      simple_rules.push_back(ByKindSimpleCombinerRule(kOutEWiseFusable, kOutEWiseFusable));
+      combiner_rules.push_back(AllSimpleCombinerRule("combiner", std::move(simple_rules)));
+      break;
+  }
+  return combiner_rules;
+}
+
+/*!
+ * \brief Returns partition rule mimicking one entry in the patterns list passed to the
+ * MergeComposite pass.
+ */
+PartitionRule MakeLabelledDFPatternPartitionRule(
+    const std::string& compiler, String rule_name, DFPattern dataflow_pattern,
+    TPatternPredicate predicate = DefaultPatternPredicate) {
+  DFPatternPartitionRule patterns("", std::move(dataflow_pattern), std::move(predicate));
+  return CompositePartitionRule(std::move(rule_name), std::move(patterns));
+}
+
+/*!
+ * \brief Returns partition rule mimicking
+ * MergeComposite/AnnotateTarget/MergeCompilerRegions/PartitionGraph passes for "compiler"
+ * attribute of \p target.
+ */
+PartitionRule MakePatternBYOCPartitionRule(const std::string& compiler,
+                                           Array<PartitionRule> sub_rules) {
+  size_t max_max_depth = GetMaxMaxDepth("byoc_max_max_depth");
+  // Union all the individual pattern rules.
+  UnionPartitionRule unioned("", std::move(sub_rules));
+  PartitionRule combine =
+      MakeCombinePartitionRule(std::move(unioned), BYOCCombinerRules(compiler), max_max_depth);
+  // Ignore invalid candidates.
+  SubGraphConfig sub_graph_config;
+  sub_graph_config.allow_taps = false;
+  sub_graph_config.max_max_depth = max_max_depth;
+  sub_graph_config.max_exits = 1;
+  OnlyValidPartitionRule valid("", std::move(combine), sub_graph_config);
+  // Wrap the candidates in a "Primitive" function with a "Compiler" attribute.
+  return PrimitivePartitionRule("", std::move(valid));
+}
+
+TVM_REGISTER_GLOBAL("relay.collage.make_labelled_dfpattern_partition_rule")
+    .set_body_typed([](String compiler, String rule_name, DFPattern dataflow_pattern) {
+      return MakeLabelledDFPatternPartitionRule(std::move(compiler), std::move(rule_name),
+                                                std::move(dataflow_pattern));
+    });
+
+TVM_REGISTER_GLOBAL("relay.collage.make_labelled_dfpattern_partition_rule_with_predicate")
+    .set_body_typed([](String compiler, String rule_name, DFPattern dataflow_pattern,
+                       TPatternPredicate predicate) {
+      return MakeLabelledDFPatternPartitionRule(std::move(compiler), std::move(rule_name),
+                                                std::move(dataflow_pattern), std::move(predicate));
+    });
+
+TVM_REGISTER_GLOBAL("relay.collage.make_pattern_byoc_partition_rule")
+    .set_body_typed(MakePatternBYOCPartitionRule);
+
+/*!
+ * \brief Returns the rule to pick out expression nodes which can be 'left behind' for execution
+ * on the host.
+ */
+PartitionRule MakeHostPartitionRule() { return HostPartitionRule(""); }
+
+}  // namespace
+
+Array<PartitionSpec> GatherPartitionSpecs(const CompilationConfig& config) {
+  Array<PartitionSpec> result;
+  for (const auto& primitive_target : config->primitive_targets) {
+    String spec_name = GetSpecName(primitive_target);
+    PartitionRule rule;
+    if (primitive_target.IsExternalCodegen()) {
+      // Transition to the Python side so we can get access to the BYOC pattern registry.
+      // That will bounce right back into the above construction helpers.
+      static const runtime::PackedFunc* make_byoc_partition_rule =
+          runtime::Registry::Get("tvm.relay.collage.make_byoc_partition_rule");
+      ICHECK(make_byoc_partition_rule);
+      rule = (*make_byoc_partition_rule)(spec_name);
+      VLOG(1) << "Target " << primitive_target->ToDebugString() << " is for BYOC spec_name "
+              << spec_name << " and has default partition rule:\n"
+              << rule->ToString();
+    } else {
+      rule = MakeTVMPartitionRule();
+      VLOG(1) << "Target " << primitive_target->ToDebugString() << " is for TVM spec_name "
+              << spec_name << " and has default partition rule:\n"
+              << rule->ToString();
+    }
+    result.push_back(PartitionSpec(spec_name, primitive_target, rule));
+  }
+
+  // Add one more spec to cover the host target.
+  result.push_back(PartitionSpec(kHostSpecName, config->host_target, MakeHostPartitionRule()));
+
+  return result;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/gather_partition_specs.h b/src/relay/collage/gather_partition_specs.h
new file mode 100644
index 0000000000000..62ffca27d635e
--- /dev/null
+++ b/src/relay/collage/gather_partition_specs.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/gather_partition_specs.h
+ * \brief Gather the relevant \p PartitionSpecs from the available \p Targets.
+ */
+#ifndef TVM_RELAY_COLLAGE_GATHER_PARTITION_SPECS_H_
+#define TVM_RELAY_COLLAGE_GATHER_PARTITION_SPECS_H_
+
+#include <tvm/target/compilation_config.h>
+
+#include "./partition_spec.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief The 'styles' of BYOC integrations. Used to influence how their corresponding
+ * partition rule is constructed.
+ */
+enum BYOCStyle {
+  /*!
+   * \brief The BYOC patterns pick out 'ideal' candidates directly, either because:
+   *  - the BYOC toolchain does not perform any fusion so each matched sub-expression maps 1:1 to a
+   *    BYOC-provided operator, or
+   *  - the BYOC toolchain does perform fusion, however the patterns have been written to pick out
+   *    fusable sub-graphs.
+   */
+  kNoFusionBYOCStyle,
+
+  /*!
+   * \brief The BYOC patterns pick out supported operators, but the BYOC backend may perform
+   * fusion over those operators in much the same way TVM does.
+   */
+  kTVMFusionBYOCStyle,
+
+  /*!
+   * \brief The BYOC patterns pick out supported operators, but the BYOC backend may perform
+   * arbitrary fusion over those operators.
+   */
+  kArbitraryFusionBYOCStyle,
+};
+
+/*!
+ * \brief Returns all the partition specifications gathered from the \p Targets in \p config.
+ */
+Array<PartitionSpec> GatherPartitionSpecs(const CompilationConfig& config);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_GATHER_PARTITION_SPECS_H_
diff --git a/src/relay/collage/index_set.cc b/src/relay/collage/index_set.cc
new file mode 100644
index 0000000000000..55bec80820a47
--- /dev/null
+++ b/src/relay/collage/index_set.cc
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/index_set.cc
+ * \brief Efficient representation of a set of post-dfs indexes.
+ */
+
+#include "./index_set.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+// TODO(mbs): These should operate one-word-at-a-time
+
+IndexSet::IndexSet(size_t size, const std::vector<size_t>& indexes) : bitvec_(size, false) {
+  for (size_t index : indexes) {
+    ICHECK_LT(index, bitvec_.size());
+    ICHECK(!bitvec_[index]);
+    bitvec_[index] = true;
+  }
+}
+
+IndexSet IndexSet::operator&(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  std::vector<bool> result(bitvec_.size(), false);
+  for (size_t index = 0; index < bitvec_.size(); ++index) {
+    result[index] = bitvec_[index] && that.bitvec_[index];
+  }
+  return IndexSet(result);
+}
+
+IndexSet IndexSet::operator|(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  std::vector<bool> result(bitvec_.size(), false);
+  for (size_t index = 0; index < bitvec_.size(); ++index) {
+    result[index] = bitvec_[index] || that.bitvec_[index];
+  }
+  return IndexSet(result);
+}
+
+IndexSet IndexSet::operator-(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  std::vector<bool> result(bitvec_.size());
+  for (size_t index = 0; index < bitvec_.size(); ++index) {
+    result[index] = bitvec_[index] && !that.bitvec_[index];
+  }
+  return IndexSet(result);
+}
+
+bool IndexSet::AreDisjoint(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index] && that.bitvec_[index]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool IndexSet::IsSubset(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index] && !that.bitvec_[index]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool IndexSet::Intersects(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index] && that.bitvec_[index]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+IndexSet IndexSet::Subst(size_t new_size, const IndexSubst& subst) const {
+  std::vector<bool> result(new_size, false);
+  for (PostDfsIndex index = 0; index < bitvec_.size(); ++index) {
+    if (!bitvec_[index]) {
+      continue;
+    }
+    auto itr = subst.find(index);
+    ICHECK(itr != subst.end());
+    PostDfsIndex new_index = itr->second;
+    ICHECK(new_index < new_size);
+    ICHECK(!result[new_index]);
+    result[new_index] = true;
+  }
+  return IndexSet(result);
+}
+
+size_t IndexSet::PopCount() const {
+  size_t n = 0;
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index]) {
+      ++n;
+    }
+  }
+  return n;
+}
+
+bool IndexSet::IsZero() const {
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+size_t IndexSet::FirstInsideIndex() const {
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index]) {
+      return index;
+    }
+  }
+  return bitvec_.size();
+}
+
+size_t IndexSet::LastInsideIndex() const {
+  for (size_t i = bitvec_.size(); i > 0; i--) {
+    const size_t index = i - 1;
+    if (bitvec_[index]) {
+      return index;
+    }
+  }
+  return bitvec_.size();
+}
+
+size_t IndexSet::NextIndex(size_t index) const {
+  ICHECK_LT(index, bitvec_.size());
+  for (index++; index < bitvec_.size(); index++) {
+    if (bitvec_[index]) {
+      return index;
+    }
+  }
+  return bitvec_.size();
+}
+
+size_t IndexSet::FirstOutsideIndex() const {
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (!bitvec_[index]) {
+      return index;
+    }
+  }
+  return bitvec_.size();
+}
+
+bool IndexSet::operator==(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  return bitvec_ == that.bitvec_;
+}
+
+bool IndexSet::operator!=(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  return bitvec_ != that.bitvec_;
+}
+
+bool IndexSet::operator<(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index] && !that.bitvec_[index]) {
+      return true;
+    }
+    if (!bitvec_[index] && that.bitvec_[index]) {
+      return false;
+    }
+  }
+  return false;
+}
+
+size_t IndexSet::hash() const {
+  std::hash<std::vector<bool>> h;
+  return h(bitvec_);
+}
+
+std::string IndexSet::ToString() const {
+  std::ostringstream os;
+  os << "{";
+  bool first = true;
+  for (size_t start = 0; start < bitvec_.size(); /*no-op*/) {
+    if (!bitvec_[start]) {
+      ++start;
+      continue;
+    }
+    size_t end;
+    for (end = start + 1; end < bitvec_.size() && bitvec_[end]; ++end) {
+      /*no-op*/
+    }
+    if (first) {
+      first = false;
+    } else {
+      os << ",";
+    }
+    os << start;
+    if (end > start + 2) {
+      os << ".." << (end - 1);
+      start = end;
+    } else {
+      ++start;
+    }
+  }
+  os << "}";
+  return os.str();
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/index_set.h b/src/relay/collage/index_set.h
new file mode 100644
index 0000000000000..f24b695cc76c9
--- /dev/null
+++ b/src/relay/collage/index_set.h
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/index_set.h
+ * \brief Efficient representation of a set of post-dfs indexes.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_INDEX_SET_H_
+#define TVM_RELAY_COLLAGE_INDEX_SET_H_
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "../ir/dataflow_matcher_impl.h"
+#include "../ir/indexed_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+using IndexSubst = std::unordered_map<size_t, size_t>;
+
+class IndexSet {
+ public:
+  IndexSet() = default;
+  explicit IndexSet(size_t size) : bitvec_(size, false) {}
+  IndexSet(size_t size, const std::vector<size_t>& indexes);
+
+  IndexSet operator&(const IndexSet& that) const;
+  IndexSet operator|(const IndexSet& that) const;
+  IndexSet operator-(const IndexSet& that) const;
+  bool AreDisjoint(const IndexSet& that) const;
+  bool IsSubset(const IndexSet& that) const;
+  bool Intersects(const IndexSet& that) const;
+
+  bool operator[](size_t index) const {
+    ICHECK_LT(index, bitvec_.size());
+    return bitvec_[index];
+  }
+
+  IndexSet& Add(size_t index) {
+    ICHECK_LT(index, bitvec_.size());
+    bitvec_[index] = true;
+    return *this;
+  }
+
+  IndexSet Subst(size_t new_size, const IndexSubst& subst) const;
+
+  size_t end_index() const { return bitvec_.size(); }
+  size_t PopCount() const;
+  bool IsZero() const;
+  size_t FirstInsideIndex() const;
+  size_t LastInsideIndex() const;
+  size_t NextIndex(size_t index) const;
+  size_t FirstOutsideIndex() const;
+  bool operator==(const IndexSet& that) const;
+  bool operator!=(const IndexSet& that) const;
+  bool operator<(const IndexSet& that) const;
+  size_t hash() const;
+  std::string ToString() const;
+
+  struct IndexSetIterator {
+    const IndexSet* set;
+    size_t i;
+
+    size_t operator*() const {
+      ICHECK_LT(i, set->end_index());
+      return i;
+    }
+
+    const IndexSetIterator& operator++() {
+      ICHECK_LT(i, set->end_index());
+      i = set->NextIndex(i);
+      return *this;
+    }
+
+    bool operator==(const IndexSetIterator& that) const {
+      ICHECK(set == that.set);
+      return i == that.i;
+    }
+
+    bool operator!=(const IndexSetIterator& that) const {
+      ICHECK(set == that.set);
+      return i != that.i;
+    }
+  };
+
+  IndexSetIterator begin() const { return IndexSetIterator{this, FirstInsideIndex()}; }
+  IndexSetIterator end() const { return IndexSetIterator{this, end_index()}; }
+
+ private:
+  explicit IndexSet(std::vector<bool> bitvec) : bitvec_(std::move(bitvec)) {}
+
+  std::vector<bool> bitvec_;
+};
+
+struct IndexSetEqual {
+  bool operator()(const IndexSet& left, const IndexSet& right) const { return left == right; }
+};
+
+struct IndexSetHash {
+  size_t operator()(const IndexSet& set) const { return set.hash(); }
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_INDEX_SET_H_
diff --git a/src/relay/collage/name_supply.cc b/src/relay/collage/name_supply.cc
new file mode 100644
index 0000000000000..4b7d497b0d577
--- /dev/null
+++ b/src/relay/collage/name_supply.cc
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/name_supply.cc
+ * \brief A source of fresh variable names.
+ */
+
+#include "./name_supply.h"
+
+#include <algorithm>
+#include <sstream>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+namespace {
+void AppendCSafe(bool* first, std::ostringstream& os, const std::string& str) {
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char c = str[i];
+    if (i == 0 && first && (!std::isalpha(c) && c != '_')) {
+      os << "_";
+    }
+    if (c == '_' || std::isalnum(c)) {
+      os << c;
+    } else {
+      os << "_";
+    }
+    *first = false;
+  }
+}
+}  // namespace
+
+NameSupply NameSupply::MakeSubNameSupply() {
+  NameSupply result(prefix_);
+  for (const auto& kv : next_free_index_) {
+    result.next_free_index_.emplace(kv.first, kv.second);
+  }
+  return result;
+}
+
+std::string NameSupply::Fresh(const std::initializer_list<std::string>& hints) {
+  std::ostringstream os;
+  bool first = true;
+  bool need_sep = false;
+  if (!prefix_.empty()) {
+    AppendCSafe(&first, os, prefix_);
+    need_sep = true;
+  }
+  for (const auto& hint : hints) {
+    if (hint.empty()) {
+      continue;
+    }
+    if (need_sep) {
+      os << "_";
+    }
+    AppendCSafe(&first, os, hint);
+    need_sep = true;
+  }
+  std::string name = os.str();
+  auto itr = next_free_index_.find(name);
+  if (itr == next_free_index_.end()) {
+    next_free_index_.emplace(name, 1);
+  } else {
+    os << "_" << itr->second++;
+    name = os.str();
+  }
+  return name;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/name_supply.h b/src/relay/collage/name_supply.h
new file mode 100644
index 0000000000000..d37023ab6f815
--- /dev/null
+++ b/src/relay/collage/name_supply.h
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/name_supply.h
+ * \brief A source of fresh variable names.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_NAME_SUPPLY_H_
+#define TVM_RELAY_COLLAGE_NAME_SUPPLY_H_
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*! \brief A supply of fresh names. */
+class NameSupply {
+ public:
+  explicit NameSupply(std::string prefix) : prefix_(std::move(prefix)) {}
+
+  NameSupply MakeSubNameSupply();
+
+  void Reserve(const std::string& existing) { next_free_index_.emplace(existing, 1); }
+
+  std::string Fresh(const std::initializer_list<std::string>& hints);
+
+ private:
+  /*! \brief Prefix for all names. May be empty. */
+  std::string prefix_;
+  /*! \brief Next unused index for variables with given basename. */
+  std::unordered_map<std::string, int> next_free_index_;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_NAME_SUPPLY_H_
diff --git a/src/relay/collage/partition_rule.cc b/src/relay/collage/partition_rule.cc
new file mode 100644
index 0000000000000..25429aeb5f094
--- /dev/null
+++ b/src/relay/collage/partition_rule.cc
@@ -0,0 +1,433 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/partition_rule.cc
+ * \brief Compositional partitioning rules.
+ */
+
+#include "./partition_rule.h"
+
+#include <tvm/relay/transform.h>
+
+#include "./partition_rule.h"
+#include "./partition_spec.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+TVM_REGISTER_NODE_TYPE(PartitionRuleNode);
+
+void PartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> PartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  ICHECK(false) << "PartitionRuleNode::AllCandidates should be overridden in sub-class";
+  return {};
+}
+
+std::string PartitionRuleNode::ToString() const { return ToDoc().str(); }
+
+Doc PartitionRuleNode::ToDoc() const {
+  Doc doc;
+  doc << GetTypeKey() << "(" << Doc::NewLine(2);
+  std::vector<Doc> body_items;
+  AppendBodyItems(&body_items);
+  doc << Doc::Indent(2, Doc::Concat(body_items, Doc::NewLine())) << Doc::NewLine();
+  doc << ")";
+  return doc;
+}
+
+void PartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  body_items->emplace_back();
+  body_items->back() << "rule_name=" << Doc::StrLiteral(rule_name_);
+}
+
+PartitionRule::PartitionRule(String rule_name) {
+  auto node = runtime::make_object<PartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+bool DefaultPatternPredicate(const Expr& matched_sub_expr) { return true; }
+
+TVM_REGISTER_NODE_TYPE(DFPatternPartitionRuleNode);
+
+void DFPatternPartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> DFPatternPartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  VLOG(1) << "running DFPatternPartitionRule(" << rule_name_ << ")";
+  std::vector<CandidatePartition> result;
+  DFPatternMatcher matcher(&dataflow_graph.indexed_graph());
+  for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) {
+    Expr sub_expr = dataflow_graph.index_to_node(index)->ref();
+    if (!matcher.Match(pattern_, sub_expr)) {
+      continue;
+    }
+    if (!predicate_(sub_expr)) {
+      VLOG(1) << "DFPatternPartitionRule(" << rule_name_ << ") has failing predicate";
+      continue;
+    }
+    IndexSet inside = MatcherToIndexSet(matcher);
+    OpPatternKind kind;
+    String label;
+    std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside);
+    SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label));
+    String rule_name = rule_name_.empty() ? sub_graph->label_ : rule_name_;
+    CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec);
+    VLOG(2) << "DFPatternPartitionRule(" << rule_name_ << ") yields " << candidate->ToString();
+    result.emplace_back(std::move(candidate));
+  }
+  VLOG(1) << "DFPatternPartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void DFPatternPartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  body_items->emplace_back();
+  body_items->back() << "pattern=" << PrettyPrint(pattern_);
+}
+
+DFPatternPartitionRule::DFPatternPartitionRule(String rule_name, DFPattern pattern,
+                                               TPatternPredicate predicate) {
+  auto node = runtime::make_object<DFPatternPartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->pattern_ = std::move(pattern);
+  node->predicate_ = std::move(predicate);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(CompositePartitionRuleNode);
+
+void CompositePartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> CompositePartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  std::vector<CandidatePartition> candidates = sub_rule_->AllCandidates(dataflow_graph, spec);
+  VLOG(1) << "running CompositePartitionRule(" << rule_name_ << ") over " << candidates.size()
+          << " sub-candidates";
+  std::vector<CandidatePartition> result;
+  FunctionAttrsMap attrs;
+  attrs.Set(attr::kComposite, rule_name_);
+  for (auto& candidate : candidates) {
+    String rule_name = NestLabels(rule_name_, candidate->rule_name_);
+    SubGraph sub_graph = candidate->sub_graph_.WithAttrs(dataflow_graph, attrs);
+    CandidatePartition new_candidate = WithSubGraph(
+        WithRuleName(std::move(candidate), std::move(rule_name)), std::move(sub_graph));
+    VLOG(2) << "CompositePartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString();
+    result.emplace_back(std::move(new_candidate));
+  }
+  VLOG(1) << "CompositePartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void CompositePartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  body_items->emplace_back();
+  body_items->back() << "sub_rule=" << sub_rule_->ToDoc();
+}
+
+CompositePartitionRule::CompositePartitionRule(String rule_name, PartitionRule sub_rule) {
+  auto node = runtime::make_object<CompositePartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_rule_ = std::move(sub_rule);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(PrimitivePartitionRuleNode);
+
+void PrimitivePartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> PrimitivePartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  std::vector<CandidatePartition> candidates = sub_rule_->AllCandidates(dataflow_graph, spec);
+  VLOG(1) << "running PrimitivePartitionRule(" << rule_name_ << ") over " << candidates.size()
+          << " sub-candidates";
+  std::vector<CandidatePartition> result;
+  FunctionAttrsMap attrs;
+  attrs.Set(attr::kPrimitive, Integer(1));
+  if (spec->target_.IsExternalCodegen()) {
+    // The spec name will be the target kind name which is 1:1 with the "Compiler" attribute name.
+    attrs.Set(attr::kCompiler, spec->spec_name_);
+  }
+  for (auto& candidate : candidates) {
+    String rule_name = NestLabels(rule_name_, candidate->rule_name_);
+    SubGraph sub_graph = candidate->sub_graph_.WithAttrs(dataflow_graph, attrs);
+    CandidatePartition new_candidate = WithSubGraph(
+        WithRuleName(std::move(candidate), std::move(rule_name)), std::move(sub_graph));
+    VLOG(2) << "PrimitivePartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString();
+    result.emplace_back(std::move(new_candidate));
+  }
+  VLOG(1) << "PrimitivePartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void PrimitivePartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  body_items->emplace_back();
+  body_items->back() << "sub_rule=" << sub_rule_->ToDoc();
+}
+
+PrimitivePartitionRule::PrimitivePartitionRule(String rule_name, PartitionRule sub_rule) {
+  auto node = runtime::make_object<PrimitivePartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_rule_ = std::move(sub_rule);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(UnionPartitionRuleNode);
+
+void UnionPartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> UnionPartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  std::vector<CandidatePartition> result;
+  for (const auto& sub_rule : sub_rules_) {
+    std::vector<CandidatePartition> candidates = sub_rule->AllCandidates(dataflow_graph, spec);
+    for (auto& candidate : candidates) {
+      String rule_name = NestLabels(rule_name_, candidate->rule_name_);
+      CandidatePartition new_candidate = WithRuleName(std::move(candidate), std::move(rule_name));
+      VLOG(2) << "UnionPartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString();
+      result.emplace_back(std::move(new_candidate));
+    }
+  }
+  VLOG(1) << "UnionPartitionRule(" << rule_name_ << ") produced " << result.size() << " candidates";
+  return result;
+}
+
+void UnionPartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  for (const auto& sub_rule : sub_rules_) {
+    body_items->emplace_back();
+    body_items->back() << "sub_rule=" << sub_rule->ToDoc();
+  }
+}
+
+UnionPartitionRule::UnionPartitionRule(String rule_name, Array<PartitionRule> sub_rules) {
+  auto node = runtime::make_object<UnionPartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_rules_ = std::move(sub_rules);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(OpCallByKindPartitionRuleNode);
+
+void OpCallByKindPartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> OpCallByKindPartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  VLOG(1) << "running OpCallByKindPartitionRule(" << rule_name_ << ")";
+  std::vector<CandidatePartition> result;
+  for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) {
+    auto node = dataflow_graph.index_to_node(index);
+    Expr sub_expr = node->ref();
+    if (sub_expr->IsInstance<CallNode>()) {
+      OpPatternKind kind;
+      String label;
+      std::tie(kind, label) = SubExprKindAndLabel(sub_expr);
+      if (kind <= kOutEWiseFusable) {
+        IndexSet inside(dataflow_graph.size(), {index});
+        SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label));
+        String rule_name = NestLabels(rule_name_, sub_graph->label_);
+        CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec);
+        VLOG(2) << "OpCallByKindPartitionRule(" << rule_name_ << ") yields "
+                << candidate->ToString();
+        result.emplace_back(std::move(candidate));
+      }
+    }
+  }
+  VLOG(1) << "OpCallByKindPartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void OpCallByKindPartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+}
+
+OpCallByKindPartitionRule::OpCallByKindPartitionRule(String rule_name) {
+  auto node = runtime::make_object<OpCallByKindPartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(CombinePartitionRuleNode);
+
+void CombinePartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> CombinePartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  // We'll accumulate all the candidates here, starting with those from the sub-rule.
+  // Once a candidate is added to this vector it is immutable.
+  std::vector<CandidatePartition> candidates = sub_rule_->AllCandidates(dataflow_graph, spec);
+  VLOG(1) << "running CombinePartitionRule(" << rule_name_ << ") over " << candidates.size()
+          << " sub-candidates";
+  CandidateSet result_set(std::move(candidates));
+
+  size_t num_rounds = 0;
+  AppendAllResultsContext ctxt(&dataflow_graph, max_max_depth_, &result_set);
+  while (result_set.PrepareForNextRound()) {
+    VLOG_CONTEXT << "round " << ++num_rounds;
+    VLOG(1) << "checking " << result_set.size() << " candidates (" << result_set.first_new_index()
+            << " existing)";
+    for (const auto& combiner_rule : combiner_rules_) {
+      combiner_rule->AppendAllResults(&ctxt);
+    }
+  }
+
+  std::vector<CandidatePartition> result;
+  for (auto& candidate : result_set.MovedCurrentCandidates()) {
+    String rule_name = NestLabels(rule_name_, candidate->rule_name_);
+    CandidatePartition new_candidate = WithRuleName(std::move(candidate), std::move(rule_name));
+    VLOG(2) << "CombinePartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString();
+    result.emplace_back(std::move(new_candidate));
+  }
+  VLOG(1) << "CombinePartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void CombinePartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  body_items->emplace_back();
+  body_items->back() << "sub_rule=" << sub_rule_->ToDoc();
+  for (const auto& combiner_rule : combiner_rules_) {
+    body_items->emplace_back();
+    body_items->back() << "combiner_rule=" << combiner_rule->ToString();
+  }
+  body_items->emplace_back();
+  body_items->back() << "max_max_depth=" << max_max_depth_;
+}
+
+CombinePartitionRule::CombinePartitionRule(String rule_name, PartitionRule sub_rule,
+                                           Array<CombinerRule> combiner_rules,
+                                           size_t max_max_depth_) {
+  auto node = runtime::make_object<CombinePartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_rule_ = std::move(sub_rule);
+  node->combiner_rules_ = std::move(combiner_rules);
+  node->max_max_depth_ = max_max_depth_;
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(OnlyValidPartitionRuleNode);
+
+void OnlyValidPartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> OnlyValidPartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  std::vector<CandidatePartition> candidates = sub_rule_->AllCandidates(dataflow_graph, spec);
+  VLOG(1) << "running OnlyValidPartitionRule(" << rule_name_ << ") over " << candidates.size()
+          << " sub-candidates";
+  std::vector<CandidatePartition> result;
+  for (auto& candidate : candidates) {
+    if (!candidate->sub_graph_->IsValid(dataflow_graph, config_)) {
+      VLOG(2) << "Ignoring invalid candidate " << candidate->ToString();
+      continue;
+    }
+    String rule_name = NestLabels(rule_name_, candidate->rule_name_);
+    CandidatePartition new_candidate = WithRuleName(std::move(candidate), std::move(rule_name));
+    VLOG(2) << "OnlyValidPartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString();
+    result.emplace_back(std::move(new_candidate));
+  }
+  VLOG(1) << "OnlyValidPartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void OnlyValidPartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  body_items->emplace_back();
+  body_items->back() << "sub_rule=" << sub_rule_->ToDoc();
+  body_items->emplace_back();
+  body_items->back() << "config=" << config_.ToString();
+}
+
+OnlyValidPartitionRule::OnlyValidPartitionRule(String rule_name, PartitionRule sub_rule,
+                                               const SubGraphConfig& config) {
+  auto node = runtime::make_object<OnlyValidPartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_rule_ = std::move(sub_rule);
+  node->config_ = config;
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(HostPartitionRuleNode);
+
+void HostPartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> HostPartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  VLOG(1) << "running HostPartitionRule(" << rule_name_ << ")";
+  std::vector<CandidatePartition> result;
+  for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) {
+    if (MustBeLowered(dataflow_graph.index_to_node(index)->ref())) {
+      continue;
+    }
+    IndexSet inside(dataflow_graph.size(), {index});
+    OpPatternKind kind;
+    String label;
+    std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside);
+    SubGraph sub_graph(dataflow_graph, std::move(inside), kind, label);
+    String rule_name = NestLabels(rule_name_, sub_graph->label_);
+    // We'll a zero cost for the candidate since we'll never want to actually estimate the cost
+    // of this 'partition'.
+    CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec, Cost::Zero());
+    VLOG(2) << "HostPartitionRule(" << rule_name_ << ") yields " << candidate->ToString();
+    result.push_back(candidate);
+  }
+  VLOG(1) << "HostPartitionRule(" << rule_name_ << ") produced " << result.size() << " candidates";
+  return result;
+}
+
+void HostPartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {}
+
+HostPartitionRule::HostPartitionRule(String rule_name) {
+  auto node = runtime::make_object<HostPartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/partition_rule.h b/src/relay/collage/partition_rule.h
new file mode 100644
index 0000000000000..a9209b4235552
--- /dev/null
+++ b/src/relay/collage/partition_rule.h
@@ -0,0 +1,487 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/partition_rule.h
+ * \brief Compositional partitioning  rules.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_PARTITION_RULE_H_
+#define TVM_RELAY_COLLAGE_PARTITION_RULE_H_
+
+#include <tvm/relay/dataflow_pattern.h>
+#include <tvm/relay/expr.h>
+
+#include <string>
+#include <vector>
+
+#include "../../printer/doc.h"
+#include "./candidate_partition.h"
+#include "./combiner_rule.h"
+#include "./sub_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Type of function to check if a matched sub-expression should be accepted by a rule. This
+ * can be used to, eg, reject operators of unsupported shape or dtype, or otherwise implement rules
+ * which are difficult to express in the dataflow pattern language directly.
+ */
+using TPatternPredicate = TypedPackedFunc<bool(const Expr& matched_sub_expr)>;
+
+/*!
+ * \brief The default pattern predicate. Always returns true.
+ */
+bool DefaultPatternPredicate(const Expr& matched_sub_expr);
+
+/*!
+ * \brief Base class of all partition rules.
+ *
+ * A \p PartitionRule describes how to find a set of \p CandidatePartitions for a \p DataflowGraph.
+ * The candidates are allowed to overlap, and ultimately it is the job of the Collage searcher to
+ * find a selection of candidates which covers the whole Relay expression without overlap. Partition
+ * rules are paired with their \p Target and other 'top level' configuration in a \p PartitionSpec.
+ *
+ * We provide a set of 'base' partition rules which produce candidates from the dataflow graph
+ * directly. We also provide a set of 'combinator' partition rules which can produce new candidates
+ * from the results of an arbitrary sub-rule or sub-rules. By mixing these base and combinator
+ * rules we can express a wide variety of partition strategies and encoding conventions.
+ *
+ * There may be many thousands of candidates in flight during the Collage search. We take care to
+ * defer constructing or rewriting Relay expressions until absolutely necessary. We only pay for
+ * extracting a function to represent a candidate when we need to measure it's cost. And we only
+ * pay for rewriting the overall Relay expression to commit to a partitioning when the Collage
+ * search has completed.
+ *
+ * The base rules implemented so far:
+ *  - \p DFPatternPartitionRule: Given a \p DFPattern and expression predicate, produces a candidate
+ *    for every sub-graph matched by the pattern and predicate. Unlike the \p PatternRewriter,
+ *    candidates are free to overlap. Used to bring BYOC patterns into the Collage framework.
+ *  - \p OpCallByKindPartitionRule: Uses the "TOpPattern" attribute provided for every Relay
+ *    operator to produce a candidate for every call to a 'fusable Relay operator'. Used to
+ *    look ahead to how TVM will fuse sub-graphs.
+ *
+ * The combinator rules implemented so far:
+ *  - \p CompositePartitionRule: Indicates all candidates matched by the sub-rule should be wrapped
+ *    by a "Composite" function. The "Composite" name is taken from the rule name. Used to indicate
+ *    Relay operators (or groups of Relay operators) should be mapped to target-specific operators,
+ *    both for BYOC and TVM external library integrations.
+ *  - \p PrimitivePartitionRule: Indicates all candidates matched by the sub-rule should be wrapped
+ *    by a "Primitive" function, possibly with an additional "Compiler" attribute. Used to
+ *    delineate a partition (or kernel).
+ *  - \p UnionPartitionRule: Simply unions all the candidates from all sub-rules together. Used to
+ *    combine individual \p DFPatternPartitionRules.
+ *  - \p CombinePartitionRule: Given a sub-rule and a list of 'combiner' rules, finds
+ *    all possible ways of combining the sub-rule's candidates to yield even larger candidates.
+ *    Note that the sub-rule's candidates may also be directly included in the results. The
+ *    'combiner' rules allow combining by \p OpPatternKinds, combining the arguments to tuples
+ *    which themselves are arguments to Relay operator calls, and so on. This rule is intended to
+ *    mimic the existing TVM \p FuseOps pass, though:
+ *    i) all candidates are found rather than just the largest, ii) the starting set of candidates
+ *    can be provided by any other rule, and iii) we rely on \p SubGraph validity checking to weed
+ *    out infeasible candidates.
+ *  - \p OnlyValidPartitionRule: Given a \p SubGraphConfig, ignores candidates with 'invalid'
+ *    sub-graphs. Used to limit the maximum candidate depth, the number of independent outputs,
+ *    and whether intermediate 'taps' are allowed.
+ *  - \p HostPartitionRule: Produces candidates for all Relay expressions which could be
+ *    'left behind' for execution by the host (eg on the VM). This rule lets us simplify the
+ *    overall Collage search algorithm.
+ *
+ * (Though not yet implemented, we'd like to allow a combinator rule which will union candidate
+ * based on their 'anchor' operators. This can be used to implement 'vertical' and 'horizontal'
+ * partition on more primitive candidates. Note that the \p SubGraph machinery supports
+ * multiple-input and -output sub-graphs and their validation, so horizontal partition is easy
+ * implement.)
+ *
+ * Here are some typical ways to combine \p PartitionRules for different partition/fusion
+ * strategies:
+ *
+ *  - Classic pattern-based BYOC with \p MergeComposite/AnnotateTarget/PartitionGraph passes:
+ *    \code
+ *    PrimitivePartitionRule
+ *      OnlyValidPartitionRule
+ *        CombinePartitionRule (with join-anything combiner rule)
+ *          UnionPartitionRule
+ *            CompositePartitionRule(label1)
+ *              DFPatternPartitionRule(pattern1)
+ *                        :
+ *            CompositePartitionRule(labeln)
+ *              DFPatternPartitionRule(patternn)
+ *    \endcode
+ *
+ *  - "Consider this library implementation for these sub-expressions", using \p DFPatterns to
+ *    pick out which Relay operators are supported:
+ *    \code
+ *    OnlyValidPartitionRule
+ *      CombinePartitionRule (with default TVM combiner rules)
+ *        UnionPartitionRule
+ *          OpCallByKindPartitionRule
+ *          CompositePartitionRule(lable1)
+ *            DFPatternPartitionRule(pattern1)
+ *                       :
+ *          CompositePartitionRule(lablen)
+ *            DFPatternPartitionRule(patternn)
+ *    \endcode
+ *
+ *  - Classic TVM \p FuseOps
+ *    \code
+ *    PrimitivePartitionRule
+ *      OnlyValidPartitionRule
+ *        CombinePartitionRule (with default TVM combiner rules)
+ *          OpCallByKindPartitionRule
+ *    \endcode
+ *
+ *  - "Just fuse what I tell you to fuse", using \p DFPatterns to directly select candidates:
+ *    \code
+ *    PrimitivePartitionRule
+ *      OnlyValidPartitionRule
+ *        UnionPartitionRule
+ *          DFPatternPartitionRule(pattern1)
+ *                       :
+ *          DFPatternPartitionRule(patternn)
+ *    \endcode
+ */
+class PartitionRuleNode : public Object {
+ public:
+  /*!
+   * \brief A unique (over all rules for the same target) name for the rule. Rule names are
+   * combined and captured with \p PartitionCandidate rule names for debuggability and
+   * explainability. Some rules will copy the rule name into function attributes.
+   *
+   */
+  String rule_name_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  /*!
+   * \brief Returns all the possible candidate partitions according to this rule for the overall
+   * expression corresponding to \p dataflow_graph. The candidates will generally have unknown
+   * target and cost: the target will be filled in by the \p PartitionSpec, while the cost will
+   * be filled in lazily.
+   */
+  virtual std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                        const PartitionSpec& spec) const;
+
+  std::string ToString() const;
+  Doc ToDoc() const;
+
+ protected:
+  virtual void AppendBodyItems(std::vector<Doc>* body_items) const;
+
+ public:
+  static constexpr const char* _type_key = "relay.collage.PartitionRule";
+  static constexpr const uint32_t _type_child_slots = 10;
+  TVM_DECLARE_BASE_OBJECT_INFO(PartitionRuleNode, Object);
+};
+
+class PartitionRule : public ObjectRef {
+ public:
+  explicit PartitionRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(PartitionRule, ObjectRef, PartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rule which fires on all sub-expressions matching a dataflow-pattern and pattern
+ * predicate. It is valid for matching candidates to overlap.
+ */
+class DFPatternPartitionRuleNode : public PartitionRuleNode {
+ public:
+  /*!
+   * \brief Relay pattern.
+   */
+  DFPattern pattern_;
+
+  /*!
+   * \brief Predicate on matched sub-expression to decide if partition rule should fire.
+   */
+  TPatternPredicate predicate_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+  static constexpr const char* _type_key = "relay.collage.DFPatternPartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(DFPatternPartitionRuleNode, PartitionRuleNode);
+};
+
+class DFPatternPartitionRule : public PartitionRule {
+ public:
+  DFPatternPartitionRule(String rule_name, DFPattern pattern,
+                         TPatternPredicate predicate = DefaultPatternPredicate);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(DFPatternPartitionRule, PartitionRule, DFPatternPartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rule which wraps candidates within a function with the "Composite" attribute
+ * bound to the given rule name.
+ *
+ * This is the standard way by which operators or operator groups are tagged as being supported
+ * by a particular externally provided function. It is up to the BYOC lowering function to
+ * recognize the "Composite" name and emit the appropriate code or call.
+ */
+class CompositePartitionRuleNode : public PartitionRuleNode {
+ public:
+  /*! \brief The sub-partition rule. */
+  PartitionRule sub_rule_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+  static constexpr const char* _type_key = "relay.collage.CompositePartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CompositePartitionRuleNode, PartitionRuleNode);
+};
+
+class CompositePartitionRule : public PartitionRule {
+ public:
+  CompositePartitionRule(String rule_name, PartitionRule sub_rule);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(CompositePartitionRule, PartitionRule, CompositePartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rule which wraps candidates within a function with the "Primitive" attribute
+ * bound to 1. If the partition spec target(s) have the "compiler" attribute then that name is
+ * also added to the function as a "Compiler" attribute.
+ *
+ * This is the standard way by which sub-graphs are marked as being in a 'partition' who's
+ * compilation will be managed by an external BYOC toolchain. It can also be used to mark
+ * sub-graphs for lowering to a single kernel by the built-in TVM lowering machinery.
+ */
+class PrimitivePartitionRuleNode : public PartitionRuleNode {
+ public:
+  /*! \brief The sub-partition rule. */
+  PartitionRule sub_rule_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+  static constexpr const char* _type_key = "relay.collage.PrimitivePartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PrimitivePartitionRuleNode, PartitionRuleNode);
+};
+
+class PrimitivePartitionRule : public PartitionRule {
+ public:
+  PrimitivePartitionRule(String rule_name, PartitionRule sub_rule);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(PrimitivePartitionRule, PartitionRule, PrimitivePartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rule which simply unions all matches from all sub-partition rules.
+ *
+ * This can be used to combine the results of a set of, eg, DFPatternPartitionRules.
+ */
+class UnionPartitionRuleNode : public PartitionRuleNode {
+ public:
+  Array<PartitionRule> sub_rules_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+  static constexpr const char* _type_key = "relay.collage.UnionPartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(UnionPartitionRuleNode, PartitionRuleNode);
+};
+
+class UnionPartitionRule : public PartitionRule {
+ public:
+  UnionPartitionRule(String rule_name, Array<PartitionRule> sub_rules);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(UnionPartitionRule, PartitionRule, UnionPartitionRuleNode)
+};
+
+/*
+ *! \brief Partition rule which places calls to Relay operators with a "TOpPattern" attribute of
+ * \p kOutEWiseFusable or less in their own singleton sub-graph. No other Relay sub-expressions
+ * (such as tuples or tuple projection) are selected, and it is up to outer partition rules to
+ * account for them.
+ */
+class OpCallByKindPartitionRuleNode : public PartitionRuleNode {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+  static constexpr const char* _type_key = "relay.collage.OpCallByKindPartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(OpCallByKindPartitionRuleNode, PartitionRuleNode);
+};
+
+class OpCallByKindPartitionRule : public PartitionRule {
+ public:
+  explicit OpCallByKindPartitionRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(OpCallByKindPartitionRule, PartitionRule,
+                                OpCallByKindPartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rule which combines sub-graphs to exploit optimizations commonly available in
+ * backends (including the TVM lowering backend). Those optimization rules are in turn described by
+ * one or more \p PrimRules.
+ *
+ * For TVM these primitive rules are guided by the \p OpPatternKind associated with every sub-graph.
+ * That in turn is the maximum of the kind of each expression node in the sub-graph, using the
+ * rules:
+ *  - Constants are \p kElemwise.
+ *  - A call to a Relay operator has the kind of its callee.
+ *  - Tuple construction and projection are injective provided all tuple fields are of tensor type.
+ *  - All other sub-expressions are opaque.
+ *
+ * The available \p OpPatternKinds (and our abbreviations for them) are:
+ *  - E: kElemWise, eg nn.relu
+ *  - B: kBroadcast, eg add
+ *  - I: kInjective, eg concatenate
+ *  - R: kCommReduce, eg sum
+ *  - A: kOutEWiseFusable, eg nn.conv2d (often called 'anchor nodes', hence the A abbreviation)
+ *  - O: kOpaque, everything else
+ * (The kTuple kind is not used by this machinery.)
+ *
+ * Kinds are ordered as above from least- to most-constraining w.r.t. possible partition
+ * opportunities. When we write a kind abbreviation below we intend it to mean that kind *or less*.
+ * And when when write 'kl -> kr' we mean it to match a sub-expression of kind kr or less who's
+ * dataflow inputs are all of kind kl or less.
+ *
+ * We can then mimic the classic \p FuseOps TVM Pass with the following more primitive 'combiner'
+ * rules:
+ *  - Sub-groups cannot have taps. In the classic \p FuseOps pass taps are avoided by construction
+ *    by always considering all node->dominator paths. Here we naively allow taps on all candidates,
+ *    but reject them using SubGraph::IsValid with a SubGraphConfig with allow_taps = false.
+ *  - Combine A -> B
+ *  - Combine B -> R
+ *  - Combine I -> I
+ *  - Combine I -> tuple -> I. That is, if an I sub-graph has a tuple as input, and at least one
+ *    tuple field can be provided by an I sub-graph exit, then both the tuple and all such fields
+ *    may be joined.
+ *
+ * Note that \p FuseOps only considers the largest possible sub-graphs. However this partition rule
+ * considers all possibilities so as to 'make room' for other targets supplying other
+ * overlapping candidates.
+ *
+ * See combiner_rule.h for the more primitive combiner rules which implement the above.
+ */
+class CombinePartitionRuleNode : public PartitionRuleNode {
+ public:
+  /*! \brief The sub-rule supplying the initial set of candidates. */
+  PartitionRule sub_rule_;
+  /*! \brief The more primitive rules to use to combine the candidates found by the above rule. */
+  Array<CombinerRule> combiner_rules_;
+  /*! \brief Maximum max_depth for candidates. */
+  size_t max_max_depth_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+ public:
+  static constexpr const char* _type_key = "relay.collage.CombinePartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CombinePartitionRuleNode, PartitionRuleNode);
+};
+
+class CombinePartitionRule : public PartitionRule {
+ public:
+  CombinePartitionRule(String rule_name, PartitionRule sub_rule, Array<CombinerRule> combiner_rules,
+                       size_t max_max_depth_);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(CombinePartitionRule, PartitionRule, CombinePartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rules which keeps only candidates from the sub-rule whose sub-groups are valid
+ * w.r.t. the given \p SubGraphConfig.
+ */
+class OnlyValidPartitionRuleNode : public PartitionRuleNode {
+ public:
+  PartitionRule sub_rule_;
+  SubGraphConfig config_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+ public:
+  static constexpr const char* _type_key = "relay.collage.OnlyValidPartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(OnlyValidPartitionRuleNode, PartitionRuleNode);
+};
+
+class OnlyValidPartitionRule : public PartitionRule {
+ public:
+  OnlyValidPartitionRule(String rule_name, PartitionRule sub_rule, const SubGraphConfig& config);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(OnlyValidPartitionRule, PartitionRule, OnlyValidPartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rule which selects nodes which can be 'left behind' to be executed by the host
+ * (eg on the VM). This includes most of the 'interstitial' Relay constructs, such a let bindings,
+ * operators on references, calls to non-operator functions, and so on. It can also include the
+ * construction of and projection from tuples which may not be supported within a partition.
+ */
+class HostPartitionRuleNode : public PartitionRuleNode {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+ public:
+  static constexpr const char* _type_key = "relay.collage.HostPartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(HostPartitionRuleNode, PartitionRuleNode);
+};
+
+class HostPartitionRule : public PartitionRule {
+ public:
+  explicit HostPartitionRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(HostPartitionRule, PartitionRule, HostPartitionRuleNode);
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_PARTITION_RULE_H_
diff --git a/src/relay/collage/partition_spec.cc b/src/relay/collage/partition_spec.cc
new file mode 100644
index 0000000000000..60c2e6b6d9764
--- /dev/null
+++ b/src/relay/collage/partition_spec.cc
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/partition_spec.cc
+ * \brief Combine a \p PartitionRule with one or more \p Targets.
+ */
+
+#include "./partition_spec.h"
+
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+String DefaultValidateSubGraphFunc(const Function& function) { return String(); }
+
+TVM_REGISTER_NODE_TYPE(PartitionSpecNode);
+
+void PartitionSpecNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> PartitionSpecNode::AllCandidates(
+    const DataflowGraph& dataflow_graph) const {
+  std::vector<CandidatePartition> result;
+  // Make sure the target is in scope for inspection by any predicates in
+  // DFPatternPartitionRuleNode rules.
+  With<Target> target_scope(target_);
+  // Gather all the candidates.
+  std::vector<CandidatePartition> candidates =
+      rule_->AllCandidates(dataflow_graph, GetRef<PartitionSpec>(this));
+  // Update the rules names.
+  for (const auto& candidate : candidates) {
+    ICHECK_EQ(candidate->spec_, GetRef<PartitionSpec>(this));
+    String rule_name = NestLabels(spec_name_, candidate->rule_name_);
+    CandidatePartition new_candidate = WithRuleName(candidate, std::move(rule_name));
+    result.emplace_back(std::move(new_candidate));
+  }
+  return result;
+}
+
+std::string PartitionSpecNode::ToString() const {
+  Doc doc;
+  doc << "PartitionSpec(" << Doc::NewLine(2);
+  std::vector<Doc> body_items;
+  body_items.emplace_back();
+  body_items.back() << "spec_name=" << Doc::StrLiteral(spec_name_);
+  body_items.emplace_back();
+  body_items.back() << "target=" << target_->ToDebugString();
+  body_items.emplace_back();
+  body_items.back() << "rule=" << rule_->ToDoc();
+  doc << Doc::Indent(2, Doc::Concat(body_items, Doc::NewLine())) << Doc::NewLine();
+  doc << ")";
+  return doc.str();
+}
+
+PartitionSpec::PartitionSpec(String spec_name, Target target, PartitionRule rule,
+                             TValidateSubGraphFunc validate_sub_graph_func) {
+  auto node = runtime::make_object<PartitionSpecNode>();
+  node->spec_name_ = std::move(spec_name);
+  node->target_ = std::move(target);
+  node->rule_ = std::move(rule);
+  node->validate_sub_graph_func_ = std::move(validate_sub_graph_func);
+  data_ = std::move(node);
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/partition_spec.h b/src/relay/collage/partition_spec.h
new file mode 100644
index 0000000000000..90a7b6d65a65f
--- /dev/null
+++ b/src/relay/collage/partition_spec.h
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/partition_spec.h
+ * \brief Combine a \p PartitionRule with one or more \p Targets.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_PARTITION_SPEC_H_
+#define TVM_RELAY_COLLAGE_PARTITION_SPEC_H_
+
+#include <tvm/relay/function.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/target/target.h>
+
+#include <string>
+#include <vector>
+
+#include "./partition_rule.h"
+#include "./sub_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Type of functions for checking the validity of partitions before they proceed to lowering
+ * and codegen. The argument is the function extracted from the overall expression to represent
+ * the partition. The result is a non-empty error message string if the candidate should be
+ * rejected.
+ */
+using TValidateSubGraphFunc = TypedPackedFunc<String(const Function& function)>;
+
+/*!
+ * \brief The default validation function. Always returns the empty string, ie no error.
+ */
+String DefaultValidateSubGraphFunc(const Function& function);
+
+/*!
+ * \brief Pairs a \p PartitionRule with one or more \p Targets it can be used for.
+ */
+class PartitionSpecNode : public Object {
+ public:
+  /*!
+   * \brief Specification name to distinguish this spec from all others. Typically the BYOC
+   * 'compiler' name, "tvm", or "host".
+   */
+  String spec_name_;
+
+  /*!
+   * \brief The target all candidate partitions should be compiled for.
+   *
+   * It's tempting to support multiple targets here since. Eg the partitioning rules for
+   * TVM are the same irrespective of whether the target is "cuda" or "llvm", so it would make
+   * sense to build the candidate partitions first without committing to any target, then 'stamp'
+   * them for each target as the final step.
+   *
+   * However, we want to make sure any predicate in \p DFPatternPartitionRuleNode instances
+   * can have access to the current target instance. Eg the predicate may need to consult
+   * build-time configuration to decide what operators, shapes etc are actually supported.
+   * That implies the specific target is known when the candidate partitions are being constructed.
+   *
+   * So for now we'll just force each spec to have exactly one target.
+   */
+  Target target_;
+
+  /*!
+   * \brief The partition rule to use to gather candidates.
+   */
+  PartitionRule rule_;
+
+  /*!
+   * \brief The validation function to apply to each candidate's the extracted function before
+   * proceeding to lowering/codegen.
+   */
+  TValidateSubGraphFunc validate_sub_graph_func_ = DefaultValidateSubGraphFunc;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  /*!
+   * \brief Returns all the candidate partitions found by this specification. The candidates
+   * will be for a specific target, but will not yet have an extracted function or cost.
+   */
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph) const;
+
+  std::string ToString() const;
+
+  static constexpr const char* _type_key = "relay.collage.PartitionSpec";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PartitionSpecNode, Object);
+};
+
+class PartitionSpec : public ObjectRef {
+ public:
+  PartitionSpec(String spec_name, Target target, PartitionRule rule,
+                TValidateSubGraphFunc validate_sub_graph_func = DefaultValidateSubGraphFunc);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(PartitionSpec, ObjectRef, PartitionSpecNode);
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_PARTITION_SPEC_H_
diff --git a/src/relay/collage/priority_queue.h b/src/relay/collage/priority_queue.h
new file mode 100644
index 0000000000000..1d30fe5d96af3
--- /dev/null
+++ b/src/relay/collage/priority_queue.h
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/priority_queue.h
+ * \brief An updatable priority queue.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_PRIORITY_QUEUE_H_
+#define TVM_RELAY_COLLAGE_PRIORITY_QUEUE_H_
+
+#include <set>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*! \brief Priority queue of search states, ordered by increasing cost. */
+template <typename T, typename CmpTPtr, typename EqTPtr>
+class PriorityQueue {
+ public:
+  PriorityQueue() = default;
+
+  /*! \brief Pushes \p item onto the queue. */
+  void Push(T* item) { set_.emplace(item); }
+
+  /*! \brief Pops the item with the least cost off the queue. */
+  T* Pop() {
+    ICHECK(!set_.empty());
+    T* item = *set_.begin();
+    set_.erase(set_.begin());
+    return item;
+  }
+
+  /*! \brief Updates the queue to account for \p item's best cost being lowered. */
+  void Update(T* item) {
+    auto itr = std::find_if(set_.begin(), set_.end(),
+                            [item](const T* that) { return EqTPtr()(that, item); });
+    ICHECK(itr != set_.end());
+    set_.erase(itr);
+    set_.emplace(item);
+  }
+
+  bool empty() const { return set_.empty(); }
+  size_t size() const { return set_.size(); }
+
+ private:
+  // TODO(mbs): Actually use a pri-queue datastructure!
+  std::set<T*, CmpTPtr> set_;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_PRIORITY_QUEUE_H_
diff --git a/src/relay/collage/prune_candidates.cc b/src/relay/collage/prune_candidates.cc
new file mode 100644
index 0000000000000..f6a53b75f4b24
--- /dev/null
+++ b/src/relay/collage/prune_candidates.cc
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/prune_candidates.cc
+ * \brief Try to remove candidates which will never contribute to an optimal partitioning.
+ */
+
+#include "./prune_candidates.h"
+
+#include "./dataflow_graph.h"
+#include "./gather_partition_specs.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+namespace {
+
+/*!
+ * \brief Returns a map from post-dfs dataflow node indices to the indices within \p candidates for
+ * those candidates which intersect that dataflow node.
+ *
+ * NOTE: The index set in the vector results is over candidate indices not post-dfs indices!
+ */
+std::vector<IndexSet> MakeInsideMap(const DataflowGraph& dataflow_graph,
+                                    const std::vector<CandidatePartition>& candidates) {
+  std::vector<IndexSet> result(dataflow_graph.size(), IndexSet(candidates.size()));
+  for (size_t i = 0; i < candidates.size(); ++i) {
+    CandidatePartition candidate = candidates[i];
+    for (PostDfsIndex index : candidate->sub_graph_->inside_) {
+      result[index].Add(i);
+    }
+  }
+  return result;
+}
+
+/*!
+ * \brief Returns the maximal candidates within \p candidates. A candidate is maximal if it is not
+ * contained by any super-candidate for the same target.
+ */
+std::vector<CandidatePartition> MaximalCandidates(
+    const DataflowGraph& dataflow_graph, const std::vector<CandidatePartition>& candidates) {
+  std::vector<IndexSet> inside_map = MakeInsideMap(dataflow_graph, candidates);
+  std::vector<CandidatePartition> result;
+  for (size_t i = 0; i < candidates.size(); ++i) {
+    CandidatePartition maximal_candidate = candidates[i];
+    bool has_super_candidate = false;
+    IndexSet explored_candidates(candidates.size());  // over candidates!
+    for (PostDfsIndex index : maximal_candidate->sub_graph_->inside_) {
+      for (size_t j : inside_map[index]) {
+        if (i == j) {
+          // Ignore self.
+          continue;
+        }
+        if (explored_candidates[j]) {
+          // Already checked.
+          continue;
+        }
+        explored_candidates.Add(j);
+        CandidatePartition super_candidate = candidates[j];
+        if (maximal_candidate->spec_ == super_candidate->spec_ &&
+            maximal_candidate->sub_graph_->inside_.IsSubset(super_candidate->sub_graph_->inside_)) {
+          has_super_candidate = true;
+          break;
+        }
+      }
+      if (has_super_candidate) {
+        break;
+      }
+    }
+    if (!has_super_candidate) {
+      VLOG(2) << "Found maximal candidate " << maximal_candidate->ToString();
+      result.emplace_back(maximal_candidate);
+    }
+  }
+  VLOG(1) << "Have " << result.size() << " maximal candidates";
+  return result;
+}
+
+/*!
+ * \brief Returns all the candidates in \p candidates which intersect without being equal.
+ */
+std::vector<CandidatePartition> IntersectingCandidates(
+    const DataflowGraph& dataflow_graph, const std::vector<CandidatePartition>& candidates) {
+  std::vector<IndexSet> inside_map = MakeInsideMap(dataflow_graph, candidates);
+  IndexSet intersecting(candidates.size());  // over candidates!
+  for (size_t i = 0; i < candidates.size(); ++i) {
+    CandidatePartition intersecting_candidate = candidates[i];
+    IndexSet explored_candidates(candidates.size());  // over candidates!
+    for (PostDfsIndex index : intersecting_candidate->sub_graph_->inside_) {
+      for (size_t j : inside_map[index]) {
+        if (j < i) {
+          // Intersection is commutative.
+          continue;
+        }
+        if (i == j) {
+          // Ignore self.
+          continue;
+        }
+        if (explored_candidates[j]) {
+          // Already checked.
+          continue;
+        }
+        explored_candidates.Add(j);
+        CandidatePartition other_candidate = candidates[j];
+        if (intersecting_candidate->sub_graph_->inside_ == other_candidate->sub_graph_->inside_) {
+          // Have same inside set.
+          continue;
+        }
+        VLOG(2) << "Candidate " << intersecting_candidate->ToString() << " intersects with "
+                << other_candidate->ToString();
+        intersecting.Add(i);
+        intersecting.Add(j);
+      }
+    }
+  }
+  std::vector<CandidatePartition> result;
+  for (size_t i : intersecting) {
+    CandidatePartition candidate = candidates[i];
+    VLOG(2) << "Found intersecting candidate " << candidate->ToString();
+    result.emplace_back(candidate);
+  }
+  VLOG(1) << "Have " << result.size() << " intersecting candidates";
+  return result;
+}
+
+/*!
+ * \brief Returns the set operation left - right.
+ */
+std::vector<CandidatePartition> SetDifference(const std::vector<CandidatePartition>& left,
+                                              const std::vector<CandidatePartition>& right) {
+  std::unordered_set<CandidatePartition, CandidatePartitionHash, CandidatePartitionEquals>
+      right_set(right.begin(), right.end());
+  std::vector<CandidatePartition> result;
+  for (const auto& candidate : left) {
+    if (right_set.count(candidate) == 0) {
+      result.emplace_back(candidate);
+    }
+  }
+  return result;
+}
+
+/*!
+ * \brief Adds everything in right to left. Returns the number of elements added.
+ */
+size_t SetUnionInPlace(
+    std::unordered_set<CandidatePartition, CandidatePartitionHash, CandidatePartitionEquals>* left,
+    const std::vector<CandidatePartition>& right) {
+  size_t init_size = left->size();
+  for (const auto& candidate : right) {
+    left->emplace(candidate);
+  }
+  return left->size() - init_size;
+}
+
+}  // namespace
+
+std::vector<CandidatePartition> PruneCandidates(
+    const DataflowGraph& dataflow_graph,
+    const std::vector<CandidatePartition>& initial_candidates) {
+  VLOG_CONTEXT << "prune";
+  // Start with all candidates available.
+  std::vector<CandidatePartition> candidates = initial_candidates;
+  std::unordered_set<CandidatePartition, CandidatePartitionHash, CandidatePartitionEquals> pruned;
+  size_t num_rounds = 0;
+  while (true) {
+    VLOG_CONTEXT << "round " << ++num_rounds;
+    VLOG(1) << "checking " << candidates.size() << " candidates";
+    // Add all the maximal candidates to the pruned set.
+    std::vector<CandidatePartition> maximal_candidates =
+        MaximalCandidates(dataflow_graph, candidates);
+    size_t num_new_pruned = SetUnionInPlace(&pruned, maximal_candidates);
+    VLOG(1) << "Added " << num_new_pruned << " new pruned candidates";
+    if (num_new_pruned == 0) {
+      // We've reached a fixed point.
+      break;
+    }
+    // If two pruned candidates intersect without being equal then we may miss valid
+    // paths during search. So remove those intersecting candidates from the available candidates
+    // and try again so as to find smaller candidates to 'bridge the gaps'.
+    std::vector<CandidatePartition> pruned_vec(pruned.begin(), pruned.end());
+    std::vector<CandidatePartition> intersecting_candidates =
+        IntersectingCandidates(dataflow_graph, pruned_vec);
+    // We need more maximal candidates to fill in the gaps between the current pruned candidates.
+    // Force that by removing the intersecting candidates from the set of available candidates
+    // and going around again.
+    candidates = SetDifference(candidates, intersecting_candidates);
+  }
+
+  VLOG(1) << "Have " << pruned.size() << " pruned candidates";
+  std::vector<CandidatePartition> result(pruned.begin(), pruned.end());
+  // Re-establish a canonical order of candidates.
+  std::sort(result.begin(), result.end());
+  return result;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/prune_candidates.h b/src/relay/collage/prune_candidates.h
new file mode 100644
index 0000000000000..6e35870b9b97f
--- /dev/null
+++ b/src/relay/collage/prune_candidates.h
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/prune_candidates.h
+ * \brief Try to remove candidates which will never contribute to an optimal partitioning.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_PRUNE_CANDIDATES_H_
+#define TVM_RELAY_COLLAGE_PRUNE_CANDIDATES_H_
+
+#include <vector>
+
+#include "./candidate_partition.h"
+#include "./dataflow_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Returns \p initial_candidates with all unnecessary candidates pruned.
+ *
+ * We prune according to the following two heuristics:
+ * 1. Given partitions (A, target) and (B, target) then
+ *    cost(A union B, target) < cost(A, target) + cost(B, target).
+ *    That is, there's no use estimating the cost of small partitions when a larger partition
+ *    containing them is also available. More precisely, call a partition 'maximal' if it is
+ *    not contained by any other partition for the same target. Then we want to prefer maximal
+ *    candidates when searching.
+ * 2. Given maximal partitions (A union B, target) and (A union B, target') where
+ *    target != target', then min(cost(A union B, target), cost(A union B, target')) <
+ *    min(cost(A, target) + cost(B, target'), cost(A, target') + cost(B, target)).
+ *    That is, there's no use estimating cross-combinations of partitions which are not maximal.
+ *
+ * However, we can't prune a non-maximal candidate if it will make some other maximal candidate
+ * unreachable during the Collage search. We achieve this by iterating until fixed point:
+ *  - Find maximal candidates of current set of candidates.
+ *  - Add those maximal candidates to the output 'pruned' set.
+ *  - If any two candidates in the 'pruned' set intersect without being equal, remove those from
+ *    the current set of candidates and go around again. That will force more candidates to
+ *    be considered 'maximal'.
+ * That over-approximates the true necessary candidates but is at least simple.
+ */
+std::vector<CandidatePartition> PruneCandidates(
+    const DataflowGraph& dataflow_graph, const std::vector<CandidatePartition>& initial_candidates);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_PRUNE_CANDIDATES_H_
diff --git a/src/relay/collage/recover_virtual_device_map.cc b/src/relay/collage/recover_virtual_device_map.cc
new file mode 100644
index 0000000000000..47265b85c8a21
--- /dev/null
+++ b/src/relay/collage/recover_virtual_device_map.cc
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/recover_virtual_device_map.cc
+ * \brief Recover the virtual device for every Relay expression node.
+ */
+
+#include "./recover_virtual_device_map.h"
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+std::unordered_map<const ExprNode*, VirtualDevice> RecoverVirtualDeviceMap(const IRModule& mod,
+                                                                           const Expr& expr) {
+  class Visitor : public transform::DeviceAwareExprVisitor {
+   public:
+    explicit Visitor(const Optional<IRModule>& maybe_mod)
+        : transform::DeviceAwareExprVisitor(maybe_mod) {}
+
+    void VisitExpr(const Expr& expr) final {
+      map_[expr.get()] = GetVirtualDevice(expr);
+      transform::DeviceAwareExprVisitor::VisitExpr(expr);
+    }
+
+    std::unordered_map<const ExprNode*, VirtualDevice> map_;
+  };
+
+  Visitor visitor(mod);
+  visitor.VisitExpr(expr);
+  return std::move(visitor.map_);
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/recover_virtual_device_map.h b/src/relay/collage/recover_virtual_device_map.h
new file mode 100644
index 0000000000000..e3104b457e458
--- /dev/null
+++ b/src/relay/collage/recover_virtual_device_map.h
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/collage/recover_virtual_device_map.h
+ * \brief Recover the virtual device for every Relay expression node.
+ *
+ * Temporary hack until virtual_device_ work is finished.
+ */
+#ifndef TVM_RELAY_COLLAGE_RECOVER_VIRTUAL_DEVICE_MAP_H_
+#define TVM_RELAY_COLLAGE_RECOVER_VIRTUAL_DEVICE_MAP_H_
+
+#include <tvm/relay/expr.h>
+
+#include <unordered_map>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+std::unordered_map<const ExprNode*, VirtualDevice> RecoverVirtualDeviceMap(const IRModule& mod,
+                                                                           const Expr& expr);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_RECOVER_VIRTUAL_DEVICE_MAP_H_
diff --git a/src/relay/collage/sub_graph.cc b/src/relay/collage/sub_graph.cc
new file mode 100644
index 0000000000000..016ce958ee5ba
--- /dev/null
+++ b/src/relay/collage/sub_graph.cc
@@ -0,0 +1,1005 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/sub_graph.cc
+ * \brief Represents a sub-graph of an overall Relay expression.
+ */
+
+#include "./sub_graph.h"
+
+#include <tvm/relay/transform.h>
+
+#include "../../support/scalars.h"
+#include "../transforms/pass_utils.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+namespace {
+
+class Extractor;
+
+/*!
+ * \brief Helper class for rewriting expressions to replace a sub-graph according to the
+ * given extractor.
+ */
+class Rewriter : public ExprMutator {
+ public:
+  explicit Rewriter(const Extractor* extractor) : extractor_(extractor) {}
+
+  Expr VisitExpr(const Expr& expr) final;
+
+ private:
+  /*! \brief Already prepared extractor which will guide the rewrite. */
+  const Extractor* extractor_;
+};
+
+/*! \brief Helper class for extracting matched sub-graphs from the overall expression. */
+class Extractor : public ExprMutator {
+ public:
+  Extractor(const DataflowGraph* dataflow_graph, const SubGraphNode* sub_graph,
+            FunctionAttrsMap opt_attrs)
+      : dataflow_graph_(dataflow_graph), sub_graph_(sub_graph), opt_attrs_(std::move(opt_attrs)) {
+    ICHECK_EQ(dataflow_graph_->size(), sub_graph_->overall_size());
+  }
+
+  const DataflowGraph& dataflow_graph() const { return *dataflow_graph_; }
+
+  /*!
+   * \brief Collect the parameters and output expressions for the function representing
+   * the sub-graph.
+   */
+  void Extract() {
+    ICHECK(!sub_graph_->IsEmpty());
+    VLOG(2) << "Extracting " << sub_graph_->ToString();
+    const bool for_function = opt_attrs_.defined();
+
+    //  In reverse dataflow order...
+    for (PostDfsIndex i = dataflow_graph_->size(); i > 0; --i) {
+      PostDfsIndex index = i - 1;
+      if (!sub_graph_->inside_[index]) {
+        // Node is outside sub-graph.
+        continue;
+      }
+      VLOG(2) << "index " << index;
+      auto node = dataflow_graph_->index_to_node(index);
+      if (sub_graph_->exit_[node->index_] || node->is_external_ || memo_.count(node->ref()) == 0) {
+        // This sub-expression is:
+        //  - inside the sub-graph and needed outside the sub-graph. So it must contribute to an
+        //    output (even if we've already visited it while constructing an output from a
+        //    downstream sub-expression).
+        //  - not yet visited, in which case it must still be considered an 'output' so it will
+        //    be evaluated for any possible side effects.
+        Expr output = VisitExpr(GetRef<Expr>(node->node_ref_));
+        VLOG(2) << "index " << index << " added as output:\n"
+                << PrettyPrint(output) << "\nat " << outputs_.size();
+        expr_to_output_index_.emplace(node->node_ref_, outputs_.size());
+        outputs_.emplace_back(std::move(output));
+        output_types_.emplace_back(node->node_ref_->checked_type());
+      }
+    }
+    ICHECK(!outputs_.empty());
+
+    // Reverse the outputs so as to preserve the original evaluation order.
+    std::reverse(outputs_.begin(), outputs_.end());
+    std::reverse(output_types_.begin(), output_types_.end());
+    for (auto& kv : expr_to_output_index_) {
+      kv.second = static_cast<int>(outputs_.size()) - 1 - kv.second;
+    }
+
+    // Build a 'body' expression to represent the extracted sub-graph. If we have multiple
+    // outputs we'll place them in a tuple.
+    Type body_type;
+    Expr body;
+    if (outputs_.size() > 1) {
+      body_type = TupleType(output_types_);
+      body = Tuple(outputs_);
+      body->checked_type_ = body_type;
+    } else {
+      body_type = output_types_.front();
+      body = outputs_.front();
+    }
+
+    // Re-express all the sub-sub-graphs in terms of the body.
+    DataflowGraph body_dataflow_graph(body);
+    std::vector<SubSubGraph> sub_sub_graphs;
+    IndexSubst subst = MakeIndexSubst(body_dataflow_graph);
+    for (const auto& sub_sub_graph : sub_graph_->sub_sub_graphs_) {
+      sub_sub_graphs.emplace_back(sub_sub_graph.Subst(body_dataflow_graph, subst));
+    }
+
+    // Sweep backwards through the body, rewriting to account for each sub-sub-graph.
+    body = SubSubGraph::ParallelRewrite(body_dataflow_graph, body, std::move(sub_sub_graphs));
+
+    if (for_function) {
+      // Rewrite so all input nodes are now conveyed via call arguments to a new function.
+      Array<Type> arg_types;
+      arg_types.reserve(params_.size());
+      for (const auto& param : params_) {
+        arg_types.push_back(param->checked_type());
+      }
+      extracted_ = Function(std::move(params_), std::move(body), body_type,
+                            /*ty_params=*/{}, DictAttrs(opt_attrs_));
+      extracted_->checked_type_ =
+          FuncType(std::move(arg_types), body_type, /*type_params=*/{}, /*type_constraints=*/{});
+      body = Call(extracted_, std::move(args_));
+      body->checked_type_ = body_type;
+    } else {
+      // Don't do anything with the inputs.
+      extracted_ = body;
+    }
+
+    // Setup the output substitution.
+    for (const auto& kv : expr_to_output_index_) {
+      Expr expr;
+      if (outputs_.size() == 1) {
+        expr = body;
+      } else if (for_function) {
+        expr = TupleGetItem(body, kv.second);
+        expr->checked_type_ = output_types_[kv.second];
+      } else {
+        const auto* tuple_node = body.as<TupleNode>();
+        ICHECK(tuple_node);
+        expr = tuple_node->fields[kv.second];
+      }
+      VLOG(2) << "output " << dataflow_graph_->item_to_node(kv.first)->index_ << " is at index "
+              << kv.second << " (of " << outputs_.size() << " outputs)";
+      output_substitution_.emplace(kv.first, std::move(expr));
+    }
+  }
+
+  ////// Following members are valid only after Extract() has returned.
+
+  /*!
+   * \brief Returns the expression representing the extracted sub-graph. If opt_attrs_ is
+   * defined then will be a function.
+   */
+  Expr extracted() const { return extracted_; }
+
+  /*!
+   * \brief Returns the substitution to apply to all expression nodes in the overall expression
+   * so as to replace references to outputs of the sub-graph with their rewritten form.
+   */
+  const std::unordered_map<const ExprNode*, Expr>& output_substitution() const {
+    return output_substitution_;
+  }
+
+ private:
+  /*!
+   * \brief Returns a map from original index to new index for each node inside the sub-graph. Only
+   * valid after \p Extract has made its backwards dataflow sweep.
+   */
+  IndexSubst MakeIndexSubst(const DataflowGraph& new_dataflow_graph) const {
+    VLOG(2) << "building extractor substitution";
+    IndexSubst subst;
+    for (PostDfsIndex index : sub_graph_->inside_) {
+      auto orig_node = dataflow_graph_->index_to_node(index);
+      ICHECK_EQ(orig_node->index_, index);
+      auto itr = memo_.find(orig_node->ref());
+      ICHECK(itr != memo_.end());
+      auto new_node = new_dataflow_graph.item_to_node(itr->second);
+      VLOG(2) << orig_node->index_ << " |-> " << new_node->index_;
+      subst.emplace(orig_node->index_, new_node->index_);
+    }
+    return subst;
+  }
+
+  /*! \brief Returns true if \p expr is inside the sub-graph. */
+  bool inside(const Expr& expr) {
+    return sub_graph_->inside_[dataflow_graph_->item_to_node(expr)->index_];
+  }
+
+  /*!
+   * \brief Returns the variable uniquely representing \p expr, which should be
+   * an input node (ie outside the sub-graph but feeding into a node inside the sub-graph).
+   *
+   * It is valid for:
+   *  - An expression outside the sub-graph to be used multiple times inside the sub-graph.
+   *  - An expression outside the sub-graph to be used both inside and outside the sub-graph.
+   */
+  Var VarFor(const Expr& expr) {
+    ICHECK(!inside(expr));
+    ICHECK(opt_attrs_.defined());
+    auto itr = expr_to_param_.find(expr.get());
+    if (itr != expr_to_param_.end()) {
+      return itr->second;
+    }
+    auto fresh_var = Var("FunctionVar_" + std::to_string(params_.size()), expr->checked_type());
+    fresh_var->checked_type_ = expr->checked_type();
+    params_.push_back(fresh_var);
+    args_.push_back(expr);
+    expr_to_param_.emplace(expr.get(), fresh_var);
+    return fresh_var;
+  }
+
+  /*!
+   * \brief If \p expr is inside the sub-graph then return it's rewritten form.
+   * If \p expr is outside the sub-graph then it must correspond to an input node.
+   *  - If opt_attrs_ is defined return the variable to represent it.
+   *  - Otherwise just return the expression directly.
+   *
+   * Should be called only on inputs to nodes which are inside the sub-graph.
+   */
+  Expr VisitExpr(const Expr& expr) final {
+    if (inside(expr)) {
+      return ExprMutator::VisitExpr(expr);
+    } else if (CanInline(expr)) {
+      // Implicitly include inlinable input sub-expressions.
+      return expr;
+    } else if (opt_attrs_.defined()) {
+      // Map to a function parameter.
+      return VarFor(expr);
+    } else {
+      // Stop rewriting.
+      return expr;
+    }
+  }
+
+  Expr VisitExpr_(const FunctionNode* function_node) override {
+    if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+      return GetRef<Function>(function_node);
+    }
+    return ExprMutator::VisitExpr_(function_node);
+  }
+
+  //// Context fields, passed in constructor.
+
+  /*! \brief The dataflow graph corresponding to the overall expression. */
+  const DataflowGraph* dataflow_graph_;
+  /*! \brief The sub-graph of the above we are extracting. */
+  const SubGraphNode* sub_graph_;
+  /*! \brief Optional attributes if the sub-graph should be extracted as a function. */
+  FunctionAttrsMap opt_attrs_;
+
+  //// Result fields, available after Extract() called.
+
+  /*!
+   * \brief The extracted expression. If opt_attrs_ is defined this will be a function.
+   */
+  Expr extracted_;
+  /*!
+   * \brief Map from output nodes to corresponding expressions. If the sub-graph has more than
+   * one exit node then each entry will be a tuple projection.
+   */
+  std::unordered_map<const ExprNode*, Expr> output_substitution_;
+
+  //// Accumulator fields, built as we visit expressions.
+
+  /*! \brief (If opt_attrs_ is defined) Parameters representing input expression nodes. */
+  Array<Var> params_;
+  /*!
+   * \brief (If opt_attrs_ is defined) The input expression nodes for each of the above params_.
+   */
+  Array<Expr> args_;
+  /*!
+   * \brief (If opt_attrs_ is defined) Map from existing input expression nodes to the parameters
+   * in params_ which now representing them.
+   */
+  std::unordered_map<const ExprNode*, Var> expr_to_param_;
+  /*!
+   * \brief Accumulated new expressions which represent the exit nodes of the rewritten sub-graph.
+   * It is possible to have multiple outputs. It is possible one output also contributes to other
+   * outputs (ie the output is a 'tap').
+   */
+  std::vector<Expr> outputs_;
+  /*! \brief (If opt_attrs_ is defined) Types of original expressions corresponding to outputs_. */
+  std::vector<Type> output_types_;
+  /*!
+   * \brief Map from existing exit expression nodes to the index in outputs_ which should
+   * represent them in the rewritten overall expression.
+   */
+  std::unordered_map<const ExprNode*, int> expr_to_output_index_;
+};
+
+Expr Rewriter::VisitExpr(const Expr& expr) {
+  auto itr = extractor_->output_substitution().find(expr.get());
+  if (itr == extractor_->output_substitution().end()) {
+    return ExprMutator::VisitExpr(expr);
+  } else {
+    return itr->second;
+  }
+}
+
+}  // namespace
+
+std::pair<OpPatternKind, std::string> SubExprKindAndLabel(const Expr& sub_expr) {
+  class Visitor : public ExprFunctor<std::pair<OpPatternKind, std::string>(const Expr&)> {
+   private:
+    std::pair<OpPatternKind, std::string> VisitExpr_(const CallNode* call_node) final {
+      if (const auto* op_node = call_node->op.as<OpNode>()) {
+        auto op = GetRef<Op>(op_node);
+        static auto fpattern = Op::GetAttrMap<TOpPattern>("TOpPattern");
+        if (fpattern.count(op) == 0) {
+          VLOG(1) << "no TOpPattern known for " << op->name << ", considering opaque";
+          return {kOpaque, op->name};
+        } else if (IsDynamic(call_node->checked_type()) && IsDataDependent(call_node)) {
+          VLOG(1) << "call has dynamic shape which is data-dependent, considering opaque";
+          return {kOpaque, op->name};
+        } else {
+          OpPatternKind kind = static_cast<OpPatternKind>(fpattern[op]);
+          VLOG(2) << "TOpPattern for " << op->name << " is " << KindToString(kind);
+          return {kind, op->name};
+        }
+      } else if (const auto* function_node = call_node->op.as<FunctionNode>()) {
+        Optional<Integer> opt_i =
+            function_node->GetAttr<Integer>("TOpPattern", Optional<Integer>());
+        if (opt_i.defined()) {
+          OpPatternKind kind = static_cast<OpPatternKind>(opt_i.value()->value);
+          VLOG(1) << "TOpPattern for function is " << KindToString(kind);
+          return {kind, "call_prim"};
+        } else {
+          VLOG(1) << "calling function without TOpPattern, considering opaque";
+          return {kOpaque, "call_fun"};
+        }
+      } else {
+        VLOG(1) << "unsupported call, considering opaque";
+        return {kOpaque, "call_any"};
+      }
+    }
+
+    std::pair<OpPatternKind, std::string> VisitExpr_(const ConstantNode* constant_node) final {
+      VLOG(2) << "TOpPattern for constant is " << KindToString(kElemWise);
+      if (support::IsSimpleScalar(constant_node)) {
+        return {kElemWise, "scalar"};
+      } else {
+        return {kElemWise, "const"};
+      }
+    }
+
+    std::pair<OpPatternKind, std::string> VisitExpr_(const TupleNode* tuple_node) final {
+      const auto* tuple_type_node = tuple_node->checked_type().as<TupleTypeNode>();
+      ICHECK(tuple_type_node != nullptr);
+      if (std::all_of(tuple_type_node->fields.begin(), tuple_type_node->fields.end(),
+                      [](const Type& type) { return type.as<TensorTypeNode>() != nullptr; })) {
+        VLOG(2) << "TOpPattern for tuple is " << KindToString(kInjective);
+        return {kInjective, "tuple"};
+      } else {
+        VLOG(1) << "tuple contains non-tensors, considering opaque";
+        return {kOpaque, "tuple"};
+      }
+    }
+
+    std::pair<OpPatternKind, std::string> VisitExpr_(
+        const TupleGetItemNode* tuple_get_item_node) final {
+      const auto* tuple_type_node = tuple_get_item_node->tuple->checked_type().as<TupleTypeNode>();
+      ICHECK(tuple_type_node != nullptr);
+      if (std::all_of(tuple_type_node->fields.begin(), tuple_type_node->fields.end(),
+                      [](const Type& type) { return type.as<TensorTypeNode>() != nullptr; })) {
+        VLOG(2) << "TOpPattern for tuple projection is " << KindToString(kInjective);
+        return {kInjective, "proj"};
+      } else {
+        VLOG(1) << "tuple being projected contains non-tensors, considering opaque";
+        return {kOpaque, "proj"};
+      }
+    }
+
+    // TODO(mbs): We implement the following mostly so we have a lightweight way of describing
+    // the current sub-expression. If partitioning is ever extended beyond the usual call/tuple/proj
+    // sub-language we should revise the returned operator kinds to match.
+
+    std::pair<OpPatternKind, std::string> VisitExpr_(const VarNode* var_node) final {
+      return {kOpaque, "%" + var_node->name_hint()};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const GlobalVarNode* global_var_node) final {
+      return {kOpaque, "@" + global_var_node->name_hint};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const OpNode* op_node) final {
+      return {kOpaque, "`" + op_node->name};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const FunctionNode* function_node) final {
+      return {kOpaque, "fn"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const LetNode* let_node) final {
+      return {kOpaque, "let"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const IfNode* if_node) final {
+      return {kOpaque, "if"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const RefCreateNode* ref_create_node) final {
+      return {kOpaque, "ref"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const RefReadNode* op) final {
+      return {kOpaque, "ref_read"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const RefWriteNode* op) final {
+      return {kOpaque, "ref_write"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const ConstructorNode* op) final {
+      return {kOpaque, "`" + op->name_hint};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const MatchNode* op) final {
+      return {kOpaque, "match"};
+    }
+  };
+  return Visitor().VisitExpr(sub_expr);
+}
+
+std::pair<OpPatternKind, std::string> SubGraphKindAndLabel(const DataflowGraph& dataflow_graph,
+                                                           const IndexSet& inside) {
+  std::ostringstream os;
+  bool first = true;
+  OpPatternKind max_kind = kElemWise;
+  for (PostDfsIndex index : inside) {
+    OpPatternKind sub_kind;
+    std::string sub_label;
+    std::tie(sub_kind, sub_label) = SubExprKindAndLabel(dataflow_graph.index_to_node(index)->ref());
+    if (!sub_label.empty()) {
+      if (first) {
+        first = false;
+      } else {
+        os << "+";
+      }
+      os << sub_label;
+    }
+    max_kind = CombineKinds(max_kind, sub_kind);
+  }
+  return {max_kind, os.str()};
+}
+
+IndexSet MatcherToIndexSet(const DFPatternMatcher& matcher) {
+  IndexSet result(matcher.size());
+  for (const auto& kv : matcher.memo()) {
+    for (const auto& matched_sub_expr : kv.second) {
+      if (CanInline(matched_sub_expr)) {
+        // Trivial sub-expressions can just be included in the extracted function body
+        // when we construct it and don't need to be considered part of the sub-graph.
+        continue;
+      }
+      if (kv.first.as<WildcardPatternNode>()) {
+        // Don't consider the expressions matched by a wildcard to be part of the sub-graph.
+        continue;
+      }
+      result.Add(matcher.expr_to_node(matched_sub_expr)->index_);
+    }
+  }
+  return result;
+}
+
+std::string SubGraphConfig::ToString() const {
+  std::ostringstream os;
+  os << "{max_exits=" << max_exits;
+  os << ",allow_taps=" << allow_taps;
+  os << ",max_max_depth=" << max_max_depth;
+  os << "}";
+  return os.str();
+}
+
+TVM_REGISTER_NODE_TYPE(SubSubGraphNode);
+
+void SubSubGraphNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+SubGraph SubSubGraphNode::sub_graph() const { return Downcast<SubGraph>(sub_graph_obj_); }
+
+bool SubSubGraphNode::operator==(const SubSubGraphNode& that) const {
+  return *sub_graph().get() == *that.sub_graph().get();
+}
+
+bool SubSubGraphNode::operator<(const SubSubGraphNode& that) const {
+  return *sub_graph().get() < *that.sub_graph().get();
+}
+
+size_t SubSubGraphNode::hash() const {
+  size_t h = StructuralHash()(attrs_);
+  h ^= sub_graph()->hash() + 0x9e3779b9 + (h << 6) + (h >> 2);
+  return h;
+}
+
+std::string SubSubGraphNode::ToString() const {
+  std::ostringstream os;
+  os << "{sub_graph=" << sub_graph()->ToString();
+  os << ",attrs=" << PrettyPrint(attrs_);
+  os << "}";
+  return os.str();
+}
+
+Function SubSubGraphNode::Extract(const DataflowGraph& dataflow_graph) const {
+  Extractor extractor(&dataflow_graph, sub_graph().get(), attrs_);
+  extractor.Extract();
+  return Downcast<Function>(extractor.extracted());
+}
+
+Expr SubSubGraphNode::Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const {
+  Extractor extractor(&dataflow_graph, sub_graph().get(), attrs_);
+  extractor.Extract();
+  Rewriter rewriter(&extractor);
+  return rewriter.VisitExpr(expr);
+}
+
+SubSubGraph::SubSubGraph(SubGraph sub_graph, FunctionAttrsMap attrs) {
+  auto data = runtime::make_object<SubSubGraphNode>();
+  data->sub_graph_obj_ = std::move(sub_graph);
+  data->attrs_ = std::move(attrs);
+  data_ = std::move(data);
+}
+
+SubSubGraph SubSubGraph::Subst(const DataflowGraph& new_dataflow_graph,
+                               const std::unordered_map<PostDfsIndex, PostDfsIndex>& subst) const {
+  return SubSubGraph(get()->sub_graph().Subst(new_dataflow_graph, subst), get()->attrs_);
+}
+
+bool SubSubGraph::TriviallyUnionable(const SubSubGraph& that) const {
+  if (get()->attrs_.size() != that->attrs_.size()) {
+    return false;
+  }
+  for (const auto& kv : get()->attrs_) {
+    if (kv.first == "Composite") {
+      // Even if all the attributes agree we don't consider "Composite" functions to
+      // ever be unionable.
+      // TODO(mbs): Find a cleaner way to do this.
+      return false;
+    }
+    auto itr = that->attrs_.find(kv.first);
+    if (itr == that->attrs_.end()) {
+      return false;
+    }
+    if (!StructuralEqual()(kv.second, (*itr).second)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+SubSubGraph SubSubGraph::DisjointUnion(const DataflowGraph& dataflow_graph,
+                                       const SubSubGraph& that) const {
+  ICHECK(TriviallyUnionable(that));
+  return SubSubGraph(get()->sub_graph().DisjointUnion(dataflow_graph, that->sub_graph()),
+                     get()->attrs_);
+}
+
+/*static*/
+Expr SubSubGraph::ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr,
+                                  std::vector<SubSubGraph> sub_sub_graphs) {
+  // IMPORTANT: See the corresponding comment in SubGraph::ParallelRewrite.
+  std::sort(sub_sub_graphs.begin(), sub_sub_graphs.end(),
+            [](const SubSubGraph& left, const SubSubGraph& right) {
+              return left->sub_graph()->last_inside_index_ > right->sub_graph()->last_inside_index_;
+            });
+
+  Expr result = expr;
+  for (const auto& sub_sub_graph : sub_sub_graphs) {
+    result = sub_sub_graph->Rewrite(dataflow_graph, result);
+  }
+  return result;
+}
+
+TVM_REGISTER_NODE_TYPE(SubGraphNode);
+
+void SubGraphNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+IndexSet SubGraphNode::Downstream(const DataflowGraph& dataflow_graph) const {
+  IndexSet downstream(dataflow_graph.size());
+  for (PostDfsIndex exit_index : exit_) {
+    downstream = downstream | dataflow_graph.downstream_of(exit_index);
+  }
+  return downstream;
+}
+
+bool SubGraphNode::IsValid(const DataflowGraph& dataflow_graph,
+                           const SubGraphConfig& config) const {
+  // Check we don't have too many exit nodes.
+  if (config.max_exits > 0 && exit_.PopCount() > config.max_exits) {
+    VLOG(1) << "Subgraph " << ToString() << " is invalid: " << exit_.PopCount()
+            << " exits exceeds maximum " << config.max_exits;
+    return false;
+  }
+
+  // Check the maximum path depth is in limit.
+  if (config.max_max_depth > 0 && max_depth_ > config.max_max_depth) {
+    VLOG(1) << "Subgraph " << ToString() << " is invalid: maximum depth " << max_depth_
+            << " exceeds limit " << config.max_max_depth;
+    return false;
+  }
+
+  // All inside nodes must be in the same basic block.
+  const DataflowGraph::Node* basic_block = nullptr;
+  for (PostDfsIndex index : inside_) {
+    auto node = dataflow_graph.index_to_node(index);
+    if (basic_block == nullptr) {
+      basic_block = node->basic_block_;
+    }
+    if (node->basic_block_ != basic_block) {
+      VLOG(1) << "Subgraph " << ToString() << " is invalid: nodes are from different basic blocks";
+      return false;
+    }
+  }
+
+  // The sub-sub-graphs must be subsets and non-overlapping.
+  IndexSet union_inside(dataflow_graph.size());
+  for (const auto& sub_sub_graph : sub_sub_graphs_) {
+    if (!sub_sub_graph->sub_graph()->inside_.AreDisjoint(union_inside)) {
+      VLOG(1) << "Subgraph " << ToString() << " is invalid: sub-sub-graphs overlap";
+      return false;
+    }
+    if (!sub_sub_graph->sub_graph()->inside_.IsSubset(inside_)) {
+      VLOG(1) << "Subgraph " << ToString()
+              << " is invalid: sub-sub-graph is not subset of overall sub-graph";
+      return false;
+    }
+  }
+
+  if (!config.allow_taps) {
+    // Exit nodes cannot also contribute to inside nodes.
+    for (PostDfsIndex index : exit_) {
+      auto node = dataflow_graph.index_to_node(index);
+      if (AnyOutputInside(node)) {
+        VLOG(1) << "Subgraph " << ToString()
+                << " is invalid: inner node is 'tapped' and also contributes to output, but taps "
+                   "are disabled";
+        return false;
+      }
+    }
+  }
+
+  // Check no output would end up feeding into any entry node.
+  for (PostDfsIndex output_index : output_) {
+    if (dataflow_graph.downstream_of(output_index).Intersects(entry_)) {
+      VLOG(1) << "Subgraph " << ToString() << " is invalid: output node " << output_index
+              << " feeds back into this sub-graph";
+      return false;
+    }
+  }
+
+  // Looks legit!
+  return true;
+}
+
+Function SubGraphNode::ExtractAsFunction(const DataflowGraph& dataflow_graph) const {
+  SubSubGraph sub_sub_graph(GetRef<SubGraph>(this), FunctionAttrsMap());
+  return sub_sub_graph->Extract(dataflow_graph);
+}
+
+Expr SubGraphNode::Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const {
+  if (sub_sub_graphs_.empty()) {
+    // Nothing to rewrite.
+    return expr;
+  }
+  Extractor extractor(&dataflow_graph, this, NullValue<FunctionAttrsMap>());
+  extractor.Extract();
+  Rewriter rewriter(&extractor);
+  return rewriter.VisitExpr(expr);
+}
+
+std::string SubGraphNode::ToString() const {
+  std::ostringstream os;
+  os << "{inside=" << inside_.ToString();
+  os << ",entry=" << entry_.ToString();
+  os << ",exit=" << exit_.ToString();
+  os << ",input=" << input_.ToString();
+  os << ",output=" << output_.ToString();
+  os << ",max_depth=" << max_depth_;
+  os << ",kind=" << KindToString(kind_);
+  if (!label_.empty()) {
+    os << ",label=" << label_;
+  }
+  for (const auto& sub_sub_graph : sub_sub_graphs_) {
+    os << ",sub_sub_graph=" << sub_sub_graph->ToString();
+  }
+  os << "}";
+  return os.str();
+}
+
+bool SubGraphNode::operator==(const SubGraphNode& that) const {
+  ICHECK_EQ(inside_.end_index(), that.inside_.end_index());
+  if (inside_ != that.inside_) {
+    return false;
+  }
+  if (sub_sub_graphs_.size() != that.sub_sub_graphs_.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < sub_sub_graphs_.size(); ++i) {
+    if (*sub_sub_graphs_[i].get() != *that.sub_sub_graphs_[i].get()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool SubGraphNode::operator<(const SubGraphNode& that) const {
+  if (first_inside_index_ < that.first_inside_index_) {
+    return true;
+  }
+  if (that.first_inside_index_ < first_inside_index_) {
+    return false;
+  }
+  return inside_ < that.inside_;
+}
+
+size_t SubGraphNode::hash() const {
+  size_t h = inside_.hash();
+  for (const auto& sub_sub_graph : sub_sub_graphs_) {
+    h ^= sub_sub_graph->hash() + 0x9e3779b9 + (h << 6) + (h >> 2);
+  }
+  return h;
+}
+
+void SubGraphNode::Init(const DataflowGraph& dataflow_graph) {
+  for (PostDfsIndex index = 0; index < inside_.end_index(); ++index) {
+    auto node = dataflow_graph.index_to_node(index);
+    if (inside_[index]) {
+      if (AnyInputOutside(node)) {
+        entry_.Add(index);
+      }
+      if (AnyOutputOutside(node) || node->is_external_) {
+        exit_.Add(index);
+      }
+    } else {
+      if (AnyInputInside(node)) {
+        output_.Add(index);
+      }
+      if (AnyOutputInside(node) && !CanInline(node->ref())) {
+        input_.Add(index);
+      }
+    }
+  }
+  max_depth_ = MaxDepth(dataflow_graph);
+}
+
+size_t SubGraphNode::MaxDepth(const DataflowGraph& dataflow_graph) const {
+  std::unordered_map<const DataflowGraph::Node*, size_t> max_depths;
+  std::vector<const DataflowGraph::Node*> stack;
+  size_t max_depth = 0;
+  // All the entry nodes have max depth 0.
+  for (PostDfsIndex index : entry_) {
+    auto node = dataflow_graph.index_to_node(index);
+    max_depths.emplace(node, 0);
+    stack.push_back(node);
+  }
+  while (!stack.empty()) {
+    const DataflowGraph::Node* node = stack.back();
+    stack.pop_back();
+    size_t next_depth = max_depths[node] + 1;
+    if (exit_[node->index_]) {
+      // If this node is external then it will have no outputs but we still wish to consider
+      // the path to the implied output as requiring one more step.
+      // Otherwise we're accounting for reaching one of the external outputs belowe.
+      max_depth = std::max(max_depth, next_depth);
+    }
+    for (const DataflowGraph::Node* output_node : node->outputs_) {
+      if (!inside_[output_node->index_]) {
+        continue;
+      }
+      if (max_depths.count(output_node) == 0) {
+        max_depths.emplace(output_node, next_depth);
+        stack.push_back(output_node);
+      } else if (next_depth > max_depths[output_node]) {
+        // We found a deeper path to an already expanded node. We'll expand again.
+        max_depths[output_node] = next_depth;
+        stack.push_back(output_node);
+      }
+    }
+  }
+  return max_depth;
+}
+
+/*! \brief Return's true if any (input/output) of node is (outside/inside) the sub-graph.  */
+bool SubGraphNode::AnyInputOutside(const DataflowGraph::Node* node) const {
+  return std::any_of(node->inputs_.begin(), node->inputs_.end(),
+                     [this](const DataflowGraph::Node* sub_node) {
+                       return !inside_[sub_node->index_] && !CanInline(sub_node->ref());
+                     });
+}
+
+bool SubGraphNode::AnyInputInside(const DataflowGraph::Node* node) const {
+  return std::any_of(
+      node->inputs_.begin(), node->inputs_.end(),
+      [this](const DataflowGraph::Node* sub_node) { return inside_[sub_node->index_]; });
+}
+
+bool SubGraphNode::AnyOutputOutside(const DataflowGraph::Node* node) const {
+  return std::any_of(
+      node->outputs_.begin(), node->outputs_.end(),
+      [this](const DataflowGraph::Node* sub_node) { return !inside_[sub_node->index_]; });
+}
+
+bool SubGraphNode::AnyOutputInside(const DataflowGraph::Node* node) const {
+  return std::any_of(
+      node->outputs_.begin(), node->outputs_.end(),
+      [this](const DataflowGraph::Node* sub_node) { return inside_[sub_node->index_]; });
+}
+
+SubGraph::SubGraph(const DataflowGraph& dataflow_graph, IndexSet inside, OpPatternKind kind,
+                   String label, std::vector<SubSubGraph> sub_sub_graphs) {
+  std::sort(
+      sub_sub_graphs.begin(), sub_sub_graphs.end(),
+      [](const SubSubGraph& left, const SubSubGraph& right) { return *left.get() < *right.get(); });
+  auto node = runtime::make_object<SubGraphNode>();
+  node->inside_ = std::move(inside);
+  node->first_inside_index_ = node->inside_.FirstInsideIndex();
+  node->last_inside_index_ = node->inside_.LastInsideIndex();
+  node->entry_ = IndexSet(node->inside_.end_index());
+  node->exit_ = IndexSet(node->inside_.end_index());
+  node->input_ = IndexSet(node->inside_.end_index());
+  node->output_ = IndexSet(node->inside_.end_index());
+  node->kind_ = kind;
+  node->label_ = std::move(label);
+  node->sub_sub_graphs_ = sub_sub_graphs;
+  node->Init(dataflow_graph);
+  data_ = std::move(node);
+}
+
+SubGraph::SubGraph(const DataflowGraph& dataflow_graph)
+    : SubGraph(dataflow_graph, IndexSet(dataflow_graph.size())) {}
+
+bool SubGraph::AreDisjoint(const SubGraph& that) const {
+  return get()->inside_.AreDisjoint(that->inside_);
+}
+
+namespace {
+/*! \brief Returns true if an output of \p left not in \p right ultimately flows into \p right. */
+bool FlowsInto(const DataflowGraph& dataflow_graph, const SubGraph& left, const SubGraph& right) {
+  for (PostDfsIndex output_index : left->output_) {
+    if (!right->inside_[output_index] &&
+        dataflow_graph.downstream_of(output_index).Intersects(right->entry_)) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace
+
+bool SubGraph::AreTouching(const DataflowGraph& dataflow_graph, const SubGraph& that) const {
+  if (!get()->inside_.AreDisjoint(that->inside_)) {
+    // Easy rejection.
+    return false;
+  }
+  if (!get()->output_.Intersects(that->entry_)) {
+    // Not touching.
+    return false;
+  }
+  if (FlowsInto(dataflow_graph, *this, that) || FlowsInto(dataflow_graph, that, *this)) {
+    // Unioning would create a cycle.
+    return false;
+  }
+  return true;
+}
+
+bool SubGraph::AreSelfContained(const SubGraph& that) const {
+  return get()->output_.IsSubset(that->entry_) && that->input_.IsSubset(get()->exit_);
+}
+
+SubGraph SubGraph::DisjointUnion(const DataflowGraph& dataflow_graph, const SubGraph& that) const {
+  ICHECK(AreDisjoint(that));
+  IndexSet inside = get()->inside_ | that->inside_;
+  std::vector<SubSubGraph> sub_sub_graphs;
+  for (const auto& sub_sub_graph : get()->sub_sub_graphs_) {
+    sub_sub_graphs.push_back(sub_sub_graph);
+  }
+  for (const auto& sub_sub_graph : that->sub_sub_graphs_) {
+    auto existing_itr = std::find_if(sub_sub_graphs.begin(), sub_sub_graphs.end(),
+                                     [&sub_sub_graph](const SubSubGraph& existing) {
+                                       return existing.TriviallyUnionable(sub_sub_graph);
+                                     });
+    if (existing_itr != sub_sub_graphs.end()) {
+      *existing_itr = existing_itr->DisjointUnion(dataflow_graph, sub_sub_graph);
+    } else {
+      sub_sub_graphs.push_back(sub_sub_graph);
+    }
+  }
+  return SubGraph(dataflow_graph, std::move(inside), CombineKinds(get()->kind_, that->kind_),
+                  UnionLabels(get()->label_, that->label_), std::move(sub_sub_graphs));
+}
+
+SubGraph SubGraph::WithAttrs(const DataflowGraph& dataflow_graph, FunctionAttrsMap attrs) const {
+  std::vector<SubSubGraph> sub_sub_graphs;
+  sub_sub_graphs.push_back(SubSubGraph(*this, attrs));
+  return SubGraph(dataflow_graph, get()->inside_, get()->kind_, get()->label_,
+                  std::move(sub_sub_graphs));
+}
+
+SubGraph SubGraph::Subst(const DataflowGraph& new_dataflow_graph, const IndexSubst& subst) const {
+  IndexSet new_inside = get()->inside_.Subst(new_dataflow_graph.size(), subst);
+  std::vector<SubSubGraph> new_sub_sub_graphs;
+  for (const auto& sub_sub_graph : get()->sub_sub_graphs_) {
+    new_sub_sub_graphs.push_back(sub_sub_graph.Subst(new_dataflow_graph, subst));
+  }
+  return SubGraph(new_dataflow_graph, std::move(new_inside), get()->kind_, get()->label_,
+                  std::move(new_sub_sub_graphs));
+}
+
+/*static*/
+Expr SubGraph::ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr,
+                               std::vector<SubGraph> sub_graphs) {
+  // IMPORTANT:
+  //  - All the sub-graphs will be w.r.t. the dataflow graph for the original expression.
+  //    Each time we call Rewrite on one of those graphs the result expression will be rewritten
+  //    from the final output back to the inputs. The inputs will then be shared with the original
+  //    expression. Thus it is safe to iteratively rewrite all the sub-graphs without redoing the
+  //    dataflow_graph and substituting indexes provided we work in reverse dataflow order.
+  //  - We rely on the argument expression reference holding the original expression alive so that
+  //    the dataflow_graph will never contain dangling pointes (even though as per above we'll
+  //    never dereference them).
+  std::sort(sub_graphs.begin(), sub_graphs.end(), [](const SubGraph& left, const SubGraph& right) {
+    return left->last_inside_index_ > right->last_inside_index_;
+  });
+  Expr result = expr;
+  for (const auto& sub_graph : sub_graphs) {
+    result = sub_graph->Rewrite(dataflow_graph, result);
+  }
+  return result;
+}
+
+transform::Pass PartitionOnIndexesForTesting(size_t max_exits, bool allow_taps,
+                                             Array<Integer> indexes, Array<String> labels) {
+  auto pass_func = [=](Function function, IRModule mod, transform::PassContext ctxt) {
+    ICHECK(!labels.defined() || indexes.size() == labels.size());
+    VLOG(1) << "Considering partitioning for:\n" << PrettyPrint(function);
+    DataflowGraph dataflow_graph(function);
+    std::unordered_map<String, std::vector<PostDfsIndex>> sub_sub_graph_indexes;
+    std::vector<PostDfsIndex> node_indexes;
+    node_indexes.reserve(indexes.size());
+    for (size_t i = 0; i < indexes.size(); ++i) {
+      const Integer& index = indexes[i];
+      ICHECK_GE(index->value, 0);
+      ICHECK_LT(index->value, dataflow_graph.size());
+      PostDfsIndex index_int = static_cast<PostDfsIndex>(index->value);
+      node_indexes.push_back(index_int);
+      if (labels.defined()) {
+        const String& label = labels[i];
+        if (!label.empty()) {
+          sub_sub_graph_indexes[label].push_back(index_int);
+        }
+      }
+    }
+    std::vector<SubSubGraph> sub_sub_graphs;
+    for (const auto& kv : sub_sub_graph_indexes) {
+      FunctionAttrsMap attrs;
+      attrs.Set("Composite", kv.first);
+      sub_sub_graphs.push_back(
+          SubSubGraph(SubGraph(dataflow_graph, IndexSet(dataflow_graph.size(), kv.second)), attrs));
+    }
+    OpPatternKind kind;
+    String label;
+    IndexSet inside(dataflow_graph.size(), node_indexes);
+    std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside);
+    SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label),
+                       std::move(sub_sub_graphs));
+    SubGraphConfig config;
+    config.max_exits = max_exits;
+    config.allow_taps = allow_taps;
+    if (sub_graph->IsValid(dataflow_graph, config)) {
+      VLOG(1) << "Sub-graph " << sub_graph->ToString() << " is considered valid";
+    } else {
+      VLOG(1) << "Sub-graph " << sub_graph->ToString()
+              << " is NOT considered valid, not partitioning";
+      return function;
+    }
+    Function result = Downcast<Function>(sub_graph->Rewrite(dataflow_graph, function));
+    VLOG(1) << "Partitioned to:\n" << PrettyPrint(result);
+    return result;
+  };
+  return transform::CreateFunctionPass(pass_func, /*opt_level=*/0, "PartitionOnIndexesForTesting",
+                                       {});
+}
+
+TVM_REGISTER_GLOBAL("relay.collage.partition_on_indexes_for_testing")
+    .set_body_typed([](size_t max_outputs, bool allow_taps, Array<Integer> indexes,
+                       Array<String> labels) {
+      return PartitionOnIndexesForTesting(max_outputs, allow_taps, indexes, labels);
+    });
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/sub_graph.h b/src/relay/collage/sub_graph.h
new file mode 100644
index 0000000000000..021bc73a8a26b
--- /dev/null
+++ b/src/relay/collage/sub_graph.h
@@ -0,0 +1,441 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/sub_graph.h
+ * \brief Represents a sub-graph of an overall Relay expression.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_SUB_GRAPH_H_
+#define TVM_RELAY_COLLAGE_SUB_GRAPH_H_
+
+#include <tvm/relay/op_attr_types.h>
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "../ir/dataflow_matcher_impl.h"
+#include "../ir/indexed_graph.h"
+#include "./index_set.h"
+#include "dataflow_graph.h"
+#include "name_supply.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*! \brief Returns operator pattern kind as single-letter string. */
+std::string KindToString(OpPatternKind kind);
+
+/*!
+ * \brief Returns a kind and label for the single \p sub_expr, ignoring it's sub-sub expressions.
+ */
+std::pair<OpPatternKind, std::string> SubExprKindAndLabel(const Expr& sub_expr);
+
+/*!
+ * \brief Returns a kind and label for all the nodes in \p inside.
+ */
+std::pair<OpPatternKind, std::string> SubGraphKindAndLabel(const DataflowGraph& dataflow_graph,
+                                                           const IndexSet& inside);
+
+/*!
+ * \brief Returns the index set representing all the sub-expression matched by \p matcher.
+ */
+IndexSet MatcherToIndexSet(const DFPatternMatcher& matcher);
+
+/*!
+ * \brief Configuration controlling which sub-graphs are considered valid.
+ */
+struct SubGraphConfig {
+  /*! \brief Maximum number of exit nodes in the sub-graph, or zero if no limit. */
+  size_t max_exits = 0;
+  /*!
+   * \brief Whether a node inside the sub-graph may flow to nodes both inside and outside
+   * the sub-graph (which we call a 'tap'). Note that it is still possible to have multiple outputs
+   * even with this flag false.
+   */
+  bool allow_taps = false;
+  /*!
+   * \brief Maximum allowed maximum depth, or zero if no-limit.
+   */
+  size_t max_max_depth = 0;
+
+  std::string ToString() const;
+};
+
+class SubGraph;
+using FunctionAttrsMap = Map<String, ObjectRef>;
+
+/*!
+ * \brief A sub-sub graph is a sub-graph which is to be nested inside a function as part of some
+ * enclosing sub-graph.
+ *
+ * Extraction yields a function with input nodes replaced by parameters and exit nodes in the
+ * function result. Rewriting replaces the sub-graph with a call to that function, and all
+ * outputs with (projections from) the call result.
+ *
+ * (Note that it's tempting to move attrs_ into \p SubGraphNode and thus avoid this class.
+ * However we found the implementation was easier to understand in this form since it makes
+ * the result of \p Extract unambiguous.)
+ */
+class SubSubGraphNode : public Object {
+ public:
+  /*! \brief The nested sub-graph. */
+  ObjectRef /* actually SubGraph */ sub_graph_obj_;
+  /*! \brief Attributes (possibly empty) to attach to the extracted function. */
+  FunctionAttrsMap attrs_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  SubGraph sub_graph() const;
+
+  bool operator==(const SubSubGraphNode& that) const;
+  bool operator!=(const SubSubGraphNode& that) const { return !(*this == that); }
+  bool operator<(const SubSubGraphNode& that) const;
+  size_t hash() const;
+
+  std::string ToString() const;
+
+  /*!
+   * \brief Returns the function representing this sub-sub-graph within the overall expression
+   * represented by \p dataflow_graph:
+   *  - All sub-graph inputs become parameters.
+   *  - All sub-graph outputs become function results (either directly or as a field in a tuple).
+   *  - The function has attrs_ for attributes (which may be empty).
+   *  - The function body accounts for any rewrites implied by the nested sub-graph.
+   */
+  Function Extract(const DataflowGraph& dataflow_graph) const;
+
+  /*!
+   * \brief Returns \p expr (which has matching \p dataflow_graph) rewritten to encode the
+   * partitioning implied by this sub-sub-graph.
+   */
+  Expr Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const;
+
+  static constexpr const char* _type_key = "relay.collage.SubSubGraph";
+  TVM_DECLARE_FINAL_OBJECT_INFO(SubSubGraphNode, Object);
+};
+
+class SubSubGraph : public ObjectRef {
+ public:
+  SubSubGraph(SubGraph sub_graph, FunctionAttrsMap attrs);
+
+  /*!
+   * \brief Returns copy of this sub-sub-graph with all indexes substituted according to \p subst,
+   * whose range is w.r.t. \p new_dataflow_graph.
+   */
+  SubSubGraph Subst(const DataflowGraph& new_dataflow_graph,
+                    const std::unordered_map<PostDfsIndex, PostDfsIndex>& subst) const;
+
+  /*!
+   * \brief Returns true if this can be safely unioned.
+   */
+  bool TriviallyUnionable(const SubSubGraph& that) const;
+
+  /*!
+   * \brief Returns the disjoin union of this and \p that sub-sub graphs, which must agree on
+   * their attributes.
+   */
+  SubSubGraph DisjointUnion(const DataflowGraph& dataflow_graph, const SubSubGraph& that) const;
+
+  /*!
+   * \brief Returns \p expr rewritten according to all the given sub-sub-graphs. The sub-sub-graphs
+   * can be given in any order, but must be disjoint.
+   */
+  static Expr ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr,
+                              std::vector<SubSubGraph> sub_sub_graphs);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(SubSubGraph, ObjectRef, SubSubGraphNode);
+};
+
+using SubSubGraphs = Array<SubSubGraph>;
+
+/*!
+ * \brief A compact representation of a sub-graph within an (implied) overall Relay expression.
+ *
+ * Sub-graphs can be used to represent partitions/kernels/composite functions without having to
+ * pay the cost of constructing or rewriting any expressions. We also allow 'extracting' a
+ * function to use for measuring a partition/kernel's latency independently from 'rewriting'
+ * the overall Relay expression since only a tiny subset of candidate partitions will end up being
+ * needed after Collage has completed its search.
+ *
+ * We expect O(thousands) of sub-graphs to be in flight while processing a given model, so are
+ * mindful of space overhead.
+ *
+ * A sub-graph classifies every dataflow node of the overall expression as either 'inside' or
+ * 'outside' the sub-graph. Obviously not all such divisions make sense, for example it is not
+ * valid for an inside node to feed into another inside node via outside nodes. We provide the
+ * \p IsValid method to check for validity, and \p SubGraphConfig to control which validity rules
+ * apply (such as maximum depth).
+ *
+ * We generally work with the \p DataflowGraph representation of the overall Relay expression
+ * rather than the expression itself. We use the post-dfs visit index to uniquely refer to
+ * expression nodes.
+ *
+ * As well as 'inside' and 'outside' we have four other flavors of dataflow nodes, all uniquely
+ * determined from the 'inside' nodes:
+ *  - 'entry' nodes are those inside with at least one dataflow input outside.
+ *  - 'exit' nodes are  those inside with at least one dataflow output outside, or which
+ *    are considered 'external' in the underlying dataflow graph (eg because they represent
+ *    the result of the overall function).
+ *  - 'input' nodes are those outside with at least one dataflow output inside.
+ *  - 'output' nodes are those outside with at least one dataflow input inside.
+ * Index sets for these are cached with the sub-graph for performance.
+ *
+ * It is valid to have multiple entry nodes (we can bind a parameter for each). It may be valid to
+ * have multiple exit nodes (we can build a tuple of all such). It may be valid to have exit nodes
+ * which also contribute to other inside nodes (ie represent a 'tap' on an intermediate result).
+ *
+ * Sub-graphs are closed under:
+ *  - Disjoint union.
+ *  - Wrapping by a function with given attributes (see \p SubSubGraph above). This can be used
+ *    to encode "Composite" functions, or to represent a candidate kernel within a "Primitive"
+ *    function. (By combining 'wrapping' with 'union' we can encode, eg, 'this sub-graph should
+ *    be placed inside a primitive function which itself may have calls to composite functions).
+ *  - Substitution, which allows a sub-graph w.r.t. one dataflow graph to be transformed to
+ *    match some other (typically smaller) dataflow graph.
+ *
+ * See the subclasses of \p PartitionRule for how sub-graphs are built and combined during Collage
+ * search.
+ *
+ * To support some of the \p OpPatternKind-based fusion rule processing we give sub-graphs
+ * a kind, which is generally the maximum of the kinds of all the operator calls appearing
+ * inside it. We also given sub-graphs a (not necessarily unique) label to help debugging
+ * and guide the selection of global symbol names.
+ */
+class SubGraphNode : public Object {
+ public:
+  /*!
+   * \brief Which sub-expressions are inside the sub-graph (using their post-dfs indexes w.r.t.
+   * the implied DataflowGraph).
+   */
+  IndexSet inside_;
+
+  /*!
+   * \brief Index of first and last inside nodes.
+   *
+   * Cached for performance, uniquely determined by inside_.
+   */
+  PostDfsIndex first_inside_index_ = 0;
+  PostDfsIndex last_inside_index_ = 0;
+
+  /*!
+   * \brief Which sub-expressions are entry/exit/input/output for this sub-graph.
+   *
+   * Cached for performance, uniquely determined by inside_.
+   */
+  IndexSet entry_;
+  IndexSet exit_;
+  IndexSet input_;
+  IndexSet output_;
+
+  /*!
+   * \brief Maximum depth of any dataflow path from an entry to an output sub-expression.
+   *
+   * Cached for performance, uniquely determined by inside_.
+   */
+  size_t max_depth_ = 0;
+
+  /*!
+   * \brief The \p OpPatternKind summarizing the input/output behavior of the sub-graph.
+   *
+   * A sub-graph consisting of a single Relay expression node is given kind:
+   *  - For Call to a Relay operator, the "TOpPattern" attribute of that operator (provided the
+   *    call does not involve data-dependent dynamic shapes).
+   *  - For Call to Relay Function, the "TOpPattern" attribute of the function (provided it has
+   *    that attribute)
+   *  - For Constants, \p kElemWise.
+   *  - For Tuple and tuple projections, \p kInjective (provided all tuple fields are of tensor
+   *    type)
+   *  - All other nodes \p kOpaque.
+   * Sub-graphs with more than one node have the maximum of the kind of each node.
+   *
+   * Cached for performance, uniquely determined by inside_.
+   */
+  OpPatternKind kind_ = kOpaque;
+
+  /*!
+   * \brief A label for the sub-graph. Not guaranteed to be unique, but is a human-readable summary
+   * of the sub-graph which can help with debugging and guide the selection of global symbol names.
+   */
+  String label_;
+
+  /*!
+   * \brief Sub-sub-graphs of this sub-graph which must be represented by functions. These must
+   * be disjoint, but it's ok for this sub-graph to have nodes not inside any sub-sub-graph.
+   */
+  SubSubGraphs sub_sub_graphs_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  // TODO(mbs): 'Anchor nodes' and rules for unioning them.
+  // In FuseOps it's just the unique kEWiseFusable node, if any.
+  // I'd like to allow writing vertical fusion rules, eg if two candidates are directly
+  // connected and have nn.conv2d anchors allow their join.
+  // I'd also like to allow horizontal fusion rules, eg if two candidates are not directly
+  // connected but could be joined without producing invalid (eg cyclic) and have nn.conv2d anchors
+  // then do so. Come back to this.
+
+  /*! \brief Number of nodes in overall dataflow graph. */
+  size_t overall_size() const { return inside_.end_index(); }
+
+  bool IsEmpty() const { return inside_.IsZero(); }
+
+  /*! \brief Number of nodes in sub-graph. */
+  size_t Size() const { return inside_.PopCount(); }
+
+  /*!
+   * \brief Returns the dataflow nodes downstream of all exit nodes.
+   */
+  IndexSet Downstream(const DataflowGraph& dataflow_graph) const;
+
+  /*!
+   * \brief Returns true if this sub-graph is valid. Ie:
+   *  - no output of the sub-graph can flow to any input of the sub-graph (otherwise we'd end up
+   *    with a dataflow cycle when we partition).
+   *  - all inputs and outputs of the sub-graph are in the same scope, ie not separated by
+   *    control flow (otherwise there'd be no consistent program point at which to eval the
+   *    partitioned function).
+   *  - no more than config.max_outputs outputs are require.
+   *  - if config.allow_taps is false, no inside node has outputs to nodes both inside and
+   *    outside the sub-graph.
+   */
+  bool IsValid(const DataflowGraph& dataflow_graph, const SubGraphConfig& config) const;
+
+  /*!
+   * \brief Returns this sub-graph extracted as a stand-alone function. The function will have
+   * no attributes, and is suitable for building and profiling by the \p CostEstimator.
+   */
+  Function ExtractAsFunction(const DataflowGraph& dataflow_graph) const;
+
+  /*!
+   * \brief Returns \p expr (which has matching \p dataflow_graph) rewritten to encode the
+   * partitioning implied by this sub-graph.
+   */
+  Expr Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const;
+
+  std::string ToString() const;
+
+  bool operator==(const SubGraphNode& that) const;
+  bool operator!=(const SubGraphNode& that) const { return !(*this == that); }
+  bool operator<(const SubGraphNode& that) const;
+  size_t hash() const;
+
+ private:
+  /*! \brief Initialize the entry/exit/input/output sets given the inside and \p dataflow_graph. */
+  void Init(const DataflowGraph& dataflow_graph);
+
+  /*! \brief Calculates and returns the maximum path depth. */
+  size_t MaxDepth(const DataflowGraph& dataflow_graph) const;
+
+  /*! \brief Return's true if any (input/output) of node is (outside/inside) the sub-graph. */
+  bool AnyInputOutside(const DataflowGraph::Node* node) const;
+  bool AnyInputInside(const DataflowGraph::Node* node) const;
+  bool AnyOutputOutside(const DataflowGraph::Node* node) const;
+  bool AnyOutputInside(const DataflowGraph::Node* node) const;
+
+ public:
+  static constexpr const char* _type_key = "relay.collage.SubGraph";
+  TVM_DECLARE_FINAL_OBJECT_INFO(SubGraphNode, Object);
+
+  friend class SubGraph;
+};
+
+class SubGraph : public ObjectRef {
+ public:
+  /*! \brief Primitive constructor. The following constructors are generally more convenient. */
+  SubGraph(const DataflowGraph& dataflow_graph, IndexSet inside, OpPatternKind kind = kOpaque,
+           String label = {}, std::vector<SubSubGraph> sub_sub_graphs = {});
+
+  /*! \brief Constructs the empty sub-graph for \p dataflow_graph. */
+  explicit SubGraph(const DataflowGraph& dataflow_graph);
+
+  /*! \brief Returns true if this and that are disjoint. */
+  bool AreDisjoint(const SubGraph& that) const;
+
+  /*!
+   * \brief Returns true if:
+   *  - \p this and \p that are disjoint, and
+   *  - an output node of \p this coincides with an entry node of \p that, and
+   *  - \p this and \p that are not obviously invalid after \p DisjointUnion
+   *    (eg because such a sub-graph would produce a cycle).
+   * Note however that the \p DisjointUnion may not necessarily be valid even with the above
+   * checks.
+   */
+  bool AreTouching(const DataflowGraph& dataflow_graph, const SubGraph& that) const;
+
+  /*!
+   * \brief Returns true if:
+   *  - all the outputs of \p this are entries for \p that, and
+   *  - all the inputs of \p that are exits for \p this.
+   */
+  bool AreSelfContained(const SubGraph& that) const;
+
+  /*!
+   * \brief Returns disjoint union of this and \p that sub-graphs. The result may not be valid.
+   */
+  SubGraph DisjointUnion(const DataflowGraph& dataflow_graph, const SubGraph& that) const;
+
+  /*!
+   * \brief Returns copy of this sub-graph with all nodes placed inside a sub-sub-graph with
+   * given attributes.
+   */
+  SubGraph WithAttrs(const DataflowGraph& dataflow_graph, FunctionAttrsMap attrs) const;
+
+  /*!
+   * \brief Returns copy of this sub-graph with all indexes substituted according to \p subst,
+   * whose range is w.r.t. \p new_dataflow_graph.
+   */
+  SubGraph Subst(const DataflowGraph& new_dataflow_graph,
+                 const std::unordered_map<PostDfsIndex, PostDfsIndex>& subst) const;
+
+  /*!
+   * \brief Returns \p expr rewritten according to all the given sub-graphs. The sub-graphs can
+   * be given in any order, but must be disjoint.
+   */
+  static Expr ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr,
+                              std::vector<SubGraph> sub_graphs);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(SubGraph, ObjectRef, SubGraphNode);
+};
+
+struct SubGraphEqual {
+  bool operator()(const SubGraph& left, const SubGraph& right) const {
+    return *left.get() == *right.get();
+  }
+};
+
+struct SubGraphHash {
+  size_t operator()(const SubGraph& sub_graph) const { return sub_graph->hash(); }
+};
+
+/*!
+ * \brief Pass to partition every global function according to the post-dfs indexes
+ * given in an array. Visible for testing from Python only, would never make sense to use
+ * as a generic pass!
+ */
+transform::Pass PartitionOnIndexesForTesting(Array<Integer> indexes);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_SUB_GRAPH_H_
diff --git a/src/relay/collage/utils.cc b/src/relay/collage/utils.cc
new file mode 100644
index 0000000000000..03af980e8c1d3
--- /dev/null
+++ b/src/relay/collage/utils.cc
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/utils.cc
+ * \brief Misc helpers.
+ */
+
+#include "./utils.h"
+
+#include "../../support/scalars.h"
+#include "../op/memory/device_copy.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+String GetSpecName(const Target& target) {
+  if (TargetKind::GetAttrMap<Bool>(tvm::attr::kIsExternalCodegen).get(target->kind, Bool(false))) {
+    return target->kind->name;
+  } else {
+    return std::string(kTVMSpecNamePrefix) + target->kind->name;
+  }
+}
+
+String UnionLabels(String left, String right) {
+  if (left.empty()) {
+    return right;
+  }
+  if (right.empty()) {
+    return left;
+  }
+  return left + "+" + right;
+}
+
+String NestLabels(String left, String right) {
+  if (left.empty()) {
+    return right;
+  }
+  if (right.empty()) {
+    return left;
+  }
+  if (right.size() > left.size()) {
+    std::string right_str = right;
+    if (right_str.substr(0, left.size()) == left) {
+      return right;
+    }
+  }
+  return left + "." + right;
+}
+
+std::string KindToString(OpPatternKind kind) {
+  switch (kind) {
+    case kElemWise:
+      return "E";
+    case kBroadcast:
+      return "B";
+    case kInjective:
+      return "I";
+    case kCommReduce:
+      return "R";
+    case kOutEWiseFusable:
+      return "A";
+    case kTuple:
+      return "T";
+    case kOpaque:
+      return "O";
+  }
+  return "?";
+}
+
+OpPatternKind CombineKinds(OpPatternKind left, OpPatternKind right) {
+  return std::max(left, right);
+}
+
+bool CanInline(const Expr& expr) {
+  if (expr.as<OpNode>() || expr.as<ConstructorNode>() || expr.as<FunctionNode>()) {
+    return true;
+  }
+  if (const auto* constant_node = expr.as<ConstantNode>()) {
+    return support::IsSimpleScalar(constant_node);
+  }
+  return false;
+}
+
+bool IsSpecialOp(const OpNode* op_node) {
+  auto op = GetRef<Op>(op_node);
+  static auto fnoncomputational = Op::GetAttrMap<TNonComputational>("TNonComputational");
+  if (fnoncomputational.count(op) && fnoncomputational[op]) {
+    // Operator has been marked as non-computational.
+    return true;
+  }
+  // TODO(mbs): This is incomplete.
+  static auto shape_of_op_ = Op::Get("shape_of");
+  static auto vm_shape_of_op_ = Op::Get("vm.shape_of");
+  if (op == DeviceCopyOp() || op == shape_of_op_ || op == vm_shape_of_op_) {
+    // Operator is compiled away by the VM compilation flow.
+    return true;
+  }
+  return false;
+}
+
+bool MustBeLowered(const Expr& expr) {
+  if (const auto* call_node = expr.as<CallNode>()) {
+    if (const auto* function_node = call_node->op.as<FunctionNode>()) {
+      if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+        // We've already committed to this call being to one or more operators which must be
+        // lowered.
+        return true;
+      }
+    } else if (const auto* op_node = call_node->op.as<OpNode>()) {
+      if (!IsSpecialOp(op_node)) {
+        // The VM compilation path won't rewrite this call.
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/utils.h b/src/relay/collage/utils.h
new file mode 100644
index 0000000000000..4c0493cdd675c
--- /dev/null
+++ b/src/relay/collage/utils.h
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/utils.h
+ * \brief Misc helpers.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_UTILS_H_
+#define TVM_RELAY_COLLAGE_UTILS_H_
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/runtime/container/string.h>
+
+#include <string>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Distinguished partition spec names.
+ */
+constexpr const char* kTVMSpecNamePrefix = "tvm_";
+constexpr const char* kHostSpecName = "host";
+
+/*!
+ * \brief Returns the partition spec name to use for \p target. For external codegen targets the
+ * spec name is just the target kind name. For TVM native targets the spec name is of the form
+ * "tvm_<kind_name>".
+ */
+String GetSpecName(const Target& target);
+
+/*! \brief Returns \p "<left>+<right>". */
+String UnionLabels(String left, String right);
+
+/*! \brief Returns \p "<outer>.<inner>". */
+String NestLabels(String outer, String inner);
+
+/*! \brief Returns abbreviation for \p kind. */
+std::string KindToString(OpPatternKind kind);
+
+/*! \brief Returns maximum of \p left and \p right. */
+OpPatternKind CombineKinds(OpPatternKind left, OpPatternKind right);
+
+/*!
+ * \brief Returns true if \p expr can be safely inlined in body of function extracted
+ * from sub-graph, even if \p expr was not technically matched by the pattern which produced
+ * the sub-graph.
+ */
+bool CanInline(const Expr& expr);
+
+/*!
+ * \brief Returns true if \p op_node can be directly handled by the VM.
+ */
+bool IsSpecialOp(const OpNode* op_node);
+
+/*!
+ * \brief Return true if the Relay expression node given by \p expr cannot be evaluated by
+ * the VM and must end up in a kernel.
+ */
+bool MustBeLowered(const Expr& expr);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_UTILS_H_
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index e3e3bfbb973e5..e38066c83d360 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -585,6 +585,7 @@ RELAY_REGISTER_OP("nn.relu")
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(1)
     .add_type_rel("Identity", IdentityRel)
+    //    .set_attr<TOpPattern>("TOpPattern", kElemWise)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc
index 3f1985b7ddfa5..0a5a29a9872c8 100644
--- a/src/relay/transforms/annotate_target.cc
+++ b/src/relay/transforms/annotate_target.cc
@@ -216,6 +216,8 @@ class AnnotateTargetRewriter : public ExprRewriter {
         if (!Op::HasAttrMap("target." + std::string(target))) {
           continue;
         }
+        // TODO(mbs): Do not check in
+        LOG(FATAL) << "Unexpected BYOC predicate on op " << op->name << " for target " << target;
         auto fannotate = Op::GetAttrMap<FTVMAnnotateTarget>("target." + std::string(target));
         const Expr& ex = GetRef<Expr>(pre);
         if (fannotate.count(op) && fannotate[op](ex)) {
diff --git a/src/relay/transforms/infer_layout_utils.cc b/src/relay/transforms/infer_layout_utils.cc
index efe886c29d23b..70e7d5d1cf2fc 100644
--- a/src/relay/transforms/infer_layout_utils.cc
+++ b/src/relay/transforms/infer_layout_utils.cc
@@ -42,9 +42,9 @@ Layout AdjustSubordinateFactors(const Layout& src_layout, const Layout& old_layo
   //   2) Find the Index of this dual axis in old_layout.
   //   3) Find the shape of the that axis in old_shape.
   //   4) a) Adjust factor to 1, if that shape is 1. b) Else retain the factor.
-  DLOG(INFO) << "AdjustSubordinateFactors"
-             << "src_layout: " << src_layout << " old_layout: " << old_layout
-             << " old_shape: " << old_shape << std::endl;
+  VLOG(1) << "AdjustSubordinateFactors"
+          << "src_layout: " << src_layout << " old_layout: " << old_layout
+          << " old_shape: " << old_shape << std::endl;
   std::string new_layout;
   for (auto axis : src_layout->axes) {
     if (!LayoutAxis::Get(axis).IsPrimal()) {
@@ -85,8 +85,8 @@ Layout AdjustSubordinateFactors(const Layout& src_layout, const Layout& old_layo
 }
 
 bool Isomorphic(const Layout& lhs, const Layout& rhs) {
-  DLOG(INFO) << "Isomorphic: "
-             << "lhs: " << lhs << " rhs: " << rhs << std::endl;
+  VLOG(1) << "Isomorphic: "
+          << "lhs: " << lhs << " rhs: " << rhs << std::endl;
   ICHECK(lhs.defined());
   ICHECK(rhs.defined());
   if (lhs->axes.size() != rhs->axes.size()) return false;
@@ -115,8 +115,8 @@ bool Isomorphic(const Layout& lhs, const Layout& rhs) {
 }
 
 Layout TryTransformLike(const Layout& old, const Layout& ref_old, const Layout& ref_new) {
-  DLOG(INFO) << "transform_layout: old = " << old << ", ref_new = " << ref_new
-             << ", ref_old = " << ref_old << std::endl;
+  VLOG(1) << "transform_layout: old = " << old << ", ref_new = " << ref_new
+          << ", ref_old = " << ref_old << std::endl;
   ICHECK(ref_old.defined());
   ICHECK(ref_new.defined());
   ICHECK(old.defined());
@@ -181,7 +181,7 @@ Layout TryTransformLike(const Layout& old, const Layout& ref_old, const Layout&
     }
   }
 
-  DLOG(INFO) << "new_layout = " << new_layout << std::endl;
+  VLOG(1) << "new_layout = " << new_layout << std::endl;
   return Layout(new_layout);
 }
 
diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc
index 012b3579494f1..90b916b5471ed 100644
--- a/src/relay/transforms/inline.cc
+++ b/src/relay/transforms/inline.cc
@@ -218,8 +218,13 @@ IRModule Inline(const IRModule& module) {
 namespace transform {
 
 Pass Inline() {
-  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func =
-      [=](IRModule m, PassContext pc) { return relay::Inline(m); };
+  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func = [=](IRModule mod,
+                                                                            PassContext pc) {
+    VLOG(1) << "Inline input:" << std::endl << PrettyPrint(mod);
+    IRModule out_mod = relay::Inline(mod);
+    VLOG(1) << "Inline result:" << std::endl << PrettyPrint(out_mod);
+    return out_mod;
+  };
   return CreateModulePass(pass_func, 1, "InlineGlobals", {});
 }
 
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index 5b584e199dc73..13dc689f6f135 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -79,6 +79,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
     for (auto field : tuple_node->fields) {
       auto new_field = Mutate(field);
       if (const auto* op = new_field.as<ConstantNode>()) {
+        // TODO(mbs): Replace with support::IsSimpleScalar?
         DataType dtype(op->data->dtype);
         bool is_simple_const = (dtype == DataType::Int(32) || dtype == DataType::Int(64) ||
                                 dtype == DataType::Float(32) || dtype == DataType::Float(64) ||
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index bc1ed518d4736..bbe03c6421ad3 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -331,6 +331,12 @@ class Partitioner : public MixedModeMutator {
     global_region_func = WithAttr(std::move(global_region_func), attr::kPrimitive, tvm::Integer(1));
     global_region_func =
         WithAttr(std::move(global_region_func), attr::kCompiler, tvm::runtime::String(target));
+    // TODO(mbs): The partitioned functions are tagged as Inline=1 so that they can be collapsed
+    // back into the main relay function and thus pass through the keyhole of the
+    // GraphExecutorCodegen and AOTExecutorCodegen's 'codegen' method, only to then be outlined
+    // again. Ideally codegen would be IRModule at a time instead of function at a time, but
+    // the assumption of IRModule == single 'main' function is now so deeply engrained I it would
+    // be tricky to make that change.
     global_region_func = WithAttr(std::move(global_region_func), attr::kInline, tvm::Integer(1));
 
     std::string fname = name;
diff --git a/src/relay/transforms/simplify_inference.cc b/src/relay/transforms/simplify_inference.cc
index e7eef41e41c4b..04383d33cfeb2 100644
--- a/src/relay/transforms/simplify_inference.cc
+++ b/src/relay/transforms/simplify_inference.cc
@@ -204,7 +204,7 @@ class InferenceSimplifier : public MixedModeMutator {
     return new_e;
   }
 
-  Expr Rewrite_(const CallNode* n, const Expr& new_n) {
+  Expr Rewrite_(const CallNode* n, const Expr& new_n) final {
     if (n->op == batch_norm_op_) {
       ty_map_[new_n.as<CallNode>()->args[0]] = n->args[0]->checked_type();
     } else if (n->op == layer_norm_op_) {
diff --git a/src/runtime/const_loader_module.cc b/src/runtime/const_loader_module.cc
index 2e91d26d5f965..520cc8342bad2 100644
--- a/src/runtime/const_loader_module.cc
+++ b/src/runtime/const_loader_module.cc
@@ -55,11 +55,16 @@ class ConstLoaderModuleNode : public ModuleNode {
     // symbol lookup for initialization. Otherwise, symbols/primitives in the
     // DSO module will also be cached but they never need to be initialized.
     for (const auto& it : const_vars_by_symbol_) {
+      for (const auto& s : it.second) {
+        VLOG(1) << "ConstLoaderModuleNode has constant symbol '" << s << "' for function '"
+                << it.first << "'";
+      }
       initialized_[it.first] = false;
     }
   }
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
+    VLOG(1) << "ConstLoaderModuleNode::GetFunction(" << name << ")";
     // Initialize and memoize the module.
     // Usually, we have some warmup runs. The module initialization should be
     // done at this stage. Therefore, runtime overhead is not a concern.
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index 3971081bf8f8a..cd46967e532b7 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -67,7 +67,7 @@ nvinfer1::ITensor* TensorRTOpConverter::Transpose(TensorRTOpConverterParams* par
     // Batch dimension cannot be modified.
     ICHECK_EQ(input->getDimensions().nbDims, order.size() - 1);
     ICHECK_EQ(order[0], 0);
-    for (size_t i = 0; i < order.size(); ++i) {
+    for (size_t i = 0; i + 1 < order.size(); ++i) {
       perm.order[i] = order[i + 1] - 1;
     }
   } else {
@@ -880,7 +880,7 @@ class ConcatOpConverter : public TensorRTOpConverter {
     const int input_rank = params->inputs[0].tensor->getDimensions().nbDims;
     std::vector<nvinfer1::ITensor*> input_tensors;
     for (auto input : params->inputs) {
-      ICHECK(input.type == kTensor);
+      ICHECK_EQ(input.type, kTensor);
       ICHECK_EQ(input_rank, input.tensor->getDimensions().nbDims);
       input_tensors.push_back(input.tensor);
     }
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index b4d7b41b7f4ae..fd8e99d2c9997 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -118,20 +118,22 @@ class CUDADeviceAPI final : public DeviceAPI {
       CUDA_CALL(cudaSetDevice(dev.device_id));
       size_t free_mem, total_mem;
       CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
-      VLOG(1) << "allocating " << nbytes << " bytes on device, with " << free_mem
-              << " bytes currently free out of " << total_mem << " bytes available";
+      VLOG(1) << "allocating " << nbytes << " bytes on device " << dev.device_id << " with "
+              << free_mem << " bytes currently free out of " << total_mem << " bytes available";
       CUDA_CALL(cudaMalloc(&ret, nbytes));
     }
+    VLOG(1) << "allocated at " << std::hex << reinterpret_cast<size_t>(ret);
     return ret;
   }
 
   void FreeDataSpace(Device dev, void* ptr) final {
     if (dev.device_type == kDLCUDAHost) {
-      VLOG(1) << "freeing host memory";
+      VLOG(1) << "freeing host memory at " << std::hex << reinterpret_cast<size_t>(ptr);
       CUDA_CALL(cudaFreeHost(ptr));
     } else {
       CUDA_CALL(cudaSetDevice(dev.device_id));
-      VLOG(1) << "freeing device memory";
+      VLOG(1) << "freeing device " << dev.device_id << " memory at " << std::hex
+              << reinterpret_cast<size_t>(ptr);
       CUDA_CALL(cudaFree(ptr));
     }
   }
diff --git a/src/runtime/vm/pooled_allocator.h b/src/runtime/vm/pooled_allocator.h
index e5f236983a735..743d1b900ea97 100644
--- a/src/runtime/vm/pooled_allocator.h
+++ b/src/runtime/vm/pooled_allocator.h
@@ -52,6 +52,8 @@ class PooledAllocator final : public Allocator {
       auto&& pool = it->second;
       auto ret = pool.back();
       pool.pop_back();
+      VLOG(1) << "reusing buffer of " << ret.size << " bytes at " << std::hex
+              << reinterpret_cast<size_t>(ret.data);
       return ret;
     }
     Buffer buf;
@@ -67,7 +69,9 @@ class PooledAllocator final : public Allocator {
     }
 
     used_memory_.fetch_add(size, std::memory_order_relaxed);
-    VLOG(1) << "allocate " << size << " B, used memory " << used_memory_ << " B";
+    VLOG(1) << "allocated " << size << " bytes at " << std::hex
+            << reinterpret_cast<size_t>(buf.data) << std::dec << ", total used memory is now "
+            << used_memory_ << " bytes";
     return buf;
   }
 
@@ -77,7 +81,8 @@ class PooledAllocator final : public Allocator {
       memory_pool_.emplace(buffer.size, std::vector<Buffer>{});
     }
     memory_pool_.at(buffer.size).push_back(buffer);
-    VLOG(1) << "reclaim buffer " << buffer.size;
+    VLOG(1) << "reclaiming buffer of " << buffer.size << " bytes at " << std::hex
+            << reinterpret_cast<size_t>(buffer.data);
   }
 
   size_t UsedMemory() const override { return used_memory_.load(std::memory_order_relaxed); }
@@ -88,12 +93,14 @@ class PooledAllocator final : public Allocator {
     for (auto const& it : memory_pool_) {
       auto const& pool = it.second;
       for (auto const& buf : pool) {
+        VLOG(1) << "freeing " << buf.size << " bytes at " << std::hex
+                << reinterpret_cast<size_t>(buf.data);
         DeviceAPI::Get(buf.device)->FreeDataSpace(buf.device, buf.data);
       }
     }
     memory_pool_.clear();
     used_memory_ = 0;
-    VLOG(1) << "release all buffers";
+    VLOG(1) << "released all buffers";
   }
 
  private:
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 8d03dbf210c33..43c2b3ddb0573 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -264,6 +264,7 @@ void VirtualMachine::SetOneInput(std::string func_name, const TVMArgValue& tag,
   } else {
     LOG(FATAL) << "The type of input tensor tag (" << tag.type_code()
                << ") doesn't match integer or string";
+    inp_index = 0;
   }
   ICHECK_LT(inp_index, params_num);
 
@@ -359,11 +360,11 @@ void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<Obje
 }
 
 ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& args) {
-  DLOG(INFO) << "Executing Function: " << std::endl << func;
+  VLOG(1) << "Executing Function: " << std::endl << func;
   for (int i = 0; i < static_cast<int>(devices_.size()); ++i) {
-    DLOG(INFO) << "Device " << i << " has device type " << devices_[i].device_type
-               << " and device id " << devices_[i].device_id
-               << (i == exec_->host_device_index ? " (using as host device)" : "");
+    VLOG(1) << "Device " << i << " has device type " << devices_[i].device_type << " and device id "
+            << devices_[i].device_id
+            << (i == exec_->host_device_index ? " (using as host device)" : "");
   }
 
   InvokeGlobal(func, args);
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 1148013706ab7..a0b3c6fcfc1de 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -416,6 +416,9 @@ TVM_REGISTER_TARGET_KIND("hybrid", kDLCPU)  // line break
 TVM_REGISTER_TARGET_KIND("composite", kDLCPU)  // line break
     .add_attr_option<Array<Target>>("devices");
 
+TVM_REGISTER_TARGET_KIND("test_external_cpu_target", kDLCPU)  // line break
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
 /**********  Registry  **********/
 
 TVM_REGISTER_GLOBAL("target.TargetKindGetAttr")
diff --git a/tests/cpp/relay/collage/partition_rule_test.cc b/tests/cpp/relay/collage/partition_rule_test.cc
new file mode 100644
index 0000000000000..4e55359993adf
--- /dev/null
+++ b/tests/cpp/relay/collage/partition_rule_test.cc
@@ -0,0 +1,272 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "../../../src/relay/collage/partition_rule.h"
+
+#include <gtest/gtest.h>
+#include <tvm/parser/parser.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+
+#include "../../../src/relay/collage/partition_spec.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+IRModule TestIRModule() {
+  constexpr const char* kModel = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(10, 10), float32]) {
+      %0 = abs(%x);                      //  3
+      %1 = nn.relu(%0);                  //  4
+      nn.relu(%1)                        //  5
+    }
+  )";
+  return parser::ParseModule("string", kModel);
+}
+
+std::vector<collage::CandidatePartition> MakeCandidates(
+    const collage::DataflowGraph& graph, const runtime::String rule_name,
+    const collage::PartitionSpec& spec, const std::vector<std::vector<size_t>> index_sets) {
+  std::vector<collage::CandidatePartition> candidate_partitions;
+  for (const auto& indexes : index_sets) {
+    auto subgraph = collage::SubGraph(graph, collage::IndexSet(graph.size(), indexes));
+    auto candidate = collage::CandidatePartition(rule_name, subgraph, spec);
+    candidate_partitions.emplace_back(std::move(candidate));
+  }
+  return candidate_partitions;
+}
+
+TEST(PartitionRule, DFPatternSingleOp) {
+  IRModule ir_mod = TestIRModule();
+  auto main = Downcast<Function>(ir_mod->Lookup("main"));
+  auto graph = collage::DataflowGraph(main);
+  Target target("llvm");
+  auto spec = collage::PartitionSpec("test_spec", target, {});
+
+  {
+    auto pattern = IsOp("nn.relu")({IsWildcard()});
+    auto rule = collage::DFPatternPartitionRule("relu_pattern", pattern);
+    auto expected_candidates = MakeCandidates(graph, "relu_pattern", spec, {{4}, {5}});
+
+    auto candidates = rule->AllCandidates(graph, spec);
+
+    ICHECK_EQ(candidates.size(), 2);
+    for (size_t i = 0; i < candidates.size(); i++) {
+      ICHECK(collage::CandidatePartitionEquals()(candidates[i], expected_candidates[i]));
+    }
+  }
+}
+
+TEST(PartitionRule, DFPatternOverlap) {
+  IRModule ir_mod = TestIRModule();
+  auto main = Downcast<Function>(ir_mod->Lookup("main"));
+  auto graph = collage::DataflowGraph(main);
+  Target target("llvm");
+  auto spec = collage::PartitionSpec("test_spec", target, {});
+
+  {
+    auto pattern =
+        IsOp("nn.relu")({IsOp("nn.relu")({IsWildcard()}) || IsOp("abs")({IsWildcard()})});
+    auto rule = collage::DFPatternPartitionRule("relu+abs_pattern", pattern);
+    auto expected_candidates = MakeCandidates(graph, "relu+abs_pattern", spec, {{3, 4}, {4, 5}});
+
+    auto candidates = rule->AllCandidates(graph, spec);
+
+    ICHECK_EQ(candidates.size(), 2);
+    for (size_t i = 0; i < candidates.size(); i++) {
+      ICHECK(collage::CandidatePartitionEquals()(candidates[i], expected_candidates[i]));
+    }
+  }
+}
+
+TEST(PartitionRule, Composite) {
+  IRModule ir_mod = TestIRModule();
+  auto main = Downcast<Function>(ir_mod->Lookup("main"));
+  auto graph = collage::DataflowGraph(main);
+  Target target("llvm");
+  auto spec = collage::PartitionSpec("test_spec", target, {});
+
+  {
+    constexpr const char* kExpectedMod = R"(
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = abs(%x);
+        %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="composite") {
+          nn.relu(%FunctionVar_01)
+        };
+        %2 = %1(%0);
+        %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Composite="composite") {
+          nn.relu(%FunctionVar_0)
+        };
+        %3(%2)
+      }
+    )";
+    auto expected_expr =
+        Downcast<Function>(parser::ParseModule("string", kExpectedMod)->Lookup("main"));
+    auto pattern = IsOp("nn.relu")({IsWildcard()});
+    auto df_rule = collage::DFPatternPartitionRule("relu_pattern", pattern);
+    auto composite_rule = collage::CompositePartitionRule("composite", df_rule);
+
+    auto candidates = composite_rule->AllCandidates(graph, spec);
+    auto rewrite_expr = collage::CandidatePartition::ParallelRewrite(graph, main, candidates);
+
+    ICHECK_EQ(candidates.size(), 2);
+    ICHECK(StructuralEqual()(rewrite_expr, expected_expr));
+  }
+}
+
+TEST(PartitionRule, PrimitiveTVM) {
+  IRModule ir_mod = TestIRModule();
+  auto main = Downcast<Function>(ir_mod->Lookup("main"));
+  auto graph = collage::DataflowGraph(main);
+  Target target("llvm");
+  auto spec = collage::PartitionSpec("test_spec", target, {});
+
+  {
+    constexpr const char* kExpectedMod = R"(
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = abs(%x);
+        %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Primitive=1) {
+          nn.relu(%FunctionVar_01)
+        };
+        %2 = %1(%0);
+        %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1) {
+          nn.relu(%FunctionVar_0)
+        };
+        %3(%2)
+      }
+    )";
+    auto expected_expr =
+        Downcast<Function>(parser::ParseModule("string", kExpectedMod)->Lookup("main"));
+    auto pattern = IsOp("nn.relu")({IsWildcard()});
+    auto df_rule = collage::DFPatternPartitionRule("relu_pattern", pattern);
+    auto primitive_rule = collage::PrimitivePartitionRule("primitive", df_rule);
+
+    auto candidates = primitive_rule->AllCandidates(graph, spec);
+    auto rewrite_expr = collage::CandidatePartition::ParallelRewrite(graph, main, candidates);
+
+    ICHECK_EQ(candidates.size(), 2);
+    ICHECK(StructuralEqual()(rewrite_expr, expected_expr));
+  }
+}
+
+TVM_REGISTER_TARGET_KIND("test_ext_codegen", kDLCUDA)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+TEST(PartitionRule, PrimitiveExternal) {
+  IRModule ir_mod = TestIRModule();
+  auto main = Downcast<Function>(ir_mod->Lookup("main"));
+  auto graph = collage::DataflowGraph(main);
+  Target target("test_ext_codegen");
+  auto spec = collage::PartitionSpec("test_ext_codegen", target, {});
+
+  {
+    constexpr const char* kExpectedMod = R"(
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = abs(%x);
+        %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Primitive=1, Compiler="test_ext_codegen") {
+          nn.relu(%FunctionVar_01)
+        };
+        %2 = %1(%0);
+        %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="test_ext_codegen") {
+          nn.relu(%FunctionVar_0)
+        };
+        %3(%2)
+      }
+    )";
+    auto expected_expr =
+        Downcast<Function>(parser::ParseModule("string", kExpectedMod)->Lookup("main"));
+    auto pattern = IsOp("nn.relu")({IsWildcard()});
+    auto df_rule = collage::DFPatternPartitionRule("relu_pattern", pattern);
+    auto primitive_rule = collage::PrimitivePartitionRule("primitive", df_rule);
+
+    auto candidates = primitive_rule->AllCandidates(graph, spec);
+    auto rewrite_expr = collage::CandidatePartition::ParallelRewrite(graph, main, candidates);
+
+    ICHECK_EQ(candidates.size(), 2);
+    ICHECK(StructuralEqual()(rewrite_expr, expected_expr));
+  }
+}
+
+TEST(PartitionRule, Union) {
+  IRModule ir_mod = TestIRModule();
+  auto main = Downcast<Function>(ir_mod->Lookup("main"));
+  auto graph = collage::DataflowGraph(main);
+  Target target("llvm");
+  auto spec = collage::PartitionSpec("test_spec", target, {});
+
+  {
+    auto abs_pattern = IsOp("abs")({IsWildcard()});
+    auto abs_rule = collage::DFPatternPartitionRule("abs_pattern", abs_pattern);
+    auto relu_pattern = IsOp("nn.relu")({IsWildcard()});
+    auto relu_rule = collage::DFPatternPartitionRule("relu_pattern", relu_pattern);
+    auto union_rule = collage::UnionPartitionRule("union", {abs_rule, relu_rule});
+
+    auto abs_candidates = MakeCandidates(graph, "abs_pattern", spec, {{3}});
+    auto relu_candidates = MakeCandidates(graph, "relu_pattern", spec, {{4}, {5}});
+
+    std::vector<collage::CandidatePartition> expected_candidates;
+    expected_candidates.insert(expected_candidates.end(), abs_candidates.begin(),
+                               abs_candidates.end());
+    expected_candidates.insert(expected_candidates.end(), relu_candidates.begin(),
+                               relu_candidates.end());
+
+    auto candidates = union_rule->AllCandidates(graph, spec);
+
+    ICHECK_EQ(candidates.size(), expected_candidates.size());
+    for (size_t i = 0; i < candidates.size(); i++) {
+      ICHECK(collage::CandidatePartitionEquals()(candidates[i], expected_candidates[i]));
+    }
+  }
+}
+
+TEST(PartitionRule, OpCallByKind) {
+  constexpr const char* kMod = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(10, 10), float32]) {
+      %0 = abs(%x);                      //  4
+      %1 = add(%0, %x);                  //  5
+      shape_of(%1)                       //  6
+    }
+  )";
+  auto main = Downcast<Function>(parser::ParseModule("string", kMod)->Lookup("main"));
+  auto graph = collage::DataflowGraph(main);
+  Target target("llvm");
+  auto spec = collage::PartitionSpec("test_spec", target, {});
+
+  {
+    auto rule = collage::OpCallByKindPartitionRule("op_call_by_kind");
+    auto expected_candidates = MakeCandidates(graph, "op_call_by_kind", spec, {{4}, {5}});
+
+    auto candidates = rule->AllCandidates(graph, spec);
+
+    ICHECK_EQ(candidates.size(), expected_candidates.size());
+    for (size_t i = 0; i < candidates.size(); i++) {
+      ICHECK(collage::CandidatePartitionEquals()(candidates[i], expected_candidates[i]));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/lint/rat-excludes b/tests/lint/rat-excludes
index 1cdb78e31913c..cbc2043cfa4ed 100644
--- a/tests/lint/rat-excludes
+++ b/tests/lint/rat-excludes
@@ -19,6 +19,7 @@
 .*\.log
 .*\.interp
 .*\.tokens
+.*\.tuninglog
 
 # microTVM test data files
 testdata
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index cecb64785a49a..38da305f3b172 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 import itertools
-
+import logging
 
 import tvm
 import tvm.relay.testing
@@ -33,12 +33,14 @@
 from tvm.contrib.download import download
 from tvm.relay.op.contrib import tensorrt
 
-
 SUPPORTED_DTYPES = ["float16", "float32"]
 
 has_tensorrt_codegen = pytest.mark.skipif(
-    not tvm.get_global_func("relay.ext.tensorrt", True), reason="TensorRT codegen not available"
+    not tensorrt.is_tensorrt_compiler_enabled(), reason="TensorRT codegen not available"
 )
+
+# CAUTION: Currently always false in CI since adds tens of minutes to test time and depends
+# on TensorRT installation. See https://github.com/apache/tvm/issues/11765
 has_tensorrt_runtime = pytest.mark.skipif(
     not tensorrt.is_tensorrt_runtime_enabled(), reason="TensorRT runtime not available"
 )
@@ -72,7 +74,7 @@ def assert_result_dict_holds(result_dict, dtype="float16"):
                 tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=5e-3)
 
 
-def set_func_attr(func, compile_name, symbol_name):
+def set_outer_func_attr(func, compile_name, symbol_name):
     func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
     func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
     func = func.with_attr("Compiler", compile_name)
@@ -80,6 +82,12 @@ def set_func_attr(func, compile_name, symbol_name):
     return func
 
 
+def set_inner_func_attr(func, pattern_name, composite_name):
+    func = func.with_attr("PartitionedFromPattern", pattern_name)
+    func = func.with_attr("Composite", composite_name)
+    return func
+
+
 def run_and_verify_func(config, target="cuda", run_module=True, data_type="float32"):
     """Test a Relay func by compiling, running, and comparing TVM and TRT outputs.
 
@@ -110,34 +118,31 @@ def run_and_verify_func(config, target="cuda", run_module=True, data_type="float
 
     result_dict = dict()
     for mode in ["vm", "graph"]:
-        for mode in ["graph"]:
-            for use_trt in [True, False]:
-                mod = tvm.IRModule()
-                mod["main"] = f
-                result_key = mode + ("_trt" if use_trt else "")
-                if use_trt:
-                    mod = relay.transform.InferType()(mod)
-                    mod, config = tensorrt.partition_for_tensorrt(
-                        mod, params, use_fp16=data_type == "float16"
-                    )
-                    with tvm.transform.PassContext(
-                        opt_level=3, config={"relay.ext.tensorrt.options": config}
-                    ):
-                        func = relay.create_executor(
-                            mode, mod=mod, device=dev, target=target
-                        ).evaluate()
-                else:
-                    mod = relay.transform.InferType()(mod)
-                    with tvm.transform.PassContext(opt_level=3):
-                        func = relay.create_executor(
-                            mode, mod=mod, device=dev, target=target
-                        ).evaluate()
+        for use_trt in [True, False]:
+            mod = tvm.IRModule()
+            mod["main"] = f
+            result_key = mode + ("_trt" if use_trt else "")
+            if use_trt:
+                use_fp16 = data_type == "float16"
+                trt_target = tvm.target.Target(f"tensorrt -use_fp16={use_fp16}")
+                mod = relay.transform.InferType()(mod)
+                mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target)
+                with tvm.transform.PassContext(opt_level=3):
+                    func = relay.create_executor(
+                        mode, mod=mod, device=dev, target=[target, trt_target]
+                    ).evaluate()
+            else:
+                mod = relay.transform.InferType()(mod)
+                with tvm.transform.PassContext(opt_level=3):
+                    func = relay.create_executor(
+                        mode, mod=mod, device=dev, target=target
+                    ).evaluate()
 
-                if run_module:
-                    result_dict[result_key] = func(**input_dict, **params)
+            if run_module:
+                result_dict[result_key] = func(**input_dict, **params)
 
-                if run_module:
-                    assert_result_dict_holds(result_dict, data_type)
+            if run_module:
+                assert_result_dict_holds(result_dict, data_type)
 
 
 def test_tensorrt_simple(run_module):
@@ -163,10 +168,8 @@ def test_tensorrt_simple(run_module):
                 result_key = mode + ("_trt" if use_trt else "")
                 if use_trt:
                     mod = relay.transform.InferType()(mod)
-                    mod, config = tensorrt.partition_for_tensorrt(mod)
-                    with tvm.transform.PassContext(
-                        opt_level=3, config={"relay.ext.tensorrt.options": config}
-                    ):
+                    mod = tensorrt.partition_for_tensorrt(mod)
+                    with tvm.transform.PassContext(opt_level=3):
                         func = relay.create_executor(
                             mode, mod=mod, device=tvm.cuda(0), target="cuda"
                         ).evaluate()
@@ -212,9 +215,9 @@ def test_tensorrt_not_compatible(run_module):
     f = relay.Function([x], out)
     mod = tvm.IRModule()
     mod["main"] = f
-    mod, config = tensorrt.partition_for_tensorrt(mod)
+    mod = tensorrt.partition_for_tensorrt(mod)
     for mode in ["graph", "vm"]:
-        with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+        with tvm.transform.PassContext(opt_level=3):
             func = relay.create_executor(
                 mode, mod=mod, device=tvm.cuda(0), target="cuda"
             ).evaluate()
@@ -622,26 +625,18 @@ def are_ops_on_graph(self, subgraph) -> bool:
 
 
 def are_ops_on_trt(mod, op_list):
+    op_on_trt = False
+    op_on_tvm = False
     for subgraph in mod.get_global_vars():
         name = subgraph.name_hint
-        op_on_trt = False
-        op_on_tvm = True
-        if name == "main":
-            op_on_tvm = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
-        elif mod[name].attrs and mod[name].attrs["Compiler"] == "tensorrt":
-            op_on_trt = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
+        if mod[name].attrs and mod[name].attrs["Compiler"] == "tensorrt":
+            op_on_trt |= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
         else:
-            op_on_tvm &= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
-
-        if not op_on_trt or op_on_tvm:
-            return False
+            op_on_tvm |= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
 
-    return True
+    return op_on_trt and not op_on_tvm
 
 
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
 def test_dynamic_reshape(run_module):
     def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt):
         result_arr = [{} for _ in range(len(x_data_list))]
@@ -652,9 +647,9 @@ def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt):
             mod = tvm.IRModule()
             mod["main"] = f
             if use_trt:
-                mod, _ = tensorrt.partition_for_tensorrt(
-                    mod, params={}, remove_no_mac_subgraphs=False
-                )
+                logging.info("Before partitioning:\n%s", mod)
+                mod = tensorrt.partition_for_tensorrt(mod)
+                logging.info("After partitioning:\n%s", mod)
                 assert are_ops_on_trt(mod, op_list=["reshape"]) == should_offload_to_trt
             if run_module:
                 with relay.build_config(opt_level=3):
@@ -1051,6 +1046,7 @@ def get_graph(d_type="float16"):
         run_and_verify_func(get_graph(d_type=type), run_module=run_module, data_type=type)
 
 
+@pytest.mark.skip(reason=("Fails assert_allclose. See https://github.com/apache/tvm/issues/11765"))
 def test_conv3d(run_module):
     def get_graph(
         x_shape=(1, 24, 8, 8, 8),
@@ -1143,11 +1139,6 @@ def get_graph(
     )
 
 
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
-@has_tensorrt_codegen
-@tvm.testing.requires_cuda
 def test_dynamic_offload():
     """
     This test checks for proper dynamic offloading of relay graphs. An addition between
@@ -1161,24 +1152,29 @@ def test_dynamic_offload():
 
     x = relay.var("x", shape=(data_shape[0], data_shape[1], Any(), Any()), dtype="float32")
     y = relay.var("y", shape=(data_shape), dtype="float32")
-    kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+    kernel = relay.const(np.random.rand(*k_shape).astype("float32"))
 
     def get_expected():
         # Create a nested TRT function that matches the expected output
         mod = tvm.IRModule()
-        var1 = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32")
-        kernel_trt = relay.var("tensorrt_0_i1", shape=(k_shape), dtype="float32")
-        out1 = relay.nn.conv2d(var1, kernel_trt, channels=k_shape[0], kernel_size=k_shape[2:4])
-        f1 = GlobalVar("tvmgen_default_tensorrt_0")
-        func = relay.Function([var1, kernel_trt], out1)
-        func = set_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0")
-        mod[f1] = func
+        outer_var = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32")
+        inner_var = relay.var("FunctionVar_0_0", shape=(data_shape), dtype="float32")
+        inner_body = relay.nn.conv2d(
+            inner_var, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]
+        )
+        inner_func = relay.Function([inner_var], inner_body)
+        inner_func = set_inner_func_attr(inner_func, "nn.conv2d_", "tensorrt.nn.conv2d")
+        outer_body = inner_func(outer_var)
+        outer_func = relay.Function([outer_var], outer_body)
+        outer_func = set_outer_func_attr(outer_func, "tensorrt", "tvmgen_default_tensorrt_main_0")
+        gv = GlobalVar("tvmgen_default_tensorrt_main_0")
+        mod[gv] = outer_func
         mod = relay.transform.InferType()(mod)
 
         # Create the main function
         out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
-        out = relay.add(out1, f1(y, kernel))
-        f = relay.Function([x, y, kernel], out)
+        out = relay.add(out1, gv(y))
+        f = relay.Function([x, y], out)
         mod["main"] = f
         mod = relay.transform.InferType()(mod)
         return mod
@@ -1187,13 +1183,13 @@ def get_expected():
     out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
     out2 = relay.nn.conv2d(y, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
     out = relay.add(out1, out2)
-    f = relay.Function([x, y, kernel], out)
+    f = relay.Function([x, y], out)
 
     # Pass the function to TRT compilation
     mod = tvm.IRModule()
     mod["main"] = f
     mod = relay.transform.InferType()(mod)
-    mod_trt, config = tensorrt.partition_for_tensorrt(mod, params={})
+    mod_trt = tensorrt.partition_for_tensorrt(mod)
 
     # Get the expected relay graph and compare
     mod_exp = get_expected()
@@ -1212,7 +1208,7 @@ def test_tensorrt_dynamic_batch(run_module):
         mod = tvm.IRModule()
         mod["main"] = f
         if use_trt:
-            mod, _ = tensorrt.partition_for_tensorrt(mod)
+            mod = tensorrt.partition_for_tensorrt(mod)
 
         if run_module:
             with relay.build_config(opt_level=3):
@@ -1242,17 +1238,17 @@ def test_tensorrt_dynamic_batch_conv(run_module):
             f = relay.Function([x, kernel], out)
             mod = tvm.IRModule()
             mod["main"] = f
+            trt_target = tvm.target.Target(f"tensorrt -use_implicit_batch={use_implicit_batch}")
             if use_trt:
-                mod, config = tensorrt.partition_for_tensorrt(
-                    mod, params, use_implicit_batch=use_implicit_batch
-                )
+                mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target)
             if run_module:
                 for target in ["llvm", "cuda"]:
-                    with tvm.transform.PassContext(
-                        opt_level=3, config={"relay.ext.tensorrt.options": config}
-                    ):
+                    targets = [target]
+                    if use_trt:
+                        targets.append(trt_target)
+                    with tvm.transform.PassContext(opt_level=3):
                         func = relay.create_executor(
-                            "vm", mod=mod, device=tvm.device(target), target=target
+                            "vm", mod=mod, device=tvm.device(target), target=targets
                         ).evaluate()
                     for i, batch_size in enumerate(batches_to_test):
                         result_arr[i][target][use_trt] = func(x_data[:batch_size, ...], **params)
@@ -1262,6 +1258,11 @@ def test_tensorrt_dynamic_batch_conv(run_module):
                     assert_result_dict_holds(result_arr[i][target])
 
 
+@pytest.mark.skip(
+    reason=(
+        "Coredumps, possibly due to LLVM and PyTorch version mismatch. See https://github.com/apache/tvm/issues/11765"
+    )
+)
 def test_maskrcnn_resnet50(run_module) -> None:
     """
     This function tests the working of pytorch maskrcnn with resnet50 as backbone with
@@ -1281,9 +1282,11 @@ def convert_traced_model_to_vm_trt(
         input_name = "input0"
         shape_list = [(input_name, input_shape)]
         mod, params = relay.frontend.from_pytorch(traced_module, shape_list)
-        mod, config = tensorrt.partition_for_tensorrt(mod, params, remove_no_mac_subgraphs=True)
+        trt_target = tvm.target.Target("tensorrt -remove_no_mac_subgraphs=True")
+        mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target)
+        targets = [target, trt_target]
         with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]):
-            vm_trt_exec = relay.vm.compile(mod, target=target, params=params)
+            vm_trt_exec = relay.vm.compile(mod, target=targets, params=params)
 
         return vm_trt_exec
 
@@ -1381,7 +1384,7 @@ def test_empty_subgraph(run_module):
     var1 = relay.var("tensorrt_0_i0", shape=(x_shape), dtype="float32")
     f1 = GlobalVar("tensorrt_0")
     func = relay.Function([var1], var1)
-    func = set_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0")
+    func = set_outer_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0")
     mod[f1] = func
     mod = relay.transform.InferType()(mod)
 
@@ -1402,4 +1405,5 @@ def test_empty_subgraph(run_module):
 
 
 if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
     tvm.testing.main()
diff --git a/tests/python/contrib/test_tensorrt_int8_exp.py b/tests/python/contrib/test_tensorrt_int8_exp.py
index 84360e92d33b9..97eff3af0e42a 100644
--- a/tests/python/contrib/test_tensorrt_int8_exp.py
+++ b/tests/python/contrib/test_tensorrt_int8_exp.py
@@ -103,8 +103,8 @@ def test_trt_int8():
     # compile the model
     target = "cuda"
     dev = tvm.cuda(1)
-    mod, config = partition_for_tensorrt(mod, params)
-    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+    mod = partition_for_tensorrt(mod, params)
+    with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params=params)
 
     dtype = "float32"
diff --git a/tests/python/relay/collage/menangerie.py b/tests/python/relay/collage/menangerie.py
new file mode 100644
index 0000000000000..60f150e11cea8
--- /dev/null
+++ b/tests/python/relay/collage/menangerie.py
@@ -0,0 +1,4286 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import onnx
+import numpy as np
+import logging
+import tvm.contrib.target.onnx
+
+MODEL_PREFIX = "/home/mbs/gauntlet/models/"
+MNIST = {
+    "name": "mnist",
+    "filename": "mnist-8.onnx",
+    "input_shapes": {"Input3": [1, 1, 28, 28]},
+    "input_dtypes": {"Input3": "float32"},
+    "main_dtype": "float32",
+}
+GPT2 = {
+    "name": "gpt2",
+    "filename": "gpt2.onnx",
+    "input_shapes": {"input1": [1, 50, 32]},
+    "input_dtypes": {"input1": "int64"},
+    "main_dtype": "float32",
+}
+RESNET50V2 = {
+    "name": "resnet50",
+    "filename": "resnet50-v2-7.onnx",
+    "input_shapes": {"data": [1, 3, 224, 224]},
+    "input_dtypes": {"data": "float32"},
+    "main_dtype": "float32",
+}
+MOBILENETV2 = {
+    "name": "mobilenet",
+    "filename": "mobilenetv2-1.0.onnx",
+    "input_shapes": {"data": [1, 3, 224, 224]},
+    "input_dtypes": {"data": "float32"},
+    "main_dtype": "float32",
+}
+# Note that resnext50_32_4d below was extracted directly from the pytorch model and not from any onnx file.
+RESNEXT50_32_4d = {
+    "name": "resnext50_32_4d",
+    "filename": "resnext50_32x4d.onnx",
+    "input_shapes": {"x": [1, 64, 56, 56]},
+    "input_dtypes": {"x": "float32"},
+    "main_dtype": "float32",
+}
+
+
+def make_const(dtype, shape):
+    return tvm.relay.const(np.random.rand(*shape).astype(dtype))
+
+
+def make_consts(dtype, shapes):
+    return [make_const(dtype, shape) for shape in shapes]
+
+
+def mnist_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (8, 1, 5, 5),  # 0
+            (8, 1, 1),  # 1
+            (16, 8, 5, 5),  # 2
+            (16, 1, 1),  # 3
+            (10, 256),  # 4
+            (1, 10),  # 5
+        ],
+    )
+
+
+def mnist():
+    metatable = {"relay.Constant": mnist_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1, 1, 28, 28), float32]) -> Tensor[(1, 10), float32] {
+          %0 = nn.pad(%x, 0f, pad_width=[[0, 0], [0, 0], [2, 2], [2, 2]]);
+          %1 = nn.conv2d(%0, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=8, kernel_size=[5, 5]);
+          %2 = add(%1, meta[relay.Constant][1]);
+          %3 = nn.relu(%2);
+          %4 = nn.max_pool2d(%3, pool_size=[2, 2], strides=[2, 2], padding=[0, 0, 0, 0]);
+          %5 = nn.pad(%4, 0f, pad_width=[[0, 0], [0, 0], [2, 2], [2, 2]]);
+          %6 = nn.conv2d(%5, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=16, kernel_size=[5, 5]);
+          %7 = add(%6, meta[relay.Constant][3]);
+          %8 = nn.relu(%7);
+          %9 = nn.max_pool2d(%8, pool_size=[3, 3], strides=[3, 3], padding=[0, 0, 0, 0]);
+          %10 = reshape(%9, newshape=[1, 256]);
+          %11 = nn.dense(%10, meta[relay.Constant][4], units=None, out_dtype="float32");
+          add(%11, meta[relay.Constant][5])
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "mnist",
+        "input_shapes": {"x": [1, 1, 28, 28]},
+        "input_dtypes": {"x": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def gpt2_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (50257, 768),  # 0
+            (1, 32, 768),  # 1
+            (768,),  # 2
+            (768,),  # 3
+            (2304, 768),  # 4
+            (2304,),  # 5
+            (1, 1, 32, 32),  # 6
+            (1, 1, 32, 32),  # 7
+            (768, 768),  # 8
+            (768,),  # 9
+            (768,),  # 10
+            (768,),  # 11
+            (3072, 768),  # 12
+            (3072,),  # 13
+            (768, 3072),  # 14
+            (768,),  # 15
+            (768,),  # 16
+            (768,),  # 17
+            (2304, 768),  # 18
+            (2304,),  # 19
+            (1, 1, 32, 32),  # 20
+            (1, 1, 32, 32),  # 21
+            (768, 768),  # 22
+            (768,),  # 23
+            (768,),  # 24
+            (768,),  # 25
+            (3072, 768),  # 26
+            (3072,),  # 27
+            (768, 3072),  # 28
+            (768,),  # 29
+            (768,),  # 30
+            (768,),  # 31
+            (2304, 768),  # 32
+            (2304,),  # 33
+            (1, 1, 32, 32),  # 34
+            (1, 1, 32, 32),  # 35
+            (768, 768),  # 36
+            (768,),  # 37
+            (768,),  # 38
+            (768,),  # 39
+            (3072, 768),  # 40
+            (3072,),  # 41
+            (768, 3072),  # 42
+            (768,),  # 43
+            (768,),  # 44
+            (768,),  # 45
+            (2304, 768),  # 46
+            (2304,),  # 47
+            (1, 1, 32, 32),  # 48
+            (1, 1, 32, 32),  # 49
+            (768, 768),  # 50
+            (768,),  # 51
+            (768,),  # 52
+            (768,),  # 53
+            (3072, 768),  # 54
+            (3072,),  # 55
+            (768, 3072),  # 56
+            (768,),  # 57
+            (768,),  # 58
+            (768,),  # 59
+            (2304, 768),  # 60
+            (2304,),  # 61
+            (1, 1, 32, 32),  # 62
+            (1, 1, 32, 32),  # 63
+            (768, 768),  # 64
+            (768,),  # 65
+            (768,),  # 66
+            (768,),  # 67
+            (3072, 768),  # 68
+            (3072,),  # 69
+            (768, 3072),  # 70
+            (768,),  # 71
+            (768,),  # 72
+            (768,),  # 73
+            (2304, 768),  # 74
+            (2304,),  # 75
+            (1, 1, 32, 32),  # 76
+            (1, 1, 32, 32),  # 77
+            (768, 768),  # 78
+            (768,),  # 79
+            (768,),  # 80
+            (768,),  # 81
+            (3072, 768),  # 82
+            (3072,),  # 83
+            (768, 3072),  # 84
+            (768,),  # 85
+            (768,),  # 86
+            (768,),  # 87
+            (2304, 768),  # 88
+            (2304,),  # 89
+            (1, 1, 32, 32),  # 90
+            (1, 1, 32, 32),  # 91
+            (768, 768),  # 92
+            (768,),  # 93
+            (768,),  # 94
+            (768,),  # 95
+            (3072, 768),  # 96
+            (3072,),  # 97
+            (768, 3072),  # 98
+            (768,),  # 99
+            (768,),  # 100
+            (768,),  # 101
+            (2304, 768),  # 102
+            (2304,),  # 103
+            (1, 1, 32, 32),  # 104
+            (1, 1, 32, 32),  # 105
+            (768, 768),  # 106
+            (768,),  # 107
+            (768,),  # 108
+            (768,),  # 109
+            (3072, 768),  # 110
+            (3072,),  # 111
+            (768, 3072),  # 112
+            (768,),  # 113
+            (768,),  # 114
+            (768,),  # 115
+            (2304, 768),  # 116
+            (2304,),  # 117
+            (1, 1, 32, 32),  # 118
+            (1, 1, 32, 32),  # 119
+            (768, 768),  # 120
+            (768,),  # 121
+            (768,),  # 122
+            (768,),  # 123
+            (3072, 768),  # 124
+            (3072,),  # 125
+            (768, 3072),  # 126
+            (768,),  # 127
+            (768,),  # 128
+            (768,),  # 129
+            (2304, 768),  # 130
+            (2304,),  # 131
+            (1, 1, 32, 32),  # 132
+            (1, 1, 32, 32),  # 133
+            (768, 768),  # 134
+            (768,),  # 135
+            (768,),  # 136
+            (768,),  # 137
+            (3072, 768),  # 138
+            (3072,),  # 139
+            (768, 3072),  # 140
+            (768,),  # 141
+            (768,),  # 142
+            (768,),  # 143
+            (2304, 768),  # 144
+            (2304,),  # 145
+            (1, 1, 32, 32),  # 146
+            (1, 1, 32, 32),  # 147
+            (768, 768),  # 148
+            (768,),  # 149
+            (768,),  # 150
+            (768,),  # 151
+            (3072, 768),  # 152
+            (3072,),  # 153
+            (768, 3072),  # 154
+            (768,),  # 155
+            (768,),  # 156
+            (768,),  # 157
+            (2304, 768),  # 158
+            (2304,),  # 159
+            (1, 1, 32, 32),  # 160
+            (1, 1, 32, 32),  # 161
+            (768, 768),  # 162
+            (768,),  # 163
+            (768,),  # 164
+            (768,),  # 165
+            (3072, 768),  # 166
+            (3072,),  # 167
+            (768, 3072),  # 168
+            (768,),  # 169
+            (768,),  # 170
+            (768,),  # 171
+        ],
+    )
+
+
+def gpt2():
+    metatable = {"relay.Constant": gpt2_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1, 50, 32), int64]) -> (Tensor[(1, 50, 32, 768), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32]) {
+          %0 = reshape(%x, newshape=[-1, 32]);
+          %1 = less(%0, 0i64);
+          %2 = add(%0, 50257i64);
+          %3 = where(%1, %2, %0);
+          %4 = take(meta[relay.Constant][0], %3, axis=0);
+          %5 = add(%4, meta[relay.Constant][1]);
+          %6 = mean(%5, axis=[-1], keepdims=True);
+          %7 = subtract(%5, %6);
+          %8 = power(%7, 2f);
+          %9 = mean(%8, axis=[-1], keepdims=True);
+          %10 = add(%9, 1e-05f);
+          %11 = sqrt(%10);
+          %12 = divide(%7, %11);
+          %13 = multiply(%12, meta[relay.Constant][2]);
+          %14 = add(%13, meta[relay.Constant][3]);
+          %15 = reshape(%14, newshape=[-1, 768]);
+          %16 = nn.dense(%15, meta[relay.Constant][4], units=2304);
+          %17 = add(%16, meta[relay.Constant][5]);
+          %18 = reshape(%17, newshape=[50, 32, 2304]);
+          %19 = split(%18, indices_or_sections=[768, 1536], axis=2);
+          %20 = %19.0;
+          %21 = reshape(%20, newshape=[50, 32, 12, 64]);
+          %22 = transpose(%21, axes=[0, 2, 1, 3]);
+          %23 = %19.1;
+          %24 = reshape(%23, newshape=[50, 32, 12, 64]);
+          %25 = transpose(%24, axes=[0, 2, 3, 1]);
+          %26 = reshape(%25, newshape=[-1, 64, 32]);
+          %27 = reshape(%22, newshape=[-1, 32, 64]);
+          %28 = transpose(%26, axes=[0, 2, 1]);
+          %29 = nn.batch_matmul(%27, %28, out_dtype="float32", transpose_b=True);
+          %30 = reshape(%29, newshape=[50, 12, 32, 32]);
+          %31 = divide(%30, 8f);
+          %32 = multiply(%31, meta[relay.Constant][6]);
+          %33 = subtract(%32, meta[relay.Constant][7]);
+          %34 = nn.softmax(%33, axis=3);
+          %35 = %19.2;
+          %36 = reshape(%35, newshape=[50, 32, 12, 64]);
+          %37 = transpose(%36, axes=[0, 2, 1, 3]);
+          %38 = reshape(%37, newshape=[-1, 32, 64]);
+          %39 = reshape(%34, newshape=[-1, 32, 32]);
+          %40 = transpose(%38, axes=[0, 2, 1]);
+          %41 = nn.batch_matmul(%39, %40, out_dtype="float32", transpose_b=True);
+          %42 = reshape(%41, newshape=[50, 12, 32, 64]);
+          %43 = transpose(%42, axes=[0, 2, 1, 3]);
+          %44 = reshape(%43, newshape=[50, 32, 768]);
+          %45 = reshape(%44, newshape=[-1, 768]);
+          %46 = nn.dense(%45, meta[relay.Constant][8], units=768);
+          %47 = add(%46, meta[relay.Constant][9]);
+          %48 = reshape(%47, newshape=[50, 32, 768]);
+          %49 = add(%5, %48);
+          %50 = mean(%49, axis=[-1], keepdims=True);
+          %51 = subtract(%49, %50);
+          %52 = power(%51, 2f);
+          %53 = mean(%52, axis=[-1], keepdims=True);
+          %54 = add(%53, 1e-05f);
+          %55 = sqrt(%54);
+          %56 = divide(%51, %55);
+          %57 = multiply(%56, meta[relay.Constant][10]);
+          %58 = add(%57, meta[relay.Constant][11]);
+          %59 = reshape(%58, newshape=[-1, 768]);
+          %60 = nn.dense(%59, meta[relay.Constant][12], units=3072);
+          %61 = add(%60, meta[relay.Constant][13]);
+          %62 = reshape(%61, newshape=[50, 32, 3072]);
+          %63 = power(%62, 3f);
+          %64 = multiply(%63, 0.044715f);
+          %65 = add(%62, %64);
+          %66 = multiply(%65, 0.797885f);
+          %67 = tanh(%66);
+          %68 = multiply(%62, 0.5f);
+          %69 = add(%67, 1f);
+          %70 = multiply(%68, %69);
+          %71 = reshape(%70, newshape=[-1, 3072]);
+          %72 = nn.dense(%71, meta[relay.Constant][14], units=768);
+          %73 = add(%72, meta[relay.Constant][15]);
+          %74 = reshape(%73, newshape=[50, 32, 768]);
+          %75 = add(%49, %74);
+          %76 = mean(%75, axis=[-1], keepdims=True);
+          %77 = subtract(%75, %76);
+          %78 = power(%77, 2f);
+          %79 = mean(%78, axis=[-1], keepdims=True);
+          %80 = add(%79, 1e-05f);
+          %81 = sqrt(%80);
+          %82 = divide(%77, %81);
+          %83 = multiply(%82, meta[relay.Constant][16]);
+          %84 = add(%83, meta[relay.Constant][17]);
+          %85 = reshape(%84, newshape=[-1, 768]);
+          %86 = nn.dense(%85, meta[relay.Constant][18], units=2304);
+          %87 = add(%86, meta[relay.Constant][19]);
+          %88 = reshape(%87, newshape=[50, 32, 2304]);
+          %89 = split(%88, indices_or_sections=[768, 1536], axis=2);
+          %90 = %89.0;
+          %91 = reshape(%90, newshape=[50, 32, 12, 64]);
+          %92 = transpose(%91, axes=[0, 2, 1, 3]);
+          %93 = %89.1;
+          %94 = reshape(%93, newshape=[50, 32, 12, 64]);
+          %95 = transpose(%94, axes=[0, 2, 3, 1]);
+          %96 = reshape(%95, newshape=[-1, 64, 32]);
+          %97 = reshape(%92, newshape=[-1, 32, 64]);
+          %98 = transpose(%96, axes=[0, 2, 1]);
+          %99 = nn.batch_matmul(%97, %98, out_dtype="float32", transpose_b=True);
+          %100 = reshape(%99, newshape=[50, 12, 32, 32]);
+          %101 = divide(%100, 8f);
+          %102 = multiply(%101, meta[relay.Constant][20]);
+          %103 = subtract(%102, meta[relay.Constant][21]);
+          %104 = nn.softmax(%103, axis=3);
+          %105 = %89.2;
+          %106 = reshape(%105, newshape=[50, 32, 12, 64]);
+          %107 = transpose(%106, axes=[0, 2, 1, 3]);
+          %108 = reshape(%107, newshape=[-1, 32, 64]);
+          %109 = reshape(%104, newshape=[-1, 32, 32]);
+          %110 = transpose(%108, axes=[0, 2, 1]);
+          %111 = nn.batch_matmul(%109, %110, out_dtype="float32", transpose_b=True);
+          %112 = reshape(%111, newshape=[50, 12, 32, 64]);
+          %113 = transpose(%112, axes=[0, 2, 1, 3]);
+          %114 = reshape(%113, newshape=[50, 32, 768]);
+          %115 = reshape(%114, newshape=[-1, 768]);
+          %116 = nn.dense(%115, meta[relay.Constant][22], units=768);
+          %117 = add(%116, meta[relay.Constant][23]);
+          %118 = reshape(%117, newshape=[50, 32, 768]);
+          %119 = add(%75, %118);
+          %120 = mean(%119, axis=[-1], keepdims=True);
+          %121 = subtract(%119, %120);
+          %122 = power(%121, 2f);
+          %123 = mean(%122, axis=[-1], keepdims=True);
+          %124 = add(%123, 1e-05f);
+          %125 = sqrt(%124);
+          %126 = divide(%121, %125);
+          %127 = multiply(%126, meta[relay.Constant][24]);
+          %128 = add(%127, meta[relay.Constant][25]);
+          %129 = reshape(%128, newshape=[-1, 768]);
+          %130 = nn.dense(%129, meta[relay.Constant][26], units=3072);
+          %131 = add(%130, meta[relay.Constant][27]);
+          %132 = reshape(%131, newshape=[50, 32, 3072]);
+          %133 = power(%132, 3f);
+          %134 = multiply(%133, 0.044715f);
+          %135 = add(%132, %134);
+          %136 = multiply(%135, 0.797885f);
+          %137 = tanh(%136);
+          %138 = multiply(%132, 0.5f);
+          %139 = add(%137, 1f);
+          %140 = multiply(%138, %139);
+          %141 = reshape(%140, newshape=[-1, 3072]);
+          %142 = nn.dense(%141, meta[relay.Constant][28], units=768);
+          %143 = add(%142, meta[relay.Constant][29]);
+          %144 = reshape(%143, newshape=[50, 32, 768]);
+          %145 = add(%119, %144);
+          %146 = mean(%145, axis=[-1], keepdims=True);
+          %147 = subtract(%145, %146);
+          %148 = power(%147, 2f);
+          %149 = mean(%148, axis=[-1], keepdims=True);
+          %150 = add(%149, 1e-05f);
+          %151 = sqrt(%150);
+          %152 = divide(%147, %151);
+          %153 = multiply(%152, meta[relay.Constant][30]);
+          %154 = add(%153, meta[relay.Constant][31]);
+          %155 = reshape(%154, newshape=[-1, 768]);
+          %156 = nn.dense(%155, meta[relay.Constant][32], units=2304);
+          %157 = add(%156, meta[relay.Constant][33]);
+          %158 = reshape(%157, newshape=[50, 32, 2304]);
+          %159 = split(%158, indices_or_sections=[768, 1536], axis=2);
+          %160 = %159.0;
+          %161 = reshape(%160, newshape=[50, 32, 12, 64]);
+          %162 = transpose(%161, axes=[0, 2, 1, 3]);
+          %163 = %159.1;
+          %164 = reshape(%163, newshape=[50, 32, 12, 64]);
+          %165 = transpose(%164, axes=[0, 2, 3, 1]);
+          %166 = reshape(%165, newshape=[-1, 64, 32]);
+          %167 = reshape(%162, newshape=[-1, 32, 64]);
+          %168 = transpose(%166, axes=[0, 2, 1]);
+          %169 = nn.batch_matmul(%167, %168, out_dtype="float32", transpose_b=True);
+          %170 = reshape(%169, newshape=[50, 12, 32, 32]);
+          %171 = divide(%170, 8f);
+          %172 = multiply(%171, meta[relay.Constant][34]);
+          %173 = subtract(%172, meta[relay.Constant][35]);
+          %174 = nn.softmax(%173, axis=3);
+          %175 = %159.2;
+          %176 = reshape(%175, newshape=[50, 32, 12, 64]);
+          %177 = transpose(%176, axes=[0, 2, 1, 3]);
+          %178 = reshape(%177, newshape=[-1, 32, 64]);
+          %179 = reshape(%174, newshape=[-1, 32, 32]);
+          %180 = transpose(%178, axes=[0, 2, 1]);
+          %181 = nn.batch_matmul(%179, %180, out_dtype="float32", transpose_b=True);
+          %182 = reshape(%181, newshape=[50, 12, 32, 64]);
+          %183 = transpose(%182, axes=[0, 2, 1, 3]);
+          %184 = reshape(%183, newshape=[50, 32, 768]);
+          %185 = reshape(%184, newshape=[-1, 768]);
+          %186 = nn.dense(%185, meta[relay.Constant][36], units=768);
+          %187 = add(%186, meta[relay.Constant][37]);
+          %188 = reshape(%187, newshape=[50, 32, 768]);
+          %189 = add(%145, %188);
+          %190 = mean(%189, axis=[-1], keepdims=True);
+          %191 = subtract(%189, %190);
+          %192 = power(%191, 2f);
+          %193 = mean(%192, axis=[-1], keepdims=True);
+          %194 = add(%193, 1e-05f);
+          %195 = sqrt(%194);
+          %196 = divide(%191, %195);
+          %197 = multiply(%196, meta[relay.Constant][38]);
+          %198 = add(%197, meta[relay.Constant][39]);
+          %199 = reshape(%198, newshape=[-1, 768]);
+          %200 = nn.dense(%199, meta[relay.Constant][40], units=3072);
+          %201 = add(%200, meta[relay.Constant][41]);
+          %202 = reshape(%201, newshape=[50, 32, 3072]);
+          %203 = power(%202, 3f);
+          %204 = multiply(%203, 0.044715f);
+          %205 = add(%202, %204);
+          %206 = multiply(%205, 0.797885f);
+          %207 = tanh(%206);
+          %208 = multiply(%202, 0.5f);
+          %209 = add(%207, 1f);
+          %210 = multiply(%208, %209);
+          %211 = reshape(%210, newshape=[-1, 3072]);
+          %212 = nn.dense(%211, meta[relay.Constant][42], units=768);
+          %213 = add(%212, meta[relay.Constant][43]);
+          %214 = reshape(%213, newshape=[50, 32, 768]);
+          %215 = add(%189, %214);
+          %216 = mean(%215, axis=[-1], keepdims=True);
+          %217 = subtract(%215, %216);
+          %218 = power(%217, 2f);
+          %219 = mean(%218, axis=[-1], keepdims=True);
+          %220 = add(%219, 1e-05f);
+          %221 = sqrt(%220);
+          %222 = divide(%217, %221);
+          %223 = multiply(%222, meta[relay.Constant][44]);
+          %224 = add(%223, meta[relay.Constant][45]);
+          %225 = reshape(%224, newshape=[-1, 768]);
+          %226 = nn.dense(%225, meta[relay.Constant][46], units=2304);
+          %227 = add(%226, meta[relay.Constant][47]);
+          %228 = reshape(%227, newshape=[50, 32, 2304]);
+          %229 = split(%228, indices_or_sections=[768, 1536], axis=2);
+          %230 = %229.0;
+          %231 = reshape(%230, newshape=[50, 32, 12, 64]);
+          %232 = transpose(%231, axes=[0, 2, 1, 3]);
+          %233 = %229.1;
+          %234 = reshape(%233, newshape=[50, 32, 12, 64]);
+          %235 = transpose(%234, axes=[0, 2, 3, 1]);
+          %236 = reshape(%235, newshape=[-1, 64, 32]);
+          %237 = reshape(%232, newshape=[-1, 32, 64]);
+          %238 = transpose(%236, axes=[0, 2, 1]);
+          %239 = nn.batch_matmul(%237, %238, out_dtype="float32", transpose_b=True);
+          %240 = reshape(%239, newshape=[50, 12, 32, 32]);
+          %241 = divide(%240, 8f);
+          %242 = multiply(%241, meta[relay.Constant][48]);
+          %243 = subtract(%242, meta[relay.Constant][49]);
+          %244 = nn.softmax(%243, axis=3);
+          %245 = %229.2;
+          %246 = reshape(%245, newshape=[50, 32, 12, 64]);
+          %247 = transpose(%246, axes=[0, 2, 1, 3]);
+          %248 = reshape(%247, newshape=[-1, 32, 64]);
+          %249 = reshape(%244, newshape=[-1, 32, 32]);
+          %250 = transpose(%248, axes=[0, 2, 1]);
+          %251 = nn.batch_matmul(%249, %250, out_dtype="float32", transpose_b=True);
+          %252 = reshape(%251, newshape=[50, 12, 32, 64]);
+          %253 = transpose(%252, axes=[0, 2, 1, 3]);
+          %254 = reshape(%253, newshape=[50, 32, 768]);
+          %255 = reshape(%254, newshape=[-1, 768]);
+          %256 = nn.dense(%255, meta[relay.Constant][50], units=768);
+          %257 = add(%256, meta[relay.Constant][51]);
+          %258 = reshape(%257, newshape=[50, 32, 768]);
+          %259 = add(%215, %258);
+          %260 = mean(%259, axis=[-1], keepdims=True);
+          %261 = subtract(%259, %260);
+          %262 = power(%261, 2f);
+          %263 = mean(%262, axis=[-1], keepdims=True);
+          %264 = add(%263, 1e-05f);
+          %265 = sqrt(%264);
+          %266 = divide(%261, %265);
+          %267 = multiply(%266, meta[relay.Constant][52]);
+          %268 = add(%267, meta[relay.Constant][53]);
+          %269 = reshape(%268, newshape=[-1, 768]);
+          %270 = nn.dense(%269, meta[relay.Constant][54], units=3072);
+          %271 = add(%270, meta[relay.Constant][55]);
+          %272 = reshape(%271, newshape=[50, 32, 3072]);
+          %273 = power(%272, 3f);
+          %274 = multiply(%273, 0.044715f);
+          %275 = add(%272, %274);
+          %276 = multiply(%275, 0.797885f);
+          %277 = tanh(%276);
+          %278 = multiply(%272, 0.5f);
+          %279 = add(%277, 1f);
+          %280 = multiply(%278, %279);
+          %281 = reshape(%280, newshape=[-1, 3072]);
+          %282 = nn.dense(%281, meta[relay.Constant][56], units=768);
+          %283 = add(%282, meta[relay.Constant][57]);
+          %284 = reshape(%283, newshape=[50, 32, 768]);
+          %285 = add(%259, %284);
+          %286 = mean(%285, axis=[-1], keepdims=True);
+          %287 = subtract(%285, %286);
+          %288 = power(%287, 2f);
+          %289 = mean(%288, axis=[-1], keepdims=True);
+          %290 = add(%289, 1e-05f);
+          %291 = sqrt(%290);
+          %292 = divide(%287, %291);
+          %293 = multiply(%292, meta[relay.Constant][58]);
+          %294 = add(%293, meta[relay.Constant][59]);
+          %295 = reshape(%294, newshape=[-1, 768]);
+          %296 = nn.dense(%295, meta[relay.Constant][60], units=2304);
+          %297 = add(%296, meta[relay.Constant][61]);
+          %298 = reshape(%297, newshape=[50, 32, 2304]);
+          %299 = split(%298, indices_or_sections=[768, 1536], axis=2);
+          %300 = %299.0;
+          %301 = reshape(%300, newshape=[50, 32, 12, 64]);
+          %302 = transpose(%301, axes=[0, 2, 1, 3]);
+          %303 = %299.1;
+          %304 = reshape(%303, newshape=[50, 32, 12, 64]);
+          %305 = transpose(%304, axes=[0, 2, 3, 1]);
+          %306 = reshape(%305, newshape=[-1, 64, 32]);
+          %307 = reshape(%302, newshape=[-1, 32, 64]);
+          %308 = transpose(%306, axes=[0, 2, 1]);
+          %309 = nn.batch_matmul(%307, %308, out_dtype="float32", transpose_b=True);
+          %310 = reshape(%309, newshape=[50, 12, 32, 32]);
+          %311 = divide(%310, 8f);
+          %312 = multiply(%311, meta[relay.Constant][62]);
+          %313 = subtract(%312, meta[relay.Constant][63]);
+          %314 = nn.softmax(%313, axis=3);
+          %315 = %299.2;
+          %316 = reshape(%315, newshape=[50, 32, 12, 64]);
+          %317 = transpose(%316, axes=[0, 2, 1, 3]);
+          %318 = reshape(%317, newshape=[-1, 32, 64]);
+          %319 = reshape(%314, newshape=[-1, 32, 32]);
+          %320 = transpose(%318, axes=[0, 2, 1]);
+          %321 = nn.batch_matmul(%319, %320, out_dtype="float32", transpose_b=True);
+          %322 = reshape(%321, newshape=[50, 12, 32, 64]);
+          %323 = transpose(%322, axes=[0, 2, 1, 3]);
+          %324 = reshape(%323, newshape=[50, 32, 768]);
+          %325 = reshape(%324, newshape=[-1, 768]);
+          %326 = nn.dense(%325, meta[relay.Constant][64], units=768);
+          %327 = add(%326, meta[relay.Constant][65]);
+          %328 = reshape(%327, newshape=[50, 32, 768]);
+          %329 = add(%285, %328);
+          %330 = mean(%329, axis=[-1], keepdims=True);
+          %331 = subtract(%329, %330);
+          %332 = power(%331, 2f);
+          %333 = mean(%332, axis=[-1], keepdims=True);
+          %334 = add(%333, 1e-05f);
+          %335 = sqrt(%334);
+          %336 = divide(%331, %335);
+          %337 = multiply(%336, meta[relay.Constant][66]);
+          %338 = add(%337, meta[relay.Constant][67]);
+          %339 = reshape(%338, newshape=[-1, 768]);
+          %340 = nn.dense(%339, meta[relay.Constant][68], units=3072);
+          %341 = add(%340, meta[relay.Constant][69]);
+          %342 = reshape(%341, newshape=[50, 32, 3072]);
+          %343 = power(%342, 3f);
+          %344 = multiply(%343, 0.044715f);
+          %345 = add(%342, %344);
+          %346 = multiply(%345, 0.797885f);
+          %347 = tanh(%346);
+          %348 = multiply(%342, 0.5f);
+          %349 = add(%347, 1f);
+          %350 = multiply(%348, %349);
+          %351 = reshape(%350, newshape=[-1, 3072]);
+          %352 = nn.dense(%351, meta[relay.Constant][70], units=768);
+          %353 = add(%352, meta[relay.Constant][71]);
+          %354 = reshape(%353, newshape=[50, 32, 768]);
+          %355 = add(%329, %354);
+          %356 = mean(%355, axis=[-1], keepdims=True);
+          %357 = subtract(%355, %356);
+          %358 = power(%357, 2f);
+          %359 = mean(%358, axis=[-1], keepdims=True);
+          %360 = add(%359, 1e-05f);
+          %361 = sqrt(%360);
+          %362 = divide(%357, %361);
+          %363 = multiply(%362, meta[relay.Constant][72]);
+          %364 = add(%363, meta[relay.Constant][73]);
+          %365 = reshape(%364, newshape=[-1, 768]);
+          %366 = nn.dense(%365, meta[relay.Constant][74], units=2304);
+          %367 = add(%366, meta[relay.Constant][75]);
+          %368 = reshape(%367, newshape=[50, 32, 2304]);
+          %369 = split(%368, indices_or_sections=[768, 1536], axis=2);
+          %370 = %369.0;
+          %371 = reshape(%370, newshape=[50, 32, 12, 64]);
+          %372 = transpose(%371, axes=[0, 2, 1, 3]);
+          %373 = %369.1;
+          %374 = reshape(%373, newshape=[50, 32, 12, 64]);
+          %375 = transpose(%374, axes=[0, 2, 3, 1]);
+          %376 = reshape(%375, newshape=[-1, 64, 32]);
+          %377 = reshape(%372, newshape=[-1, 32, 64]);
+          %378 = transpose(%376, axes=[0, 2, 1]);
+          %379 = nn.batch_matmul(%377, %378, out_dtype="float32", transpose_b=True);
+          %380 = reshape(%379, newshape=[50, 12, 32, 32]);
+          %381 = divide(%380, 8f);
+          %382 = multiply(%381, meta[relay.Constant][76]);
+          %383 = subtract(%382, meta[relay.Constant][77]);
+          %384 = nn.softmax(%383, axis=3);
+          %385 = %369.2;
+          %386 = reshape(%385, newshape=[50, 32, 12, 64]);
+          %387 = transpose(%386, axes=[0, 2, 1, 3]);
+          %388 = reshape(%387, newshape=[-1, 32, 64]);
+          %389 = reshape(%384, newshape=[-1, 32, 32]);
+          %390 = transpose(%388, axes=[0, 2, 1]);
+          %391 = nn.batch_matmul(%389, %390, out_dtype="float32", transpose_b=True);
+          %392 = reshape(%391, newshape=[50, 12, 32, 64]);
+          %393 = transpose(%392, axes=[0, 2, 1, 3]);
+          %394 = reshape(%393, newshape=[50, 32, 768]);
+          %395 = reshape(%394, newshape=[-1, 768]);
+          %396 = nn.dense(%395, meta[relay.Constant][78], units=768);
+          %397 = add(%396, meta[relay.Constant][79]);
+          %398 = reshape(%397, newshape=[50, 32, 768]);
+          %399 = add(%355, %398);
+          %400 = mean(%399, axis=[-1], keepdims=True);
+          %401 = subtract(%399, %400);
+          %402 = power(%401, 2f);
+          %403 = mean(%402, axis=[-1], keepdims=True);
+          %404 = add(%403, 1e-05f);
+          %405 = sqrt(%404);
+          %406 = divide(%401, %405);
+          %407 = multiply(%406, meta[relay.Constant][80]);
+          %408 = add(%407, meta[relay.Constant][81]);
+          %409 = reshape(%408, newshape=[-1, 768]);
+          %410 = nn.dense(%409, meta[relay.Constant][82], units=3072);
+          %411 = add(%410, meta[relay.Constant][83]);
+          %412 = reshape(%411, newshape=[50, 32, 3072]);
+          %413 = power(%412, 3f);
+          %414 = multiply(%413, 0.044715f);
+          %415 = add(%412, %414);
+          %416 = multiply(%415, 0.797885f);
+          %417 = tanh(%416);
+          %418 = multiply(%412, 0.5f);
+          %419 = add(%417, 1f);
+          %420 = multiply(%418, %419);
+          %421 = reshape(%420, newshape=[-1, 3072]);
+          %422 = nn.dense(%421, meta[relay.Constant][84], units=768);
+          %423 = add(%422, meta[relay.Constant][85]);
+          %424 = reshape(%423, newshape=[50, 32, 768]);
+          %425 = add(%399, %424);
+          %426 = mean(%425, axis=[-1], keepdims=True);
+          %427 = subtract(%425, %426);
+          %428 = power(%427, 2f);
+          %429 = mean(%428, axis=[-1], keepdims=True);
+          %430 = add(%429, 1e-05f);
+          %431 = sqrt(%430);
+          %432 = divide(%427, %431);
+          %433 = multiply(%432, meta[relay.Constant][86]);
+          %434 = add(%433, meta[relay.Constant][87]);
+          %435 = reshape(%434, newshape=[-1, 768]);
+          %436 = nn.dense(%435, meta[relay.Constant][88], units=2304);
+          %437 = add(%436, meta[relay.Constant][89]);
+          %438 = reshape(%437, newshape=[50, 32, 2304]);
+          %439 = split(%438, indices_or_sections=[768, 1536], axis=2);
+          %440 = %439.0;
+          %441 = reshape(%440, newshape=[50, 32, 12, 64]);
+          %442 = transpose(%441, axes=[0, 2, 1, 3]);
+          %443 = %439.1;
+          %444 = reshape(%443, newshape=[50, 32, 12, 64]);
+          %445 = transpose(%444, axes=[0, 2, 3, 1]);
+          %446 = reshape(%445, newshape=[-1, 64, 32]);
+          %447 = reshape(%442, newshape=[-1, 32, 64]);
+          %448 = transpose(%446, axes=[0, 2, 1]);
+          %449 = nn.batch_matmul(%447, %448, out_dtype="float32", transpose_b=True);
+          %450 = reshape(%449, newshape=[50, 12, 32, 32]);
+          %451 = divide(%450, 8f);
+          %452 = multiply(%451, meta[relay.Constant][90]);
+          %453 = subtract(%452, meta[relay.Constant][91]);
+          %454 = nn.softmax(%453, axis=3);
+          %455 = %439.2;
+          %456 = reshape(%455, newshape=[50, 32, 12, 64]);
+          %457 = transpose(%456, axes=[0, 2, 1, 3]);
+          %458 = reshape(%457, newshape=[-1, 32, 64]);
+          %459 = reshape(%454, newshape=[-1, 32, 32]);
+          %460 = transpose(%458, axes=[0, 2, 1]);
+          %461 = nn.batch_matmul(%459, %460, out_dtype="float32", transpose_b=True);
+          %462 = reshape(%461, newshape=[50, 12, 32, 64]);
+          %463 = transpose(%462, axes=[0, 2, 1, 3]);
+          %464 = reshape(%463, newshape=[50, 32, 768]);
+          %465 = reshape(%464, newshape=[-1, 768]);
+          %466 = nn.dense(%465, meta[relay.Constant][92], units=768);
+          %467 = add(%466, meta[relay.Constant][93]);
+          %468 = reshape(%467, newshape=[50, 32, 768]);
+          %469 = add(%425, %468);
+          %470 = mean(%469, axis=[-1], keepdims=True);
+          %471 = subtract(%469, %470);
+          %472 = power(%471, 2f);
+          %473 = mean(%472, axis=[-1], keepdims=True);
+          %474 = add(%473, 1e-05f);
+          %475 = sqrt(%474);
+          %476 = divide(%471, %475);
+          %477 = multiply(%476, meta[relay.Constant][94]);
+          %478 = add(%477, meta[relay.Constant][95]);
+          %479 = reshape(%478, newshape=[-1, 768]);
+          %480 = nn.dense(%479, meta[relay.Constant][96], units=3072);
+          %481 = add(%480, meta[relay.Constant][97]);
+          %482 = reshape(%481, newshape=[50, 32, 3072]);
+          %483 = power(%482, 3f);
+          %484 = multiply(%483, 0.044715f);
+          %485 = add(%482, %484);
+          %486 = multiply(%485, 0.797885f);
+          %487 = tanh(%486);
+          %488 = multiply(%482, 0.5f);
+          %489 = add(%487, 1f);
+          %490 = multiply(%488, %489);
+          %491 = reshape(%490, newshape=[-1, 3072]);
+          %492 = nn.dense(%491, meta[relay.Constant][98], units=768);
+          %493 = add(%492, meta[relay.Constant][99]);
+          %494 = reshape(%493, newshape=[50, 32, 768]);
+          %495 = add(%469, %494);
+          %496 = mean(%495, axis=[-1], keepdims=True);
+          %497 = subtract(%495, %496);
+          %498 = power(%497, 2f);
+          %499 = mean(%498, axis=[-1], keepdims=True);
+          %500 = add(%499, 1e-05f);
+          %501 = sqrt(%500);
+          %502 = divide(%497, %501);
+          %503 = multiply(%502, meta[relay.Constant][100]);
+          %504 = add(%503, meta[relay.Constant][101]);
+          %505 = reshape(%504, newshape=[-1, 768]);
+          %506 = nn.dense(%505, meta[relay.Constant][102], units=2304);
+          %507 = add(%506, meta[relay.Constant][103]);
+          %508 = reshape(%507, newshape=[50, 32, 2304]);
+          %509 = split(%508, indices_or_sections=[768, 1536], axis=2);
+          %510 = %509.0;
+          %511 = reshape(%510, newshape=[50, 32, 12, 64]);
+          %512 = transpose(%511, axes=[0, 2, 1, 3]);
+          %513 = %509.1;
+          %514 = reshape(%513, newshape=[50, 32, 12, 64]);
+          %515 = transpose(%514, axes=[0, 2, 3, 1]);
+          %516 = reshape(%515, newshape=[-1, 64, 32]);
+          %517 = reshape(%512, newshape=[-1, 32, 64]);
+          %518 = transpose(%516, axes=[0, 2, 1]);
+          %519 = nn.batch_matmul(%517, %518, out_dtype="float32", transpose_b=True);
+          %520 = reshape(%519, newshape=[50, 12, 32, 32]);
+          %521 = divide(%520, 8f);
+          %522 = multiply(%521, meta[relay.Constant][104]);
+          %523 = subtract(%522, meta[relay.Constant][105]);
+          %524 = nn.softmax(%523, axis=3);
+          %525 = %509.2;
+          %526 = reshape(%525, newshape=[50, 32, 12, 64]);
+          %527 = transpose(%526, axes=[0, 2, 1, 3]);
+          %528 = reshape(%527, newshape=[-1, 32, 64]);
+          %529 = reshape(%524, newshape=[-1, 32, 32]);
+          %530 = transpose(%528, axes=[0, 2, 1]);
+          %531 = nn.batch_matmul(%529, %530, out_dtype="float32", transpose_b=True);
+          %532 = reshape(%531, newshape=[50, 12, 32, 64]);
+          %533 = transpose(%532, axes=[0, 2, 1, 3]);
+          %534 = reshape(%533, newshape=[50, 32, 768]);
+          %535 = reshape(%534, newshape=[-1, 768]);
+          %536 = nn.dense(%535, meta[relay.Constant][106], units=768);
+          %537 = add(%536, meta[relay.Constant][107]);
+          %538 = reshape(%537, newshape=[50, 32, 768]);
+          %539 = add(%495, %538);
+          %540 = mean(%539, axis=[-1], keepdims=True);
+          %541 = subtract(%539, %540);
+          %542 = power(%541, 2f);
+          %543 = mean(%542, axis=[-1], keepdims=True);
+          %544 = add(%543, 1e-05f);
+          %545 = sqrt(%544);
+          %546 = divide(%541, %545);
+          %547 = multiply(%546, meta[relay.Constant][108]);
+          %548 = add(%547, meta[relay.Constant][109]);
+          %549 = reshape(%548, newshape=[-1, 768]);
+          %550 = nn.dense(%549, meta[relay.Constant][110], units=3072);
+          %551 = add(%550, meta[relay.Constant][111]);
+          %552 = reshape(%551, newshape=[50, 32, 3072]);
+          %553 = power(%552, 3f);
+          %554 = multiply(%553, 0.044715f);
+          %555 = add(%552, %554);
+          %556 = multiply(%555, 0.797885f);
+          %557 = tanh(%556);
+          %558 = multiply(%552, 0.5f);
+          %559 = add(%557, 1f);
+          %560 = multiply(%558, %559);
+          %561 = reshape(%560, newshape=[-1, 3072]);
+          %562 = nn.dense(%561, meta[relay.Constant][112], units=768);
+          %563 = add(%562, meta[relay.Constant][113]);
+          %564 = reshape(%563, newshape=[50, 32, 768]);
+          %565 = add(%539, %564);
+          %566 = mean(%565, axis=[-1], keepdims=True);
+          %567 = subtract(%565, %566);
+          %568 = power(%567, 2f);
+          %569 = mean(%568, axis=[-1], keepdims=True);
+          %570 = add(%569, 1e-05f);
+          %571 = sqrt(%570);
+          %572 = divide(%567, %571);
+          %573 = multiply(%572, meta[relay.Constant][114]);
+          %574 = add(%573, meta[relay.Constant][115]);
+          %575 = reshape(%574, newshape=[-1, 768]);
+          %576 = nn.dense(%575, meta[relay.Constant][116], units=2304);
+          %577 = add(%576, meta[relay.Constant][117]);
+          %578 = reshape(%577, newshape=[50, 32, 2304]);
+          %579 = split(%578, indices_or_sections=[768, 1536], axis=2);
+          %580 = %579.0;
+          %581 = reshape(%580, newshape=[50, 32, 12, 64]);
+          %582 = transpose(%581, axes=[0, 2, 1, 3]);
+          %583 = %579.1;
+          %584 = reshape(%583, newshape=[50, 32, 12, 64]);
+          %585 = transpose(%584, axes=[0, 2, 3, 1]);
+          %586 = reshape(%585, newshape=[-1, 64, 32]);
+          %587 = reshape(%582, newshape=[-1, 32, 64]);
+          %588 = transpose(%586, axes=[0, 2, 1]);
+          %589 = nn.batch_matmul(%587, %588, out_dtype="float32", transpose_b=True);
+          %590 = reshape(%589, newshape=[50, 12, 32, 32]);
+          %591 = divide(%590, 8f);
+          %592 = multiply(%591, meta[relay.Constant][118]);
+          %593 = subtract(%592, meta[relay.Constant][119]);
+          %594 = nn.softmax(%593, axis=3);
+          %595 = %579.2;
+          %596 = reshape(%595, newshape=[50, 32, 12, 64]);
+          %597 = transpose(%596, axes=[0, 2, 1, 3]);
+          %598 = reshape(%597, newshape=[-1, 32, 64]);
+          %599 = reshape(%594, newshape=[-1, 32, 32]);
+          %600 = transpose(%598, axes=[0, 2, 1]);
+          %601 = nn.batch_matmul(%599, %600, out_dtype="float32", transpose_b=True);
+          %602 = reshape(%601, newshape=[50, 12, 32, 64]);
+          %603 = transpose(%602, axes=[0, 2, 1, 3]);
+          %604 = reshape(%603, newshape=[50, 32, 768]);
+          %605 = reshape(%604, newshape=[-1, 768]);
+          %606 = nn.dense(%605, meta[relay.Constant][120], units=768);
+          %607 = add(%606, meta[relay.Constant][121]);
+          %608 = reshape(%607, newshape=[50, 32, 768]);
+          %609 = add(%565, %608);
+          %610 = mean(%609, axis=[-1], keepdims=True);
+          %611 = subtract(%609, %610);
+          %612 = power(%611, 2f);
+          %613 = mean(%612, axis=[-1], keepdims=True);
+          %614 = add(%613, 1e-05f);
+          %615 = sqrt(%614);
+          %616 = divide(%611, %615);
+          %617 = multiply(%616, meta[relay.Constant][122]);
+          %618 = add(%617, meta[relay.Constant][123]);
+          %619 = reshape(%618, newshape=[-1, 768]);
+          %620 = nn.dense(%619, meta[relay.Constant][124], units=3072);
+          %621 = add(%620, meta[relay.Constant][125]);
+          %622 = reshape(%621, newshape=[50, 32, 3072]);
+          %623 = power(%622, 3f);
+          %624 = multiply(%623, 0.044715f);
+          %625 = add(%622, %624);
+          %626 = multiply(%625, 0.797885f);
+          %627 = tanh(%626);
+          %628 = multiply(%622, 0.5f);
+          %629 = add(%627, 1f);
+          %630 = multiply(%628, %629);
+          %631 = reshape(%630, newshape=[-1, 3072]);
+          %632 = nn.dense(%631, meta[relay.Constant][126], units=768);
+          %633 = add(%632, meta[relay.Constant][127]);
+          %634 = reshape(%633, newshape=[50, 32, 768]);
+          %635 = add(%609, %634);
+          %636 = mean(%635, axis=[-1], keepdims=True);
+          %637 = subtract(%635, %636);
+          %638 = power(%637, 2f);
+          %639 = mean(%638, axis=[-1], keepdims=True);
+          %640 = add(%639, 1e-05f);
+          %641 = sqrt(%640);
+          %642 = divide(%637, %641);
+          %643 = multiply(%642, meta[relay.Constant][128]);
+          %644 = add(%643, meta[relay.Constant][129]);
+          %645 = reshape(%644, newshape=[-1, 768]);
+          %646 = nn.dense(%645, meta[relay.Constant][130], units=2304);
+          %647 = add(%646, meta[relay.Constant][131]);
+          %648 = reshape(%647, newshape=[50, 32, 2304]);
+          %649 = split(%648, indices_or_sections=[768, 1536], axis=2);
+          %650 = %649.0;
+          %651 = reshape(%650, newshape=[50, 32, 12, 64]);
+          %652 = transpose(%651, axes=[0, 2, 1, 3]);
+          %653 = %649.1;
+          %654 = reshape(%653, newshape=[50, 32, 12, 64]);
+          %655 = transpose(%654, axes=[0, 2, 3, 1]);
+          %656 = reshape(%655, newshape=[-1, 64, 32]);
+          %657 = reshape(%652, newshape=[-1, 32, 64]);
+          %658 = transpose(%656, axes=[0, 2, 1]);
+          %659 = nn.batch_matmul(%657, %658, out_dtype="float32", transpose_b=True);
+          %660 = reshape(%659, newshape=[50, 12, 32, 32]);
+          %661 = divide(%660, 8f);
+          %662 = multiply(%661, meta[relay.Constant][132]);
+          %663 = subtract(%662, meta[relay.Constant][133]);
+          %664 = nn.softmax(%663, axis=3);
+          %665 = %649.2;
+          %666 = reshape(%665, newshape=[50, 32, 12, 64]);
+          %667 = transpose(%666, axes=[0, 2, 1, 3]);
+          %668 = reshape(%667, newshape=[-1, 32, 64]);
+          %669 = reshape(%664, newshape=[-1, 32, 32]);
+          %670 = transpose(%668, axes=[0, 2, 1]);
+          %671 = nn.batch_matmul(%669, %670, out_dtype="float32", transpose_b=True);
+          %672 = reshape(%671, newshape=[50, 12, 32, 64]);
+          %673 = transpose(%672, axes=[0, 2, 1, 3]);
+          %674 = reshape(%673, newshape=[50, 32, 768]);
+          %675 = reshape(%674, newshape=[-1, 768]);
+          %676 = nn.dense(%675, meta[relay.Constant][134], units=768);
+          %677 = add(%676, meta[relay.Constant][135]);
+          %678 = reshape(%677, newshape=[50, 32, 768]);
+          %679 = add(%635, %678);
+          %680 = mean(%679, axis=[-1], keepdims=True);
+          %681 = subtract(%679, %680);
+          %682 = power(%681, 2f);
+          %683 = mean(%682, axis=[-1], keepdims=True);
+          %684 = add(%683, 1e-05f);
+          %685 = sqrt(%684);
+          %686 = divide(%681, %685);
+          %687 = multiply(%686, meta[relay.Constant][136]);
+          %688 = add(%687, meta[relay.Constant][137]);
+          %689 = reshape(%688, newshape=[-1, 768]);
+          %690 = nn.dense(%689, meta[relay.Constant][138], units=3072);
+          %691 = add(%690, meta[relay.Constant][139]);
+          %692 = reshape(%691, newshape=[50, 32, 3072]);
+          %693 = power(%692, 3f);
+          %694 = multiply(%693, 0.044715f);
+          %695 = add(%692, %694);
+          %696 = multiply(%695, 0.797885f);
+          %697 = tanh(%696);
+          %698 = multiply(%692, 0.5f);
+          %699 = add(%697, 1f);
+          %700 = multiply(%698, %699);
+          %701 = reshape(%700, newshape=[-1, 3072]);
+          %702 = nn.dense(%701, meta[relay.Constant][140], units=768);
+          %703 = add(%702, meta[relay.Constant][141]);
+          %704 = reshape(%703, newshape=[50, 32, 768]);
+          %705 = add(%679, %704);
+          %706 = mean(%705, axis=[-1], keepdims=True);
+          %707 = subtract(%705, %706);
+          %708 = power(%707, 2f);
+          %709 = mean(%708, axis=[-1], keepdims=True);
+          %710 = add(%709, 1e-05f);
+          %711 = sqrt(%710);
+          %712 = divide(%707, %711);
+          %713 = multiply(%712, meta[relay.Constant][142]);
+          %714 = add(%713, meta[relay.Constant][143]);
+          %715 = reshape(%714, newshape=[-1, 768]);
+          %716 = nn.dense(%715, meta[relay.Constant][144], units=2304);
+          %717 = add(%716, meta[relay.Constant][145]);
+          %718 = reshape(%717, newshape=[50, 32, 2304]);
+          %719 = split(%718, indices_or_sections=[768, 1536], axis=2);
+          %720 = %719.0;
+          %721 = reshape(%720, newshape=[50, 32, 12, 64]);
+          %722 = transpose(%721, axes=[0, 2, 1, 3]);
+          %723 = %719.1;
+          %724 = reshape(%723, newshape=[50, 32, 12, 64]);
+          %725 = transpose(%724, axes=[0, 2, 3, 1]);
+          %726 = reshape(%725, newshape=[-1, 64, 32]);
+          %727 = reshape(%722, newshape=[-1, 32, 64]);
+          %728 = transpose(%726, axes=[0, 2, 1]);
+          %729 = nn.batch_matmul(%727, %728, out_dtype="float32", transpose_b=True);
+          %730 = reshape(%729, newshape=[50, 12, 32, 32]);
+          %731 = divide(%730, 8f);
+          %732 = multiply(%731, meta[relay.Constant][146]);
+          %733 = subtract(%732, meta[relay.Constant][147]);
+          %734 = nn.softmax(%733, axis=3);
+          %735 = %719.2;
+          %736 = reshape(%735, newshape=[50, 32, 12, 64]);
+          %737 = transpose(%736, axes=[0, 2, 1, 3]);
+          %738 = reshape(%737, newshape=[-1, 32, 64]);
+          %739 = reshape(%734, newshape=[-1, 32, 32]);
+          %740 = transpose(%738, axes=[0, 2, 1]);
+          %741 = nn.batch_matmul(%739, %740, out_dtype="float32", transpose_b=True);
+          %742 = reshape(%741, newshape=[50, 12, 32, 64]);
+          %743 = transpose(%742, axes=[0, 2, 1, 3]);
+          %744 = reshape(%743, newshape=[50, 32, 768]);
+          %745 = reshape(%744, newshape=[-1, 768]);
+          %746 = nn.dense(%745, meta[relay.Constant][148], units=768);
+          %747 = add(%746, meta[relay.Constant][149]);
+          %748 = reshape(%747, newshape=[50, 32, 768]);
+          %749 = add(%705, %748);
+          %750 = mean(%749, axis=[-1], keepdims=True);
+          %751 = subtract(%749, %750);
+          %752 = power(%751, 2f);
+          %753 = mean(%752, axis=[-1], keepdims=True);
+          %754 = add(%753, 1e-05f);
+          %755 = sqrt(%754);
+          %756 = divide(%751, %755);
+          %757 = multiply(%756, meta[relay.Constant][150]);
+          %758 = add(%757, meta[relay.Constant][151]);
+          %759 = reshape(%758, newshape=[-1, 768]);
+          %760 = nn.dense(%759, meta[relay.Constant][152], units=3072);
+          %761 = add(%760, meta[relay.Constant][153]);
+          %762 = reshape(%761, newshape=[50, 32, 3072]);
+          %763 = power(%762, 3f);
+          %764 = multiply(%763, 0.044715f);
+          %765 = add(%762, %764);
+          %766 = multiply(%765, 0.797885f);
+          %767 = tanh(%766);
+          %768 = multiply(%762, 0.5f);
+          %769 = add(%767, 1f);
+          %770 = multiply(%768, %769);
+          %771 = reshape(%770, newshape=[-1, 3072]);
+          %772 = nn.dense(%771, meta[relay.Constant][154], units=768);
+          %773 = add(%772, meta[relay.Constant][155]);
+          %774 = reshape(%773, newshape=[50, 32, 768]);
+          %775 = add(%749, %774);
+          %776 = mean(%775, axis=[-1], keepdims=True);
+          %777 = subtract(%775, %776);
+          %778 = power(%777, 2f);
+          %779 = mean(%778, axis=[-1], keepdims=True);
+          %780 = add(%779, 1e-05f);
+          %781 = sqrt(%780);
+          %782 = divide(%777, %781);
+          %783 = multiply(%782, meta[relay.Constant][156]);
+          %784 = add(%783, meta[relay.Constant][157]);
+          %785 = reshape(%784, newshape=[-1, 768]);
+          %786 = nn.dense(%785, meta[relay.Constant][158], units=2304);
+          %787 = add(%786, meta[relay.Constant][159]);
+          %788 = reshape(%787, newshape=[50, 32, 2304]);
+          %789 = split(%788, indices_or_sections=[768, 1536], axis=2);
+          %790 = %789.0;
+          %791 = reshape(%790, newshape=[50, 32, 12, 64]);
+          %792 = transpose(%791, axes=[0, 2, 1, 3]);
+          %793 = %789.1;
+          %794 = reshape(%793, newshape=[50, 32, 12, 64]);
+          %795 = transpose(%794, axes=[0, 2, 3, 1]);
+          %796 = reshape(%795, newshape=[-1, 64, 32]);
+          %797 = reshape(%792, newshape=[-1, 32, 64]);
+          %798 = transpose(%796, axes=[0, 2, 1]);
+          %799 = nn.batch_matmul(%797, %798, out_dtype="float32", transpose_b=True);
+          %800 = reshape(%799, newshape=[50, 12, 32, 32]);
+          %801 = divide(%800, 8f);
+          %802 = multiply(%801, meta[relay.Constant][160]);
+          %803 = subtract(%802, meta[relay.Constant][161]);
+          %804 = nn.softmax(%803, axis=3);
+          %805 = %789.2;
+          %806 = reshape(%805, newshape=[50, 32, 12, 64]);
+          %807 = transpose(%806, axes=[0, 2, 1, 3]);
+          %808 = reshape(%807, newshape=[-1, 32, 64]);
+          %809 = reshape(%804, newshape=[-1, 32, 32]);
+          %810 = transpose(%808, axes=[0, 2, 1]);
+          %811 = nn.batch_matmul(%809, %810, out_dtype="float32", transpose_b=True);
+          %812 = reshape(%811, newshape=[50, 12, 32, 64]);
+          %813 = transpose(%812, axes=[0, 2, 1, 3]);
+          %814 = reshape(%813, newshape=[50, 32, 768]);
+          %815 = reshape(%814, newshape=[-1, 768]);
+          %816 = nn.dense(%815, meta[relay.Constant][162], units=768);
+          %817 = add(%816, meta[relay.Constant][163]);
+          %818 = reshape(%817, newshape=[50, 32, 768]);
+          %819 = add(%775, %818);
+          %820 = mean(%819, axis=[-1], keepdims=True);
+          %821 = subtract(%819, %820);
+          %822 = power(%821, 2f);
+          %823 = mean(%822, axis=[-1], keepdims=True);
+          %824 = add(%823, 1e-05f);
+          %825 = sqrt(%824);
+          %826 = divide(%821, %825);
+          %827 = multiply(%826, meta[relay.Constant][164]);
+          %828 = add(%827, meta[relay.Constant][165]);
+          %829 = reshape(%828, newshape=[-1, 768]);
+          %830 = nn.dense(%829, meta[relay.Constant][166], units=3072);
+          %831 = add(%830, meta[relay.Constant][167]);
+          %832 = reshape(%831, newshape=[50, 32, 3072]);
+          %833 = power(%832, 3f);
+          %834 = multiply(%833, 0.044715f);
+          %835 = add(%832, %834);
+          %836 = multiply(%835, 0.797885f);
+          %837 = tanh(%836);
+          %838 = multiply(%832, 0.5f);
+          %839 = add(%837, 1f);
+          %840 = multiply(%838, %839);
+          %841 = reshape(%840, newshape=[-1, 3072]);
+          %842 = nn.dense(%841, meta[relay.Constant][168], units=768);
+          %843 = add(%842, meta[relay.Constant][169]);
+          %844 = reshape(%843, newshape=[50, 32, 768]);
+          %845 = add(%819, %844);
+          %846 = mean(%845, axis=[-1], keepdims=True);
+          %847 = subtract(%845, %846);
+          %848 = power(%847, 2f);
+          %849 = mean(%848, axis=[-1], keepdims=True);
+          %850 = add(%849, 1e-05f);
+          %851 = sqrt(%850);
+          %852 = divide(%847, %851);
+          %853 = multiply(%852, meta[relay.Constant][170]);
+          %854 = add(%853, meta[relay.Constant][171]);
+          %855 = transpose(%24, axes=[0, 2, 1, 3]);
+          %856 = expand_dims(%855, axis=0);
+          %857 = expand_dims(%37, axis=0);
+          %858 = (%856, %857);
+          %859 = transpose(%94, axes=[0, 2, 1, 3]);
+          %860 = expand_dims(%859, axis=0);
+          %861 = expand_dims(%107, axis=0);
+          %862 = (%860, %861);
+          %863 = transpose(%164, axes=[0, 2, 1, 3]);
+          %864 = expand_dims(%863, axis=0);
+          %865 = expand_dims(%177, axis=0);
+          %866 = (%864, %865);
+          %867 = transpose(%234, axes=[0, 2, 1, 3]);
+          %868 = expand_dims(%867, axis=0);
+          %869 = expand_dims(%247, axis=0);
+          %870 = (%868, %869);
+          %871 = transpose(%304, axes=[0, 2, 1, 3]);
+          %872 = expand_dims(%871, axis=0);
+          %873 = expand_dims(%317, axis=0);
+          %874 = (%872, %873);
+          %875 = transpose(%374, axes=[0, 2, 1, 3]);
+          %876 = expand_dims(%875, axis=0);
+          %877 = expand_dims(%387, axis=0);
+          %878 = (%876, %877);
+          %879 = transpose(%444, axes=[0, 2, 1, 3]);
+          %880 = expand_dims(%879, axis=0);
+          %881 = expand_dims(%457, axis=0);
+          %882 = (%880, %881);
+          %883 = transpose(%514, axes=[0, 2, 1, 3]);
+          %884 = expand_dims(%883, axis=0);
+          %885 = expand_dims(%527, axis=0);
+          %886 = (%884, %885);
+          %887 = transpose(%584, axes=[0, 2, 1, 3]);
+          %888 = expand_dims(%887, axis=0);
+          %889 = expand_dims(%597, axis=0);
+          %890 = (%888, %889);
+          %891 = transpose(%654, axes=[0, 2, 1, 3]);
+          %892 = expand_dims(%891, axis=0);
+          %893 = expand_dims(%667, axis=0);
+          %894 = (%892, %893);
+          %895 = transpose(%724, axes=[0, 2, 1, 3]);
+          %896 = expand_dims(%895, axis=0);
+          %897 = expand_dims(%737, axis=0);
+          %898 = (%896, %897);
+          %899 = transpose(%794, axes=[0, 2, 1, 3]);
+          %900 = expand_dims(%899, axis=0);
+          %901 = expand_dims(%807, axis=0);
+          %902 = (%900, %901);
+          %903 = reshape(%854, newshape=[1, 50, 32, 768]);
+          %904 = concatenate(%858);
+          %905 = concatenate(%862);
+          %906 = concatenate(%866);
+          %907 = concatenate(%870);
+          %908 = concatenate(%874);
+          %909 = concatenate(%878);
+          %910 = concatenate(%882);
+          %911 = concatenate(%886);
+          %912 = concatenate(%890);
+          %913 = concatenate(%894);
+          %914 = concatenate(%898);
+          %915 = concatenate(%902);
+          (%903, %904, %905, %906, %907, %908, %909, %910, %911, %912, %913, %914, %915)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "gpt2",
+        "input_shapes": {"x": [1, 50, 32]},
+        "input_dtypes": {"x": "int64"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def gpt2_16():
+    metatable = {"relay.Constant": gpt2_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1, 50, 32), int64]) -> (Tensor[(1, 50, 32, 768), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16]) {
+          %0 = reshape(%x, newshape=[-1, 32]);
+          %1 = less(%0, 0i64);
+          %2 = add(%0, 50257i64);
+          %3 = where(%1, %2, %0);
+          %4 = take(meta[relay.Constant][0], %3, axis=0);
+          %5 = add(%4, meta[relay.Constant][1]);
+          %6 = mean(%5, axis=[-1], keepdims=True);
+          %7 = subtract(%5, %6);
+          %8 = power(%7, 2f16);
+          %9 = mean(%8, axis=[-1], keepdims=True);
+          %10 = add(%9, 1e-05f16);
+          %11 = sqrt(%10);
+          %12 = divide(%7, %11);
+          %13 = multiply(%12, meta[relay.Constant][2]);
+          %14 = add(%13, meta[relay.Constant][3]);
+          %15 = reshape(%14, newshape=[-1, 768]);
+          %16 = nn.dense(%15, meta[relay.Constant][4], units=2304);
+          %17 = add(%16, meta[relay.Constant][5]);
+          %18 = reshape(%17, newshape=[50, 32, 2304]);
+          %19 = split(%18, indices_or_sections=[768, 1536], axis=2);
+          %20 = %19.0;
+          %21 = reshape(%20, newshape=[50, 32, 12, 64]);
+          %22 = transpose(%21, axes=[0, 2, 1, 3]);
+          %23 = %19.1;
+          %24 = reshape(%23, newshape=[50, 32, 12, 64]);
+          %25 = transpose(%24, axes=[0, 2, 3, 1]);
+          %26 = reshape(%25, newshape=[-1, 64, 32]);
+          %27 = reshape(%22, newshape=[-1, 32, 64]);
+          %28 = transpose(%26, axes=[0, 2, 1]);
+          %29 = nn.batch_matmul(%27, %28, out_dtype="float16", transpose_b=True);
+          %30 = reshape(%29, newshape=[50, 12, 32, 32]);
+          %31 = divide(%30, 8f16);
+          %32 = multiply(%31, meta[relay.Constant][6]);
+          %33 = subtract(%32, meta[relay.Constant][7]);
+          %34 = nn.softmax(%33, axis=3);
+          %35 = %19.2;
+          %36 = reshape(%35, newshape=[50, 32, 12, 64]);
+          %37 = transpose(%36, axes=[0, 2, 1, 3]);
+          %38 = reshape(%37, newshape=[-1, 32, 64]);
+          %39 = reshape(%34, newshape=[-1, 32, 32]);
+          %40 = transpose(%38, axes=[0, 2, 1]);
+          %41 = nn.batch_matmul(%39, %40, out_dtype="float16", transpose_b=True);
+          %42 = reshape(%41, newshape=[50, 12, 32, 64]);
+          %43 = transpose(%42, axes=[0, 2, 1, 3]);
+          %44 = reshape(%43, newshape=[50, 32, 768]);
+          %45 = reshape(%44, newshape=[-1, 768]);
+          %46 = nn.dense(%45, meta[relay.Constant][8], units=768);
+          %47 = add(%46, meta[relay.Constant][9]);
+          %48 = reshape(%47, newshape=[50, 32, 768]);
+          %49 = add(%5, %48);
+          %50 = mean(%49, axis=[-1], keepdims=True);
+          %51 = subtract(%49, %50);
+          %52 = power(%51, 2f16);
+          %53 = mean(%52, axis=[-1], keepdims=True);
+          %54 = add(%53, 1e-05f16);
+          %55 = sqrt(%54);
+          %56 = divide(%51, %55);
+          %57 = multiply(%56, meta[relay.Constant][10]);
+          %58 = add(%57, meta[relay.Constant][11]);
+          %59 = reshape(%58, newshape=[-1, 768]);
+          %60 = nn.dense(%59, meta[relay.Constant][12], units=3072);
+          %61 = add(%60, meta[relay.Constant][13]);
+          %62 = reshape(%61, newshape=[50, 32, 3072]);
+          %63 = power(%62, 3f16);
+          %64 = multiply(%63, 0.044715f16);
+          %65 = add(%62, %64);
+          %66 = multiply(%65, 0.797885f16);
+          %67 = tanh(%66);
+          %68 = multiply(%62, 0.5f16);
+          %69 = add(%67, 1f16);
+          %70 = multiply(%68, %69);
+          %71 = reshape(%70, newshape=[-1, 3072]);
+          %72 = nn.dense(%71, meta[relay.Constant][14], units=768);
+          %73 = add(%72, meta[relay.Constant][15]);
+          %74 = reshape(%73, newshape=[50, 32, 768]);
+          %75 = add(%49, %74);
+          %76 = mean(%75, axis=[-1], keepdims=True);
+          %77 = subtract(%75, %76);
+          %78 = power(%77, 2f16);
+          %79 = mean(%78, axis=[-1], keepdims=True);
+          %80 = add(%79, 1e-05f16);
+          %81 = sqrt(%80);
+          %82 = divide(%77, %81);
+          %83 = multiply(%82, meta[relay.Constant][16]);
+          %84 = add(%83, meta[relay.Constant][17]);
+          %85 = reshape(%84, newshape=[-1, 768]);
+          %86 = nn.dense(%85, meta[relay.Constant][18], units=2304);
+          %87 = add(%86, meta[relay.Constant][19]);
+          %88 = reshape(%87, newshape=[50, 32, 2304]);
+          %89 = split(%88, indices_or_sections=[768, 1536], axis=2);
+          %90 = %89.0;
+          %91 = reshape(%90, newshape=[50, 32, 12, 64]);
+          %92 = transpose(%91, axes=[0, 2, 1, 3]);
+          %93 = %89.1;
+          %94 = reshape(%93, newshape=[50, 32, 12, 64]);
+          %95 = transpose(%94, axes=[0, 2, 3, 1]);
+          %96 = reshape(%95, newshape=[-1, 64, 32]);
+          %97 = reshape(%92, newshape=[-1, 32, 64]);
+          %98 = transpose(%96, axes=[0, 2, 1]);
+          %99 = nn.batch_matmul(%97, %98, out_dtype="float16", transpose_b=True);
+          %100 = reshape(%99, newshape=[50, 12, 32, 32]);
+          %101 = divide(%100, 8f16);
+          %102 = multiply(%101, meta[relay.Constant][20]);
+          %103 = subtract(%102, meta[relay.Constant][21]);
+          %104 = nn.softmax(%103, axis=3);
+          %105 = %89.2;
+          %106 = reshape(%105, newshape=[50, 32, 12, 64]);
+          %107 = transpose(%106, axes=[0, 2, 1, 3]);
+          %108 = reshape(%107, newshape=[-1, 32, 64]);
+          %109 = reshape(%104, newshape=[-1, 32, 32]);
+          %110 = transpose(%108, axes=[0, 2, 1]);
+          %111 = nn.batch_matmul(%109, %110, out_dtype="float16", transpose_b=True);
+          %112 = reshape(%111, newshape=[50, 12, 32, 64]);
+          %113 = transpose(%112, axes=[0, 2, 1, 3]);
+          %114 = reshape(%113, newshape=[50, 32, 768]);
+          %115 = reshape(%114, newshape=[-1, 768]);
+          %116 = nn.dense(%115, meta[relay.Constant][22], units=768);
+          %117 = add(%116, meta[relay.Constant][23]);
+          %118 = reshape(%117, newshape=[50, 32, 768]);
+          %119 = add(%75, %118);
+          %120 = mean(%119, axis=[-1], keepdims=True);
+          %121 = subtract(%119, %120);
+          %122 = power(%121, 2f16);
+          %123 = mean(%122, axis=[-1], keepdims=True);
+          %124 = add(%123, 1e-05f16);
+          %125 = sqrt(%124);
+          %126 = divide(%121, %125);
+          %127 = multiply(%126, meta[relay.Constant][24]);
+          %128 = add(%127, meta[relay.Constant][25]);
+          %129 = reshape(%128, newshape=[-1, 768]);
+          %130 = nn.dense(%129, meta[relay.Constant][26], units=3072);
+          %131 = add(%130, meta[relay.Constant][27]);
+          %132 = reshape(%131, newshape=[50, 32, 3072]);
+          %133 = power(%132, 3f16);
+          %134 = multiply(%133, 0.044715f16);
+          %135 = add(%132, %134);
+          %136 = multiply(%135, 0.797885f16);
+          %137 = tanh(%136);
+          %138 = multiply(%132, 0.5f16);
+          %139 = add(%137, 1f16);
+          %140 = multiply(%138, %139);
+          %141 = reshape(%140, newshape=[-1, 3072]);
+          %142 = nn.dense(%141, meta[relay.Constant][28], units=768);
+          %143 = add(%142, meta[relay.Constant][29]);
+          %144 = reshape(%143, newshape=[50, 32, 768]);
+          %145 = add(%119, %144);
+          %146 = mean(%145, axis=[-1], keepdims=True);
+          %147 = subtract(%145, %146);
+          %148 = power(%147, 2f16);
+          %149 = mean(%148, axis=[-1], keepdims=True);
+          %150 = add(%149, 1e-05f16);
+          %151 = sqrt(%150);
+          %152 = divide(%147, %151);
+          %153 = multiply(%152, meta[relay.Constant][30]);
+          %154 = add(%153, meta[relay.Constant][31]);
+          %155 = reshape(%154, newshape=[-1, 768]);
+          %156 = nn.dense(%155, meta[relay.Constant][32], units=2304);
+          %157 = add(%156, meta[relay.Constant][33]);
+          %158 = reshape(%157, newshape=[50, 32, 2304]);
+          %159 = split(%158, indices_or_sections=[768, 1536], axis=2);
+          %160 = %159.0;
+          %161 = reshape(%160, newshape=[50, 32, 12, 64]);
+          %162 = transpose(%161, axes=[0, 2, 1, 3]);
+          %163 = %159.1;
+          %164 = reshape(%163, newshape=[50, 32, 12, 64]);
+          %165 = transpose(%164, axes=[0, 2, 3, 1]);
+          %166 = reshape(%165, newshape=[-1, 64, 32]);
+          %167 = reshape(%162, newshape=[-1, 32, 64]);
+          %168 = transpose(%166, axes=[0, 2, 1]);
+          %169 = nn.batch_matmul(%167, %168, out_dtype="float16", transpose_b=True);
+          %170 = reshape(%169, newshape=[50, 12, 32, 32]);
+          %171 = divide(%170, 8f16);
+          %172 = multiply(%171, meta[relay.Constant][34]);
+          %173 = subtract(%172, meta[relay.Constant][35]);
+          %174 = nn.softmax(%173, axis=3);
+          %175 = %159.2;
+          %176 = reshape(%175, newshape=[50, 32, 12, 64]);
+          %177 = transpose(%176, axes=[0, 2, 1, 3]);
+          %178 = reshape(%177, newshape=[-1, 32, 64]);
+          %179 = reshape(%174, newshape=[-1, 32, 32]);
+          %180 = transpose(%178, axes=[0, 2, 1]);
+          %181 = nn.batch_matmul(%179, %180, out_dtype="float16", transpose_b=True);
+          %182 = reshape(%181, newshape=[50, 12, 32, 64]);
+          %183 = transpose(%182, axes=[0, 2, 1, 3]);
+          %184 = reshape(%183, newshape=[50, 32, 768]);
+          %185 = reshape(%184, newshape=[-1, 768]);
+          %186 = nn.dense(%185, meta[relay.Constant][36], units=768);
+          %187 = add(%186, meta[relay.Constant][37]);
+          %188 = reshape(%187, newshape=[50, 32, 768]);
+          %189 = add(%145, %188);
+          %190 = mean(%189, axis=[-1], keepdims=True);
+          %191 = subtract(%189, %190);
+          %192 = power(%191, 2f16);
+          %193 = mean(%192, axis=[-1], keepdims=True);
+          %194 = add(%193, 1e-05f16);
+          %195 = sqrt(%194);
+          %196 = divide(%191, %195);
+          %197 = multiply(%196, meta[relay.Constant][38]);
+          %198 = add(%197, meta[relay.Constant][39]);
+          %199 = reshape(%198, newshape=[-1, 768]);
+          %200 = nn.dense(%199, meta[relay.Constant][40], units=3072);
+          %201 = add(%200, meta[relay.Constant][41]);
+          %202 = reshape(%201, newshape=[50, 32, 3072]);
+          %203 = power(%202, 3f16);
+          %204 = multiply(%203, 0.044715f16);
+          %205 = add(%202, %204);
+          %206 = multiply(%205, 0.797885f16);
+          %207 = tanh(%206);
+          %208 = multiply(%202, 0.5f16);
+          %209 = add(%207, 1f16);
+          %210 = multiply(%208, %209);
+          %211 = reshape(%210, newshape=[-1, 3072]);
+          %212 = nn.dense(%211, meta[relay.Constant][42], units=768);
+          %213 = add(%212, meta[relay.Constant][43]);
+          %214 = reshape(%213, newshape=[50, 32, 768]);
+          %215 = add(%189, %214);
+          %216 = mean(%215, axis=[-1], keepdims=True);
+          %217 = subtract(%215, %216);
+          %218 = power(%217, 2f16);
+          %219 = mean(%218, axis=[-1], keepdims=True);
+          %220 = add(%219, 1e-05f16);
+          %221 = sqrt(%220);
+          %222 = divide(%217, %221);
+          %223 = multiply(%222, meta[relay.Constant][44]);
+          %224 = add(%223, meta[relay.Constant][45]);
+          %225 = reshape(%224, newshape=[-1, 768]);
+          %226 = nn.dense(%225, meta[relay.Constant][46], units=2304);
+          %227 = add(%226, meta[relay.Constant][47]);
+          %228 = reshape(%227, newshape=[50, 32, 2304]);
+          %229 = split(%228, indices_or_sections=[768, 1536], axis=2);
+          %230 = %229.0;
+          %231 = reshape(%230, newshape=[50, 32, 12, 64]);
+          %232 = transpose(%231, axes=[0, 2, 1, 3]);
+          %233 = %229.1;
+          %234 = reshape(%233, newshape=[50, 32, 12, 64]);
+          %235 = transpose(%234, axes=[0, 2, 3, 1]);
+          %236 = reshape(%235, newshape=[-1, 64, 32]);
+          %237 = reshape(%232, newshape=[-1, 32, 64]);
+          %238 = transpose(%236, axes=[0, 2, 1]);
+          %239 = nn.batch_matmul(%237, %238, out_dtype="float16", transpose_b=True);
+          %240 = reshape(%239, newshape=[50, 12, 32, 32]);
+          %241 = divide(%240, 8f16);
+          %242 = multiply(%241, meta[relay.Constant][48]);
+          %243 = subtract(%242, meta[relay.Constant][49]);
+          %244 = nn.softmax(%243, axis=3);
+          %245 = %229.2;
+          %246 = reshape(%245, newshape=[50, 32, 12, 64]);
+          %247 = transpose(%246, axes=[0, 2, 1, 3]);
+          %248 = reshape(%247, newshape=[-1, 32, 64]);
+          %249 = reshape(%244, newshape=[-1, 32, 32]);
+          %250 = transpose(%248, axes=[0, 2, 1]);
+          %251 = nn.batch_matmul(%249, %250, out_dtype="float16", transpose_b=True);
+          %252 = reshape(%251, newshape=[50, 12, 32, 64]);
+          %253 = transpose(%252, axes=[0, 2, 1, 3]);
+          %254 = reshape(%253, newshape=[50, 32, 768]);
+          %255 = reshape(%254, newshape=[-1, 768]);
+          %256 = nn.dense(%255, meta[relay.Constant][50], units=768);
+          %257 = add(%256, meta[relay.Constant][51]);
+          %258 = reshape(%257, newshape=[50, 32, 768]);
+          %259 = add(%215, %258);
+          %260 = mean(%259, axis=[-1], keepdims=True);
+          %261 = subtract(%259, %260);
+          %262 = power(%261, 2f16);
+          %263 = mean(%262, axis=[-1], keepdims=True);
+          %264 = add(%263, 1e-05f16);
+          %265 = sqrt(%264);
+          %266 = divide(%261, %265);
+          %267 = multiply(%266, meta[relay.Constant][52]);
+          %268 = add(%267, meta[relay.Constant][53]);
+          %269 = reshape(%268, newshape=[-1, 768]);
+          %270 = nn.dense(%269, meta[relay.Constant][54], units=3072);
+          %271 = add(%270, meta[relay.Constant][55]);
+          %272 = reshape(%271, newshape=[50, 32, 3072]);
+          %273 = power(%272, 3f16);
+          %274 = multiply(%273, 0.044715f16);
+          %275 = add(%272, %274);
+          %276 = multiply(%275, 0.797885f16);
+          %277 = tanh(%276);
+          %278 = multiply(%272, 0.5f16);
+          %279 = add(%277, 1f16);
+          %280 = multiply(%278, %279);
+          %281 = reshape(%280, newshape=[-1, 3072]);
+          %282 = nn.dense(%281, meta[relay.Constant][56], units=768);
+          %283 = add(%282, meta[relay.Constant][57]);
+          %284 = reshape(%283, newshape=[50, 32, 768]);
+          %285 = add(%259, %284);
+          %286 = mean(%285, axis=[-1], keepdims=True);
+          %287 = subtract(%285, %286);
+          %288 = power(%287, 2f16);
+          %289 = mean(%288, axis=[-1], keepdims=True);
+          %290 = add(%289, 1e-05f16);
+          %291 = sqrt(%290);
+          %292 = divide(%287, %291);
+          %293 = multiply(%292, meta[relay.Constant][58]);
+          %294 = add(%293, meta[relay.Constant][59]);
+          %295 = reshape(%294, newshape=[-1, 768]);
+          %296 = nn.dense(%295, meta[relay.Constant][60], units=2304);
+          %297 = add(%296, meta[relay.Constant][61]);
+          %298 = reshape(%297, newshape=[50, 32, 2304]);
+          %299 = split(%298, indices_or_sections=[768, 1536], axis=2);
+          %300 = %299.0;
+          %301 = reshape(%300, newshape=[50, 32, 12, 64]);
+          %302 = transpose(%301, axes=[0, 2, 1, 3]);
+          %303 = %299.1;
+          %304 = reshape(%303, newshape=[50, 32, 12, 64]);
+          %305 = transpose(%304, axes=[0, 2, 3, 1]);
+          %306 = reshape(%305, newshape=[-1, 64, 32]);
+          %307 = reshape(%302, newshape=[-1, 32, 64]);
+          %308 = transpose(%306, axes=[0, 2, 1]);
+          %309 = nn.batch_matmul(%307, %308, out_dtype="float16", transpose_b=True);
+          %310 = reshape(%309, newshape=[50, 12, 32, 32]);
+          %311 = divide(%310, 8f16);
+          %312 = multiply(%311, meta[relay.Constant][62]);
+          %313 = subtract(%312, meta[relay.Constant][63]);
+          %314 = nn.softmax(%313, axis=3);
+          %315 = %299.2;
+          %316 = reshape(%315, newshape=[50, 32, 12, 64]);
+          %317 = transpose(%316, axes=[0, 2, 1, 3]);
+          %318 = reshape(%317, newshape=[-1, 32, 64]);
+          %319 = reshape(%314, newshape=[-1, 32, 32]);
+          %320 = transpose(%318, axes=[0, 2, 1]);
+          %321 = nn.batch_matmul(%319, %320, out_dtype="float16", transpose_b=True);
+          %322 = reshape(%321, newshape=[50, 12, 32, 64]);
+          %323 = transpose(%322, axes=[0, 2, 1, 3]);
+          %324 = reshape(%323, newshape=[50, 32, 768]);
+          %325 = reshape(%324, newshape=[-1, 768]);
+          %326 = nn.dense(%325, meta[relay.Constant][64], units=768);
+          %327 = add(%326, meta[relay.Constant][65]);
+          %328 = reshape(%327, newshape=[50, 32, 768]);
+          %329 = add(%285, %328);
+          %330 = mean(%329, axis=[-1], keepdims=True);
+          %331 = subtract(%329, %330);
+          %332 = power(%331, 2f16);
+          %333 = mean(%332, axis=[-1], keepdims=True);
+          %334 = add(%333, 1e-05f16);
+          %335 = sqrt(%334);
+          %336 = divide(%331, %335);
+          %337 = multiply(%336, meta[relay.Constant][66]);
+          %338 = add(%337, meta[relay.Constant][67]);
+          %339 = reshape(%338, newshape=[-1, 768]);
+          %340 = nn.dense(%339, meta[relay.Constant][68], units=3072);
+          %341 = add(%340, meta[relay.Constant][69]);
+          %342 = reshape(%341, newshape=[50, 32, 3072]);
+          %343 = power(%342, 3f16);
+          %344 = multiply(%343, 0.044715f16);
+          %345 = add(%342, %344);
+          %346 = multiply(%345, 0.797885f16);
+          %347 = tanh(%346);
+          %348 = multiply(%342, 0.5f16);
+          %349 = add(%347, 1f16);
+          %350 = multiply(%348, %349);
+          %351 = reshape(%350, newshape=[-1, 3072]);
+          %352 = nn.dense(%351, meta[relay.Constant][70], units=768);
+          %353 = add(%352, meta[relay.Constant][71]);
+          %354 = reshape(%353, newshape=[50, 32, 768]);
+          %355 = add(%329, %354);
+          %356 = mean(%355, axis=[-1], keepdims=True);
+          %357 = subtract(%355, %356);
+          %358 = power(%357, 2f16);
+          %359 = mean(%358, axis=[-1], keepdims=True);
+          %360 = add(%359, 1e-05f16);
+          %361 = sqrt(%360);
+          %362 = divide(%357, %361);
+          %363 = multiply(%362, meta[relay.Constant][72]);
+          %364 = add(%363, meta[relay.Constant][73]);
+          %365 = reshape(%364, newshape=[-1, 768]);
+          %366 = nn.dense(%365, meta[relay.Constant][74], units=2304);
+          %367 = add(%366, meta[relay.Constant][75]);
+          %368 = reshape(%367, newshape=[50, 32, 2304]);
+          %369 = split(%368, indices_or_sections=[768, 1536], axis=2);
+          %370 = %369.0;
+          %371 = reshape(%370, newshape=[50, 32, 12, 64]);
+          %372 = transpose(%371, axes=[0, 2, 1, 3]);
+          %373 = %369.1;
+          %374 = reshape(%373, newshape=[50, 32, 12, 64]);
+          %375 = transpose(%374, axes=[0, 2, 3, 1]);
+          %376 = reshape(%375, newshape=[-1, 64, 32]);
+          %377 = reshape(%372, newshape=[-1, 32, 64]);
+          %378 = transpose(%376, axes=[0, 2, 1]);
+          %379 = nn.batch_matmul(%377, %378, out_dtype="float16", transpose_b=True);
+          %380 = reshape(%379, newshape=[50, 12, 32, 32]);
+          %381 = divide(%380, 8f16);
+          %382 = multiply(%381, meta[relay.Constant][76]);
+          %383 = subtract(%382, meta[relay.Constant][77]);
+          %384 = nn.softmax(%383, axis=3);
+          %385 = %369.2;
+          %386 = reshape(%385, newshape=[50, 32, 12, 64]);
+          %387 = transpose(%386, axes=[0, 2, 1, 3]);
+          %388 = reshape(%387, newshape=[-1, 32, 64]);
+          %389 = reshape(%384, newshape=[-1, 32, 32]);
+          %390 = transpose(%388, axes=[0, 2, 1]);
+          %391 = nn.batch_matmul(%389, %390, out_dtype="float16", transpose_b=True);
+          %392 = reshape(%391, newshape=[50, 12, 32, 64]);
+          %393 = transpose(%392, axes=[0, 2, 1, 3]);
+          %394 = reshape(%393, newshape=[50, 32, 768]);
+          %395 = reshape(%394, newshape=[-1, 768]);
+          %396 = nn.dense(%395, meta[relay.Constant][78], units=768);
+          %397 = add(%396, meta[relay.Constant][79]);
+          %398 = reshape(%397, newshape=[50, 32, 768]);
+          %399 = add(%355, %398);
+          %400 = mean(%399, axis=[-1], keepdims=True);
+          %401 = subtract(%399, %400);
+          %402 = power(%401, 2f16);
+          %403 = mean(%402, axis=[-1], keepdims=True);
+          %404 = add(%403, 1e-05f16);
+          %405 = sqrt(%404);
+          %406 = divide(%401, %405);
+          %407 = multiply(%406, meta[relay.Constant][80]);
+          %408 = add(%407, meta[relay.Constant][81]);
+          %409 = reshape(%408, newshape=[-1, 768]);
+          %410 = nn.dense(%409, meta[relay.Constant][82], units=3072);
+          %411 = add(%410, meta[relay.Constant][83]);
+          %412 = reshape(%411, newshape=[50, 32, 3072]);
+          %413 = power(%412, 3f16);
+          %414 = multiply(%413, 0.044715f16);
+          %415 = add(%412, %414);
+          %416 = multiply(%415, 0.797885f16);
+          %417 = tanh(%416);
+          %418 = multiply(%412, 0.5f16);
+          %419 = add(%417, 1f16);
+          %420 = multiply(%418, %419);
+          %421 = reshape(%420, newshape=[-1, 3072]);
+          %422 = nn.dense(%421, meta[relay.Constant][84], units=768);
+          %423 = add(%422, meta[relay.Constant][85]);
+          %424 = reshape(%423, newshape=[50, 32, 768]);
+          %425 = add(%399, %424);
+          %426 = mean(%425, axis=[-1], keepdims=True);
+          %427 = subtract(%425, %426);
+          %428 = power(%427, 2f16);
+          %429 = mean(%428, axis=[-1], keepdims=True);
+          %430 = add(%429, 1e-05f16);
+          %431 = sqrt(%430);
+          %432 = divide(%427, %431);
+          %433 = multiply(%432, meta[relay.Constant][86]);
+          %434 = add(%433, meta[relay.Constant][87]);
+          %435 = reshape(%434, newshape=[-1, 768]);
+          %436 = nn.dense(%435, meta[relay.Constant][88], units=2304);
+          %437 = add(%436, meta[relay.Constant][89]);
+          %438 = reshape(%437, newshape=[50, 32, 2304]);
+          %439 = split(%438, indices_or_sections=[768, 1536], axis=2);
+          %440 = %439.0;
+          %441 = reshape(%440, newshape=[50, 32, 12, 64]);
+          %442 = transpose(%441, axes=[0, 2, 1, 3]);
+          %443 = %439.1;
+          %444 = reshape(%443, newshape=[50, 32, 12, 64]);
+          %445 = transpose(%444, axes=[0, 2, 3, 1]);
+          %446 = reshape(%445, newshape=[-1, 64, 32]);
+          %447 = reshape(%442, newshape=[-1, 32, 64]);
+          %448 = transpose(%446, axes=[0, 2, 1]);
+          %449 = nn.batch_matmul(%447, %448, out_dtype="float16", transpose_b=True);
+          %450 = reshape(%449, newshape=[50, 12, 32, 32]);
+          %451 = divide(%450, 8f16);
+          %452 = multiply(%451, meta[relay.Constant][90]);
+          %453 = subtract(%452, meta[relay.Constant][91]);
+          %454 = nn.softmax(%453, axis=3);
+          %455 = %439.2;
+          %456 = reshape(%455, newshape=[50, 32, 12, 64]);
+          %457 = transpose(%456, axes=[0, 2, 1, 3]);
+          %458 = reshape(%457, newshape=[-1, 32, 64]);
+          %459 = reshape(%454, newshape=[-1, 32, 32]);
+          %460 = transpose(%458, axes=[0, 2, 1]);
+          %461 = nn.batch_matmul(%459, %460, out_dtype="float16", transpose_b=True);
+          %462 = reshape(%461, newshape=[50, 12, 32, 64]);
+          %463 = transpose(%462, axes=[0, 2, 1, 3]);
+          %464 = reshape(%463, newshape=[50, 32, 768]);
+          %465 = reshape(%464, newshape=[-1, 768]);
+          %466 = nn.dense(%465, meta[relay.Constant][92], units=768);
+          %467 = add(%466, meta[relay.Constant][93]);
+          %468 = reshape(%467, newshape=[50, 32, 768]);
+          %469 = add(%425, %468);
+          %470 = mean(%469, axis=[-1], keepdims=True);
+          %471 = subtract(%469, %470);
+          %472 = power(%471, 2f16);
+          %473 = mean(%472, axis=[-1], keepdims=True);
+          %474 = add(%473, 1e-05f16);
+          %475 = sqrt(%474);
+          %476 = divide(%471, %475);
+          %477 = multiply(%476, meta[relay.Constant][94]);
+          %478 = add(%477, meta[relay.Constant][95]);
+          %479 = reshape(%478, newshape=[-1, 768]);
+          %480 = nn.dense(%479, meta[relay.Constant][96], units=3072);
+          %481 = add(%480, meta[relay.Constant][97]);
+          %482 = reshape(%481, newshape=[50, 32, 3072]);
+          %483 = power(%482, 3f16);
+          %484 = multiply(%483, 0.044715f16);
+          %485 = add(%482, %484);
+          %486 = multiply(%485, 0.797885f16);
+          %487 = tanh(%486);
+          %488 = multiply(%482, 0.5f16);
+          %489 = add(%487, 1f16);
+          %490 = multiply(%488, %489);
+          %491 = reshape(%490, newshape=[-1, 3072]);
+          %492 = nn.dense(%491, meta[relay.Constant][98], units=768);
+          %493 = add(%492, meta[relay.Constant][99]);
+          %494 = reshape(%493, newshape=[50, 32, 768]);
+          %495 = add(%469, %494);
+          %496 = mean(%495, axis=[-1], keepdims=True);
+          %497 = subtract(%495, %496);
+          %498 = power(%497, 2f16);
+          %499 = mean(%498, axis=[-1], keepdims=True);
+          %500 = add(%499, 1e-05f16);
+          %501 = sqrt(%500);
+          %502 = divide(%497, %501);
+          %503 = multiply(%502, meta[relay.Constant][100]);
+          %504 = add(%503, meta[relay.Constant][101]);
+          %505 = reshape(%504, newshape=[-1, 768]);
+          %506 = nn.dense(%505, meta[relay.Constant][102], units=2304);
+          %507 = add(%506, meta[relay.Constant][103]);
+          %508 = reshape(%507, newshape=[50, 32, 2304]);
+          %509 = split(%508, indices_or_sections=[768, 1536], axis=2);
+          %510 = %509.0;
+          %511 = reshape(%510, newshape=[50, 32, 12, 64]);
+          %512 = transpose(%511, axes=[0, 2, 1, 3]);
+          %513 = %509.1;
+          %514 = reshape(%513, newshape=[50, 32, 12, 64]);
+          %515 = transpose(%514, axes=[0, 2, 3, 1]);
+          %516 = reshape(%515, newshape=[-1, 64, 32]);
+          %517 = reshape(%512, newshape=[-1, 32, 64]);
+          %518 = transpose(%516, axes=[0, 2, 1]);
+          %519 = nn.batch_matmul(%517, %518, out_dtype="float16", transpose_b=True);
+          %520 = reshape(%519, newshape=[50, 12, 32, 32]);
+          %521 = divide(%520, 8f16);
+          %522 = multiply(%521, meta[relay.Constant][104]);
+          %523 = subtract(%522, meta[relay.Constant][105]);
+          %524 = nn.softmax(%523, axis=3);
+          %525 = %509.2;
+          %526 = reshape(%525, newshape=[50, 32, 12, 64]);
+          %527 = transpose(%526, axes=[0, 2, 1, 3]);
+          %528 = reshape(%527, newshape=[-1, 32, 64]);
+          %529 = reshape(%524, newshape=[-1, 32, 32]);
+          %530 = transpose(%528, axes=[0, 2, 1]);
+          %531 = nn.batch_matmul(%529, %530, out_dtype="float16", transpose_b=True);
+          %532 = reshape(%531, newshape=[50, 12, 32, 64]);
+          %533 = transpose(%532, axes=[0, 2, 1, 3]);
+          %534 = reshape(%533, newshape=[50, 32, 768]);
+          %535 = reshape(%534, newshape=[-1, 768]);
+          %536 = nn.dense(%535, meta[relay.Constant][106], units=768);
+          %537 = add(%536, meta[relay.Constant][107]);
+          %538 = reshape(%537, newshape=[50, 32, 768]);
+          %539 = add(%495, %538);
+          %540 = mean(%539, axis=[-1], keepdims=True);
+          %541 = subtract(%539, %540);
+          %542 = power(%541, 2f16);
+          %543 = mean(%542, axis=[-1], keepdims=True);
+          %544 = add(%543, 1e-05f16);
+          %545 = sqrt(%544);
+          %546 = divide(%541, %545);
+          %547 = multiply(%546, meta[relay.Constant][108]);
+          %548 = add(%547, meta[relay.Constant][109]);
+          %549 = reshape(%548, newshape=[-1, 768]);
+          %550 = nn.dense(%549, meta[relay.Constant][110], units=3072);
+          %551 = add(%550, meta[relay.Constant][111]);
+          %552 = reshape(%551, newshape=[50, 32, 3072]);
+          %553 = power(%552, 3f16);
+          %554 = multiply(%553, 0.044715f16);
+          %555 = add(%552, %554);
+          %556 = multiply(%555, 0.797885f16);
+          %557 = tanh(%556);
+          %558 = multiply(%552, 0.5f16);
+          %559 = add(%557, 1f16);
+          %560 = multiply(%558, %559);
+          %561 = reshape(%560, newshape=[-1, 3072]);
+          %562 = nn.dense(%561, meta[relay.Constant][112], units=768);
+          %563 = add(%562, meta[relay.Constant][113]);
+          %564 = reshape(%563, newshape=[50, 32, 768]);
+          %565 = add(%539, %564);
+          %566 = mean(%565, axis=[-1], keepdims=True);
+          %567 = subtract(%565, %566);
+          %568 = power(%567, 2f16);
+          %569 = mean(%568, axis=[-1], keepdims=True);
+          %570 = add(%569, 1e-05f16);
+          %571 = sqrt(%570);
+          %572 = divide(%567, %571);
+          %573 = multiply(%572, meta[relay.Constant][114]);
+          %574 = add(%573, meta[relay.Constant][115]);
+          %575 = reshape(%574, newshape=[-1, 768]);
+          %576 = nn.dense(%575, meta[relay.Constant][116], units=2304);
+          %577 = add(%576, meta[relay.Constant][117]);
+          %578 = reshape(%577, newshape=[50, 32, 2304]);
+          %579 = split(%578, indices_or_sections=[768, 1536], axis=2);
+          %580 = %579.0;
+          %581 = reshape(%580, newshape=[50, 32, 12, 64]);
+          %582 = transpose(%581, axes=[0, 2, 1, 3]);
+          %583 = %579.1;
+          %584 = reshape(%583, newshape=[50, 32, 12, 64]);
+          %585 = transpose(%584, axes=[0, 2, 3, 1]);
+          %586 = reshape(%585, newshape=[-1, 64, 32]);
+          %587 = reshape(%582, newshape=[-1, 32, 64]);
+          %588 = transpose(%586, axes=[0, 2, 1]);
+          %589 = nn.batch_matmul(%587, %588, out_dtype="float16", transpose_b=True);
+          %590 = reshape(%589, newshape=[50, 12, 32, 32]);
+          %591 = divide(%590, 8f16);
+          %592 = multiply(%591, meta[relay.Constant][118]);
+          %593 = subtract(%592, meta[relay.Constant][119]);
+          %594 = nn.softmax(%593, axis=3);
+          %595 = %579.2;
+          %596 = reshape(%595, newshape=[50, 32, 12, 64]);
+          %597 = transpose(%596, axes=[0, 2, 1, 3]);
+          %598 = reshape(%597, newshape=[-1, 32, 64]);
+          %599 = reshape(%594, newshape=[-1, 32, 32]);
+          %600 = transpose(%598, axes=[0, 2, 1]);
+          %601 = nn.batch_matmul(%599, %600, out_dtype="float16", transpose_b=True);
+          %602 = reshape(%601, newshape=[50, 12, 32, 64]);
+          %603 = transpose(%602, axes=[0, 2, 1, 3]);
+          %604 = reshape(%603, newshape=[50, 32, 768]);
+          %605 = reshape(%604, newshape=[-1, 768]);
+          %606 = nn.dense(%605, meta[relay.Constant][120], units=768);
+          %607 = add(%606, meta[relay.Constant][121]);
+          %608 = reshape(%607, newshape=[50, 32, 768]);
+          %609 = add(%565, %608);
+          %610 = mean(%609, axis=[-1], keepdims=True);
+          %611 = subtract(%609, %610);
+          %612 = power(%611, 2f16);
+          %613 = mean(%612, axis=[-1], keepdims=True);
+          %614 = add(%613, 1e-05f16);
+          %615 = sqrt(%614);
+          %616 = divide(%611, %615);
+          %617 = multiply(%616, meta[relay.Constant][122]);
+          %618 = add(%617, meta[relay.Constant][123]);
+          %619 = reshape(%618, newshape=[-1, 768]);
+          %620 = nn.dense(%619, meta[relay.Constant][124], units=3072);
+          %621 = add(%620, meta[relay.Constant][125]);
+          %622 = reshape(%621, newshape=[50, 32, 3072]);
+          %623 = power(%622, 3f16);
+          %624 = multiply(%623, 0.044715f16);
+          %625 = add(%622, %624);
+          %626 = multiply(%625, 0.797885f16);
+          %627 = tanh(%626);
+          %628 = multiply(%622, 0.5f16);
+          %629 = add(%627, 1f16);
+          %630 = multiply(%628, %629);
+          %631 = reshape(%630, newshape=[-1, 3072]);
+          %632 = nn.dense(%631, meta[relay.Constant][126], units=768);
+          %633 = add(%632, meta[relay.Constant][127]);
+          %634 = reshape(%633, newshape=[50, 32, 768]);
+          %635 = add(%609, %634);
+          %636 = mean(%635, axis=[-1], keepdims=True);
+          %637 = subtract(%635, %636);
+          %638 = power(%637, 2f16);
+          %639 = mean(%638, axis=[-1], keepdims=True);
+          %640 = add(%639, 1e-05f16);
+          %641 = sqrt(%640);
+          %642 = divide(%637, %641);
+          %643 = multiply(%642, meta[relay.Constant][128]);
+          %644 = add(%643, meta[relay.Constant][129]);
+          %645 = reshape(%644, newshape=[-1, 768]);
+          %646 = nn.dense(%645, meta[relay.Constant][130], units=2304);
+          %647 = add(%646, meta[relay.Constant][131]);
+          %648 = reshape(%647, newshape=[50, 32, 2304]);
+          %649 = split(%648, indices_or_sections=[768, 1536], axis=2);
+          %650 = %649.0;
+          %651 = reshape(%650, newshape=[50, 32, 12, 64]);
+          %652 = transpose(%651, axes=[0, 2, 1, 3]);
+          %653 = %649.1;
+          %654 = reshape(%653, newshape=[50, 32, 12, 64]);
+          %655 = transpose(%654, axes=[0, 2, 3, 1]);
+          %656 = reshape(%655, newshape=[-1, 64, 32]);
+          %657 = reshape(%652, newshape=[-1, 32, 64]);
+          %658 = transpose(%656, axes=[0, 2, 1]);
+          %659 = nn.batch_matmul(%657, %658, out_dtype="float16", transpose_b=True);
+          %660 = reshape(%659, newshape=[50, 12, 32, 32]);
+          %661 = divide(%660, 8f16);
+          %662 = multiply(%661, meta[relay.Constant][132]);
+          %663 = subtract(%662, meta[relay.Constant][133]);
+          %664 = nn.softmax(%663, axis=3);
+          %665 = %649.2;
+          %666 = reshape(%665, newshape=[50, 32, 12, 64]);
+          %667 = transpose(%666, axes=[0, 2, 1, 3]);
+          %668 = reshape(%667, newshape=[-1, 32, 64]);
+          %669 = reshape(%664, newshape=[-1, 32, 32]);
+          %670 = transpose(%668, axes=[0, 2, 1]);
+          %671 = nn.batch_matmul(%669, %670, out_dtype="float16", transpose_b=True);
+          %672 = reshape(%671, newshape=[50, 12, 32, 64]);
+          %673 = transpose(%672, axes=[0, 2, 1, 3]);
+          %674 = reshape(%673, newshape=[50, 32, 768]);
+          %675 = reshape(%674, newshape=[-1, 768]);
+          %676 = nn.dense(%675, meta[relay.Constant][134], units=768);
+          %677 = add(%676, meta[relay.Constant][135]);
+          %678 = reshape(%677, newshape=[50, 32, 768]);
+          %679 = add(%635, %678);
+          %680 = mean(%679, axis=[-1], keepdims=True);
+          %681 = subtract(%679, %680);
+          %682 = power(%681, 2f16);
+          %683 = mean(%682, axis=[-1], keepdims=True);
+          %684 = add(%683, 1e-05f16);
+          %685 = sqrt(%684);
+          %686 = divide(%681, %685);
+          %687 = multiply(%686, meta[relay.Constant][136]);
+          %688 = add(%687, meta[relay.Constant][137]);
+          %689 = reshape(%688, newshape=[-1, 768]);
+          %690 = nn.dense(%689, meta[relay.Constant][138], units=3072);
+          %691 = add(%690, meta[relay.Constant][139]);
+          %692 = reshape(%691, newshape=[50, 32, 3072]);
+          %693 = power(%692, 3f16);
+          %694 = multiply(%693, 0.044715f16);
+          %695 = add(%692, %694);
+          %696 = multiply(%695, 0.797885f16);
+          %697 = tanh(%696);
+          %698 = multiply(%692, 0.5f16);
+          %699 = add(%697, 1f16);
+          %700 = multiply(%698, %699);
+          %701 = reshape(%700, newshape=[-1, 3072]);
+          %702 = nn.dense(%701, meta[relay.Constant][140], units=768);
+          %703 = add(%702, meta[relay.Constant][141]);
+          %704 = reshape(%703, newshape=[50, 32, 768]);
+          %705 = add(%679, %704);
+          %706 = mean(%705, axis=[-1], keepdims=True);
+          %707 = subtract(%705, %706);
+          %708 = power(%707, 2f16);
+          %709 = mean(%708, axis=[-1], keepdims=True);
+          %710 = add(%709, 1e-05f16);
+          %711 = sqrt(%710);
+          %712 = divide(%707, %711);
+          %713 = multiply(%712, meta[relay.Constant][142]);
+          %714 = add(%713, meta[relay.Constant][143]);
+          %715 = reshape(%714, newshape=[-1, 768]);
+          %716 = nn.dense(%715, meta[relay.Constant][144], units=2304);
+          %717 = add(%716, meta[relay.Constant][145]);
+          %718 = reshape(%717, newshape=[50, 32, 2304]);
+          %719 = split(%718, indices_or_sections=[768, 1536], axis=2);
+          %720 = %719.0;
+          %721 = reshape(%720, newshape=[50, 32, 12, 64]);
+          %722 = transpose(%721, axes=[0, 2, 1, 3]);
+          %723 = %719.1;
+          %724 = reshape(%723, newshape=[50, 32, 12, 64]);
+          %725 = transpose(%724, axes=[0, 2, 3, 1]);
+          %726 = reshape(%725, newshape=[-1, 64, 32]);
+          %727 = reshape(%722, newshape=[-1, 32, 64]);
+          %728 = transpose(%726, axes=[0, 2, 1]);
+          %729 = nn.batch_matmul(%727, %728, out_dtype="float16", transpose_b=True);
+          %730 = reshape(%729, newshape=[50, 12, 32, 32]);
+          %731 = divide(%730, 8f16);
+          %732 = multiply(%731, meta[relay.Constant][146]);
+          %733 = subtract(%732, meta[relay.Constant][147]);
+          %734 = nn.softmax(%733, axis=3);
+          %735 = %719.2;
+          %736 = reshape(%735, newshape=[50, 32, 12, 64]);
+          %737 = transpose(%736, axes=[0, 2, 1, 3]);
+          %738 = reshape(%737, newshape=[-1, 32, 64]);
+          %739 = reshape(%734, newshape=[-1, 32, 32]);
+          %740 = transpose(%738, axes=[0, 2, 1]);
+          %741 = nn.batch_matmul(%739, %740, out_dtype="float16", transpose_b=True);
+          %742 = reshape(%741, newshape=[50, 12, 32, 64]);
+          %743 = transpose(%742, axes=[0, 2, 1, 3]);
+          %744 = reshape(%743, newshape=[50, 32, 768]);
+          %745 = reshape(%744, newshape=[-1, 768]);
+          %746 = nn.dense(%745, meta[relay.Constant][148], units=768);
+          %747 = add(%746, meta[relay.Constant][149]);
+          %748 = reshape(%747, newshape=[50, 32, 768]);
+          %749 = add(%705, %748);
+          %750 = mean(%749, axis=[-1], keepdims=True);
+          %751 = subtract(%749, %750);
+          %752 = power(%751, 2f16);
+          %753 = mean(%752, axis=[-1], keepdims=True);
+          %754 = add(%753, 1e-05f16);
+          %755 = sqrt(%754);
+          %756 = divide(%751, %755);
+          %757 = multiply(%756, meta[relay.Constant][150]);
+          %758 = add(%757, meta[relay.Constant][151]);
+          %759 = reshape(%758, newshape=[-1, 768]);
+          %760 = nn.dense(%759, meta[relay.Constant][152], units=3072);
+          %761 = add(%760, meta[relay.Constant][153]);
+          %762 = reshape(%761, newshape=[50, 32, 3072]);
+          %763 = power(%762, 3f16);
+          %764 = multiply(%763, 0.044715f16);
+          %765 = add(%762, %764);
+          %766 = multiply(%765, 0.797885f16);
+          %767 = tanh(%766);
+          %768 = multiply(%762, 0.5f16);
+          %769 = add(%767, 1f16);
+          %770 = multiply(%768, %769);
+          %771 = reshape(%770, newshape=[-1, 3072]);
+          %772 = nn.dense(%771, meta[relay.Constant][154], units=768);
+          %773 = add(%772, meta[relay.Constant][155]);
+          %774 = reshape(%773, newshape=[50, 32, 768]);
+          %775 = add(%749, %774);
+          %776 = mean(%775, axis=[-1], keepdims=True);
+          %777 = subtract(%775, %776);
+          %778 = power(%777, 2f16);
+          %779 = mean(%778, axis=[-1], keepdims=True);
+          %780 = add(%779, 1e-05f16);
+          %781 = sqrt(%780);
+          %782 = divide(%777, %781);
+          %783 = multiply(%782, meta[relay.Constant][156]);
+          %784 = add(%783, meta[relay.Constant][157]);
+          %785 = reshape(%784, newshape=[-1, 768]);
+          %786 = nn.dense(%785, meta[relay.Constant][158], units=2304);
+          %787 = add(%786, meta[relay.Constant][159]);
+          %788 = reshape(%787, newshape=[50, 32, 2304]);
+          %789 = split(%788, indices_or_sections=[768, 1536], axis=2);
+          %790 = %789.0;
+          %791 = reshape(%790, newshape=[50, 32, 12, 64]);
+          %792 = transpose(%791, axes=[0, 2, 1, 3]);
+          %793 = %789.1;
+          %794 = reshape(%793, newshape=[50, 32, 12, 64]);
+          %795 = transpose(%794, axes=[0, 2, 3, 1]);
+          %796 = reshape(%795, newshape=[-1, 64, 32]);
+          %797 = reshape(%792, newshape=[-1, 32, 64]);
+          %798 = transpose(%796, axes=[0, 2, 1]);
+          %799 = nn.batch_matmul(%797, %798, out_dtype="float16", transpose_b=True);
+          %800 = reshape(%799, newshape=[50, 12, 32, 32]);
+          %801 = divide(%800, 8f16);
+          %802 = multiply(%801, meta[relay.Constant][160]);
+          %803 = subtract(%802, meta[relay.Constant][161]);
+          %804 = nn.softmax(%803, axis=3);
+          %805 = %789.2;
+          %806 = reshape(%805, newshape=[50, 32, 12, 64]);
+          %807 = transpose(%806, axes=[0, 2, 1, 3]);
+          %808 = reshape(%807, newshape=[-1, 32, 64]);
+          %809 = reshape(%804, newshape=[-1, 32, 32]);
+          %810 = transpose(%808, axes=[0, 2, 1]);
+          %811 = nn.batch_matmul(%809, %810, out_dtype="float16", transpose_b=True);
+          %812 = reshape(%811, newshape=[50, 12, 32, 64]);
+          %813 = transpose(%812, axes=[0, 2, 1, 3]);
+          %814 = reshape(%813, newshape=[50, 32, 768]);
+          %815 = reshape(%814, newshape=[-1, 768]);
+          %816 = nn.dense(%815, meta[relay.Constant][162], units=768);
+          %817 = add(%816, meta[relay.Constant][163]);
+          %818 = reshape(%817, newshape=[50, 32, 768]);
+          %819 = add(%775, %818);
+          %820 = mean(%819, axis=[-1], keepdims=True);
+          %821 = subtract(%819, %820);
+          %822 = power(%821, 2f16);
+          %823 = mean(%822, axis=[-1], keepdims=True);
+          %824 = add(%823, 1e-05f16);
+          %825 = sqrt(%824);
+          %826 = divide(%821, %825);
+          %827 = multiply(%826, meta[relay.Constant][164]);
+          %828 = add(%827, meta[relay.Constant][165]);
+          %829 = reshape(%828, newshape=[-1, 768]);
+          %830 = nn.dense(%829, meta[relay.Constant][166], units=3072);
+          %831 = add(%830, meta[relay.Constant][167]);
+          %832 = reshape(%831, newshape=[50, 32, 3072]);
+          %833 = power(%832, 3f16);
+          %834 = multiply(%833, 0.044715f16);
+          %835 = add(%832, %834);
+          %836 = multiply(%835, 0.797885f16);
+          %837 = tanh(%836);
+          %838 = multiply(%832, 0.5f16);
+          %839 = add(%837, 1f16);
+          %840 = multiply(%838, %839);
+          %841 = reshape(%840, newshape=[-1, 3072]);
+          %842 = nn.dense(%841, meta[relay.Constant][168], units=768);
+          %843 = add(%842, meta[relay.Constant][169]);
+          %844 = reshape(%843, newshape=[50, 32, 768]);
+          %845 = add(%819, %844);
+          %846 = mean(%845, axis=[-1], keepdims=True);
+          %847 = subtract(%845, %846);
+          %848 = power(%847, 2f16);
+          %849 = mean(%848, axis=[-1], keepdims=True);
+          %850 = add(%849, 1e-05f16);
+          %851 = sqrt(%850);
+          %852 = divide(%847, %851);
+          %853 = multiply(%852, meta[relay.Constant][170]);
+          %854 = add(%853, meta[relay.Constant][171]);
+          %855 = transpose(%24, axes=[0, 2, 1, 3]);
+          %856 = expand_dims(%855, axis=0);
+          %857 = expand_dims(%37, axis=0);
+          %858 = (%856, %857);
+          %859 = transpose(%94, axes=[0, 2, 1, 3]);
+          %860 = expand_dims(%859, axis=0);
+          %861 = expand_dims(%107, axis=0);
+          %862 = (%860, %861);
+          %863 = transpose(%164, axes=[0, 2, 1, 3]);
+          %864 = expand_dims(%863, axis=0);
+          %865 = expand_dims(%177, axis=0);
+          %866 = (%864, %865);
+          %867 = transpose(%234, axes=[0, 2, 1, 3]);
+          %868 = expand_dims(%867, axis=0);
+          %869 = expand_dims(%247, axis=0);
+          %870 = (%868, %869);
+          %871 = transpose(%304, axes=[0, 2, 1, 3]);
+          %872 = expand_dims(%871, axis=0);
+          %873 = expand_dims(%317, axis=0);
+          %874 = (%872, %873);
+          %875 = transpose(%374, axes=[0, 2, 1, 3]);
+          %876 = expand_dims(%875, axis=0);
+          %877 = expand_dims(%387, axis=0);
+          %878 = (%876, %877);
+          %879 = transpose(%444, axes=[0, 2, 1, 3]);
+          %880 = expand_dims(%879, axis=0);
+          %881 = expand_dims(%457, axis=0);
+          %882 = (%880, %881);
+          %883 = transpose(%514, axes=[0, 2, 1, 3]);
+          %884 = expand_dims(%883, axis=0);
+          %885 = expand_dims(%527, axis=0);
+          %886 = (%884, %885);
+          %887 = transpose(%584, axes=[0, 2, 1, 3]);
+          %888 = expand_dims(%887, axis=0);
+          %889 = expand_dims(%597, axis=0);
+          %890 = (%888, %889);
+          %891 = transpose(%654, axes=[0, 2, 1, 3]);
+          %892 = expand_dims(%891, axis=0);
+          %893 = expand_dims(%667, axis=0);
+          %894 = (%892, %893);
+          %895 = transpose(%724, axes=[0, 2, 1, 3]);
+          %896 = expand_dims(%895, axis=0);
+          %897 = expand_dims(%737, axis=0);
+          %898 = (%896, %897);
+          %899 = transpose(%794, axes=[0, 2, 1, 3]);
+          %900 = expand_dims(%899, axis=0);
+          %901 = expand_dims(%807, axis=0);
+          %902 = (%900, %901);
+          %903 = reshape(%854, newshape=[1, 50, 32, 768]);
+          %904 = concatenate(%858);
+          %905 = concatenate(%862);
+          %906 = concatenate(%866);
+          %907 = concatenate(%870);
+          %908 = concatenate(%874);
+          %909 = concatenate(%878);
+          %910 = concatenate(%882);
+          %911 = concatenate(%886);
+          %912 = concatenate(%890);
+          %913 = concatenate(%894);
+          %914 = concatenate(%898);
+          %915 = concatenate(%902);
+          (%903, %904, %905, %906, %907, %908, %909, %910, %911, %912, %913, %914, %915)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "gpt2_16",
+        "input_shapes": {"x": [1, 50, 32]},
+        "input_dtypes": {"x": "int64"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def gpt2_extract_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (768, 768),  # 0
+            (768,),  # 1
+            (768,),  # 2
+            (768,),  # 3
+            (3072, 768),  # 4
+            (3072,),  # 5
+            (1, 32, 768),  # 6
+        ],
+    )
+
+
+def gpt2_extract():
+    metatable = {"relay.Constant": gpt2_extract_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1600, 768), float32]) -> Tensor[(50, 32, 3072), float32] {
+            %46 = nn.dense(%x, meta[relay.Constant][0], units=768);
+            %47 = add(%46, meta[relay.Constant][1]);
+            %48 = reshape(%47, newshape=[50, 32, 768]);
+            %49 = add(meta[relay.Constant][6], %48);
+            %50 = mean(%49, axis=[-1], keepdims=True);
+            %51 = subtract(%49, %50);
+            %52 = power(%51, 2f);
+            %53 = mean(%52, axis=[-1], keepdims=True);
+            %54 = add(%53, 1e-05f);
+            %55 = sqrt(%54);
+            %56 = divide(%51, %55);
+            %57 = multiply(%56, meta[relay.Constant][2]);
+            %58 = add(%57, meta[relay.Constant][3]);
+            %59 = reshape(%58, newshape=[-1, 768]);
+            %60 = nn.dense(%59, meta[relay.Constant][4], units=3072);
+            %61 = add(%60, meta[relay.Constant][5]);
+            %62 = reshape(%61, newshape=[50, 32, 3072]);
+            %63 = power(%62, 3f);
+            %64 = multiply(%63, 0.044715f);
+            %65 = add(%62, %64);
+            %66 = multiply(%65, 0.797885f);
+            %67 = tanh(%66);
+            %68 = multiply(%62, 0.5f);
+            %69 = add(%67, 1f);
+            %70 = multiply(%68, %69);
+            %70
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "input_shapes": {"x": [1600, 768]},
+        "input_dtypes": {"x": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def gpt2_extract_16():
+    metatable = {"relay.Constant": gpt2_extract_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1600, 768), float16]) -> Tensor[(50, 32, 3072), float16] {
+            %46 = nn.dense(%x, meta[relay.Constant][0], units=768);
+            %47 = add(%46, meta[relay.Constant][1]);
+            %48 = reshape(%47, newshape=[50, 32, 768]);
+            %49 = add(meta[relay.Constant][6], %48);
+            %50 = mean(%49, axis=[-1], keepdims=True);
+            %51 = subtract(%49, %50);
+            %52 = power(%51, 2f16);
+            %53 = mean(%52, axis=[-1], keepdims=True);
+            %54 = add(%53, 1e-05f16);
+            %55 = sqrt(%54);
+            %56 = divide(%51, %55);
+            %57 = multiply(%56, meta[relay.Constant][2]);
+            %58 = add(%57, meta[relay.Constant][3]);
+            %59 = reshape(%58, newshape=[-1, 768]);
+            %60 = nn.dense(%59, meta[relay.Constant][4], units=3072);
+            %61 = add(%60, meta[relay.Constant][5]);
+            %62 = reshape(%61, newshape=[50, 32, 3072]);
+            %63 = power(%62, 3f16);
+            %64 = multiply(%63, 0.044715f16);
+            %65 = add(%62, %64);
+            %66 = multiply(%65, 0.797885f16);
+            %67 = tanh(%66);
+            %68 = multiply(%62, 0.5f16);
+            %69 = add(%67, 1f16);
+            %70 = multiply(%68, %69);
+            %70
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "gpt2_extract_16",
+        "input_shapes": {"x": [1600, 768]},
+        "input_dtypes": {"x": "float16"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def gpt2_16_for_cutlass_extract_consts(dtype):
+    return make_consts(
+        "float16",
+        [
+            (2304, 768),  # 0
+            (2304,),  # 1
+            (600, 32, 64),  # 2
+            (600, 32, 32),  # 3
+        ],
+    )
+
+
+def gpt2_16_for_cutlass_extract():
+    metatable = {"relay.Constant": gpt2_16_for_cutlass_extract_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x0: Tensor[(1600, 768), float16],
+                  %x3: Tensor[(600, 32, 64), float16])
+            -> (Tensor[(1600, 2304), float16], Tensor[(1200, 32, 32), float16]) {
+          %0 = nn.dense(%x0, meta[relay.Constant][0], units=2304);
+          %1 = add(%0, meta[relay.Constant][1]);
+          %2 = nn.batch_matmul(%x3, meta[relay.Constant][2], out_dtype="float16", transpose_b=True);
+          %3 = (%2, meta[relay.Constant][3]);
+          %4 = concatenate(%3);
+          (%1, %4)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "gpt2_16_for_cutlass_extract",
+        "input_shapes": {"x0": (1600, 768), "x3": (600, 32, 64)},
+        "input_dtypes": {"x0": "float16", "x3": "float16"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def resnet50_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (3,),  # 0
+            (3,),  # 1
+            (3,),  # 2
+            (3,),  # 3
+            (64, 3, 7, 7),  # 4
+            (64,),  # 5
+            (64,),  # 6
+            (64,),  # 7
+            (64,),  # 8
+            (64,),  # 9
+            (64,),  # 10
+            (64,),  # 11
+            (64,),  # 12
+            (64, 64, 1, 1),  # 13
+            (64,),  # 14
+            (64,),  # 15
+            (64,),  # 16
+            (64,),  # 17
+            (64, 64, 3, 3),  # 18
+            (64,),  # 19
+            (64,),  # 20
+            (64,),  # 21
+            (64,),  # 22
+            (256, 64, 1, 1),  # 23
+            (256, 64, 1, 1),  # 24
+            (256,),  # 25
+            (256,),  # 26
+            (256,),  # 27
+            (256,),  # 28
+            (64, 256, 1, 1),  # 29
+            (64,),  # 30
+            (64,),  # 31
+            (64,),  # 32
+            (64,),  # 33
+            (64, 64, 3, 3),  # 34
+            (64,),  # 35
+            (64,),  # 36
+            (64,),  # 37
+            (64,),  # 38
+            (256, 64, 1, 1),  # 39
+            (256,),  # 40
+            (256,),  # 41
+            (256,),  # 42
+            (256,),  # 43
+            (64, 256, 1, 1),  # 44
+            (64,),  # 45
+            (64,),  # 46
+            (64,),  # 47
+            (64,),  # 48
+            (64, 64, 3, 3),  # 49
+            (64,),  # 50
+            (64,),  # 51
+            (64,),  # 52
+            (64,),  # 53
+            (256, 64, 1, 1),  # 54
+            (256,),  # 55
+            (256,),  # 56
+            (256,),  # 57
+            (256,),  # 58
+            (128, 256, 1, 1),  # 59
+            (128,),  # 60
+            (128,),  # 61
+            (128,),  # 62
+            (128,),  # 63
+            (128, 128, 3, 3),  # 64
+            (128,),  # 65
+            (128,),  # 66
+            (128,),  # 67
+            (128,),  # 68
+            (512, 128, 1, 1),  # 69
+            (512, 256, 1, 1),  # 70
+            (512,),  # 71
+            (512,),  # 72
+            (512,),  # 73
+            (512,),  # 74
+            (128, 512, 1, 1),  # 75
+            (128,),  # 76
+            (128,),  # 77
+            (128,),  # 78
+            (128,),  # 79
+            (128, 128, 3, 3),  # 80
+            (128,),  # 81
+            (128,),  # 82
+            (128,),  # 83
+            (128,),  # 84
+            (512, 128, 1, 1),  # 85
+            (512,),  # 86
+            (512,),  # 87
+            (512,),  # 88
+            (512,),  # 89
+            (128, 512, 1, 1),  # 90
+            (128,),  # 91
+            (128,),  # 92
+            (128,),  # 93
+            (128,),  # 94
+            (128, 128, 3, 3),  # 95
+            (128,),  # 96
+            (128,),  # 97
+            (128,),  # 98
+            (128,),  # 99
+            (512, 128, 1, 1),  # 100
+            (512,),  # 101
+            (512,),  # 102
+            (512,),  # 103
+            (512,),  # 104
+            (128, 512, 1, 1),  # 105
+            (128,),  # 106
+            (128,),  # 107
+            (128,),  # 108
+            (128,),  # 109
+            (128, 128, 3, 3),  # 110
+            (128,),  # 111
+            (128,),  # 112
+            (128,),  # 113
+            (128,),  # 114
+            (512, 128, 1, 1),  # 115
+            (512,),  # 116
+            (512,),  # 117
+            (512,),  # 118
+            (512,),  # 119
+            (256, 512, 1, 1),  # 120
+            (256,),  # 121
+            (256,),  # 122
+            (256,),  # 123
+            (256,),  # 124
+            (256, 256, 3, 3),  # 125
+            (256,),  # 126
+            (256,),  # 127
+            (256,),  # 128
+            (256,),  # 129
+            (1024, 256, 1, 1),  # 130
+            (1024, 512, 1, 1),  # 131
+            (1024,),  # 132
+            (1024,),  # 133
+            (1024,),  # 134
+            (1024,),  # 135
+            (256, 1024, 1, 1),  # 136
+            (256,),  # 137
+            (256,),  # 138
+            (256,),  # 139
+            (256,),  # 140
+            (256, 256, 3, 3),  # 141
+            (256,),  # 142
+            (256,),  # 143
+            (256,),  # 144
+            (256,),  # 145
+            (1024, 256, 1, 1),  # 146
+            (1024,),  # 147
+            (1024,),  # 148
+            (1024,),  # 149
+            (1024,),  # 150
+            (256, 1024, 1, 1),  # 151
+            (256,),  # 152
+            (256,),  # 153
+            (256,),  # 154
+            (256,),  # 155
+            (256, 256, 3, 3),  # 156
+            (256,),  # 157
+            (256,),  # 158
+            (256,),  # 159
+            (256,),  # 160
+            (1024, 256, 1, 1),  # 161
+            (1024,),  # 162
+            (1024,),  # 163
+            (1024,),  # 164
+            (1024,),  # 165
+            (256, 1024, 1, 1),  # 166
+            (256,),  # 167
+            (256,),  # 168
+            (256,),  # 169
+            (256,),  # 170
+            (256, 256, 3, 3),  # 171
+            (256,),  # 172
+            (256,),  # 173
+            (256,),  # 174
+            (256,),  # 175
+            (1024, 256, 1, 1),  # 176
+            (1024,),  # 177
+            (1024,),  # 178
+            (1024,),  # 179
+            (1024,),  # 180
+            (256, 1024, 1, 1),  # 181
+            (256,),  # 182
+            (256,),  # 183
+            (256,),  # 184
+            (256,),  # 185
+            (256, 256, 3, 3),  # 186
+            (256,),  # 187
+            (256,),  # 188
+            (256,),  # 189
+            (256,),  # 190
+            (1024, 256, 1, 1),  # 191
+            (1024,),  # 192
+            (1024,),  # 193
+            (1024,),  # 194
+            (1024,),  # 195
+            (256, 1024, 1, 1),  # 196
+            (256,),  # 197
+            (256,),  # 198
+            (256,),  # 199
+            (256,),  # 200
+            (256, 256, 3, 3),  # 201
+            (256,),  # 202
+            (256,),  # 203
+            (256,),  # 204
+            (256,),  # 205
+            (1024, 256, 1, 1),  # 206
+            (1024,),  # 207
+            (1024,),  # 208
+            (1024,),  # 209
+            (1024,),  # 210
+            (512, 1024, 1, 1),  # 211
+            (512,),  # 212
+            (512,),  # 213
+            (512,),  # 214
+            (512,),  # 215
+            (512, 512, 3, 3),  # 216
+            (512,),  # 217
+            (512,),  # 218
+            (512,),  # 219
+            (512,),  # 220
+            (2048, 512, 1, 1),  # 221
+            (2048, 1024, 1, 1),  # 222
+            (2048,),  # 223
+            (2048,),  # 224
+            (2048,),  # 225
+            (2048,),  # 226
+            (512, 2048, 1, 1),  # 227
+            (512,),  # 228
+            (512,),  # 229
+            (512,),  # 230
+            (512,),  # 231
+            (512, 512, 3, 3),  # 232
+            (512,),  # 233
+            (512,),  # 234
+            (512,),  # 235
+            (512,),  # 236
+            (2048, 512, 1, 1),  # 237
+            (2048,),  # 238
+            (2048,),  # 239
+            (2048,),  # 240
+            (2048,),  # 241
+            (512, 2048, 1, 1),  # 242
+            (512,),  # 243
+            (512,),  # 244
+            (512,),  # 245
+            (512,),  # 246
+            (512, 512, 3, 3),  # 247
+            (512,),  # 248
+            (512,),  # 249
+            (512,),  # 250
+            (512,),  # 251
+            (2048, 512, 1, 1),  # 252
+            (2048,),  # 253
+            (2048,),  # 254
+            (2048,),  # 255
+            (2048,),  # 256
+            (1000, 2048),  # 257
+            (1000,),  # 258
+        ],
+    )
+
+
+def resnet50():
+    metatable = {"relay.Constant": resnet50_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32] {
+          %0 = nn.batch_norm(%data, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]);
+          %1 = %0.0;
+          %2 = nn.conv2d(%1, meta[relay.Constant][4], strides=[2, 2], padding=[3, 3, 3, 3], channels=64, kernel_size=[7, 7]);
+          %3 = nn.batch_norm(%2, meta[relay.Constant][5], meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8]);
+          %4 = %3.0;
+          %5 = nn.relu(%4);
+          %6 = nn.max_pool2d(%5, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]);
+          %7 = nn.batch_norm(%6, meta[relay.Constant][9], meta[relay.Constant][10], meta[relay.Constant][11], meta[relay.Constant][12]);
+          %8 = %7.0;
+          %9 = nn.relu(%8);
+          %10 = nn.conv2d(%9, meta[relay.Constant][13], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %11 = nn.batch_norm(%10, meta[relay.Constant][14], meta[relay.Constant][15], meta[relay.Constant][16], meta[relay.Constant][17]);
+          %12 = %11.0;
+          %13 = nn.relu(%12);
+          %14 = nn.conv2d(%13, meta[relay.Constant][18], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %15 = nn.batch_norm(%14, meta[relay.Constant][19], meta[relay.Constant][20], meta[relay.Constant][21], meta[relay.Constant][22]);
+          %16 = %15.0;
+          %17 = nn.relu(%16);
+          %18 = nn.conv2d(%17, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %19 = nn.conv2d(%9, meta[relay.Constant][24], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %20 = add(%18, %19);
+          %21 = nn.batch_norm(%20, meta[relay.Constant][25], meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28]);
+          %22 = %21.0;
+          %23 = nn.relu(%22);
+          %24 = nn.conv2d(%23, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %25 = nn.batch_norm(%24, meta[relay.Constant][30], meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33]);
+          %26 = %25.0;
+          %27 = nn.relu(%26);
+          %28 = nn.conv2d(%27, meta[relay.Constant][34], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %29 = nn.batch_norm(%28, meta[relay.Constant][35], meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38]);
+          %30 = %29.0;
+          %31 = nn.relu(%30);
+          %32 = nn.conv2d(%31, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %33 = add(%32, %20);
+          %34 = nn.batch_norm(%33, meta[relay.Constant][40], meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43]);
+          %35 = %34.0;
+          %36 = nn.relu(%35);
+          %37 = nn.conv2d(%36, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %38 = nn.batch_norm(%37, meta[relay.Constant][45], meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48]);
+          %39 = %38.0;
+          %40 = nn.relu(%39);
+          %41 = nn.conv2d(%40, meta[relay.Constant][49], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %42 = nn.batch_norm(%41, meta[relay.Constant][50], meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53]);
+          %43 = %42.0;
+          %44 = nn.relu(%43);
+          %45 = nn.conv2d(%44, meta[relay.Constant][54], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %46 = add(%45, %33);
+          %47 = nn.batch_norm(%46, meta[relay.Constant][55], meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58]);
+          %48 = %47.0;
+          %49 = nn.relu(%48);
+          %50 = nn.conv2d(%49, meta[relay.Constant][59], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %51 = nn.batch_norm(%50, meta[relay.Constant][60], meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63]);
+          %52 = %51.0;
+          %53 = nn.relu(%52);
+          %54 = nn.conv2d(%53, meta[relay.Constant][64], strides=[2, 2], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %55 = nn.batch_norm(%54, meta[relay.Constant][65], meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68]);
+          %56 = %55.0;
+          %57 = nn.relu(%56);
+          %58 = nn.conv2d(%57, meta[relay.Constant][69], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %59 = nn.conv2d(%49, meta[relay.Constant][70], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %60 = add(%58, %59);
+          %61 = nn.batch_norm(%60, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
+          %62 = %61.0;
+          %63 = nn.relu(%62);
+          %64 = nn.conv2d(%63, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %65 = nn.batch_norm(%64, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
+          %66 = %65.0;
+          %67 = nn.relu(%66);
+          %68 = nn.conv2d(%67, meta[relay.Constant][80], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %69 = nn.batch_norm(%68, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
+          %70 = %69.0;
+          %71 = nn.relu(%70);
+          %72 = nn.conv2d(%71, meta[relay.Constant][85], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %73 = add(%72, %60);
+          %74 = nn.batch_norm(%73, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
+          %75 = %74.0;
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %78 = nn.batch_norm(%77, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
+          %79 = %78.0;
+          %80 = nn.relu(%79);
+          %81 = nn.conv2d(%80, meta[relay.Constant][95], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %82 = nn.batch_norm(%81, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
+          %83 = %82.0;
+          %84 = nn.relu(%83);
+          %85 = nn.conv2d(%84, meta[relay.Constant][100], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %86 = add(%85, %73);
+          %87 = nn.batch_norm(%86, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
+          %88 = %87.0;
+          %89 = nn.relu(%88);
+          %90 = nn.conv2d(%89, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %91 = nn.batch_norm(%90, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
+          %92 = %91.0;
+          %93 = nn.relu(%92);
+          %94 = nn.conv2d(%93, meta[relay.Constant][110], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %95 = nn.batch_norm(%94, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
+          %96 = %95.0;
+          %97 = nn.relu(%96);
+          %98 = nn.conv2d(%97, meta[relay.Constant][115], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %99 = add(%98, %86);
+          %100 = nn.batch_norm(%99, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
+          %101 = %100.0;
+          %102 = nn.relu(%101);
+          %103 = nn.conv2d(%102, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %104 = nn.batch_norm(%103, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
+          %105 = %104.0;
+          %106 = nn.relu(%105);
+          %107 = nn.conv2d(%106, meta[relay.Constant][125], strides=[2, 2], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %108 = nn.batch_norm(%107, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
+          %109 = %108.0;
+          %110 = nn.relu(%109);
+          %111 = nn.conv2d(%110, meta[relay.Constant][130], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %112 = nn.conv2d(%102, meta[relay.Constant][131], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %113 = add(%111, %112);
+          %114 = nn.batch_norm(%113, meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134], meta[relay.Constant][135]);
+          %115 = %114.0;
+          %116 = nn.relu(%115);
+          %117 = nn.conv2d(%116, meta[relay.Constant][136], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %118 = nn.batch_norm(%117, meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139], meta[relay.Constant][140]);
+          %119 = %118.0;
+          %120 = nn.relu(%119);
+          %121 = nn.conv2d(%120, meta[relay.Constant][141], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %122 = nn.batch_norm(%121, meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144], meta[relay.Constant][145]);
+          %123 = %122.0;
+          %124 = nn.relu(%123);
+          %125 = nn.conv2d(%124, meta[relay.Constant][146], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %126 = add(%125, %113);
+          %127 = nn.batch_norm(%126, meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149], meta[relay.Constant][150]);
+          %128 = %127.0;
+          %129 = nn.relu(%128);
+          %130 = nn.conv2d(%129, meta[relay.Constant][151], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %131 = nn.batch_norm(%130, meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154], meta[relay.Constant][155]);
+          %132 = %131.0;
+          %133 = nn.relu(%132);
+          %134 = nn.conv2d(%133, meta[relay.Constant][156], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %135 = nn.batch_norm(%134, meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159], meta[relay.Constant][160]);
+          %136 = %135.0;
+          %137 = nn.relu(%136);
+          %138 = nn.conv2d(%137, meta[relay.Constant][161], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %139 = add(%138, %126);
+          %140 = nn.batch_norm(%139, meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164], meta[relay.Constant][165]);
+          %141 = %140.0;
+          %142 = nn.relu(%141);
+          %143 = nn.conv2d(%142, meta[relay.Constant][166], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %144 = nn.batch_norm(%143, meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169], meta[relay.Constant][170]);
+          %145 = %144.0;
+          %146 = nn.relu(%145);
+          %147 = nn.conv2d(%146, meta[relay.Constant][171], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %148 = nn.batch_norm(%147, meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174], meta[relay.Constant][175]);
+          %149 = %148.0;
+          %150 = nn.relu(%149);
+          %151 = nn.conv2d(%150, meta[relay.Constant][176], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %152 = add(%151, %139);
+          %153 = nn.batch_norm(%152, meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179], meta[relay.Constant][180]);
+          %154 = %153.0;
+          %155 = nn.relu(%154);
+          %156 = nn.conv2d(%155, meta[relay.Constant][181], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %157 = nn.batch_norm(%156, meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184], meta[relay.Constant][185]);
+          %158 = %157.0;
+          %159 = nn.relu(%158);
+          %160 = nn.conv2d(%159, meta[relay.Constant][186], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %161 = nn.batch_norm(%160, meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189], meta[relay.Constant][190]);
+          %162 = %161.0;
+          %163 = nn.relu(%162);
+          %164 = nn.conv2d(%163, meta[relay.Constant][191], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %165 = add(%164, %152);
+          %166 = nn.batch_norm(%165, meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194], meta[relay.Constant][195]);
+          %167 = %166.0;
+          %168 = nn.relu(%167);
+          %169 = nn.conv2d(%168, meta[relay.Constant][196], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %170 = nn.batch_norm(%169, meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199], meta[relay.Constant][200]);
+          %171 = %170.0;
+          %172 = nn.relu(%171);
+          %173 = nn.conv2d(%172, meta[relay.Constant][201], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %174 = nn.batch_norm(%173, meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204], meta[relay.Constant][205]);
+          %175 = %174.0;
+          %176 = nn.relu(%175);
+          %177 = nn.conv2d(%176, meta[relay.Constant][206], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %178 = add(%177, %165);
+          %179 = nn.batch_norm(%178, meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209], meta[relay.Constant][210]);
+          %180 = %179.0;
+          %181 = nn.relu(%180);
+          %182 = nn.conv2d(%181, meta[relay.Constant][211], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %183 = nn.batch_norm(%182, meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214], meta[relay.Constant][215]);
+          %184 = %183.0;
+          %185 = nn.relu(%184);
+          %186 = nn.conv2d(%185, meta[relay.Constant][216], strides=[2, 2], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %187 = nn.batch_norm(%186, meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219], meta[relay.Constant][220]);
+          %188 = %187.0;
+          %189 = nn.relu(%188);
+          %190 = nn.conv2d(%189, meta[relay.Constant][221], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %191 = nn.conv2d(%181, meta[relay.Constant][222], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %192 = add(%190, %191);
+          %193 = nn.batch_norm(%192, meta[relay.Constant][223], meta[relay.Constant][224], meta[relay.Constant][225], meta[relay.Constant][226]);
+          %194 = %193.0;
+          %195 = nn.relu(%194);
+          %196 = nn.conv2d(%195, meta[relay.Constant][227], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %197 = nn.batch_norm(%196, meta[relay.Constant][228], meta[relay.Constant][229], meta[relay.Constant][230], meta[relay.Constant][231]);
+          %198 = %197.0;
+          %199 = nn.relu(%198);
+          %200 = nn.conv2d(%199, meta[relay.Constant][232], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %201 = nn.batch_norm(%200, meta[relay.Constant][233], meta[relay.Constant][234], meta[relay.Constant][235], meta[relay.Constant][236]);
+          %202 = %201.0;
+          %203 = nn.relu(%202);
+          %204 = nn.conv2d(%203, meta[relay.Constant][237], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %205 = add(%204, %192);
+          %206 = nn.batch_norm(%205, meta[relay.Constant][238], meta[relay.Constant][239], meta[relay.Constant][240], meta[relay.Constant][241]);
+          %207 = %206.0;
+          %208 = nn.relu(%207);
+          %209 = nn.conv2d(%208, meta[relay.Constant][242], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %210 = nn.batch_norm(%209, meta[relay.Constant][243], meta[relay.Constant][244], meta[relay.Constant][245], meta[relay.Constant][246]);
+          %211 = %210.0;
+          %212 = nn.relu(%211);
+          %213 = nn.conv2d(%212, meta[relay.Constant][247], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %214 = nn.batch_norm(%213, meta[relay.Constant][248], meta[relay.Constant][249], meta[relay.Constant][250], meta[relay.Constant][251]);
+          %215 = %214.0;
+          %216 = nn.relu(%215);
+          %217 = nn.conv2d(%216, meta[relay.Constant][252], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %218 = add(%217, %205);
+          %219 = nn.batch_norm(%218, meta[relay.Constant][253], meta[relay.Constant][254], meta[relay.Constant][255], meta[relay.Constant][256]);
+          %220 = %219.0;
+          %221 = nn.relu(%220);
+          %222 = nn.global_avg_pool2d(%221);
+          %223 = reshape(%222, newshape=[0, -1]);
+          %224 = nn.dense(%223, meta[relay.Constant][257], units=1000);
+          add(%224, meta[relay.Constant][258])
+        }   
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "resnet50",
+        "input_shapes": {"data": [1, 3, 224, 224]},
+        "input_dtypes": {"data": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def resnet50_16():
+    metatable = {"relay.Constant": resnet50_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(1, 3, 224, 224), float16]) -> Tensor[(1, 1000), float16] {
+          %0 = nn.batch_norm(%data, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]);
+          %1 = %0.0;
+          %2 = nn.conv2d(%1, meta[relay.Constant][4], strides=[2, 2], padding=[3, 3, 3, 3], channels=64, kernel_size=[7, 7]);
+          %3 = nn.batch_norm(%2, meta[relay.Constant][5], meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8]);
+          %4 = %3.0;
+          %5 = nn.relu(%4);
+          %6 = nn.max_pool2d(%5, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]);
+          %7 = nn.batch_norm(%6, meta[relay.Constant][9], meta[relay.Constant][10], meta[relay.Constant][11], meta[relay.Constant][12]);
+          %8 = %7.0;
+          %9 = nn.relu(%8);
+          %10 = nn.conv2d(%9, meta[relay.Constant][13], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %11 = nn.batch_norm(%10, meta[relay.Constant][14], meta[relay.Constant][15], meta[relay.Constant][16], meta[relay.Constant][17]);
+          %12 = %11.0;
+          %13 = nn.relu(%12);
+          %14 = nn.conv2d(%13, meta[relay.Constant][18], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %15 = nn.batch_norm(%14, meta[relay.Constant][19], meta[relay.Constant][20], meta[relay.Constant][21], meta[relay.Constant][22]);
+          %16 = %15.0;
+          %17 = nn.relu(%16);
+          %18 = nn.conv2d(%17, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %19 = nn.conv2d(%9, meta[relay.Constant][24], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %20 = add(%18, %19);
+          %21 = nn.batch_norm(%20, meta[relay.Constant][25], meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28]);
+          %22 = %21.0;
+          %23 = nn.relu(%22);
+          %24 = nn.conv2d(%23, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %25 = nn.batch_norm(%24, meta[relay.Constant][30], meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33]);
+          %26 = %25.0;
+          %27 = nn.relu(%26);
+          %28 = nn.conv2d(%27, meta[relay.Constant][34], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %29 = nn.batch_norm(%28, meta[relay.Constant][35], meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38]);
+          %30 = %29.0;
+          %31 = nn.relu(%30);
+          %32 = nn.conv2d(%31, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %33 = add(%32, %20);
+          %34 = nn.batch_norm(%33, meta[relay.Constant][40], meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43]);
+          %35 = %34.0;
+          %36 = nn.relu(%35);
+          %37 = nn.conv2d(%36, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %38 = nn.batch_norm(%37, meta[relay.Constant][45], meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48]);
+          %39 = %38.0;
+          %40 = nn.relu(%39);
+          %41 = nn.conv2d(%40, meta[relay.Constant][49], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %42 = nn.batch_norm(%41, meta[relay.Constant][50], meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53]);
+          %43 = %42.0;
+          %44 = nn.relu(%43);
+          %45 = nn.conv2d(%44, meta[relay.Constant][54], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %46 = add(%45, %33);
+          %47 = nn.batch_norm(%46, meta[relay.Constant][55], meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58]);
+          %48 = %47.0;
+          %49 = nn.relu(%48);
+          %50 = nn.conv2d(%49, meta[relay.Constant][59], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %51 = nn.batch_norm(%50, meta[relay.Constant][60], meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63]);
+          %52 = %51.0;
+          %53 = nn.relu(%52);
+          %54 = nn.conv2d(%53, meta[relay.Constant][64], strides=[2, 2], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %55 = nn.batch_norm(%54, meta[relay.Constant][65], meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68]);
+          %56 = %55.0;
+          %57 = nn.relu(%56);
+          %58 = nn.conv2d(%57, meta[relay.Constant][69], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %59 = nn.conv2d(%49, meta[relay.Constant][70], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %60 = add(%58, %59);
+          %61 = nn.batch_norm(%60, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
+          %62 = %61.0;
+          %63 = nn.relu(%62);
+          %64 = nn.conv2d(%63, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %65 = nn.batch_norm(%64, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
+          %66 = %65.0;
+          %67 = nn.relu(%66);
+          %68 = nn.conv2d(%67, meta[relay.Constant][80], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %69 = nn.batch_norm(%68, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
+          %70 = %69.0;
+          %71 = nn.relu(%70);
+          %72 = nn.conv2d(%71, meta[relay.Constant][85], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %73 = add(%72, %60);
+          %74 = nn.batch_norm(%73, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
+          %75 = %74.0;
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %78 = nn.batch_norm(%77, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
+          %79 = %78.0;
+          %80 = nn.relu(%79);
+          %81 = nn.conv2d(%80, meta[relay.Constant][95], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %82 = nn.batch_norm(%81, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
+          %83 = %82.0;
+          %84 = nn.relu(%83);
+          %85 = nn.conv2d(%84, meta[relay.Constant][100], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %86 = add(%85, %73);
+          %87 = nn.batch_norm(%86, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
+          %88 = %87.0;
+          %89 = nn.relu(%88);
+          %90 = nn.conv2d(%89, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %91 = nn.batch_norm(%90, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
+          %92 = %91.0;
+          %93 = nn.relu(%92);
+          %94 = nn.conv2d(%93, meta[relay.Constant][110], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %95 = nn.batch_norm(%94, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
+          %96 = %95.0;
+          %97 = nn.relu(%96);
+          %98 = nn.conv2d(%97, meta[relay.Constant][115], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %99 = add(%98, %86);
+          %100 = nn.batch_norm(%99, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
+          %101 = %100.0;
+          %102 = nn.relu(%101);
+          %103 = nn.conv2d(%102, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %104 = nn.batch_norm(%103, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
+          %105 = %104.0;
+          %106 = nn.relu(%105);
+          %107 = nn.conv2d(%106, meta[relay.Constant][125], strides=[2, 2], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %108 = nn.batch_norm(%107, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
+          %109 = %108.0;
+          %110 = nn.relu(%109);
+          %111 = nn.conv2d(%110, meta[relay.Constant][130], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %112 = nn.conv2d(%102, meta[relay.Constant][131], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %113 = add(%111, %112);
+          %114 = nn.batch_norm(%113, meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134], meta[relay.Constant][135]);
+          %115 = %114.0;
+          %116 = nn.relu(%115);
+          %117 = nn.conv2d(%116, meta[relay.Constant][136], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %118 = nn.batch_norm(%117, meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139], meta[relay.Constant][140]);
+          %119 = %118.0;
+          %120 = nn.relu(%119);
+          %121 = nn.conv2d(%120, meta[relay.Constant][141], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %122 = nn.batch_norm(%121, meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144], meta[relay.Constant][145]);
+          %123 = %122.0;
+          %124 = nn.relu(%123);
+          %125 = nn.conv2d(%124, meta[relay.Constant][146], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %126 = add(%125, %113);
+          %127 = nn.batch_norm(%126, meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149], meta[relay.Constant][150]);
+          %128 = %127.0;
+          %129 = nn.relu(%128);
+          %130 = nn.conv2d(%129, meta[relay.Constant][151], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %131 = nn.batch_norm(%130, meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154], meta[relay.Constant][155]);
+          %132 = %131.0;
+          %133 = nn.relu(%132);
+          %134 = nn.conv2d(%133, meta[relay.Constant][156], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %135 = nn.batch_norm(%134, meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159], meta[relay.Constant][160]);
+          %136 = %135.0;
+          %137 = nn.relu(%136);
+          %138 = nn.conv2d(%137, meta[relay.Constant][161], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %139 = add(%138, %126);
+          %140 = nn.batch_norm(%139, meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164], meta[relay.Constant][165]);
+          %141 = %140.0;
+          %142 = nn.relu(%141);
+          %143 = nn.conv2d(%142, meta[relay.Constant][166], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %144 = nn.batch_norm(%143, meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169], meta[relay.Constant][170]);
+          %145 = %144.0;
+          %146 = nn.relu(%145);
+          %147 = nn.conv2d(%146, meta[relay.Constant][171], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %148 = nn.batch_norm(%147, meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174], meta[relay.Constant][175]);
+          %149 = %148.0;
+          %150 = nn.relu(%149);
+          %151 = nn.conv2d(%150, meta[relay.Constant][176], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %152 = add(%151, %139);
+          %153 = nn.batch_norm(%152, meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179], meta[relay.Constant][180]);
+          %154 = %153.0;
+          %155 = nn.relu(%154);
+          %156 = nn.conv2d(%155, meta[relay.Constant][181], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %157 = nn.batch_norm(%156, meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184], meta[relay.Constant][185]);
+          %158 = %157.0;
+          %159 = nn.relu(%158);
+          %160 = nn.conv2d(%159, meta[relay.Constant][186], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %161 = nn.batch_norm(%160, meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189], meta[relay.Constant][190]);
+          %162 = %161.0;
+          %163 = nn.relu(%162);
+          %164 = nn.conv2d(%163, meta[relay.Constant][191], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %165 = add(%164, %152);
+          %166 = nn.batch_norm(%165, meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194], meta[relay.Constant][195]);
+          %167 = %166.0;
+          %168 = nn.relu(%167);
+          %169 = nn.conv2d(%168, meta[relay.Constant][196], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %170 = nn.batch_norm(%169, meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199], meta[relay.Constant][200]);
+          %171 = %170.0;
+          %172 = nn.relu(%171);
+          %173 = nn.conv2d(%172, meta[relay.Constant][201], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %174 = nn.batch_norm(%173, meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204], meta[relay.Constant][205]);
+          %175 = %174.0;
+          %176 = nn.relu(%175);
+          %177 = nn.conv2d(%176, meta[relay.Constant][206], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %178 = add(%177, %165);
+          %179 = nn.batch_norm(%178, meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209], meta[relay.Constant][210]);
+          %180 = %179.0;
+          %181 = nn.relu(%180);
+          %182 = nn.conv2d(%181, meta[relay.Constant][211], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %183 = nn.batch_norm(%182, meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214], meta[relay.Constant][215]);
+          %184 = %183.0;
+          %185 = nn.relu(%184);
+          %186 = nn.conv2d(%185, meta[relay.Constant][216], strides=[2, 2], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %187 = nn.batch_norm(%186, meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219], meta[relay.Constant][220]);
+          %188 = %187.0;
+          %189 = nn.relu(%188);
+          %190 = nn.conv2d(%189, meta[relay.Constant][221], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %191 = nn.conv2d(%181, meta[relay.Constant][222], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %192 = add(%190, %191);
+          %193 = nn.batch_norm(%192, meta[relay.Constant][223], meta[relay.Constant][224], meta[relay.Constant][225], meta[relay.Constant][226]);
+          %194 = %193.0;
+          %195 = nn.relu(%194);
+          %196 = nn.conv2d(%195, meta[relay.Constant][227], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %197 = nn.batch_norm(%196, meta[relay.Constant][228], meta[relay.Constant][229], meta[relay.Constant][230], meta[relay.Constant][231]);
+          %198 = %197.0;
+          %199 = nn.relu(%198);
+          %200 = nn.conv2d(%199, meta[relay.Constant][232], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %201 = nn.batch_norm(%200, meta[relay.Constant][233], meta[relay.Constant][234], meta[relay.Constant][235], meta[relay.Constant][236]);
+          %202 = %201.0;
+          %203 = nn.relu(%202);
+          %204 = nn.conv2d(%203, meta[relay.Constant][237], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %205 = add(%204, %192);
+          %206 = nn.batch_norm(%205, meta[relay.Constant][238], meta[relay.Constant][239], meta[relay.Constant][240], meta[relay.Constant][241]);
+          %207 = %206.0;
+          %208 = nn.relu(%207);
+          %209 = nn.conv2d(%208, meta[relay.Constant][242], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %210 = nn.batch_norm(%209, meta[relay.Constant][243], meta[relay.Constant][244], meta[relay.Constant][245], meta[relay.Constant][246]);
+          %211 = %210.0;
+          %212 = nn.relu(%211);
+          %213 = nn.conv2d(%212, meta[relay.Constant][247], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %214 = nn.batch_norm(%213, meta[relay.Constant][248], meta[relay.Constant][249], meta[relay.Constant][250], meta[relay.Constant][251]);
+          %215 = %214.0;
+          %216 = nn.relu(%215);
+          %217 = nn.conv2d(%216, meta[relay.Constant][252], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %218 = add(%217, %205);
+          %219 = nn.batch_norm(%218, meta[relay.Constant][253], meta[relay.Constant][254], meta[relay.Constant][255], meta[relay.Constant][256]);
+          %220 = %219.0;
+          %221 = nn.relu(%220);
+          %222 = nn.global_avg_pool2d(%221);
+          %223 = reshape(%222, newshape=[0, -1]);
+          %224 = nn.dense(%223, meta[relay.Constant][257], units=1000);
+          add(%224, meta[relay.Constant][258])
+        }   
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "resnet50_16",
+        "input_shapes": {"data": [1, 3, 224, 224]},
+        "input_dtypes": {"data": "float16"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def mobilenet_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (32, 3, 3, 3),  # 0
+            (32,),  # 1
+            (32,),  # 2
+            (32,),  # 3
+            (32,),  # 4
+            (32, 32, 1, 1),  # 5
+            (32,),  # 6
+            (32,),  # 7
+            (32,),  # 8
+            (32,),  # 9
+            (32, 1, 3, 3),  # 10
+            (32,),  # 11
+            (32,),  # 12
+            (32,),  # 13
+            (32,),  # 14
+            (16, 32, 1, 1),  # 15
+            (16,),  # 16
+            (16,),  # 17
+            (16,),  # 18
+            (16,),  # 19
+            (96, 16, 1, 1),  # 20
+            (96,),  # 21
+            (96,),  # 22
+            (96,),  # 23
+            (96,),  # 24
+            (96, 1, 3, 3),  # 25
+            (96,),  # 26
+            (96,),  # 27
+            (96,),  # 28
+            (96,),  # 29
+            (24, 96, 1, 1),  # 30
+            (24,),  # 31
+            (24,),  # 32
+            (24,),  # 33
+            (24,),  # 34
+            (144, 24, 1, 1),  # 35
+            (144,),  # 36
+            (144,),  # 37
+            (144,),  # 38
+            (144,),  # 39
+            (144, 1, 3, 3),  # 40
+            (144,),  # 41
+            (144,),  # 42
+            (144,),  # 43
+            (144,),  # 44
+            (24, 144, 1, 1),  # 45
+            (24,),  # 46
+            (24,),  # 47
+            (24,),  # 48
+            (24,),  # 49
+            (144, 24, 1, 1),  # 50
+            (144,),  # 51
+            (144,),  # 52
+            (144,),  # 53
+            (144,),  # 54
+            (144, 1, 3, 3),  # 55
+            (144,),  # 56
+            (144,),  # 57
+            (144,),  # 58
+            (144,),  # 59
+            (32, 144, 1, 1),  # 60
+            (32,),  # 61
+            (32,),  # 62
+            (32,),  # 63
+            (32,),  # 64
+            (192, 32, 1, 1),  # 65
+            (192,),  # 66
+            (192,),  # 67
+            (192,),  # 68
+            (192,),  # 69
+            (192, 1, 3, 3),  # 70
+            (192,),  # 71
+            (192,),  # 72
+            (192,),  # 73
+            (192,),  # 74
+            (32, 192, 1, 1),  # 75
+            (32,),  # 76
+            (32,),  # 77
+            (32,),  # 78
+            (32,),  # 79
+            (192, 32, 1, 1),  # 80
+            (192,),  # 81
+            (192,),  # 82
+            (192,),  # 83
+            (192,),  # 84
+            (192, 1, 3, 3),  # 85
+            (192,),  # 86
+            (192,),  # 87
+            (192,),  # 88
+            (192,),  # 89
+            (32, 192, 1, 1),  # 90
+            (32,),  # 91
+            (32,),  # 92
+            (32,),  # 93
+            (32,),  # 94
+            (192, 32, 1, 1),  # 95
+            (192,),  # 96
+            (192,),  # 97
+            (192,),  # 98
+            (192,),  # 99
+            (192, 1, 3, 3),  # 100
+            (192,),  # 101
+            (192,),  # 102
+            (192,),  # 103
+            (192,),  # 104
+            (64, 192, 1, 1),  # 105
+            (64,),  # 106
+            (64,),  # 107
+            (64,),  # 108
+            (64,),  # 109
+            (384, 64, 1, 1),  # 110
+            (384,),  # 111
+            (384,),  # 112
+            (384,),  # 113
+            (384,),  # 114
+            (384, 1, 3, 3),  # 115
+            (384,),  # 116
+            (384,),  # 117
+            (384,),  # 118
+            (384,),  # 119
+            (64, 384, 1, 1),  # 120
+            (64,),  # 121
+            (64,),  # 122
+            (64,),  # 123
+            (64,),  # 124
+            (384, 64, 1, 1),  # 125
+            (384,),  # 126
+            (384,),  # 127
+            (384,),  # 128
+            (384,),  # 129
+            (384, 1, 3, 3),  # 130
+            (384,),  # 131
+            (384,),  # 132
+            (384,),  # 133
+            (384,),  # 134
+            (64, 384, 1, 1),  # 135
+            (64,),  # 136
+            (64,),  # 137
+            (64,),  # 138
+            (64,),  # 139
+            (384, 64, 1, 1),  # 140
+            (384,),  # 141
+            (384,),  # 142
+            (384,),  # 143
+            (384,),  # 144
+            (384, 1, 3, 3),  # 145
+            (384,),  # 146
+            (384,),  # 147
+            (384,),  # 148
+            (384,),  # 149
+            (64, 384, 1, 1),  # 150
+            (64,),  # 151
+            (64,),  # 152
+            (64,),  # 153
+            (64,),  # 154
+            (384, 64, 1, 1),  # 155
+            (384,),  # 156
+            (384,),  # 157
+            (384,),  # 158
+            (384,),  # 159
+            (384, 1, 3, 3),  # 160
+            (384,),  # 161
+            (384,),  # 162
+            (384,),  # 163
+            (384,),  # 164
+            (96, 384, 1, 1),  # 165
+            (96,),  # 166
+            (96,),  # 167
+            (96,),  # 168
+            (96,),  # 169
+            (576, 96, 1, 1),  # 170
+            (576,),  # 171
+            (576,),  # 172
+            (576,),  # 173
+            (576,),  # 174
+            (576, 1, 3, 3),  # 175
+            (576,),  # 176
+            (576,),  # 177
+            (576,),  # 178
+            (576,),  # 179
+            (96, 576, 1, 1),  # 180
+            (96,),  # 181
+            (96,),  # 182
+            (96,),  # 183
+            (96,),  # 184
+            (576, 96, 1, 1),  # 185
+            (576,),  # 186
+            (576,),  # 187
+            (576,),  # 188
+            (576,),  # 189
+            (576, 1, 3, 3),  # 190
+            (576,),  # 191
+            (576,),  # 192
+            (576,),  # 193
+            (576,),  # 194
+            (96, 576, 1, 1),  # 195
+            (96,),  # 196
+            (96,),  # 197
+            (96,),  # 198
+            (96,),  # 199
+            (576, 96, 1, 1),  # 200
+            (576,),  # 201
+            (576,),  # 202
+            (576,),  # 203
+            (576,),  # 204
+            (576, 1, 3, 3),  # 205
+            (576,),  # 206
+            (576,),  # 207
+            (576,),  # 208
+            (576,),  # 209
+            (160, 576, 1, 1),  # 210
+            (160,),  # 211
+            (160,),  # 212
+            (160,),  # 213
+            (160,),  # 214
+            (960, 160, 1, 1),  # 215
+            (960,),  # 216
+            (960,),  # 217
+            (960,),  # 218
+            (960,),  # 219
+            (960, 1, 3, 3),  # 220
+            (960,),  # 221
+            (960,),  # 222
+            (960,),  # 223
+            (960,),  # 224
+            (160, 960, 1, 1),  # 225
+            (160,),  # 226
+            (160,),  # 227
+            (160,),  # 228
+            (160,),  # 229
+            (960, 160, 1, 1),  # 230
+            (960,),  # 231
+            (960,),  # 232
+            (960,),  # 233
+            (960,),  # 234
+            (960, 1, 3, 3),  # 235
+            (960,),  # 236
+            (960,),  # 237
+            (960,),  # 238
+            (960,),  # 239
+            (160, 960, 1, 1),  # 240
+            (160,),  # 241
+            (160,),  # 242
+            (160,),  # 243
+            (160,),  # 244
+            (960, 160, 1, 1),  # 245
+            (960,),  # 246
+            (960,),  # 247
+            (960,),  # 248
+            (960,),  # 249
+            (960, 1, 3, 3),  # 250
+            (960,),  # 251
+            (960,),  # 252
+            (960,),  # 253
+            (960,),  # 254
+            (320, 960, 1, 1),  # 255
+            (320,),  # 256
+            (320,),  # 257
+            (320,),  # 258
+            (320,),  # 259
+            (1280, 320, 1, 1),  # 260
+            (1280,),  # 261
+            (1280,),  # 262
+            (1280,),  # 263
+            (1280,),  # 264
+            (1000, 1280, 1, 1),  # 265
+        ],
+    )
+
+
+def mobilenet():
+    metatable = {"relay.Constant": mobilenet_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32] {
+          %0 = nn.conv2d(%data, meta[relay.Constant][0], strides=[2, 2], padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3]);
+          %1 = nn.batch_norm(%0, meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3], meta[relay.Constant][4]);
+          %2 = %1.0;
+          %3 = nn.relu(%2);
+          %4 = nn.conv2d(%3, meta[relay.Constant][5], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %5 = nn.batch_norm(%4, meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8], meta[relay.Constant][9]);
+          %6 = %5.0;
+          %7 = nn.relu(%6);
+          %8 = nn.conv2d(%7, meta[relay.Constant][10], padding=[1, 1, 1, 1], groups=32, channels=32, kernel_size=[3, 3]);
+          %9 = nn.batch_norm(%8, meta[relay.Constant][11], meta[relay.Constant][12], meta[relay.Constant][13], meta[relay.Constant][14]);
+          %10 = %9.0;
+          %11 = nn.relu(%10);
+          %12 = nn.conv2d(%11, meta[relay.Constant][15], padding=[0, 0, 0, 0], channels=16, kernel_size=[1, 1]);
+          %13 = nn.batch_norm(%12, meta[relay.Constant][16], meta[relay.Constant][17], meta[relay.Constant][18], meta[relay.Constant][19]);
+          %14 = %13.0;
+          %15 = nn.conv2d(%14, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %16 = nn.batch_norm(%15, meta[relay.Constant][21], meta[relay.Constant][22], meta[relay.Constant][23], meta[relay.Constant][24]);
+          %17 = %16.0;
+          %18 = nn.relu(%17);
+          %19 = nn.conv2d(%18, meta[relay.Constant][25], strides=[2, 2], padding=[1, 1, 1, 1], groups=96, channels=96, kernel_size=[3, 3]);
+          %20 = nn.batch_norm(%19, meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28], meta[relay.Constant][29]);
+          %21 = %20.0;
+          %22 = nn.relu(%21);
+          %23 = nn.conv2d(%22, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
+          %24 = nn.batch_norm(%23, meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33], meta[relay.Constant][34]);
+          %25 = %24.0;
+          %26 = nn.conv2d(%25, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
+          %27 = nn.batch_norm(%26, meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38], meta[relay.Constant][39]);
+          %28 = %27.0;
+          %29 = nn.relu(%28);
+          %30 = nn.conv2d(%29, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
+          %31 = nn.batch_norm(%30, meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43], meta[relay.Constant][44]);
+          %32 = %31.0;
+          %33 = nn.relu(%32);
+          %34 = nn.conv2d(%33, meta[relay.Constant][45], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
+          %35 = nn.batch_norm(%34, meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48], meta[relay.Constant][49]);
+          %36 = %35.0;
+          %37 = add(%36, %25);
+          %38 = nn.conv2d(%37, meta[relay.Constant][50], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
+          %39 = nn.batch_norm(%38, meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53], meta[relay.Constant][54]);
+          %40 = %39.0;
+          %41 = nn.relu(%40);
+          %42 = nn.conv2d(%41, meta[relay.Constant][55], strides=[2, 2], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
+          %43 = nn.batch_norm(%42, meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58], meta[relay.Constant][59]);
+          %44 = %43.0;
+          %45 = nn.relu(%44);
+          %46 = nn.conv2d(%45, meta[relay.Constant][60], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %47 = nn.batch_norm(%46, meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63], meta[relay.Constant][64]);
+          %48 = %47.0;
+          %49 = nn.conv2d(%48, meta[relay.Constant][65], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %50 = nn.batch_norm(%49, meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68], meta[relay.Constant][69]);
+          %51 = %50.0;
+          %52 = nn.relu(%51);
+          %53 = nn.conv2d(%52, meta[relay.Constant][70], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %54 = nn.batch_norm(%53, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
+          %55 = %54.0;
+          %56 = nn.relu(%55);
+          %57 = nn.conv2d(%56, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %58 = nn.batch_norm(%57, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
+          %59 = %58.0;
+          %60 = add(%59, %48);
+          %61 = nn.conv2d(%60, meta[relay.Constant][80], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %62 = nn.batch_norm(%61, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
+          %63 = %62.0;
+          %64 = nn.relu(%63);
+          %65 = nn.conv2d(%64, meta[relay.Constant][85], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %66 = nn.batch_norm(%65, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
+          %67 = %66.0;
+          %68 = nn.relu(%67);
+          %69 = nn.conv2d(%68, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %70 = nn.batch_norm(%69, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
+          %71 = %70.0;
+          %72 = add(%71, %60);
+          %73 = nn.conv2d(%72, meta[relay.Constant][95], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %74 = nn.batch_norm(%73, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
+          %75 = %74.0;
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][100], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %78 = nn.batch_norm(%77, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
+          %79 = %78.0;
+          %80 = nn.relu(%79);
+          %81 = nn.conv2d(%80, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %82 = nn.batch_norm(%81, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
+          %83 = %82.0;
+          %84 = nn.conv2d(%83, meta[relay.Constant][110], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %85 = nn.batch_norm(%84, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
+          %86 = %85.0;
+          %87 = nn.relu(%86);
+          %88 = nn.conv2d(%87, meta[relay.Constant][115], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %89 = nn.batch_norm(%88, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
+          %90 = %89.0;
+          %91 = nn.relu(%90);
+          %92 = nn.conv2d(%91, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %93 = nn.batch_norm(%92, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
+          %94 = %93.0;
+          %95 = add(%94, %83);
+          %96 = nn.conv2d(%95, meta[relay.Constant][125], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %97 = nn.batch_norm(%96, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
+          %98 = %97.0;
+          %99 = nn.relu(%98);
+          %100 = nn.conv2d(%99, meta[relay.Constant][130], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %101 = nn.batch_norm(%100, meta[relay.Constant][131], meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134]);
+          %102 = %101.0;
+          %103 = nn.relu(%102);
+          %104 = nn.conv2d(%103, meta[relay.Constant][135], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %105 = nn.batch_norm(%104, meta[relay.Constant][136], meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139]);
+          %106 = %105.0;
+          %107 = add(%106, %95);
+          %108 = nn.conv2d(%107, meta[relay.Constant][140], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %109 = nn.batch_norm(%108, meta[relay.Constant][141], meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144]);
+          %110 = %109.0;
+          %111 = nn.relu(%110);
+          %112 = nn.conv2d(%111, meta[relay.Constant][145], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %113 = nn.batch_norm(%112, meta[relay.Constant][146], meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149]);
+          %114 = %113.0;
+          %115 = nn.relu(%114);
+          %116 = nn.conv2d(%115, meta[relay.Constant][150], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %117 = nn.batch_norm(%116, meta[relay.Constant][151], meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154]);
+          %118 = %117.0;
+          %119 = add(%118, %107);
+          %120 = nn.conv2d(%119, meta[relay.Constant][155], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %121 = nn.batch_norm(%120, meta[relay.Constant][156], meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159]);
+          %122 = %121.0;
+          %123 = nn.relu(%122);
+          %124 = nn.conv2d(%123, meta[relay.Constant][160], strides=[2, 2], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %125 = nn.batch_norm(%124, meta[relay.Constant][161], meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164]);
+          %126 = %125.0;
+          %127 = nn.relu(%126);
+          %128 = nn.conv2d(%127, meta[relay.Constant][165], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %129 = nn.batch_norm(%128, meta[relay.Constant][166], meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169]);
+          %130 = %129.0;
+          %131 = nn.conv2d(%130, meta[relay.Constant][170], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %132 = nn.batch_norm(%131, meta[relay.Constant][171], meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174]);
+          %133 = %132.0;
+          %134 = nn.relu(%133);
+          %135 = nn.conv2d(%134, meta[relay.Constant][175], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %136 = nn.batch_norm(%135, meta[relay.Constant][176], meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179]);
+          %137 = %136.0;
+          %138 = nn.relu(%137);
+          %139 = nn.conv2d(%138, meta[relay.Constant][180], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %140 = nn.batch_norm(%139, meta[relay.Constant][181], meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184]);
+          %141 = %140.0;
+          %142 = add(%141, %130);
+          %143 = nn.conv2d(%142, meta[relay.Constant][185], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %144 = nn.batch_norm(%143, meta[relay.Constant][186], meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189]);
+          %145 = %144.0;
+          %146 = nn.relu(%145);
+          %147 = nn.conv2d(%146, meta[relay.Constant][190], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %148 = nn.batch_norm(%147, meta[relay.Constant][191], meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194]);
+          %149 = %148.0;
+          %150 = nn.relu(%149);
+          %151 = nn.conv2d(%150, meta[relay.Constant][195], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %152 = nn.batch_norm(%151, meta[relay.Constant][196], meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199]);
+          %153 = %152.0;
+          %154 = add(%153, %142);
+          %155 = nn.conv2d(%154, meta[relay.Constant][200], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %156 = nn.batch_norm(%155, meta[relay.Constant][201], meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204]);
+          %157 = %156.0;
+          %158 = nn.relu(%157);
+          %159 = nn.conv2d(%158, meta[relay.Constant][205], strides=[2, 2], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %160 = nn.batch_norm(%159, meta[relay.Constant][206], meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209]);
+          %161 = %160.0;
+          %162 = nn.relu(%161);
+          %163 = nn.conv2d(%162, meta[relay.Constant][210], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %164 = nn.batch_norm(%163, meta[relay.Constant][211], meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214]);
+          %165 = %164.0;
+          %166 = nn.conv2d(%165, meta[relay.Constant][215], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %167 = nn.batch_norm(%166, meta[relay.Constant][216], meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219]);
+          %168 = %167.0;
+          %169 = nn.relu(%168);
+          %170 = nn.conv2d(%169, meta[relay.Constant][220], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %171 = nn.batch_norm(%170, meta[relay.Constant][221], meta[relay.Constant][222], meta[relay.Constant][223], meta[relay.Constant][224]);
+          %172 = %171.0;
+          %173 = nn.relu(%172);
+          %174 = nn.conv2d(%173, meta[relay.Constant][225], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %175 = nn.batch_norm(%174, meta[relay.Constant][226], meta[relay.Constant][227], meta[relay.Constant][228], meta[relay.Constant][229]);
+          %176 = %175.0;
+          %177 = add(%176, %165);
+          %178 = nn.conv2d(%177, meta[relay.Constant][230], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %179 = nn.batch_norm(%178, meta[relay.Constant][231], meta[relay.Constant][232], meta[relay.Constant][233], meta[relay.Constant][234]);
+          %180 = %179.0;
+          %181 = nn.relu(%180);
+          %182 = nn.conv2d(%181, meta[relay.Constant][235], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %183 = nn.batch_norm(%182, meta[relay.Constant][236], meta[relay.Constant][237], meta[relay.Constant][238], meta[relay.Constant][239]);
+          %184 = %183.0;
+          %185 = nn.relu(%184);
+          %186 = nn.conv2d(%185, meta[relay.Constant][240], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %187 = nn.batch_norm(%186, meta[relay.Constant][241], meta[relay.Constant][242], meta[relay.Constant][243], meta[relay.Constant][244]);
+          %188 = %187.0;
+          %189 = add(%188, %177);
+          %190 = nn.conv2d(%189, meta[relay.Constant][245], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %191 = nn.batch_norm(%190, meta[relay.Constant][246], meta[relay.Constant][247], meta[relay.Constant][248], meta[relay.Constant][249]);
+          %192 = %191.0;
+          %193 = nn.relu(%192);
+          %194 = nn.conv2d(%193, meta[relay.Constant][250], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %195 = nn.batch_norm(%194, meta[relay.Constant][251], meta[relay.Constant][252], meta[relay.Constant][253], meta[relay.Constant][254]);
+          %196 = %195.0;
+          %197 = nn.relu(%196);
+          %198 = nn.conv2d(%197, meta[relay.Constant][255], padding=[0, 0, 0, 0], channels=320, kernel_size=[1, 1]);
+          %199 = nn.batch_norm(%198, meta[relay.Constant][256], meta[relay.Constant][257], meta[relay.Constant][258], meta[relay.Constant][259]);
+          %200 = %199.0;
+          %201 = nn.conv2d(%200, meta[relay.Constant][260], padding=[0, 0, 0, 0], channels=1280, kernel_size=[1, 1]);
+          %202 = nn.batch_norm(%201, meta[relay.Constant][261], meta[relay.Constant][262], meta[relay.Constant][263], meta[relay.Constant][264]);
+          %203 = %202.0;
+          %204 = nn.relu(%203);
+          %205 = nn.global_avg_pool2d(%204);
+          %206 = nn.conv2d(%205, meta[relay.Constant][265], padding=[0, 0, 0, 0], channels=1000, kernel_size=[1, 1]);
+          reshape(%206, newshape=[0, -1])
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "mobilenet",
+        "input_shapes": {"data": [1, 3, 224, 224]},
+        "input_dtypes": {"data": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def mobilenet_16():
+    metatable = {"relay.Constant": mobilenet_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(1, 3, 224, 224), float16]) -> Tensor[(1, 1000), float16] {
+          %0 = nn.conv2d(%data, meta[relay.Constant][0], strides=[2, 2], padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3]);
+          %1 = nn.batch_norm(%0, meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3], meta[relay.Constant][4]);
+          %2 = %1.0;
+          %3 = nn.relu(%2);
+          %4 = nn.conv2d(%3, meta[relay.Constant][5], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %5 = nn.batch_norm(%4, meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8], meta[relay.Constant][9]);
+          %6 = %5.0;
+          %7 = nn.relu(%6);
+          %8 = nn.conv2d(%7, meta[relay.Constant][10], padding=[1, 1, 1, 1], groups=32, channels=32, kernel_size=[3, 3]);
+          %9 = nn.batch_norm(%8, meta[relay.Constant][11], meta[relay.Constant][12], meta[relay.Constant][13], meta[relay.Constant][14]);
+          %10 = %9.0;
+          %11 = nn.relu(%10);
+          %12 = nn.conv2d(%11, meta[relay.Constant][15], padding=[0, 0, 0, 0], channels=16, kernel_size=[1, 1]);
+          %13 = nn.batch_norm(%12, meta[relay.Constant][16], meta[relay.Constant][17], meta[relay.Constant][18], meta[relay.Constant][19]);
+          %14 = %13.0;
+          %15 = nn.conv2d(%14, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %16 = nn.batch_norm(%15, meta[relay.Constant][21], meta[relay.Constant][22], meta[relay.Constant][23], meta[relay.Constant][24]);
+          %17 = %16.0;
+          %18 = nn.relu(%17);
+          %19 = nn.conv2d(%18, meta[relay.Constant][25], strides=[2, 2], padding=[1, 1, 1, 1], groups=96, channels=96, kernel_size=[3, 3]);
+          %20 = nn.batch_norm(%19, meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28], meta[relay.Constant][29]);
+          %21 = %20.0;
+          %22 = nn.relu(%21);
+          %23 = nn.conv2d(%22, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
+          %24 = nn.batch_norm(%23, meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33], meta[relay.Constant][34]);
+          %25 = %24.0;
+          %26 = nn.conv2d(%25, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
+          %27 = nn.batch_norm(%26, meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38], meta[relay.Constant][39]);
+          %28 = %27.0;
+          %29 = nn.relu(%28);
+          %30 = nn.conv2d(%29, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
+          %31 = nn.batch_norm(%30, meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43], meta[relay.Constant][44]);
+          %32 = %31.0;
+          %33 = nn.relu(%32);
+          %34 = nn.conv2d(%33, meta[relay.Constant][45], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
+          %35 = nn.batch_norm(%34, meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48], meta[relay.Constant][49]);
+          %36 = %35.0;
+          %37 = add(%36, %25);
+          %38 = nn.conv2d(%37, meta[relay.Constant][50], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
+          %39 = nn.batch_norm(%38, meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53], meta[relay.Constant][54]);
+          %40 = %39.0;
+          %41 = nn.relu(%40);
+          %42 = nn.conv2d(%41, meta[relay.Constant][55], strides=[2, 2], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
+          %43 = nn.batch_norm(%42, meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58], meta[relay.Constant][59]);
+          %44 = %43.0;
+          %45 = nn.relu(%44);
+          %46 = nn.conv2d(%45, meta[relay.Constant][60], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %47 = nn.batch_norm(%46, meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63], meta[relay.Constant][64]);
+          %48 = %47.0;
+          %49 = nn.conv2d(%48, meta[relay.Constant][65], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %50 = nn.batch_norm(%49, meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68], meta[relay.Constant][69]);
+          %51 = %50.0;
+          %52 = nn.relu(%51);
+          %53 = nn.conv2d(%52, meta[relay.Constant][70], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %54 = nn.batch_norm(%53, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
+          %55 = %54.0;
+          %56 = nn.relu(%55);
+          %57 = nn.conv2d(%56, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %58 = nn.batch_norm(%57, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
+          %59 = %58.0;
+          %60 = add(%59, %48);
+          %61 = nn.conv2d(%60, meta[relay.Constant][80], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %62 = nn.batch_norm(%61, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
+          %63 = %62.0;
+          %64 = nn.relu(%63);
+          %65 = nn.conv2d(%64, meta[relay.Constant][85], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %66 = nn.batch_norm(%65, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
+          %67 = %66.0;
+          %68 = nn.relu(%67);
+          %69 = nn.conv2d(%68, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %70 = nn.batch_norm(%69, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
+          %71 = %70.0;
+          %72 = add(%71, %60);
+          %73 = nn.conv2d(%72, meta[relay.Constant][95], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %74 = nn.batch_norm(%73, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
+          %75 = %74.0;
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][100], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %78 = nn.batch_norm(%77, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
+          %79 = %78.0;
+          %80 = nn.relu(%79);
+          %81 = nn.conv2d(%80, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %82 = nn.batch_norm(%81, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
+          %83 = %82.0;
+          %84 = nn.conv2d(%83, meta[relay.Constant][110], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %85 = nn.batch_norm(%84, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
+          %86 = %85.0;
+          %87 = nn.relu(%86);
+          %88 = nn.conv2d(%87, meta[relay.Constant][115], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %89 = nn.batch_norm(%88, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
+          %90 = %89.0;
+          %91 = nn.relu(%90);
+          %92 = nn.conv2d(%91, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %93 = nn.batch_norm(%92, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
+          %94 = %93.0;
+          %95 = add(%94, %83);
+          %96 = nn.conv2d(%95, meta[relay.Constant][125], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %97 = nn.batch_norm(%96, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
+          %98 = %97.0;
+          %99 = nn.relu(%98);
+          %100 = nn.conv2d(%99, meta[relay.Constant][130], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %101 = nn.batch_norm(%100, meta[relay.Constant][131], meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134]);
+          %102 = %101.0;
+          %103 = nn.relu(%102);
+          %104 = nn.conv2d(%103, meta[relay.Constant][135], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %105 = nn.batch_norm(%104, meta[relay.Constant][136], meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139]);
+          %106 = %105.0;
+          %107 = add(%106, %95);
+          %108 = nn.conv2d(%107, meta[relay.Constant][140], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %109 = nn.batch_norm(%108, meta[relay.Constant][141], meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144]);
+          %110 = %109.0;
+          %111 = nn.relu(%110);
+          %112 = nn.conv2d(%111, meta[relay.Constant][145], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %113 = nn.batch_norm(%112, meta[relay.Constant][146], meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149]);
+          %114 = %113.0;
+          %115 = nn.relu(%114);
+          %116 = nn.conv2d(%115, meta[relay.Constant][150], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %117 = nn.batch_norm(%116, meta[relay.Constant][151], meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154]);
+          %118 = %117.0;
+          %119 = add(%118, %107);
+          %120 = nn.conv2d(%119, meta[relay.Constant][155], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %121 = nn.batch_norm(%120, meta[relay.Constant][156], meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159]);
+          %122 = %121.0;
+          %123 = nn.relu(%122);
+          %124 = nn.conv2d(%123, meta[relay.Constant][160], strides=[2, 2], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %125 = nn.batch_norm(%124, meta[relay.Constant][161], meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164]);
+          %126 = %125.0;
+          %127 = nn.relu(%126);
+          %128 = nn.conv2d(%127, meta[relay.Constant][165], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %129 = nn.batch_norm(%128, meta[relay.Constant][166], meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169]);
+          %130 = %129.0;
+          %131 = nn.conv2d(%130, meta[relay.Constant][170], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %132 = nn.batch_norm(%131, meta[relay.Constant][171], meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174]);
+          %133 = %132.0;
+          %134 = nn.relu(%133);
+          %135 = nn.conv2d(%134, meta[relay.Constant][175], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %136 = nn.batch_norm(%135, meta[relay.Constant][176], meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179]);
+          %137 = %136.0;
+          %138 = nn.relu(%137);
+          %139 = nn.conv2d(%138, meta[relay.Constant][180], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %140 = nn.batch_norm(%139, meta[relay.Constant][181], meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184]);
+          %141 = %140.0;
+          %142 = add(%141, %130);
+          %143 = nn.conv2d(%142, meta[relay.Constant][185], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %144 = nn.batch_norm(%143, meta[relay.Constant][186], meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189]);
+          %145 = %144.0;
+          %146 = nn.relu(%145);
+          %147 = nn.conv2d(%146, meta[relay.Constant][190], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %148 = nn.batch_norm(%147, meta[relay.Constant][191], meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194]);
+          %149 = %148.0;
+          %150 = nn.relu(%149);
+          %151 = nn.conv2d(%150, meta[relay.Constant][195], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %152 = nn.batch_norm(%151, meta[relay.Constant][196], meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199]);
+          %153 = %152.0;
+          %154 = add(%153, %142);
+          %155 = nn.conv2d(%154, meta[relay.Constant][200], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %156 = nn.batch_norm(%155, meta[relay.Constant][201], meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204]);
+          %157 = %156.0;
+          %158 = nn.relu(%157);
+          %159 = nn.conv2d(%158, meta[relay.Constant][205], strides=[2, 2], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %160 = nn.batch_norm(%159, meta[relay.Constant][206], meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209]);
+          %161 = %160.0;
+          %162 = nn.relu(%161);
+          %163 = nn.conv2d(%162, meta[relay.Constant][210], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %164 = nn.batch_norm(%163, meta[relay.Constant][211], meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214]);
+          %165 = %164.0;
+          %166 = nn.conv2d(%165, meta[relay.Constant][215], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %167 = nn.batch_norm(%166, meta[relay.Constant][216], meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219]);
+          %168 = %167.0;
+          %169 = nn.relu(%168);
+          %170 = nn.conv2d(%169, meta[relay.Constant][220], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %171 = nn.batch_norm(%170, meta[relay.Constant][221], meta[relay.Constant][222], meta[relay.Constant][223], meta[relay.Constant][224]);
+          %172 = %171.0;
+          %173 = nn.relu(%172);
+          %174 = nn.conv2d(%173, meta[relay.Constant][225], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %175 = nn.batch_norm(%174, meta[relay.Constant][226], meta[relay.Constant][227], meta[relay.Constant][228], meta[relay.Constant][229]);
+          %176 = %175.0;
+          %177 = add(%176, %165);
+          %178 = nn.conv2d(%177, meta[relay.Constant][230], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %179 = nn.batch_norm(%178, meta[relay.Constant][231], meta[relay.Constant][232], meta[relay.Constant][233], meta[relay.Constant][234]);
+          %180 = %179.0;
+          %181 = nn.relu(%180);
+          %182 = nn.conv2d(%181, meta[relay.Constant][235], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %183 = nn.batch_norm(%182, meta[relay.Constant][236], meta[relay.Constant][237], meta[relay.Constant][238], meta[relay.Constant][239]);
+          %184 = %183.0;
+          %185 = nn.relu(%184);
+          %186 = nn.conv2d(%185, meta[relay.Constant][240], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %187 = nn.batch_norm(%186, meta[relay.Constant][241], meta[relay.Constant][242], meta[relay.Constant][243], meta[relay.Constant][244]);
+          %188 = %187.0;
+          %189 = add(%188, %177);
+          %190 = nn.conv2d(%189, meta[relay.Constant][245], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %191 = nn.batch_norm(%190, meta[relay.Constant][246], meta[relay.Constant][247], meta[relay.Constant][248], meta[relay.Constant][249]);
+          %192 = %191.0;
+          %193 = nn.relu(%192);
+          %194 = nn.conv2d(%193, meta[relay.Constant][250], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %195 = nn.batch_norm(%194, meta[relay.Constant][251], meta[relay.Constant][252], meta[relay.Constant][253], meta[relay.Constant][254]);
+          %196 = %195.0;
+          %197 = nn.relu(%196);
+          %198 = nn.conv2d(%197, meta[relay.Constant][255], padding=[0, 0, 0, 0], channels=320, kernel_size=[1, 1]);
+          %199 = nn.batch_norm(%198, meta[relay.Constant][256], meta[relay.Constant][257], meta[relay.Constant][258], meta[relay.Constant][259]);
+          %200 = %199.0;
+          %201 = nn.conv2d(%200, meta[relay.Constant][260], padding=[0, 0, 0, 0], channels=1280, kernel_size=[1, 1]);
+          %202 = nn.batch_norm(%201, meta[relay.Constant][261], meta[relay.Constant][262], meta[relay.Constant][263], meta[relay.Constant][264]);
+          %203 = %202.0;
+          %204 = nn.relu(%203);
+          %205 = nn.global_avg_pool2d(%204);
+          %206 = nn.conv2d(%205, meta[relay.Constant][265], padding=[0, 0, 0, 0], channels=1000, kernel_size=[1, 1]);
+          reshape(%206, newshape=[0, -1])
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "mobilenet_16",
+        "input_shapes": {"data": [1, 3, 224, 224]},
+        "input_dtypes": {"data": "float16"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def batch_norm_extract():
+    consts = make_consts(
+        "float32",
+        [
+            (32,),  # 0
+            (32,),  # 1
+            (32,),  # 2
+            (32,),  # 3
+        ],
+    )
+    metatable = {"relay.Constant": consts}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%FunctionVar_0: Tensor[(1, 32, 112, 112), float32]) -> Tensor[(1, 32, 112, 112), float32] {
+          %3 = nn.batch_norm(%FunctionVar_0, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]);
+          %3.0
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "batch_norm_extract",
+        "input_shapes": {"FunctionVar_0": [1, 32, 112, 112]},
+        "input_dtypes": {"FunctionVar_0": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def resnext50_32x4d_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (128, 64, 1, 1),  # 0
+            (128, 4, 3, 3),  # 1
+            (256, 128, 1, 1),  # 2
+            (256, 64, 1, 1),  # 3
+            (128, 256, 1, 1),  # 4
+            (128, 4, 3, 3),  # 5
+            (256, 128, 1, 1),  # 6
+            (128, 256, 1, 1),  # 7
+            (128, 4, 3, 3),  # 8
+            (256, 128, 1, 1),  # 9
+            (256, 256, 1, 1),  # 10
+            (256, 8, 3, 3),  # 11
+            (512, 256, 1, 1),  # 12
+            (512, 256, 1, 1),  # 13
+            (256, 512, 1, 1),  # 14
+            (256, 8, 3, 3),  # 15
+            (512, 256, 1, 1),  # 16
+            (256, 512, 1, 1),  # 17
+            (256, 8, 3, 3),  # 18
+            (512, 256, 1, 1),  # 19
+            (256, 512, 1, 1),  # 20
+            (256, 8, 3, 3),  # 21
+            (512, 256, 1, 1),  # 22
+            (512, 512, 1, 1),  # 23
+            (512, 16, 3, 3),  # 24
+            (1024, 512, 1, 1),  # 25
+            (1024, 512, 1, 1),  # 26
+            (512, 1024, 1, 1),  # 27
+            (512, 16, 3, 3),  # 28
+            (1024, 512, 1, 1),  # 29
+            (512, 1024, 1, 1),  # 30
+            (512, 16, 3, 3),  # 31
+            (1024, 512, 1, 1),  # 32
+            (512, 1024, 1, 1),  # 33
+            (512, 16, 3, 3),  # 34
+            (1024, 512, 1, 1),  # 35
+            (512, 1024, 1, 1),  # 36
+            (512, 16, 3, 3),  # 37
+            (1024, 512, 1, 1),  # 38
+            (512, 1024, 1, 1),  # 39
+            (512, 16, 3, 3),  # 40
+            (1024, 512, 1, 1),  # 41
+            (1024, 1024, 1, 1),  # 42
+            (1024, 32, 3, 3),  # 43
+            (2048, 1024, 1, 1),  # 44
+            (2048, 1024, 1, 1),  # 45
+            (1024, 2048, 1, 1),  # 46
+            (1024, 32, 3, 3),  # 47
+            (2048, 1024, 1, 1),  # 48
+            (1024, 2048, 1, 1),  # 49
+            (1024, 32, 3, 3),  # 50
+            (2048, 1024, 1, 1),  # 51
+        ],
+    )
+
+
+def resnext50_32x4d():
+    metatable = {"relay.Constant": resnext50_32x4d_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1, 64, 56, 56), float32]) {
+          %0 = nn.conv2d(%x, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %1 = nn.relu(%0);
+          %2 = nn.conv2d(%1, meta[relay.Constant][1], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %3 = nn.relu(%2);
+          %4 = nn.conv2d(%3, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %5 = nn.conv2d(%x, meta[relay.Constant][3], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %6 = add(%4, %5);
+          %7 = nn.relu(%6);
+          %8 = nn.conv2d(%7, meta[relay.Constant][4], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %9 = nn.relu(%8);
+          %10 = nn.conv2d(%9, meta[relay.Constant][5], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %11 = nn.relu(%10);
+          %12 = nn.conv2d(%11, meta[relay.Constant][6], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %13 = add(%12, %7);
+          %14 = nn.relu(%13);
+          %15 = nn.conv2d(%14, meta[relay.Constant][7], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %16 = nn.relu(%15);
+          %17 = nn.conv2d(%16, meta[relay.Constant][8], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %18 = nn.relu(%17);
+          %19 = nn.conv2d(%18, meta[relay.Constant][9], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %20 = add(%19, %14);
+          %21 = nn.relu(%20);
+          %22 = nn.conv2d(%21, meta[relay.Constant][10], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %23 = nn.relu(%22);
+          %24 = nn.conv2d(%23, meta[relay.Constant][11], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %25 = nn.relu(%24);
+          %26 = nn.conv2d(%25, meta[relay.Constant][12], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %27 = nn.conv2d(%21, meta[relay.Constant][13], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %28 = add(%26, %27);
+          %29 = nn.relu(%28);
+          %30 = nn.conv2d(%29, meta[relay.Constant][14], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %31 = nn.relu(%30);
+          %32 = nn.conv2d(%31, meta[relay.Constant][15], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %33 = nn.relu(%32);
+          %34 = nn.conv2d(%33, meta[relay.Constant][16], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %35 = add(%34, %29);
+          %36 = nn.relu(%35);
+          %37 = nn.conv2d(%36, meta[relay.Constant][17], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %38 = nn.relu(%37);
+          %39 = nn.conv2d(%38, meta[relay.Constant][18], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %40 = nn.relu(%39);
+          %41 = nn.conv2d(%40, meta[relay.Constant][19], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %42 = add(%41, %36);
+          %43 = nn.relu(%42);
+          %44 = nn.conv2d(%43, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %45 = nn.relu(%44);
+          %46 = nn.conv2d(%45, meta[relay.Constant][21], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %47 = nn.relu(%46);
+          %48 = nn.conv2d(%47, meta[relay.Constant][22], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %49 = add(%48, %43);
+          %50 = nn.relu(%49);
+          %51 = nn.conv2d(%50, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %52 = nn.relu(%51);
+          %53 = nn.conv2d(%52, meta[relay.Constant][24], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %54 = nn.relu(%53);
+          %55 = nn.conv2d(%54, meta[relay.Constant][25], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %56 = nn.conv2d(%50, meta[relay.Constant][26], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %57 = add(%55, %56);
+          %58 = nn.relu(%57);
+          %59 = nn.conv2d(%58, meta[relay.Constant][27], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %60 = nn.relu(%59);
+          %61 = nn.conv2d(%60, meta[relay.Constant][28], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %62 = nn.relu(%61);
+          %63 = nn.conv2d(%62, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %64 = add(%63, %58);
+          %65 = nn.relu(%64);
+          %66 = nn.conv2d(%65, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %67 = nn.relu(%66);
+          %68 = nn.conv2d(%67, meta[relay.Constant][31], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %69 = nn.relu(%68);
+          %70 = nn.conv2d(%69, meta[relay.Constant][32], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %71 = add(%70, %65);
+          %72 = nn.relu(%71);
+          %73 = nn.conv2d(%72, meta[relay.Constant][33], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %74 = nn.relu(%73);
+          %75 = nn.conv2d(%74, meta[relay.Constant][34], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %78 = add(%77, %72);
+          %79 = nn.relu(%78);
+          %80 = nn.conv2d(%79, meta[relay.Constant][36], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %81 = nn.relu(%80);
+          %82 = nn.conv2d(%81, meta[relay.Constant][37], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %83 = nn.relu(%82);
+          %84 = nn.conv2d(%83, meta[relay.Constant][38], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %85 = add(%84, %79);
+          %86 = nn.relu(%85);
+          %87 = nn.conv2d(%86, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %88 = nn.relu(%87);
+          %89 = nn.conv2d(%88, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %90 = nn.relu(%89);
+          %91 = nn.conv2d(%90, meta[relay.Constant][41], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %92 = add(%91, %86);
+          %93 = nn.relu(%92);
+          %94 = nn.conv2d(%93, meta[relay.Constant][42], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %95 = nn.relu(%94);
+          %96 = nn.conv2d(%95, meta[relay.Constant][43], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %97 = nn.relu(%96);
+          %98 = nn.conv2d(%97, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %99 = nn.conv2d(%93, meta[relay.Constant][45], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %100 = add(%98, %99);
+          %101 = nn.relu(%100);
+          %102 = nn.conv2d(%101, meta[relay.Constant][46], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %103 = nn.relu(%102);
+          %104 = nn.conv2d(%103, meta[relay.Constant][47], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %105 = nn.relu(%104);
+          %106 = nn.conv2d(%105, meta[relay.Constant][48], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %107 = add(%106, %101);
+          %108 = nn.relu(%107);
+          %109 = nn.conv2d(%108, meta[relay.Constant][49], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %110 = nn.relu(%109);
+          %111 = nn.conv2d(%110, meta[relay.Constant][50], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %112 = nn.relu(%111);
+          %113 = nn.conv2d(%112, meta[relay.Constant][51], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %114 = add(%113, %108);
+          nn.relu(%114)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "resnext50_32x4d",
+        "input_shapes": {"x": [1, 64, 56, 56]},
+        "input_dtypes": {"x": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def resnext50_32x4d_16():
+    metatable = {"relay.Constant": resnext50_32x4d_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1, 64, 56, 56), float16]) {
+          %0 = nn.conv2d(%x, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %1 = nn.relu(%0);
+          %2 = nn.conv2d(%1, meta[relay.Constant][1], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %3 = nn.relu(%2);
+          %4 = nn.conv2d(%3, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %5 = nn.conv2d(%x, meta[relay.Constant][3], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %6 = add(%4, %5);
+          %7 = nn.relu(%6);
+          %8 = nn.conv2d(%7, meta[relay.Constant][4], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %9 = nn.relu(%8);
+          %10 = nn.conv2d(%9, meta[relay.Constant][5], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %11 = nn.relu(%10);
+          %12 = nn.conv2d(%11, meta[relay.Constant][6], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %13 = add(%12, %7);
+          %14 = nn.relu(%13);
+          %15 = nn.conv2d(%14, meta[relay.Constant][7], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %16 = nn.relu(%15);
+          %17 = nn.conv2d(%16, meta[relay.Constant][8], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %18 = nn.relu(%17);
+          %19 = nn.conv2d(%18, meta[relay.Constant][9], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %20 = add(%19, %14);
+          %21 = nn.relu(%20);
+          %22 = nn.conv2d(%21, meta[relay.Constant][10], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %23 = nn.relu(%22);
+          %24 = nn.conv2d(%23, meta[relay.Constant][11], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %25 = nn.relu(%24);
+          %26 = nn.conv2d(%25, meta[relay.Constant][12], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %27 = nn.conv2d(%21, meta[relay.Constant][13], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %28 = add(%26, %27);
+          %29 = nn.relu(%28);
+          %30 = nn.conv2d(%29, meta[relay.Constant][14], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %31 = nn.relu(%30);
+          %32 = nn.conv2d(%31, meta[relay.Constant][15], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %33 = nn.relu(%32);
+          %34 = nn.conv2d(%33, meta[relay.Constant][16], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %35 = add(%34, %29);
+          %36 = nn.relu(%35);
+          %37 = nn.conv2d(%36, meta[relay.Constant][17], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %38 = nn.relu(%37);
+          %39 = nn.conv2d(%38, meta[relay.Constant][18], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %40 = nn.relu(%39);
+          %41 = nn.conv2d(%40, meta[relay.Constant][19], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %42 = add(%41, %36);
+          %43 = nn.relu(%42);
+          %44 = nn.conv2d(%43, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %45 = nn.relu(%44);
+          %46 = nn.conv2d(%45, meta[relay.Constant][21], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %47 = nn.relu(%46);
+          %48 = nn.conv2d(%47, meta[relay.Constant][22], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %49 = add(%48, %43);
+          %50 = nn.relu(%49);
+          %51 = nn.conv2d(%50, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %52 = nn.relu(%51);
+          %53 = nn.conv2d(%52, meta[relay.Constant][24], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %54 = nn.relu(%53);
+          %55 = nn.conv2d(%54, meta[relay.Constant][25], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %56 = nn.conv2d(%50, meta[relay.Constant][26], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %57 = add(%55, %56);
+          %58 = nn.relu(%57);
+          %59 = nn.conv2d(%58, meta[relay.Constant][27], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %60 = nn.relu(%59);
+          %61 = nn.conv2d(%60, meta[relay.Constant][28], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %62 = nn.relu(%61);
+          %63 = nn.conv2d(%62, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %64 = add(%63, %58);
+          %65 = nn.relu(%64);
+          %66 = nn.conv2d(%65, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %67 = nn.relu(%66);
+          %68 = nn.conv2d(%67, meta[relay.Constant][31], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %69 = nn.relu(%68);
+          %70 = nn.conv2d(%69, meta[relay.Constant][32], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %71 = add(%70, %65);
+          %72 = nn.relu(%71);
+          %73 = nn.conv2d(%72, meta[relay.Constant][33], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %74 = nn.relu(%73);
+          %75 = nn.conv2d(%74, meta[relay.Constant][34], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %78 = add(%77, %72);
+          %79 = nn.relu(%78);
+          %80 = nn.conv2d(%79, meta[relay.Constant][36], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %81 = nn.relu(%80);
+          %82 = nn.conv2d(%81, meta[relay.Constant][37], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %83 = nn.relu(%82);
+          %84 = nn.conv2d(%83, meta[relay.Constant][38], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %85 = add(%84, %79);
+          %86 = nn.relu(%85);
+          %87 = nn.conv2d(%86, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %88 = nn.relu(%87);
+          %89 = nn.conv2d(%88, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %90 = nn.relu(%89);
+          %91 = nn.conv2d(%90, meta[relay.Constant][41], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %92 = add(%91, %86);
+          %93 = nn.relu(%92);
+          %94 = nn.conv2d(%93, meta[relay.Constant][42], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %95 = nn.relu(%94);
+          %96 = nn.conv2d(%95, meta[relay.Constant][43], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %97 = nn.relu(%96);
+          %98 = nn.conv2d(%97, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %99 = nn.conv2d(%93, meta[relay.Constant][45], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %100 = add(%98, %99);
+          %101 = nn.relu(%100);
+          %102 = nn.conv2d(%101, meta[relay.Constant][46], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %103 = nn.relu(%102);
+          %104 = nn.conv2d(%103, meta[relay.Constant][47], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %105 = nn.relu(%104);
+          %106 = nn.conv2d(%105, meta[relay.Constant][48], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %107 = add(%106, %101);
+          %108 = nn.relu(%107);
+          %109 = nn.conv2d(%108, meta[relay.Constant][49], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %110 = nn.relu(%109);
+          %111 = nn.conv2d(%110, meta[relay.Constant][50], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %112 = nn.relu(%111);
+          %113 = nn.conv2d(%112, meta[relay.Constant][51], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %114 = add(%113, %108);
+          nn.relu(%114)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "resnext50_32x4d_16",
+        "input_shapes": {"x": [1, 64, 56, 56]},
+        "input_dtypes": {"x": "float16"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def describe_onnx(name, filename):
+    """Returns the description of the ONNX model at filename, which can be passed to from_onnx to actually load
+    the model. Note that ? (ie unknown) shape dimensions must be manually changed to concrete dimensions
+    which are consistent with the overall model."""
+    onnx_model = onnx.load(MODEL_PREFIX + filename)
+    input_shapes = {}
+    input_dtypes = {}
+    initializer_names = [n.name for n in onnx_model.graph.initializer]
+    for input_info in onnx_model.graph.input:
+        if input_info.name not in initializer_names:
+            _, shape, dtype, _ = tvm.relay.frontend.onnx.get_info(input_info)
+            if dtype is None:
+                raise ValueError(f"Unknown dtype on input '{input_info.name}' is not supported.")
+            input_shapes.update({input_info.name: shape})
+            input_dtypes.update({input_info.name: dtype})
+    print(
+        f"{{'name': '{name}', 'filename': '{filename}', 'input_shapes': {input_shapes}, 'input_dtypes': {input_dtypes}, 'main_dtype': 'float32'}}"
+    )
+
+
+def from_onnx(model):
+    logging.info("-------------------- BEGIN ONNX IMPORT --------------------")
+
+    filename = MODEL_PREFIX + model["filename"]
+    logging.info(f"Loading ONNX model from {filename}")
+
+    onnx_model = onnx.load(filename)
+    logging.info(f"Loaded model from {filename}")
+
+    mod, params = tvm.relay.frontend.from_onnx(
+        onnx_model, model["input_shapes"], freeze_params=True
+    )
+    mod = tvm.relay.transform.InferType()(mod)
+    logging.info("-------------------- END ONNX IMPORT --------------------")
+
+    logging.info(f"Imported model:\n{mod}")
+    logging.info(f"Params:\n{params}")
+
+    return {
+        "name": model["name"],
+        "input_shapes": model["input_shapes"],
+        "input_dtypes": model["input_dtypes"],
+        "mod": mod,
+        "params": params,
+        "main_dtype": model["main_dtype"],
+    }
+
+
+def to_onnx(model):
+    logging.info("-------------------- BEGIN ONNX EXPORT --------------------")
+    short_filename = model["name"] + ".onnx"
+    filename = MODEL_PREFIX + short_filename
+    logging.info(f"Saving ONNX model to {filename}")
+
+    params = model["params"]
+    if params is None:
+        params = {}
+    tvm.contrib.target.onnx.to_onnx(model["mod"], params, model["name"], path=filename)
+    logging.info("-------------------- END ONNX EXPORT --------------------")
+
+    return {
+        "name": model["name"],
+        "filename": short_filename,
+        "input_shapes": model["input_shapes"],
+        "input_dtypes": model["input_dtypes"],
+        "main_dtype": model["main_dtype"],
+    }
diff --git a/tests/python/relay/collage/test_collage_partitioner.py b/tests/python/relay/collage/test_collage_partitioner.py
new file mode 100644
index 0000000000000..e1217fde44192
--- /dev/null
+++ b/tests/python/relay/collage/test_collage_partitioner.py
@@ -0,0 +1,269 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import logging
+import tempfile
+import os
+import menangerie
+
+# The following are necessary to force global functions or pattern tables to be registered
+from tvm.relay.op.contrib.cutlass import partition_for_cutlass
+from tvm.contrib.cutlass import num_cutlass_partitions
+from tvm.relay.op.contrib.cublas import partition_for_cublas
+from tvm.relay.op.contrib.cudnn import partition_for_cudnn
+
+logging.basicConfig(level=logging.INFO)
+
+HOST = tvm.target.Target("llvm")
+CUDA = tvm.target.Target("cuda", HOST)
+
+###
+### Rename to match your hardware, eg ..._vt100...
+###
+TUNING_LOG = "collage_autotvm_rtx3070.tuninglog"
+
+###
+### If true, runs final model under nvprof
+###
+PROFILE = True
+
+###
+### If true, run all models
+###
+ALL_MODELS = False
+
+###
+### If true, run all configurations
+###
+ALL_CONFIGS = False
+
+TVM_MAX_MAX_DEPTH = 8
+BYOC_MAX_MAX_DEPTH = 8
+
+runner_template = """
+import tvm
+import tvm.runtime.vm
+import numpy as np
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+MEASURE_NUMBER = 20
+MEASURE_REPEAT = 5
+WARMUP_MIN_REPEAT_MS = 250
+
+def arg_for(shape, dtype, device):
+    return tvm.nd.array(
+        np.random.rand(*shape).astype(dtype), device=device)
+
+def vm_estimate_seconds(device, vm, args):
+    vm.benchmark(device, repeat=1, number=1, min_repeat_ms=WARMUP_MIN_REPEAT_MS, **args)
+    return vm.benchmark(device, repeat=MEASURE_REPEAT, number=MEASURE_NUMBER, min_repeat_ms=0,
+                        **args)
+
+
+def run(label, name, device, lib_path, code_path, input_shapes, input_dtypes):
+    logging.info(f"Loading compiled code for {name} generated by {label} from {lib_path} and {code_path}...")
+    loaded_lib = tvm.runtime.load_module(lib_path)
+    loaded_code = bytearray(open(code_path, "rb").read())
+    loaded_exe = tvm.runtime.vm.Executable.load_exec(loaded_code, loaded_lib)
+    vm = tvm.runtime.vm.VirtualMachine(loaded_exe, device)
+    args = {
+        input_name: arg_for(input_shapes[input_name], input_dtypes[input_name], device)
+        for input_name in input_shapes.keys()
+    }
+    logging.info(f"Benchmarking for {name} generated by {label}...")
+    profile = vm_estimate_seconds(device, vm, args) 
+    logging.info(f"Benchmarked for {name} generated by {label}: {profile}")
+    logging.info(f"RESULT: {label} | {name} | {profile.median * 1e3}ms")
+
+if __name__ == "__main__":
+"""
+
+
+def compile_and_benchmark(label, model, targets, dev, tmp_dir):
+    logging.info(f"Compiling {model['name']} using {label} with {targets}...")
+    exe = tvm.relay.vm.compile(model["mod"], target=targets, params=model["params"])
+    lib_path = os.path.join(tmp_dir, "lib.so")
+    code_path = os.path.join(tmp_dir, "code.ro")
+    code, lib = exe.save()
+    logging.info(f"Saving VM code to {code_path}...")
+    with open(code_path, "wb") as fo:
+        fo.write(code)
+    logging.info(f"Exporting library to {lib_path}...")
+    lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
+    runner = f"{runner_template}    run('{label}', '{model['name']}', tvm.device({dev.device_type}), '{lib_path}', '{code_path}', {model['input_shapes']}, {model['input_dtypes']})\n"
+    runner_path = os.path.join(tmp_dir, "runner.py")
+    logging.info(f"Saving runner to {runner_path}...")
+    with open(runner_path, "w") as fo:
+        fo.write(runner)
+
+    logging.info(f"Invoking runner...")
+    if PROFILE:
+        profile_path = os.path.join(tmp_dir, "profile.txt")
+        os.system(f"nsys nvprof -o {profile_path} python3 {runner_path}")
+    else:
+        os.system(f"python3 {runner_path}")
+
+
+def collage(model):
+    logging.info(f"collage | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    tvm.relay.collage.autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
+    with tvm.relay.collage.optional_tuning_records(TUNING_LOG):
+        targets = []
+        targets.append(CUDA)
+        use_fp16 = model["main_dtype"] == "float16"
+        targets.append(
+            tvm.target.Target(f"tensorrt -use_implicit_batch=False -use_fp16={use_fp16}", HOST)
+        )
+        tmp_dir = tempfile.mkdtemp()
+        targets.append(tvm.target.Target(f"cutlass -tmp_dir={tmp_dir}", HOST))
+        targets.append(tvm.target.Target("cublas", HOST))
+        targets.append(tvm.target.Target("cudnn", HOST))
+        config = {
+            "relay.collage.tvm_max_max_depth": TVM_MAX_MAX_DEPTH,
+            "relay.collage.byoc_max_max_depth": BYOC_MAX_MAX_DEPTH,
+        }
+        logging.info(f"Using PassContext(config={config}")
+        ctxt = tvm.transform.PassContext(config=config)
+        config = tvm.target.make_compilation_config(ctxt, targets)
+        with ctxt:
+            mod = model["mod"]
+            mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod)
+            logging.info("-------------- BEGIN INDEXED --------------")
+            logging.info(mod)
+            logging.info("-------------- END INDEXED ----------------")
+            mod = tvm.relay.transform.CollagePartition(config)(mod)
+            partitioned_model = model.copy()
+            partitioned_model["mod"] = mod
+            logging.info("-------------- BEGIN PARTITIONED --------------")
+            logging.info(partitioned_model["mod"])
+            logging.info("-------------- END PARTITIONED ----------------")
+            dev = tvm.device(CUDA.kind.device_type)
+            compile_and_benchmark("collage", partitioned_model, targets, dev, tmp_dir)
+
+
+def just_tensorrt(model):
+    logging.info(f"just_tensorrt | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    tmp_dir = tempfile.mkdtemp()
+    tvm.relay.collage.autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
+    with tvm.relay.collage.optional_tuning_records(TUNING_LOG):
+        logging.info("Partitioning for TensorRT...")
+        use_fp16 = model["main_dtype"] == "float16"
+        trt_target = tvm.target.Target(
+            f"tensorrt -use_implicit_batch=False -use_fp16={use_fp16}", HOST
+        )
+        mod = tvm.relay.op.contrib.partition_for_tensorrt(
+            mod=model["mod"], params=model["params"], target=trt_target
+        )
+        partitioned_model = model.copy()
+        partitioned_model["mod"] = mod
+        logging.info("-------------- BEGIN PARTITIONED --------------")
+        logging.info(partitioned_model["mod"])
+        logging.info("-------------- END PARTITIONED ----------------")
+        targets = []
+        targets.append(CUDA)
+        targets.append(trt_target)
+        dev = tvm.device(CUDA.kind.device_type)
+        compile_and_benchmark("just_tensorrt", partitioned_model, targets, dev, tmp_dir)
+
+
+def just_cutlass(model):
+    logging.info(f"just_cutlass | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    tmp_dir = tempfile.mkdtemp()
+    tvm.relay.collage.autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
+    with tvm.relay.collage.optional_tuning_records(TUNING_LOG):
+        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            logging.info("Partitioning for CUTLASS...")
+            mod = tvm.relay.op.contrib.partition_for_cutlass(model["mod"], model["params"])
+            partitioned_model = model.copy()
+            partitioned_model["mod"] = mod
+            logging.info("-------------- BEGIN PARTITIONED --------------")
+            logging.info(partitioned_model["mod"])
+            logging.info("-------------- END PARTITIONED ----------------")
+            targets = []
+            targets.append(CUDA)
+            targets.append(tvm.target.Target(f"cutlass -tmp_dir={tmp_dir}", HOST))
+            dev = tvm.device(CUDA.kind.device_type)
+            compile_and_benchmark("just_cutlass", partitioned_model, targets, dev, tmp_dir)
+
+
+def just_tvm(model):
+    logging.info(f"just_tvm | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    tmp_dir = tempfile.mkdtemp()
+    tvm.relay.collage.autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
+    with tvm.relay.collage.optional_tuning_records(TUNING_LOG):
+        dev = tvm.device(CUDA.kind.device_type)
+        compile_and_benchmark("just_tvm", model, CUDA, dev, tmp_dir)
+
+
+def tvm_with_libs(model):
+    logging.info(f"tvm_with_libs | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    tmp_dir = tempfile.mkdtemp()
+    cuda_target = tvm.target.Target("cuda -libs=cudnn,cublas", HOST)
+    tvm.relay.collage.autotvm_tune_module(model["mod"], cuda_target, TUNING_LOG)
+    with tvm.relay.collage.optional_tuning_records(TUNING_LOG):
+        dev = tvm.device(cuda_target.kind.device_type)
+        compile_and_benchmark("tvm_with_libs", model, cuda_target, dev, tmp_dir)
+
+
+def test_all():
+    make_models = []
+    make_models.append(menangerie.resnext50_32x4d)
+    if ALL_MODELS:
+        make_models.append(menangerie.resnext50_32x4d_16)
+        make_models.append(menangerie.gpt2_16)
+        make_models.append(menangerie.gpt2)
+        make_models.append(menangerie.mobilenet_16)
+        make_models.append(menangerie.mobilenet)
+        make_models.append(menangerie.resnet50_16)
+        make_models.append(menangerie.resnet50)
+    run_models = []
+    if ALL_CONFIGS:
+        run_models.append(just_tensorrt)
+        run_models.append(just_tvm)
+        run_models.append(tvm_with_libs)
+    run_models.append(collage)
+    for make_model in make_models:
+        model = make_model()
+        for run_model in run_models:
+            run_model(model)
+
+
+def test_mini():
+    collage(menangerie.gpt2_16_for_cutlass_extract())
+
+
+if __name__ == "__main__":
+    # test_all()
+    test_mini()
diff --git a/tests/python/relay/collage/test_sub_graph.py b/tests/python/relay/collage/test_sub_graph.py
new file mode 100644
index 0000000000000..a231bd72454fa
--- /dev/null
+++ b/tests/python/relay/collage/test_sub_graph.py
@@ -0,0 +1,400 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import logging
+import tvm.testing
+
+logging.basicConfig(level=logging.INFO)
+
+partition_on_indexes_for_testing = tvm._ffi.get_global_func(
+    "relay.collage.partition_on_indexes_for_testing"
+)
+
+
+def print_with_indexes(mod):
+    mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod)
+    print(mod)
+
+
+def process(mod, max_outputs, allow_taps, indexes, labels=None):
+    mod = tvm.relay.transform.InferType()(mod)
+    mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod)
+    mod = partition_on_indexes_for_testing(max_outputs, allow_taps, indexes, labels)(mod)
+    return mod
+
+
+def assert_eq(in_mod, expected_mod, actual_mod):
+    in_mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(in_mod)
+    expected_mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(expected_mod)
+    if not tvm.ir.structural_equal(actual_mod, expected_mod, True):
+        # Print everything in full so we can see what's going on when things fail.
+        print("Input module:")
+        print(in_mod)
+        print("Expected module:")
+        print(expected_mod)
+        print("Actual module:")
+        print(actual_mod)
+        # Assert again so as to see the actual disagreeing sub-expressions.
+        tvm.ir.assert_structural_equal(actual_mod, expected_mod, map_free_vars=True)
+
+
+def test_single_op():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);
+              %1 = add(%c, %d);   // node 7
+              subtract(%0, %1)
+            }
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);
+              %1 = (fn(%x, %y, Composite="a") { add(%x, %y) })(%c, %d);
+              subtract(%0, %1)
+            }
+        """
+        )
+
+    assert_eq(input(), expected(), process(input(), 1, False, [7], ["a"]))
+
+
+def test_multi_output():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);   // node 6
+              %1 = add(%c, %d);   // node 7
+              subtract(%0, %1)
+            }
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = (fn(%w, %x, %y, %z, Composite="a") { (add(%y, %z), add(%w, %x)) })(%c, %d, %a, %b);
+              %1 = %0.0;
+              %2 = %0.1;
+              subtract(%1, %2)
+            }
+        """
+        )
+
+    # No rewrite since 2 outputs
+    assert_eq(input(), input(), process(input(), 1, False, [6, 7], ["a", "a"]))
+    # Rewrite
+    assert_eq(input(), expected(), process(input(), 2, False, [6, 7], ["a", "a"]))
+
+
+def test_classic_conv2d_add_relu():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32],
+                      %c: Tensor[(5, 2, 28, 28), float32], %d: Tensor[(5, 2, 28, 28), float32]) {
+              %0 = nn.conv2d(%a, %b); // node 8
+              %1 = add(%0, %c);       // node 9
+              %2 = nn.relu(%1);       // node 10
+              subtract(%2, %d)
+            }
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32],
+                      %c: Tensor[(5, 2, 28, 28), float32], %d: Tensor[(5, 2, 28, 28), float32]) {
+              %2 = (fn(%x, %y, %z, Composite="a") {
+                %0 = nn.conv2d(%x, %y);
+                %1 = add(%0, %z);
+                nn.relu(%1)
+              })(%a, %b, %c);           
+              subtract(%2, %d)
+            }
+        """
+        )
+
+    assert_eq(input(), expected(), process(input(), 1, False, [8, 9, 10], ["a", "a", "a"]))
+
+
+def test_diamond_single_output():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5
+              %1 = nn.relu(%0);                             // node 6
+              %2 = nn.relu(%1);                             // node 7
+              %3 = nn.leaky_relu(%0, alpha=0f);             // node 9
+              add(%2, %3)                                   // node 10
+            }   
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              (fn (%x: Tensor[(5, 3, 32, 32), float32], %y: Tensor[(2, 3, 5, 5), float32], Composite="a") {
+                %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]);
+                %1 = nn.relu(%0);
+                %2 = nn.relu(%1);
+                %3 = nn.leaky_relu(%0, alpha=0f);
+                add(%2, %3)
+              })(%a, %b)
+            }
+        """
+        )
+
+    assert_eq(
+        input(), expected(), process(input(), 1, False, [5, 6, 7, 9, 10], ["a", "a", "a", "a", "a"])
+    )
+
+
+def test_diamond_multi_output():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5
+              %1 = nn.relu(%0);                             // node 6
+              %2 = nn.relu(%1);                             // node 7
+              %3 = nn.leaky_relu(%0, alpha=0f);             // node 9
+              add(%2, %3)
+            }   
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              %4 = (fn (%x: Tensor[(5, 3, 32, 32), float32], %y: Tensor[(2, 3, 5, 5), float32], Composite="a") {
+                %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]);
+                %1 = nn.relu(%0);
+                %2 = nn.relu(%1);
+                %3 = nn.leaky_relu(%0, alpha=0f);
+                (%2, %3)
+              })(%a, %b);
+              %5 = %4.0;
+              %6 = %4.1;
+              add(%5, %6)
+            }
+        """
+        )
+
+    assert_eq(input(), expected(), process(input(), 2, False, [5, 6, 7, 9], ["a", "a", "a", "a"]))
+
+
+def test_with_tap():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5
+              %1 = nn.relu(%0);                             // node 6
+              add(%1, %0)
+            }            
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              %2 = (fn (%x, %y, Composite="a") {
+                %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]);
+                %1 = nn.relu(%0);
+                (%0, %1)
+              })(%a, %b);
+              %3 = %2.1;
+              %4 = %2.0; 
+              add(%3, %4)
+            }            
+        """
+        )
+
+    # No rewrite since has tap
+    assert_eq(input(), input(), process(input(), 2, False, [5, 6], ["a", "a"]))
+    # Rewrite
+    assert_eq(input(), expected(), process(input(), 2, True, [5, 6], ["a", "a"]))
+
+
+def test_no_cycles():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b); // node 3 
+              %1 = add(%0, %b);
+              add(%1, %b)       // node 5
+            }            
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
+              (fn(%x, %y, Composite="a") {
+                %0 = add(%x, %y);
+                %1 = add(%0, %y);
+                add(%1, %y)
+              })(%a, %b) 
+            }            
+        """
+        )
+
+    # No rewrite since would create cycle
+    assert_eq(input(), input(), process(input(), 2, False, [3, 5], ["a", "a'"]))
+    # No cycle
+    assert_eq(input(), expected(), process(input(), 2, False, [3, 4, 5], ["a", "a", "a"]))
+
+
+def test_labels_direct_connection():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32]) {
+              %0 = nn.relu(%a);  // node 3
+              %1 = nn.relu(%0);  // node 4
+              %2 = nn.relu(%1);  // node 5
+              %3 = nn.relu(%1);  // node 6
+              %4 = add(%2, %3);  // node 7
+              %5 = nn.relu(%4);  // node 8
+              %6 = nn.relu(%4);  // node 9
+              %7 = add(%5, %6);  // node 10
+              nn.relu(%7)        // node 11  
+            }            
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32]) {
+              %0 = nn.relu(%a);
+              %4 = (fn(%y, Composite="a") { 
+                %1 = nn.relu(%y);
+                %2 = nn.relu(%1);
+                %3 = nn.relu(%1);
+                add(%2, %3)
+              })(%0);
+              %7 = (fn(%z, Composite="b") {
+                %5 = nn.relu(%z);
+                %6 = nn.relu(%z);
+                add(%5, %6)
+              })(%4);
+              nn.relu(%7)
+            }
+        """
+        )
+
+    assert_eq(
+        input(),
+        expected(),
+        process(
+            input(),
+            1,
+            False,
+            [3, 4, 5, 6, 7, 8, 9, 10, 11],
+            ["", "a", "a", "a", "a", "b", "b", "b", ""],
+        ),
+    )
+
+
+def test_labels_nested_tap():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32]) {
+              %0 = nn.relu(%a);  // node 3
+              %1 = nn.relu(%0);  // node 4
+              %2 = nn.relu(%1);  // node 5
+              %3 = nn.relu(%1);  // node 6
+              %4 = add(%2, %3);  // node 7
+              %5 = nn.relu(%4);  // node 8
+              %6 = nn.relu(%4);  // node 9
+              %7 = add(%5, %6);  // node 10
+              add(%2, %7)        // node 11  
+            }            
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32]) {
+              %0 = nn.relu(%a); 
+              %5 = (fn(%y, Composite="a") { 
+                %1 = nn.relu(%y);
+                %2 = nn.relu(%1);
+                %3 = nn.relu(%1);
+                %4 = add(%2, %3);
+                (%2, %4)
+              })(%0);
+              %8 = (fn(%z, Composite="b") {
+                %6 = nn.relu(%z);
+                %7 = nn.relu(%z);
+                add(%6, %7)
+              })(%5.1);
+              add(%5.0, %8)
+            }
+        """
+        )
+
+    assert_eq(
+        input(),
+        expected(),
+        process(input(), 2, True, [4, 5, 6, 7, 8, 9, 10], ["a", "a", "a", "a", "b", "b", "b"]),
+    )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_collage_partition.py b/tests/python/relay/test_pass_collage_partition.py
new file mode 100644
index 0000000000000..cd0a915c5fea6
--- /dev/null
+++ b/tests/python/relay/test_pass_collage_partition.py
@@ -0,0 +1,567 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import relay
+import pytest
+from tvm.relay.transform import CollagePartition, InferType
+from tvm.target import make_compilation_config
+from tvm.relay.collage import MockEstimator
+from unittest.mock import patch
+from tvm.relay.dataflow_pattern import is_op, wildcard
+
+
+def cpu_pattern_table():
+    def relu_pattern():
+        return is_op("nn.relu")(wildcard())
+
+    def add_pattern():
+        return is_op("add")(wildcard(), wildcard())
+
+    def concatenate_pattern():
+        return is_op("concatenate")(wildcard())
+
+    return [
+        ("relu", relu_pattern(), lambda x: True),
+        ("add", add_pattern(), lambda x: True),
+        ("concatenate", concatenate_pattern(), lambda x: True),
+    ]
+
+
+def _mock_get_pattern_table(target):
+    if target == "test_external_cpu_target":
+        return cpu_pattern_table()
+
+
+def run_collage(mod, targets, cost_estimator, tvm_max_depth=8, byoc_max_depth=8):
+    ctxt = {
+        "relay.collage.tvm_max_max_depth": tvm_max_depth,
+        "relay.collage.byoc_max_max_depth": byoc_max_depth,
+    }
+    pass_ctxt = tvm.transform.PassContext(config=ctxt)
+    with pass_ctxt:
+        config = make_compilation_config(pass_ctxt, targets)
+        mod = InferType()(mod)
+        mod = CollagePartition(config, cost_estimator)(mod)
+        return mod
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_partition_single_op_llvm(mock_get_pattern_table):
+    mod_txt = """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32]) {
+  nn.relu(%x)
+}
+"""
+    expected_txt = """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:11 */) -> Tensor[(10, 10), float32] {
+  nn.relu(%x) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+}
+"""
+    mod = tvm.parser.fromtext(mod_txt)
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("test_external_cpu_target"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 1,
+            "test_external_cpu_target": 2,
+        }
+    )
+    mod = run_collage(mod, targets, cost_estimator)
+    assert mod.astext() == expected_txt
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_partition_single_op_byoc(mock_get_pattern_table):
+    mod_txt = """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32]) {
+  nn.relu(%x)
+}
+"""
+    expected_txt = """#[version = "0.0.5"]
+def @collage_test_external_cpu_target_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu") -> Tensor[(10, 10), float32] {
+  %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:11 */) -> Tensor[(10, 10), float32] {
+  @collage_test_external_cpu_target_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */
+}
+"""
+    mod = tvm.parser.fromtext(mod_txt)
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("test_external_cpu_target"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 2,
+            "test_external_cpu_target": 1,
+        }
+    )
+    mod = run_collage(mod, targets, cost_estimator)
+    assert mod.astext() == expected_txt
+
+
+@pytest.mark.parametrize("byoc_max_depth", [1, 3])
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_partition_diamond_valid_topology(mock_get_pattern_table, byoc_max_depth):
+    mod_txt = """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32]) {
+  %0 = nn.relu(%x);
+  %1 = abs(%0);
+  %2 = nn.relu(%1);
+  add(%1, %2)
+}
+"""
+    expected_3_txt = """#[version = "0.0.5"]
+def @collage_test_external_cpu_target_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu") -> Tensor[(10, 10), float32] {
+  %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:4:12 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @collage_test_external_cpu_target_nn_relu_add(%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu_add") -> Tensor[(10, 10), float32] {
+  %1 = fn (%FunctionVar_04: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_04) /* ty=Tensor[(10, 10), float32] span=from_string:6:11 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %2 = %1(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] */;
+  %3 = fn (%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_1: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="add") -> Tensor[(10, 10), float32] {
+    add(%FunctionVar_03, %FunctionVar_1) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+  } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %3(%FunctionVar_02, %2) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] {
+  %4 = @collage_test_external_cpu_target_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */;
+  %5 = abs(%4) /* ty=Tensor[(10, 10), float32] span=from_string:6:7 */;
+  @collage_test_external_cpu_target_nn_relu_add(%5) /* ty=Tensor[(10, 10), float32] */
+}
+"""
+    expected_1_txt = """#[version = "0.0.5"]
+def @collage_test_external_cpu_target(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target") -> Tensor[(10, 10), float32] {
+  %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:6:11 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */;
+  %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_1: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="add") -> Tensor[(10, 10), float32] {
+    add(%FunctionVar_01, %FunctionVar_1) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+  } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %2(%FunctionVar_0, %1) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @collage_test_external_cpu_target_nn_relu(%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu") -> Tensor[(10, 10), float32] {
+  %3 = fn (%FunctionVar_04: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_04) /* ty=Tensor[(10, 10), float32] span=from_string:4:12 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %3(%FunctionVar_03) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] {
+  %4 = @collage_test_external_cpu_target_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */;
+  %5 = abs(%4) /* ty=Tensor[(10, 10), float32] span=from_string:6:7 */;
+  @collage_test_external_cpu_target(%5) /* ty=Tensor[(10, 10), float32] */
+}
+"""
+
+    mod = tvm.parser.fromtext(mod_txt)
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("test_external_cpu_target"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 2,
+            "test_external_cpu_target": 1,
+        }
+    )
+    mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=1, byoc_max_depth=byoc_max_depth)
+
+    expected_mod = tvm.parser.fromtext(expected_1_txt if byoc_max_depth == 1 else expected_3_txt)
+    tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True)
+
+
+@pytest.mark.parametrize("tvm_max_depth", [1, 2, 3])
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_tvm_max_depth(mock_get_pattern_table, tvm_max_depth):
+    mod_txt = """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32]) {
+  %0 = nn.relu(%x);
+  %1 = nn.relu(%0);
+  nn.relu(%1)
+}
+"""
+    expected_txts = {
+        1: """#[version = "0.0.5"]
+def @collage_test_external_cpu_target(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target") -> Tensor[(10, 10), float32] {
+  %0 = fn (%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_03) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */;
+  %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %3 = %2(%1) /* ty=Tensor[(10, 10), float32] */;
+  %4 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %4(%3) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] {
+  @collage_test_external_cpu_target(%x) /* ty=Tensor[(10, 10), float32] */
+}
+""",
+        2: """#[version = "0.0.5"]
+def @collage_test_external_cpu_target_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu") -> Tensor[(10, 10), float32] {
+  %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] {
+  %1 = @collage_test_external_cpu_target_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */;
+  %2 = nn.relu(%1) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */;
+  nn.relu(%2) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+}
+""",
+        3: """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] {
+  %0 = nn.relu(%x) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */;
+  %1 = nn.relu(%0) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */;
+  nn.relu(%1) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+}
+""",
+    }
+
+    mod = tvm.parser.fromtext(mod_txt)
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("test_external_cpu_target"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 100,
+            "test_external_cpu_target": 99,
+        }
+    )
+    mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=tvm_max_depth, byoc_max_depth=1)
+
+    expected_mod = tvm.parser.fromtext(expected_txts[tvm_max_depth])
+    tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True)
+
+
+@pytest.mark.parametrize("byoc_max_depth", [1, 2, 3])
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_byoc_max_depth(mock_get_pattern_table, byoc_max_depth):
+    mod_txt = """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32]) {
+  %0 = nn.relu(%x);
+  %1 = nn.relu(%0);
+  nn.relu(%1)
+}
+"""
+    expected_txts = {
+        1: """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] {
+  %0 = nn.relu(%x) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */;
+  %1 = nn.relu(%0) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */;
+  nn.relu(%1) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+}
+""",
+        2: """#[version = "0.0.5"]
+def @collage_test_external_cpu_target_nn_relu_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu_nn_relu") -> Tensor[(10, 10), float32] {
+  %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */;
+  %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %2(%1) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] {
+  %3 = nn.relu(%x) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */;
+  @collage_test_external_cpu_target_nn_relu_nn_relu(%3) /* ty=Tensor[(10, 10), float32] */
+}
+""",
+        3: """#[version = "0.0.5"]
+def @collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu") -> Tensor[(10, 10), float32] {
+  %0 = fn (%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_03) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */;
+  %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %3 = %2(%1) /* ty=Tensor[(10, 10), float32] */;
+  %4 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %4(%3) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] {
+  @collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */
+}
+""",
+    }
+
+    mod = tvm.parser.fromtext(mod_txt)
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("test_external_cpu_target"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 99,
+            "test_external_cpu_target": 100,
+        }
+    )
+    mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=1, byoc_max_depth=byoc_max_depth)
+
+    expected_mod = tvm.parser.fromtext(expected_txts[byoc_max_depth])
+    tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True)
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_partition_output_tuple(mock_get_pattern_table):
+    mod_txt = """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32]) {
+  %0 = nn.relu(%x);
+  %1 = nn.relu(%0);
+  %2 = abs(%1);
+  (%0, %1, %2)
+}
+"""
+    expected_txt = """#[version = "0.0.5"]
+def @collage_test_external_cpu_target(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target") -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) {
+  %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:6:4 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */;
+  %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:6:8 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %3 = %2(%1) /* ty=Tensor[(10, 10), float32] */;
+  (%1, %3) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */
+}
+
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32], Tensor[(10, 10), float32]) {
+  %4 = @collage_test_external_cpu_target(%x) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */;
+  %5 = %4.1 /* ty=Tensor[(10, 10), float32] */;
+  %6 = %4.0 /* ty=Tensor[(10, 10), float32] */;
+  %7 = abs(%5) /* ty=Tensor[(10, 10), float32] span=from_string:6:12 */;
+  (%6, %5, %7) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32], Tensor[(10, 10), float32]) span=from_string:3:3 */
+}
+"""
+
+    mod = tvm.parser.fromtext(mod_txt)
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("test_external_cpu_target"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 2,
+            "test_external_cpu_target": 1,
+        }
+    )
+    mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=2, byoc_max_depth=2)
+
+    expected_mod = tvm.parser.fromtext(expected_txt)
+    tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True)
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_partition_intermediate_tuple(mock_get_pattern_table):
+    mod_txt = """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32]) {
+  %0 = nn.relu(%x);
+  %1 = nn.relu(%0);
+  %2 = (%0, %1);
+  concatenate(%2)
+}
+"""
+    expected_txt = """#[version = "0.0.5"]
+def @collage_test_external_cpu_target(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target") -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) {
+  %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:5:9 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */;
+  %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:5:13 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %3 = %2(%1) /* ty=Tensor[(10, 10), float32] */;
+  (%1, %3) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */
+}
+
+def @collage_test_external_cpu_target_concatenate(%FunctionVar_03: (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_concatenate") -> Tensor[(20, 10), float32] {
+  %4 = fn (%FunctionVar_04: (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */, Composite="concatenate") -> Tensor[(20, 10), float32] {
+    concatenate(%FunctionVar_04) /* ty=Tensor[(20, 10), float32] span=from_string:3:3 */
+  } /* ty=fn ((Tensor[(10, 10), float32], Tensor[(10, 10), float32])) -> Tensor[(20, 10), float32] */;
+  %4(%FunctionVar_03) /* ty=Tensor[(20, 10), float32] */
+}
+
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(20, 10), float32] {
+  %5 = @collage_test_external_cpu_target(%x) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */;
+  %6 = %5.0 /* ty=Tensor[(10, 10), float32] */;
+  %7 = %5.1 /* ty=Tensor[(10, 10), float32] */;
+  %8 = (%6, %7) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) span=from_string:6:15 */;
+  @collage_test_external_cpu_target_concatenate(%8) /* ty=Tensor[(20, 10), float32] */
+}
+"""
+
+    mod = tvm.parser.fromtext(mod_txt)
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("test_external_cpu_target"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 2,
+            "test_external_cpu_target": 1,
+        }
+    )
+    mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=3, byoc_max_depth=5)
+
+    expected_mod = tvm.parser.fromtext(expected_txt)
+    tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True)
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_fusion_benefit(mock_get_pattern_table):
+    mod_txt = """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32]) {
+  %0 = nn.relu(%x);
+  %1 = nn.relu(%0);
+  %2 = abs(%x);
+  %3 = nn.relu(%2);
+  %4 = add(%1, %3);
+  %5 = nn.relu(%4);
+  abs(%5)
+}
+"""
+    expected_txt = """#[version = "0.0.5"]
+def @collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu_add_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_1: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu_add_nn_relu") -> Tensor[(10, 10), float32] {
+  %0 = fn (%FunctionVar_04: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_04) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */;
+  %2 = fn (%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_03) /* ty=Tensor[(10, 10), float32] span=from_string:7:12 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %3 = fn (%FunctionVar_05: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_05) /* ty=Tensor[(10, 10), float32] span=from_string:7:16 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %4 = %2(%1) /* ty=Tensor[(10, 10), float32] */;
+  %5 = %3(%FunctionVar_1) /* ty=Tensor[(10, 10), float32] */;
+  %6 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_11: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="add") -> Tensor[(10, 10), float32] {
+    add(%FunctionVar_02, %FunctionVar_11) /* ty=Tensor[(10, 10), float32] span=from_string:8:16 */
+  } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %7 = %6(%4, %5) /* ty=Tensor[(10, 10), float32] */;
+  %8 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:9:7 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %8(%7) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:5:12 */) -> Tensor[(10, 10), float32] {
+  %9 = abs(%x) /* ty=Tensor[(10, 10), float32] span=from_string:6:16 */;
+  %10 = @collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu_add_nn_relu(%x, %9) /* ty=Tensor[(10, 10), float32] */;
+  abs(%10) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+}
+"""
+    mod = tvm.parser.fromtext(mod_txt)
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("test_external_cpu_target"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 5,
+            "test_external_cpu_target": 6,
+        }
+    )
+    mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=1, byoc_max_depth=5)
+
+    expected_mod = tvm.parser.fromtext(expected_txt)
+    tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True)
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_double_residual(mock_get_pattern_table):
+    mod_txt = """#[version = "0.0.5"]
+def @main(%x: Tensor[(10, 10), float32]) {
+  %0 = nn.relu(%x);
+  %1 = abs(%0);
+  %2 = add(%0, %1);
+  add(%1, %2)
+}
+"""
+    expected_txt = """#[version = "0.0.5"]
+def @collage_test_external_cpu_target_add_add(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_1: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_add_add") -> Tensor[(10, 10), float32] {
+  %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_12: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="add") -> Tensor[(10, 10), float32] {
+    add(%FunctionVar_02, %FunctionVar_12) /* ty=Tensor[(10, 10), float32] span=from_string:6:11 */
+  } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %1 = %0(%FunctionVar_1, %FunctionVar_0) /* ty=Tensor[(10, 10), float32] */;
+  %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_11: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="add") -> Tensor[(10, 10), float32] {
+    add(%FunctionVar_01, %FunctionVar_11) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */
+  } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %2(%FunctionVar_0, %1) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @collage_test_external_cpu_target_nn_relu(%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu") -> Tensor[(10, 10), float32] {
+  %3 = fn (%FunctionVar_04: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_04) /* ty=Tensor[(10, 10), float32] span=from_string:5:12 */
+  } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
+  %3(%FunctionVar_03) /* ty=Tensor[(10, 10), float32] */
+}
+
+def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] {
+  %4 = @collage_test_external_cpu_target_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */;
+  %5 = abs(%4) /* ty=Tensor[(10, 10), float32] span=from_string:6:7 */;
+  @collage_test_external_cpu_target_add_add(%5, %4) /* ty=Tensor[(10, 10), float32] */
+} 
+"""
+
+    mod = tvm.parser.fromtext(mod_txt)
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("test_external_cpu_target"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 2,
+            "test_external_cpu_target": 1,
+        }
+    )
+    mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=4, byoc_max_depth=4)
+
+    expected_mod = tvm.parser.fromtext(expected_txt)
+    tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])