diff --git a/CMakeLists.txt b/CMakeLists.txt index 306a8be308584..5e9de5b66fa4d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -295,6 +295,7 @@ tvm_file_glob(GLOB_RECURSE RELAY_OP_SRCS ) tvm_file_glob(GLOB_RECURSE RELAY_PASS_SRCS src/relay/analysis/*.cc + src/relay/collage/*.cc src/relay/transforms/*.cc src/relay/quantize/*.cc ) diff --git a/collage_autotvm_rtx3070.tuninglog b/collage_autotvm_rtx3070.tuninglog new file mode 100644 index 0000000000000..d71fdc551054d --- /dev/null +++ b/collage_autotvm_rtx3070.tuninglog @@ -0,0 +1,314 @@ +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [768, 768], "float16"], null, "float16"], {}], "config": {"index": 4580561, "code_hash": null, "entity": [["tile_x", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 1, 4, 2]], ["tile_k", "sp", [-1, 24, 1]]]}, "result": [[0.0011703262512077295, 0.0011717971932367149, 0.001173296154589372], 0, 2.1227333545684814, 1649806187.4005826], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_tensorcore.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [768, 768], "float16"], null, "float16"], {}], "config": {"index": 15358, "code_hash": null, "entity": [["block_row_warps", "ot", 2], ["block_col_warps", "ot", 2], ["warp_row_tiles", "ot", 4], ["warp_col_tiles", "ot", 2], ["chunk", "ot", 2], ["offset", "ot", 8], ["offsetCS", "ot", 8], ["vec", "ot", 8], ["wmma_m", "ot", 8]]}, "result": [[4.1945246429498836e-05, 4.20124894832511e-05, 4.206600389509219e-05], 0, 1.5296962261199951, 1649810763.887689], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [768, 768], "float16"], null, "float16"], {}], "config": {"index": 9, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[0.0017168304615384617, 0.0017169396923076923, 0.0017175537692307693], 0, 1.3375377655029297, 1649811474.8204205], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [3072, 768], "float16"], null, "float16"], {}], "config": {"index": 44253765, "code_hash": null, "entity": [["tile_x", "sp", [-1, 16, 4, 1]], ["tile_y", "sp", [-1, 12, 4, 1]], ["tile_k", "sp", [-1, 4, 4]]]}, "result": [[0.0010211715700483093, 0.0010240237777777777, 0.001028588038647343], 0, 5.8342344760894775, 1649812254.5651937], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_tensorcore.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [3072, 768], "float16"], null, "float16"], {}], "config": {"index": 15439, "code_hash": null, "entity": [["block_row_warps", "ot", 2], ["block_col_warps", "ot", 2], ["warp_row_tiles", "ot", 4], ["warp_col_tiles", "ot", 2], ["chunk", "ot", 4], ["offset", "ot", 8], ["offsetCS", "ot", 8], ["vec", "ot", 8], ["wmma_m", "ot", 8]]}, "result": [[0.0001372752479338843, 0.00013770897272727273, 0.00013806367107438016], 0, 1.7162327766418457, 1649813685.478743], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [3072, 768], "float16"], null, "float16"], {}], "config": {"index": 6, "code_hash": null, "entity": [["tile_k", "sp", [-1, 12]]]}, "result": [[0.016466704999999998, 0.0164685995, 0.0164699646], 0, 2.3179266452789307, 1649814497.3615065], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [2304, 768], "float16"], null, "float16"], {}], "config": {"index": 65237833, "code_hash": null, "entity": [["tile_x", "sp", [-1, 10, 8, 1]], ["tile_y", "sp", [-1, 16, 8, 1]], ["tile_k", "sp", [-1, 6, 4]]]}, "result": [[0.0004378561058495821, 0.0004378913426183844, 0.0004379270584958217], 0, 6.199182510375977, 1649879456.1792707], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_tensorcore.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [2304, 768], "float16"], null, "float16"], {}], "config": {"index": 10265, "code_hash": null, "entity": [["block_row_warps", "ot", 4], ["block_col_warps", "ot", 2], ["warp_row_tiles", "ot", 1], ["warp_col_tiles", "ot", 4], ["chunk", "ot", 4], ["offset", "ot", 8], ["offsetCS", "ot", 8], ["vec", "ot", 8], ["wmma_m", "ot", 16]]}, "result": [[0.00011448282788944724, 0.00011478408417085428, 0.00011491263505025125], 0, 1.6557848453521729, 1649880882.8527882], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [2304, 768], "float16"], null, "float16"], {}], "config": {"index": 9, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[0.00506267196875, 0.005084877875, 0.005099014031250001], 0, 1.4695072174072266, 1649881655.5889988], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_tensorcore.cuda", [["TENSOR", [600, 32, 64], "float16"], ["TENSOR", [600, 32, 64], "float16"], [600, 32, 32], "float16", 0, 1], {}], "config": {"index": 10210, "code_hash": null, "entity": [["block_row_warps", "ot", 2], ["block_col_warps", "ot", 2], ["warp_row_tiles", "ot", 1], ["warp_col_tiles", "ot", 1], ["chunk", "ot", 4], ["offset", "ot", 8], ["offsetCS", "ot", 8], ["vec", "ot", 8], ["wmma_m", "ot", 16]]}, "result": [[1.617625803613724e-05, 1.6183566759152736e-05, 1.621106395073425e-05], 0, 1.9195764064788818, 1649882086.06151], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul.cuda", [["TENSOR", [600, 32, 64], "float16"], ["TENSOR", [600, 32, 64], "float16"], [600, 32, 32], "float16", 0, 1], {}], "config": {"index": 11588, "code_hash": null, "entity": [["tile_y", "sp", [-1, 4, 8]], ["tile_x", "sp", [-1, 32, 1]], ["tile_k", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 64], ["unroll_explicit", "ot", 0]]}, "result": [[2.080483443708609e-05, 2.0816280353200882e-05, 2.083283294960348e-05], 0, 1.7147831916809082, 1649883281.0301726], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_tensorcore.cuda", [["TENSOR", [600, 32, 32], "float16"], ["TENSOR", [600, 64, 32], "float16"], [600, 32, 64], "float16", 0, 1], {}], "config": {"index": 3060, "code_hash": null, "entity": [["block_row_warps", "ot", 1], ["block_col_warps", "ot", 1], ["warp_row_tiles", "ot", 2], ["warp_col_tiles", "ot", 4], ["chunk", "ot", 2], ["offset", "ot", 8], ["offsetCS", "ot", 0], ["vec", "ot", 4], ["wmma_m", "ot", 32]]}, "result": [[1.4489859942609439e-05, 1.4505717687282495e-05, 1.4542667378960864e-05], 0, 2.024470806121826, 1649884257.5815887], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul.cuda", [["TENSOR", [600, 32, 32], "float16"], ["TENSOR", [600, 64, 32], "float16"], [600, 32, 64], "float16", 0, 1], {}], "config": {"index": 10118, "code_hash": null, "entity": [["tile_y", "sp", [-1, 4, 8]], ["tile_x", "sp", [-1, 32, 1]], ["tile_k", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 32], ["unroll_explicit", "ot", 0]]}, "result": [[2.0256624645161293e-05, 2.0411544258064515e-05, 2.0471600903225806e-05], 0, 1.4079077243804932, 1649885460.5937498], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 3072], "float16"], ["TENSOR", [768, 3072], "float16"], null, "float16"], {}], "config": {"index": 44926587, "code_hash": null, "entity": [["tile_x", "sp", [-1, 16, 2, 1]], ["tile_y", "sp", [-1, 12, 2, 1]], ["tile_k", "sp", [-1, 1, 8]]]}, "result": [[0.004547732660377358, 0.004549542320754717, 0.0045726865471698115], 0, 3.755682945251465, 1649890840.317121], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_tensorcore.cuda", [["TENSOR", [1600, 3072], "float16"], ["TENSOR", [768, 3072], "float16"], null, "float16"], {}], "config": {"index": 10192, "code_hash": null, "entity": [["block_row_warps", "ot", 2], ["block_col_warps", "ot", 2], ["warp_row_tiles", "ot", 2], ["warp_col_tiles", "ot", 4], ["chunk", "ot", 2], ["offset", "ot", 8], ["offsetCS", "ot", 8], ["vec", "ot", 8], ["wmma_m", "ot", 16]]}, "result": [[0.000155265987607245, 0.00015604860152526216, 0.0001561262850333651], 0, 1.7700400352478027, 1649891493.4140472], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 3072], "float16"], ["TENSOR", [768, 3072], "float16"], null, "float16"], {}], "config": {"index": 9, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[0.0100141173, 0.01003659415, 0.0100415897], 0, 1.8085365295410156, 1649892333.886837], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 3, 224, 224], "float32"], ["TENSOR", [32, 3, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 34318245, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 16, 1]], ["tile_rc", "sp", [-1, 3]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}, "result": [[5.8621e-06, 5.8625999999999996e-06, 5.8631e-06], 0, 0.5667502880096436, 1649968594.6997252], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [32, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 15657147, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 8]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 16, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.3555999999999995e-06, 7.35565e-06, 7.3781000000000006e-06], 0, 0.7575399875640869, 1650319502.366343], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [32, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 6072920, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 4]], ["tile_x", "sp", [-1, 1, 56, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[5.77155e-06, 5.7885e-06, 5.81905e-06], 0, 1.1465270519256592, 1650323152.8143198], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [16, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 9785718, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 2, 8]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 16, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[4.79855e-06, 4.81355e-06, 4.81455e-06], 0, 0.8575699329376221, 1650325997.844903], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 16, 112, 112], "float32"], ["TENSOR", [96, 16, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 63569063, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 8, 2]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.2961600000000001e-05, 1.2997649999999999e-05, 1.3000100000000001e-05], 0, 0.71053147315979, 1650330982.632312], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 96, 112, 112], "float32"], ["TENSOR", [96, 1, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 6588288, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 2, 4]], ["tile_x", "sp", [-1, 1, 28, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[1.655015e-05, 1.655815e-05, 1.66232e-05], 0, 2.27034854888916, 1650334575.9977324], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 96, 56, 56], "float32"], ["TENSOR", [24, 96, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 15104777, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 3]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 4, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[6.8786000000000005e-06, 6.899099999999999e-06, 6.915549999999999e-06], 0, 1.0633580684661865, 1650336756.828295], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 24, 56, 56], "float32"], ["TENSOR", [144, 24, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 99179720, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 6, 4]], ["tile_y", "sp", [-1, 4, 2, 1]], ["tile_x", "sp", [-1, 1, 8, 1]], ["tile_rc", "sp", [-1, 6]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[6.287049999999999e-06, 6.2876e-06, 6.31505e-06], 0, 2.9279582500457764, 1650340980.0825403], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float32"], ["TENSOR", [144, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 12539100, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 7]], ["tile_x", "sp", [-1, 1, 28, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[6.821099999999999e-06, 6.8255999999999995e-06, 6.850050000000001e-06], 0, 1.4625983238220215, 1650347726.6219482], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float32"], ["TENSOR", [24, 144, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 18208026, "code_hash": null, "entity": [["tile_f", "sp", [-1, 3, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[8.36655e-06, 8.3721e-06, 8.377600000000001e-06], 0, 0.9377567768096924, 1650350453.9616745], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float32"], ["TENSOR", [144, 1, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 2479050, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[4.43905e-06, 4.45505e-06, 4.46055e-06], 0, 1.6908175945281982, 1650355541.390374], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 144, 28, 28], "float32"], ["TENSOR", [32, 144, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 7217873, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 2, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[5.71105e-06, 5.7370499999999996e-06, 5.7421e-06], 0, 2.532322883605957, 1650358471.728223], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 28, 28], "float32"], ["TENSOR", [192, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 9368144, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[4.228049999999999e-06, 4.2320499999999996e-06, 4.25555e-06], 0, 0.9063670635223389, 1650363448.7873058], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [192, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 2508576, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 14, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[3.8405e-06, 3.84355e-06, 3.84605e-06], 0, 0.98264479637146, 1650366846.8783011], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [32, 192, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 6769873, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 2, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[7.15055e-06, 7.1561e-06, 7.1566e-06], 0, 2.941910743713379, 1650369652.7351506], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [64, 192, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 4656979, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 12]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[9.7176e-06, 9.7226e-06, 9.7241e-06], 0, 0.7810451984405518, 1650373123.1804664], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 28, 28], "float32"], ["TENSOR", [384, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 13558142, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[9.5361e-06, 9.554600000000002e-06, 9.5571e-06], 0, 1.1758761405944824, 1650377186.8371902], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float32"], ["TENSOR", [384, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 3415200, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 7]], ["tile_x", "sp", [-1, 1, 28, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[5.35005e-06, 5.37305e-06, 5.377549999999999e-06], 0, 1.0839922428131104, 1650380285.1866374], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float32"], ["TENSOR", [64, 384, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 5194603, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 12]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.72372e-05, 1.727575e-05, 1.72932e-05], 0, 0.6490328311920166, 1650382649.6077251], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float32"], ["TENSOR", [384, 1, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 702720, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.667600000000001e-06, 4.67305e-06, 4.70105e-06], 0, 0.8721504211425781, 1650384995.9365287], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [128, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 13507251, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.216615e-05, 1.21901e-05, 1.2196650000000001e-05], 0, 0.9639995098114014, 1650392782.272614], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [128, 4, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 50062362, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 2, 1, 2]], ["tile_y", "sp", [-1, 1, 2, 4]], ["tile_x", "sp", [-1, 1, 56, 1]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[7.382600000000001e-06, 7.39615e-06, 7.4001e-06], 0, 2.2717225551605225, 1650424944.499887], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [256, 128, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 45700339, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 8]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.95423e-05, 3.9552349999999994e-05, 3.9561300000000004e-05], 0, 2.2523245811462402, 1650430175.9074345], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [256, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 18573740, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.01327e-05, 2.0137700000000002e-05, 2.01432e-05], 0, 1.0902690887451172, 1650436320.534228], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [128, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 9657656, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 2, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[4.146595e-05, 4.152795e-05, 4.1557949999999994e-05], 0, 0.8977162837982178, 1650441321.9324644], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [256, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 29864625, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 8]], ["tile_y", "sp", [-1, 2, 2, 2]], ["tile_x", "sp", [-1, 1, 8, 1]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}, "result": [[7.57113e-05, 7.581275e-05, 7.593024999999999e-05], 0, 0.7584812641143799, 1650447316.5830467], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [256, 8, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 33342114, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 1, 8]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[9.6796e-06, 9.6861e-06, 9.722649999999999e-06], 0, 2.743138313293457, 1650451842.4668179], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [512, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 17125050, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.177395e-05, 4.179295e-05, 4.1814449999999994e-05], 0, 1.3813395500183105, 1650456647.9994063], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [512, 256, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 7532887, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 16, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 2, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.2836050000000004e-05, 5.2853600000000005e-05, 5.28761e-05], 0, 0.8911774158477783, 1650459586.3381498], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [256, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 14151622, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 2]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.994355e-05, 5.005455e-05, 5.0164050000000006e-05], 0, 1.1229846477508545, 1650462059.7162044], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [256, 8, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 33536454, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[7.202550000000001e-06, 7.2061e-06, 7.2231e-06], 0, 2.9223580360412598, 1650465984.0746238], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [512, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 15365101, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[7.897534999999999e-05, 7.900685e-05, 7.902235e-05], 0, 1.4378130435943604, 1650471801.2672544], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [512, 16, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 5252700, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 4, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 0]]}, "result": [[9.1631e-06, 9.168100000000001e-06, 9.189600000000001e-06], 0, 2.878096342086792, 1650475376.2008486], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [1024, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 3200427, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 2]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[5.586205e-05, 5.58681e-05, 5.60791e-05], 0, 1.1162426471710205, 1650478289.7313669], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [1024, 512, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 3185277, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[6.979539999999999e-05, 7.000990000000001e-05, 7.007095e-05], 0, 1.088874340057373, 1650503541.7996006], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [512, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 208633, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}, "result": [[6.765845e-05, 6.768145000000001e-05, 6.770450000000001e-05], 0, 0.7414331436157227, 1650505644.5295877], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [512, 16, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 6381558, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 0]]}, "result": [[8.845050000000001e-06, 8.855549999999999e-06, 8.88055e-06], 0, 3.725031852722168, 1650509494.6496606], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [1024, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 3492656, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[0.00010417354999999999, 0.00010418149999999999, 0.00010423009999999998], 0, 1.3148417472839355, 1650512952.687562], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [1024, 32, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 761250, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 0]]}, "result": [[1.1431599999999999e-05, 1.143465e-05, 1.1438650000000001e-05], 0, 5.0806190967559814, 1650515567.1315808], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [2048, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 346926, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[8.6393e-05, 8.645245000000001e-05, 8.649599999999999e-05], 0, 1.271730661392212, 1650518280.522468], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [2048, 1024, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 345844, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 7, 1, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[0.00013144805, 0.0001315021, 0.00013154105], 0, 1.8906102180480957, 1650520029.3195784], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 2048, 7, 7], "float32"], ["TENSOR", [1024, 2048, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 85322, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 2]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 64]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[0.0001166115, 0.00011664449999999998, 0.00011665295], 0, 0.9735369682312012, 1650521383.2568395], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [1024, 32, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float32"], {}], "config": {"index": 1503126, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[1.08582e-05, 1.085915e-05, 1.085915e-05], 0, 3.2422332763671875, 1650522436.543362], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 384, 14, 14], "float32"], ["TENSOR", [96, 384, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 2276787, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[8.7111e-06, 8.716649999999999e-06, 8.727600000000001e-06], 0, 0.9531784057617188, 1650594246.792376], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 96, 14, 14], "float32"], ["TENSOR", [576, 96, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 14467404, "code_hash": null, "entity": [["tile_f", "sp", [-1, 6, 8, 1]], ["tile_y", "sp", [-1, 2, 7, 1]], ["tile_x", "sp", [-1, 1, 2, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[1.0403099999999999e-05, 1.040915e-05, 1.04111e-05], 0, 3.6800646781921387, 1650596813.592452], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float32"], ["TENSOR", [576, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 1013880, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[3.897049999999999e-06, 3.91e-06, 3.92105e-06], 0, 1.0108630657196045, 1650599520.8898165], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float32"], ["TENSOR", [96, 576, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 3021363, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.4886650000000001e-05, 1.4891699999999998e-05, 1.489365e-05], 0, 1.296245813369751, 1650602852.9900029], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float32"], ["TENSOR", [576, 1, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 75600, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.5125499999999995e-06, 3.5165e-06, 3.52305e-06], 0, 0.7898616790771484, 1650604889.7693179], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 576, 7, 7], "float32"], ["TENSOR", [160, 576, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 198967, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 32, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 48]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.187615e-05, 1.188065e-05, 1.189265e-05], 0, 0.9806699752807617, 1650607006.9707348], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 160, 7, 7], "float32"], ["TENSOR", [960, 160, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 723143, "code_hash": null, "entity": [["tile_f", "sp", [-1, 6, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[8.1326e-06, 8.1471e-06, 8.1586e-06], 0, 1.4358642101287842, 1650608992.9624553], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [960, 1, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 78004, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 3, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}, "result": [[3.2270500000000003e-06, 3.2275e-06, 3.2370500000000006e-06], 0, 0.7736132144927979, 1650610364.983347], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [160, 960, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 245551, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.996525e-05, 1.997425e-05, 2.00027e-05], 0, 1.1400103569030762, 1650611207.975019], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [320, 960, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 385083, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.39933e-05, 2.4011299999999998e-05, 2.40183e-05], 0, 0.6805839538574219, 1650613344.2010403], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 320, 7, 7], "float32"], ["TENSOR", [1280, 320, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 418478, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 80]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.70027e-05, 1.70537e-05, 1.70547e-05], 0, 1.3821897506713867, 1650615147.8768744], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1280, 1, 1], "float32"], ["TENSOR", [1000, 1280, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 33248, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 1, 1]], ["tile_rc", "sp", [-1, 80]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[2.96109e-05, 2.964935e-05, 2.96514e-05], 0, 1.2670972347259521, 1650616370.1425672], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 7, 7], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"], {}], "config": {"index": 76584961, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 7]], ["tile_rx", "sp", [-1, 7]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.018785e-05, 3.021085e-05, 3.0319899999999997e-05], 0, 2.8301658630371094, 1650641335.9067605], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 9455056, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[8.2506e-06, 8.2611e-06, 8.262600000000001e-06], 0, 0.9980921745300293, 1650646562.0615616], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 152951, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 8]], ["tile_x", "sp", [-1, 4, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.538185e-05, 2.54259e-05, 2.5454850000000002e-05], 0, 1.7803680896759033, 1650673519.9665477], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 87933763, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.642805e-05, 3.644205e-05, 3.6453999999999994e-05], 0, 0.7112207412719727, 1650676122.1516056], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [64, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 26113940, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 2, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[2.661535e-05, 2.6678849999999996e-05, 2.6717849999999997e-05], 0, 0.46747660636901855, 1650677634.7384923], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 17407042, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 4]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.10854e-05, 7.109085e-05, 7.11029e-05], 0, 0.7810003757476807, 1650680015.5899546], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [512, 128, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 12549050, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[2.24233e-05, 2.2450800000000002e-05, 2.24848e-05], 0, 0.708620548248291, 1650681553.685307], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [128, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 4732823, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[3.2699899999999995e-05, 3.27019e-05, 3.2721899999999996e-05], 0, 0.47304797172546387, 1650683034.4890563], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 545975, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 4, 8]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[2.8336850000000006e-05, 2.8396349999999998e-05, 2.839935e-05], 0, 1.9903209209442139, 1650684248.641913], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 35838982, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.2699049999999996e-05, 4.2715550000000005e-05, 4.272905e-05], 0, 0.7070727348327637, 1650686601.5726962], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 2812995, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 32, 2]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 1, 1, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[0.00010861985, 0.00010863435, 0.0001086479], 0, 0.7317159175872803, 1650688514.756369], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [1024, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 1574517, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.8229850000000002e-05, 2.8233350000000003e-05, 2.825485e-05], 0, 0.5290811061859131, 1650689649.0227518], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [256, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 2015046, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[4.65231e-05, 4.652755e-05, 4.65556e-05], 0, 0.5142829418182373, 1650690398.0774868], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 83259, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 4]], ["tile_x", "sp", [-1, 7, 7, 1]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.3635449999999997e-05, 3.365495e-05, 3.366745e-05], 0, 1.430091381072998, 1650691249.2586534], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 3569676, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.170045000000001e-05, 7.171045000000001e-05, 7.17234e-05], 0, 0.7033722400665283, 1650692260.7750976], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 82748, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}, "result": [[0.00019157085, 0.0001915749, 0.00019157595], 0, 0.5982248783111572, 1650693315.4548712], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [2048, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 319626, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 7]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.5206150000000004e-05, 4.5243650000000005e-05, 4.538615e-05], 0, 0.6440334320068359, 1650694063.325511], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 2048, 7, 7], "float32"], ["TENSOR", [512, 2048, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], "float32"], {}], "config": {"index": 227061, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[8.21551e-05, 8.217515e-05, 8.223510000000001e-05], 0, 0.5891335010528564, 1650694683.6785614], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 190201, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 4]], ["tile_x", "sp", [-1, 1, 8, 2]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.184079999999999e-05, 5.184079999999999e-05, 5.1939300000000006e-05], 0, 0.6536734104156494, 1650695338.214328], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 791579, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 7]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[9.482695e-05, 9.48854e-05, 9.493345e-05], 0, 1.4417965412139893, 1650697059.9097984], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1, 2048], "float32"], ["TENSOR", [1000, 2048], "float32"], null, "float32"], {}], "config": {"index": 5, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[2.1966849999999998e-05, 2.197135e-05, 2.19843e-05], 0, 0.4426250457763672, 1650697491.6213834], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [128, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 13507399, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.0136100000000001e-05, 1.0145149999999999e-05, 1.0159099999999999e-05], 0, 0.7864212989807129, 1650724653.7135007], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [128, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 22158174, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 1, 4]], ["tile_y", "sp", [-1, 1, 8, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 0]]}, "result": [[6.9551e-06, 6.962599999999998e-06, 6.9646e-06], 0, 0.7211275100708008, 1650726163.0400467], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [256, 128, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 20804619, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.81296e-05, 2.8130599999999998e-05, 2.814765e-05], 0, 0.5774815082550049, 1650727742.5586843], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [256, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 18573740, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.541225e-05, 1.54172e-05, 1.54537e-05], 0, 0.8937528133392334, 1650729498.3622022], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [128, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 17433624, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[3.12759e-05, 3.13064e-05, 3.13069e-05], 0, 0.7797591686248779, 1650731061.2478383], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [256, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 13412619, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[4.96386e-05, 4.970209999999999e-05, 4.970365e-05], 0, 0.6517493724822998, 1650732352.357867], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [256, 8, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 36224490, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 4, 2]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[8.519599999999999e-06, 8.5411e-06, 8.5516e-06], 0, 0.9091670513153076, 1650733940.9465184], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [512, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 4481521, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[3.21494e-05, 3.21714e-05, 3.218245e-05], 0, 0.627678394317627, 1650735124.7769744], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [512, 256, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 14581630, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[4.980815e-05, 4.9814650000000006e-05, 4.9839099999999995e-05], 0, 0.9194021224975586, 1650736745.8550131], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [256, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 14448421, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 16, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.917355e-05, 3.918705e-05, 3.920855e-05], 0, 0.6747403144836426, 1650738065.3893676], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [256, 8, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 26828436, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 4, 2, 1]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0], ["fuse_yx", "ot", 1]]}, "result": [[6.2195500000000004e-06, 6.2225999999999995e-06, 6.2360999999999995e-06], 0, 0.9589457511901855, 1650740107.0941224], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 8353521, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[6.006074999999999e-05, 6.0142750000000006e-05, 6.016975e-05], 0, 0.6222403049468994, 1650741436.1006827], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 16, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 11704914, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[9.4151e-06, 9.4271e-06, 9.44265e-06], 0, 1.0609674453735352, 1650742517.3273706], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [1024, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 1032786, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 7, 2]], ["tile_x", "sp", [-1, 2, 1, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[4.349615e-05, 4.3545600000000005e-05, 4.35581e-05], 0, 0.6170883178710938, 1650743933.698596], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [1024, 512, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 3334053, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 4]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[6.675835e-05, 6.676189999999999e-05, 6.67844e-05], 0, 0.8200364112854004, 1650745036.010395], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [512, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 1560709, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.721785000000001e-05, 5.723285e-05, 5.7259849999999994e-05], 0, 0.7093315124511719, 1650746357.0012312], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [512, 16, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 2887260, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 4, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0], ["fuse_yx", "ot", 0]]}, "result": [[8.0721e-06, 8.0771e-06, 8.077649999999999e-06], 0, 1.0441720485687256, 1650747537.4062088], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [1024, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 1077100, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[8.750664999999999e-05, 8.752015e-05, 8.752465e-05], 0, 0.6045539379119873, 1650749113.1365385], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [1024, 32, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 1530018, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[1.173565e-05, 1.1736649999999999e-05, 1.174615e-05], 0, 1.378283977508545, 1650750073.250046], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 7, 7], "float16"], ["TENSOR", [2048, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 91389, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[6.80499e-05, 6.80609e-05, 6.810035e-05], 0, 0.781482458114624, 1650751244.5316045], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [2048, 1024, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 353303, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 8]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[9.634765e-05, 9.63937e-05, 9.644220000000001e-05], 0, 1.0039925575256348, 1650752141.1724086], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 2048, 7, 7], "float16"], ["TENSOR", [1024, 2048, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 305015, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 4]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 64]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[9.278935e-05, 9.279235e-05, 9.27989e-05], 0, 1.5176756381988525, 1650753010.2550077], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "group_conv2d_nchw.cuda", [["TENSOR", [1, 1024, 7, 7], "float16"], ["TENSOR", [1024, 32, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "float16"], {}], "config": {"index": 1514322, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_g", "sp", [-1, 1]], ["tile_f", "sp", [-1, 1, 4, 8]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1], ["fuse_yx", "ot", 1]]}, "result": [[1.128565e-05, 1.129965e-05, 1.130615e-05], 0, 2.028611183166504, 1650754020.2265894], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 3, 224, 224], "float16"], ["TENSOR", [32, 3, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 16795845, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 56, 1]], ["tile_rc", "sp", [-1, 3]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[6.9991000000000005e-06, 7.01165e-06, 7.02965e-06], 0, 0.6407957077026367, 1650813939.149559], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float16"], ["TENSOR", [32, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 17044204, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 4]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 56, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[6.348049999999999e-06, 6.3581e-06, 6.3720999999999995e-06], 0, 0.5244479179382324, 1650816895.9108312], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float16"], ["TENSOR", [32, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 4975320, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 4]], ["tile_x", "sp", [-1, 1, 56, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[5.4956e-06, 5.5181e-06, 5.5605500000000005e-06], 0, 0.6335549354553223, 1650817497.488131], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 112, 112], "float16"], ["TENSOR", [16, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 22304829, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 2, 4]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.6231e-06, 4.635050000000001e-06, 4.64005e-06], 0, 1.0218045711517334, 1650818669.1302266], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 16, 112, 112], "float16"], ["TENSOR", [96, 16, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 53847898, "code_hash": null, "entity": [["tile_f", "sp", [-1, 6, 4, 2]], ["tile_y", "sp", [-1, 4, 2, 1]], ["tile_x", "sp", [-1, 1, 16, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.173100000000001e-06, 7.176100000000001e-06, 7.176100000000001e-06], 0, 0.8336286544799805, 1650819872.646985], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 96, 112, 112], "float16"], ["TENSOR", [96, 1, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 6922048, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 1]], ["tile_x", "sp", [-1, 1, 8, 7]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[6.5691e-06, 6.575050000000001e-06, 6.577099999999999e-06], 0, 0.763239860534668, 1650821086.6168768], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 96, 56, 56], "float16"], ["TENSOR", [24, 96, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 15206426, "code_hash": null, "entity": [["tile_f", "sp", [-1, 3, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.61605e-06, 5.61705e-06, 5.6181000000000005e-06], 0, 0.7702598571777344, 1650822139.752306], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 24, 56, 56], "float16"], ["TENSOR", [144, 24, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 52846557, "code_hash": null, "entity": [["tile_f", "sp", [-1, 6, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.506550000000001e-06, 5.50705e-06, 5.5156e-06], 0, 0.8066568374633789, 1650823884.4936872], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float16"], ["TENSOR", [144, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 10299450, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 1, 8]], ["tile_x", "sp", [-1, 1, 28, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[6.08305e-06, 6.0940500000000005e-06, 6.1166e-06], 0, 0.6963748931884766, 1650825481.4431868], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float16"], ["TENSOR", [24, 144, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 34561186, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 6, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 4, 2]], ["tile_rc", "sp", [-1, 12]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[7.8756e-06, 7.8781e-06, 7.8936e-06], 0, 0.600581169128418, 1650826356.6420047], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 144, 56, 56], "float16"], ["TENSOR", [144, 1, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 3175550, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 2, 7]], ["tile_x", "sp", [-1, 1, 14, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.35055e-06, 4.3510500000000005e-06, 4.37055e-06], 0, 1.0711958408355713, 1650827886.3514745], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 144, 28, 28], "float16"], ["TENSOR", [32, 144, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 7217856, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 2, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[5.16955e-06, 5.1786e-06, 5.1786e-06], 0, 1.1553356647491455, 1650828704.5764093], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 32, 28, 28], "float16"], ["TENSOR", [192, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 3039664, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}, "result": [[3.95305e-06, 3.95305e-06, 3.96305e-06], 0, 0.5106737613677979, 1650829874.8851082], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float16"], ["TENSOR", [192, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 3046176, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 14, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.69705e-06, 3.7045500000000004e-06, 3.71005e-06], 0, 0.5545451641082764, 1650830991.8584304], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float16"], ["TENSOR", [32, 192, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 3283875, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.763050000000001e-06, 5.76505e-06, 5.7651e-06], 0, 0.8265197277069092, 1650831715.1286488], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 192, 28, 28], "float16"], ["TENSOR", [64, 192, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 4522579, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[8.88765e-06, 8.8976e-06, 8.9001e-06], 0, 0.7089235782623291, 1650833027.5999374], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 28, 28], "float16"], ["TENSOR", [384, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 15094839, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 4]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.6831e-06, 7.7046e-06, 7.7066e-06], 0, 1.0405845642089844, 1650834890.3047237], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float16"], ["TENSOR", [384, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 3585120, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 7, 4]], ["tile_x", "sp", [-1, 1, 14, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[5.17855e-06, 5.18105e-06, 5.2031e-06], 0, 0.6448190212249756, 1650836046.3030014], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float16"], ["TENSOR", [64, 384, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 12193990, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 48]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[1.315215e-05, 1.315965e-05, 1.31667e-05], 0, 1.0322420597076416, 1650837129.5166345], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 384, 28, 28], "float16"], ["TENSOR", [384, 1, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 702720, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[4.46905e-06, 4.47055e-06, 4.472050000000001e-06], 0, 0.5480890274047852, 1650837907.944853], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 384, 14, 14], "float16"], ["TENSOR", [96, 384, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 2391475, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.8601e-06, 7.8606e-06, 7.867599999999999e-06], 0, 0.8951401710510254, 1650839601.1209044], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 96, 14, 14], "float16"], ["TENSOR", [576, 96, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 3808975, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 4]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[7.5676e-06, 7.5796e-06, 7.5806e-06], 0, 0.5769634246826172, 1650841183.037211], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float16"], ["TENSOR", [576, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1017240, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 7, 2]], ["tile_x", "sp", [-1, 1, 7, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}, "result": [[3.3840500000000002e-06, 3.3995499999999995e-06, 3.4110500000000006e-06], 0, 0.5795722007751465, 1650842147.0042777], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float16"], ["TENSOR", [96, 576, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 3079603, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.072065e-05, 1.0726149999999999e-05, 1.073915e-05], 0, 1.0619502067565918, 1650843026.3948507], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 576, 14, 14], "float16"], ["TENSOR", [576, 1, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 48759, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 3, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}, "result": [[3.3625500000000005e-06, 3.37555e-06, 3.3775000000000003e-06], 0, 0.4328014850616455, 1650843813.8582454], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 576, 7, 7], "float16"], ["TENSOR", [160, 576, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 412645, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 1]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[1.0541650000000001e-05, 1.056665e-05, 1.061315e-05], 0, 2.12727427482605, 1650844230.8154716], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 160, 7, 7], "float16"], ["TENSOR", [960, 160, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 658631, "code_hash": null, "entity": [["tile_f", "sp", [-1, 6, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[6.8161e-06, 6.823100000000001e-06, 6.841600000000001e-06], 0, 1.1339399814605713, 1650844824.3560324], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "depthwise_conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float16"], ["TENSOR", [960, 1, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 34996, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 3, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 0]]}, "result": [[3.2030500000000006e-06, 3.20655e-06, 3.22755e-06], 0, 0.46258974075317383, 1650845433.5094543], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float16"], ["TENSOR", [160, 960, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 256311, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 32, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.67563e-05, 1.6783300000000002e-05, 1.67918e-05], 0, 1.3270132541656494, 1650846386.3554056], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 960, 7, 7], "float16"], ["TENSOR", [320, 960, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 535611, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}, "result": [[2.13803e-05, 2.138485e-05, 2.14313e-05], 0, 0.6166877746582031, 1650847120.3649514], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 320, 7, 7], "float16"], ["TENSOR", [1280, 320, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 872914, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 8]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 160]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[1.4967699999999999e-05, 1.4970200000000001e-05, 1.497575e-05], 0, 3.8850479125976562, 1650848115.3614283], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1280, 1, 1], "float16"], ["TENSOR", [1000, 1280, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 18036, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 5, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 1, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.67544e-05, 2.68309e-05, 2.6850400000000003e-05], 0, 0.5703048706054688, 1650848714.866295], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [2304, 768], "float32"], null, "float32"], {}], "config": {"index": 7797746, "code_hash": null, "entity": [["tile_x", "sp", [-1, 10, 2, 1]], ["tile_y", "sp", [-1, 2, 2, 9]], ["tile_k", "sp", [-1, 8, 1]]]}, "result": [[0.0038837555500000004, 0.00388391505, 0.0038845200499999996], 0, 2.5623362064361572, 1650922517.9653258], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [2304, 768], "float32"], null, "float32"], {}], "config": {"index": 5, "code_hash": null, "entity": [["tile_k", "sp", [-1, 8]]]}, "result": [[0.026828510899999998, 0.026837540599999998, 0.0268397599], 0, 3.169992446899414, 1650922648.847927], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul.cuda", [["TENSOR", [600, 32, 64], "float32"], ["TENSOR", [600, 32, 64], "float32"], [600, 32, 32], "float32", 0, 1], {}], "config": {"index": 4951, "code_hash": null, "entity": [["tile_y", "sp", [-1, 2, 8]], ["tile_x", "sp", [-1, 16, 1]], ["tile_k", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 16], ["unroll_explicit", "ot", 0]]}, "result": [[3.28759e-05, 3.29589e-05, 3.2961349999999996e-05], 0, 0.5255897045135498, 1650922854.3883076], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "batch_matmul.cuda", [["TENSOR", [600, 32, 32], "float32"], ["TENSOR", [600, 64, 32], "float32"], [600, 32, 64], "float32", 0, 1], {}], "config": {"index": 27148, "code_hash": null, "entity": [["tile_y", "sp", [-1, 2, 8]], ["tile_x", "sp", [-1, 16, 1]], ["tile_k", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 64], ["unroll_explicit", "ot", 1]]}, "result": [[3.2340449999999996e-05, 3.23409e-05, 3.23504e-05], 0, 0.5354282855987549, 1650923290.7878547], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [768, 768], "float32"], null, "float32"], {}], "config": {"index": 4812433, "code_hash": null, "entity": [["tile_x", "sp", [-1, 10, 8, 1]], ["tile_y", "sp", [-1, 1, 8, 8]], ["tile_k", "sp", [-1, 24, 1]]]}, "result": [[0.00024860935, 0.00024866190000000005, 0.00024869645], 0, 1.9623265266418457, 1650924106.1380405], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [768, 768], "float32"], null, "float32"], {}], "config": {"index": 9, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[0.00217030065, 0.0021703712, 0.00217064015], 0, 0.5637376308441162, 1650924967.713869], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [3072, 768], "float32"], null, "float32"], {}], "config": {"index": 44601969, "code_hash": null, "entity": [["tile_x", "sp", [-1, 1, 4, 8]], ["tile_y", "sp", [-1, 2, 4, 4]], ["tile_k", "sp", [-1, 4, 4]]]}, "result": [[0.0018861851000000003, 0.00188620165, 0.0018940356999999999], 0, 1.277604103088379, 1650927453.3620265], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [3072, 768], "float32"], null, "float32"], {}], "config": {"index": 5, "code_hash": null, "entity": [["tile_k", "sp", [-1, 8]]]}, "result": [[0.03605818615, 0.0362008626, 0.0362174195], 0, 4.180805444717407, 1650928571.656076], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_large_batch.gpu", [["TENSOR", [1600, 3072], "float32"], ["TENSOR", [768, 3072], "float32"], null, "float32"], {}], "config": {"index": 15182559, "code_hash": null, "entity": [["tile_x", "sp", [-1, 2, 2, 5]], ["tile_y", "sp", [-1, 4, 2, 3]], ["tile_k", "sp", [-1, 8, 2]]]}, "result": [[0.00559109345, 0.00559257735, 0.0056176812], 0, 3.4281482696533203, 1650932142.7651248], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1600, 3072], "float32"], ["TENSOR", [768, 3072], "float32"], null, "float32"], {}], "config": {"index": 11, "code_hash": null, "entity": [["tile_k", "sp", [-1, 64]]]}, "result": [[0.0361240734, 0.036126658900000004, 0.03614633765], 0, 4.267355918884277, 1650932378.2820215], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 3, 224, 224], "float16"], ["TENSOR", [64, 3, 7, 7], "float16"], [2, 2], [3, 3, 3, 3], [1, 1], "float16"], {}], "config": {"index": 76554127, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 2, 4]], ["tile_y", "sp", [-1, 1, 2, 7]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 7]], ["tile_rx", "sp", [-1, 7]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.347395e-05, 3.347595e-05, 3.352245e-05], 0, 2.189602851867676, 1650936160.5027037], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [64, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 9992684, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 2]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[7.1521e-06, 7.1581e-06, 7.1616e-06], 0, 0.80893874168396, 1650937165.8348072], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [64, 64, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 346643, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 4, 4]], ["tile_x", "sp", [-1, 7, 14, 1]], ["tile_rc", "sp", [-1, 64]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[1.979975e-05, 1.9807250000000003e-05, 1.98152e-05], 0, 4.116674423217773, 1650937833.3360083], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [64, 64, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 87994222, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[3.28174e-05, 3.28324e-05, 3.283545e-05], 0, 0.8266921043395996, 1650939008.9283469], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [64, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 11665964, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 4]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[2.11318e-05, 2.11398e-05, 2.114625e-05], 0, 0.6964631080627441, 1650940826.7842171], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [128, 128, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 29503014, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[5.76747e-05, 5.769175e-05, 5.772775e-05], 0, 0.8780744075775146, 1650942896.389656], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 28, 28], "float16"], ["TENSOR", [512, 128, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 6593422, "code_hash": null, "entity": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 7, 4]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[1.71372e-05, 1.71937e-05, 1.723925e-05], 0, 0.6540956497192383, 1650944631.3486328], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [128, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 10686988, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[2.57673e-05, 2.577485e-05, 2.578635e-05], 0, 1.1292264461517334, 1650946431.3661022], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 128, 28, 28], "float16"], ["TENSOR", [128, 128, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 474214, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 8]], ["tile_x", "sp", [-1, 1, 49, 2]], ["tile_rc", "sp", [-1, 128]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 1]]}, "result": [[2.18753e-05, 2.187975e-05, 2.18833e-05], 0, 1.2164885997772217, 1650947759.1606152], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 128, 28, 28], "float16"], ["TENSOR", [128, 128, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 8384098, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 1, 1, 4]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[4.11295e-05, 4.113955e-05, 4.1163549999999997e-05], 0, 1.0902330875396729, 1650949638.5774388], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [256, 256, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 7353795, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 32, 2]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 2, 1, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[8.86406e-05, 8.864109999999999e-05, 8.864115e-05], 0, 0.8693947792053223, 1650950775.1563373], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 14, 14], "float16"], ["TENSOR", [1024, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 930783, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 4]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}, "result": [[2.34788e-05, 2.34858e-05, 2.35078e-05], 0, 0.5252759456634521, 1650951747.1516812], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [256, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 1212775, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 64]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[3.91651e-05, 3.9180599999999995e-05, 3.9193099999999996e-05], 0, 0.6574358940124512, 1650953175.4410644], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 256, 14, 14], "float16"], ["TENSOR", [256, 256, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 86722, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 8, 4, 4]], ["tile_x", "sp", [-1, 1, 49, 1]], ["tile_rc", "sp", [-1, 128]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[2.68669e-05, 2.6873399999999996e-05, 2.687835e-05], 0, 1.2940130233764648, 1650953832.3535905], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 256, 14, 14], "float16"], ["TENSOR", [256, 256, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 4312500, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 2]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 2, 1, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[5.6155250000000004e-05, 5.6155750000000005e-05, 5.616675e-05], 0, 0.949575662612915, 1650956448.7583783], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 42753, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 2]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}, "result": [[0.0001527741, 0.0001528176, 0.00015290065], 0, 0.6949319839477539, 1650957527.024363], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 7, 7], "float16"], ["TENSOR", [2048, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 143805, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[3.205195e-05, 3.209495e-05, 3.21014e-05], 0, 0.7755589485168457, 1650958461.6669595], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 2048, 7, 7], "float16"], ["TENSOR", [512, 2048, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], "float16"], {}], "config": {"index": 188833, "code_hash": null, "entity": [["tile_f", "sp", [-1, 2, 4, 2]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}, "result": [[6.173455e-05, 6.1736e-05, 6.174349999999999e-05], 0, 0.7472929954528809, 1650959393.5092852], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw_winograd.cuda", [["TENSOR", [1, 512, 7, 7], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 45226, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 4]], ["tile_x", "sp", [-1, 1, 4, 4]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}, "result": [[3.226995e-05, 3.22995e-05, 3.236145e-05], 0, 0.8135907649993896, 1650960185.1974587], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "conv2d_nchw.cuda", [["TENSOR", [1, 512, 7, 7], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 789819, "code_hash": null, "entity": [["tile_f", "sp", [-1, 1, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 7, 1, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[8.708445e-05, 8.710639999999999e-05, 8.711495e-05], 0, 2.0074827671051025, 1650961212.134662], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", "dense_small_batch.gpu", [["TENSOR", [1, 2048], "float16"], ["TENSOR", [1000, 2048], "float16"], null, "float16"], {}], "config": {"index": 5, "code_hash": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[6.6981e-06, 6.7081e-06, 6.7141e-06], 0, 0.44542431831359863, 1650961789.8975906], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [1024, 32, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 2, "code_hash": null, "entity": [["algo", "ot", 2]]}, "result": [[7.8907e-05, 7.89105e-05, 7.89305e-05], 0, 1.3035199642181396, 1651696037.0387743], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 2048, 7, 7], "float32"], ["TENSOR", [1024, 2048, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.2306149999999994e-05, 5.232314999999999e-05, 5.233665e-05], 0, 0.6558830738067627, 1651696051.5444815], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [2048, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[3.39419e-05, 3.395845e-05, 3.4114899999999994e-05], 0, 0.6455562114715576, 1651696060.9264348], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [2048, 1024, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[7.451495e-05, 7.6711e-05, 8.551005e-05], 0, 1.2933030128479004, 1651696077.6541371], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [1024, 32, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[8.338005000000001e-05, 8.339205e-05, 8.345205e-05], 0, 1.2021043300628662, 1651696086.517543], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [1024, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.517355e-05, 4.5203050000000004e-05, 4.52056e-05], 0, 0.6302511692047119, 1651696097.985193], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [512, 16, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 2, "code_hash": null, "entity": [["algo", "ot", 2]]}, "result": [[6.15488e-05, 6.161125e-05, 6.16368e-05], 0, 1.2534654140472412, 1651696115.4964283], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [512, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.3825550000000006e-05, 4.3842049999999995e-05, 4.38481e-05], 0, 0.6925802230834961, 1651696129.3560424], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [1024, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[2.49448e-05, 2.494735e-05, 2.4948799999999997e-05], 0, 0.7423253059387207, 1651696138.326287], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [1024, 512, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[5.060915e-05, 5.0639650000000006e-05, 5.0771650000000004e-05], 0, 1.2802436351776123, 1651696155.4096394], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [512, 16, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 2, "code_hash": null, "entity": [["algo", "ot", 2]]}, "result": [[6.28783e-05, 6.29178e-05, 6.48748e-05], 0, 1.224653720855713, 1651696166.4411144], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [512, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[7.627445e-05, 7.629045e-05, 7.63595e-05], 0, 1.2499911785125732, 1651696178.3922029], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [256, 8, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 2, "code_hash": null, "entity": [["algo", "ot", 2]]}, "result": [[5.29612e-05, 5.3004150000000006e-05, 5.5102199999999996e-05], 0, 1.3471219539642334, 1651696192.9950073], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [256, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[2.69028e-05, 2.6947850000000003e-05, 2.709185e-05], 0, 0.6759750843048096, 1651696206.3298318], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [512, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[4.0580999999999997e-05, 4.0632550000000005e-05, 4.0673e-05], 0, 1.2982831001281738, 1651696215.4789267], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [512, 256, 1, 1], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.3493549999999996e-05, 4.352155e-05, 4.361855e-05], 0, 1.3339207172393799, 1651696233.0065742], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [256, 8, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[5.64837e-05, 5.6511250000000005e-05, 6.553234999999999e-05], 0, 1.2724413871765137, 1651696246.2444258], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [256, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.016415e-05, 5.0178150000000003e-05, 5.02056e-05], 0, 0.6765668392181396, 1651696257.173029], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [128, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[4.13285e-05, 4.135955e-05, 4.1496e-05], 0, 1.2668747901916504, 1651696265.374396], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [256, 128, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.542705e-05, 4.566555e-05, 4.8272150000000005e-05], 0, 0.7064330577850342, 1651696278.1836157], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [256, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.19779e-05, 3.2020399999999996e-05, 3.22334e-05], 0, 1.2480614185333252, 1651696293.8619213], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [128, 4, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.314195e-05, 3.315195e-05, 3.31679e-05], 0, 1.3792345523834229, 1651696312.5689838], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [128, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.605575e-05, 1.6090699999999998e-05, 1.61782e-05], 0, 1.2849547863006592, 1651696321.6512122], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 3072], "float16"], ["TENSOR", [768, 3072], "float16"], null, "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.00013500355, 0.0001350666, 0.00013514510000000001], 0, 0.6779699325561523, 1651704762.118352], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [3072, 768], "float16"], null, "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.000130729, 0.000130799, 0.000130856], 0, 0.7103776931762695, 1651704764.6885316], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [768, 768], "float16"], null, "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[4.252165e-05, 4.254115e-05, 4.254215e-05], 0, 0.5850663185119629, 1651704767.20349], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_cublas.cuda", [["TENSOR", [600, 32, 32], "float16"], ["TENSOR", [600, 64, 32], "float16"], [600, 32, 64], "float16", 0, 1], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[1.93158e-05, 1.9405800000000003e-05, 1.9479299999999998e-05], 0, 0.5462775230407715, 1651704769.5398254], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_cublas.cuda", [["TENSOR", [600, 32, 64], "float16"], ["TENSOR", [600, 32, 64], "float16"], [600, 32, 32], "float16", 0, 1], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[2.1815349999999998e-05, 2.1823350000000002e-05, 2.19333e-05], 0, 0.5473110675811768, 1651704771.9580688], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float16"], ["TENSOR", [2304, 768], "float16"], null, "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[9.769749999999999e-05, 9.77725e-05, 9.781149999999998e-05], 0, 0.6392307281494141, 1651704774.6099348], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1280, 1, 1], "float32"], ["TENSOR", [1000, 1280, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[1.585375e-05, 1.585375e-05, 1.586625e-05], 0, 0.6553614139556885, 1651705312.2156374], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 320, 7, 7], "float32"], ["TENSOR", [1280, 320, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.87343e-05, 1.8758849999999997e-05, 1.8765299999999998e-05], 0, 1.254763126373291, 1651705318.060327], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [320, 960, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.7147599999999996e-05, 3.71591e-05, 3.71691e-05], 0, 1.297356128692627, 1651705334.5834737], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [160, 960, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.79461e-05, 3.79671e-05, 3.796715e-05], 0, 1.2885222434997559, 1651705348.3673804], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 160, 7, 7], "float32"], ["TENSOR", [960, 160, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.5520249999999998e-05, 1.5562750000000003e-05, 1.644575e-05], 0, 1.3106482028961182, 1651705361.6600392], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 576, 7, 7], "float32"], ["TENSOR", [160, 576, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.5274899999999997e-05, 2.5275900000000003e-05, 2.530395e-05], 0, 1.29083251953125, 1651705367.3725972], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 576, 14, 14], "float32"], ["TENSOR", [96, 576, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.6715450000000002e-05, 2.6721899999999996e-05, 2.6767950000000002e-05], 0, 1.2988712787628174, 1651705383.1567316], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 96, 14, 14], "float32"], ["TENSOR", [576, 96, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.613175e-05, 1.6169249999999998e-05, 1.621125e-05], 0, 1.295907974243164, 1651705394.4066052], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 384, 14, 14], "float32"], ["TENSOR", [96, 384, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.90103e-05, 1.90198e-05, 1.90208e-05], 0, 1.2901599407196045, 1651705407.7473567], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 384, 28, 28], "float32"], ["TENSOR", [64, 384, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.90728e-05, 1.90858e-05, 1.909985e-05], 0, 1.2148852348327637, 1651705422.5198638], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 28, 28], "float32"], ["TENSOR", [384, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.539325e-05, 1.539975e-05, 1.5983249999999996e-05], 0, 1.2038941383361816, 1651705431.8003838], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [64, 192, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.5386250000000002e-05, 1.540225e-05, 1.541325e-05], 0, 1.298431634902954, 1651705446.8170843], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [32, 192, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.58127e-05, 1.6144749999999998e-05, 1.63108e-05], 0, 1.2637646198272705, 1651705455.472888], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 28, 28], "float32"], ["TENSOR", [192, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.4976249999999999e-05, 1.7667749999999997e-05, 1.774875e-05], 0, 1.1796517372131348, 1651705468.7749956], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 144, 28, 28], "float32"], ["TENSOR", [32, 144, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.532325e-05, 1.5424749999999998e-05, 1.567525e-05], 0, 1.286487340927124, 1651705487.1521738], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 144, 56, 56], "float32"], ["TENSOR", [24, 144, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.51377e-05, 1.5184749999999999e-05, 1.5248749999999999e-05], 0, 1.1545443534851074, 1651705491.9636042], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 24, 56, 56], "float32"], ["TENSOR", [144, 24, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.583025e-05, 1.583175e-05, 1.641125e-05], 0, 1.2789053916931152, 1651705511.8756096], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 96, 56, 56], "float32"], ["TENSOR", [24, 96, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.596475e-05, 1.63408e-05, 1.766875e-05], 0, 1.2472562789916992, 1651705522.051849], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 16, 112, 112], "float32"], ["TENSOR", [96, 16, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.4875400000000002e-05, 2.4914400000000003e-05, 2.49284e-05], 0, 1.2587049007415771, 1651705532.8126292], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [16, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.580675e-05, 1.5856300000000003e-05, 1.5899250000000002e-05], 0, 1.2889397144317627, 1651705545.7501018], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [32, 32, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.533775e-05, 1.536725e-05, 1.536775e-05], 0, 1.2521677017211914, 1651705557.5611386], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 3, 224, 224], "float32"], ["TENSOR", [32, 3, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.612825e-05, 1.6157300000000002e-05, 1.6178749999999998e-05], 0, 1.2984960079193115, 1651705570.6978276], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 7, 7], "float16"], ["TENSOR", [1024, 32, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.8641849999999995e-05, 5.8685849999999996e-05, 5.9010850000000004e-05], 0, 1.2714319229125977, 1651713264.5110943], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 2048, 7, 7], "float16"], ["TENSOR", [1024, 2048, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.15054e-05, 6.163035e-05, 6.22079e-05], 0, 1.270298957824707, 1651713270.2057958], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 7, 7], "float16"], ["TENSOR", [2048, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.843080000000001e-05, 5.888885e-05, 5.890435000000001e-05], 0, 1.3643712997436523, 1651713286.55753], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [2048, 1024, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.20044e-05, 6.215889999999999e-05, 6.27134e-05], 0, 1.3452882766723633, 1651713299.5873055], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [1024, 32, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.113384999999999e-05, 6.12164e-05, 6.12939e-05], 0, 1.3625106811523438, 1651713310.588895], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [1024, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[7.63861e-05, 7.63931e-05, 7.77211e-05], 0, 1.4055383205413818, 1651713325.2641764], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [512, 16, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.80418e-05, 5.821035e-05, 5.842684999999999e-05], 0, 1.346651315689087, 1651713331.084954], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [512, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.944685e-05, 5.953585e-05, 5.9695400000000004e-05], 0, 1.2687809467315674, 1651713342.4434164], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [1024, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[6.35162e-05, 6.37062e-05, 6.374014999999999e-05], 0, 1.2827024459838867, 1651713358.4088492], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [1024, 512, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.186515e-05, 6.191315e-05, 6.222864999999999e-05], 0, 1.2956676483154297, 1651713367.8496935], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 16, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.0032100000000004e-05, 6.0201099999999995e-05, 6.102315e-05], 0, 1.367894172668457, 1651713384.8863037], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.615235e-05, 4.6155350000000005e-05, 4.67844e-05], 0, 0.6502845287322998, 1651713389.0626698], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [256, 8, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.905665e-05, 5.910555e-05, 5.9806099999999996e-05], 0, 1.3620550632476807, 1651713401.7820523], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [256, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[6.297215e-05, 6.298165e-05, 6.303215e-05], 0, 1.2902483940124512, 1651713416.3568978], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [512, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.8349199999999995e-05, 3.8372200000000005e-05, 3.83862e-05], 0, 1.2480621337890625, 1651713429.365083], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [512, 256, 1, 1], "float16"], [2, 2], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.86542e-05, 3.866125e-05, 3.8687700000000006e-05], 0, 1.2490315437316895, 1651713438.7554371], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [256, 8, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.891705e-05, 5.90376e-05, 5.916055e-05], 0, 1.2681810855865479, 1651713447.8981256], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [256, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[6.140065e-05, 6.14486e-05, 6.149865e-05], 0, 1.307438611984253, 1651713464.5051072], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [128, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.749815e-05, 3.7540649999999994e-05, 3.75712e-05], 0, 1.3630273342132568, 1651713472.181648], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [256, 128, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.620115e-05, 3.6212149999999995e-05, 3.628415e-05], 0, 1.2850840091705322, 1651713485.997229], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [256, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.2912399999999998e-05, 2.29144e-05, 2.2933899999999997e-05], 0, 1.3092646598815918, 1651713499.214019], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [128, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 32, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[5.240745e-05, 5.2427950000000003e-05, 5.24484e-05], 0, 1.3632240295410156, 1651713513.548847], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [128, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.67913e-05, 1.68143e-05, 1.68283e-05], 0, 1.330857515335083, 1651713525.1422098], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 3072], "float32"], ["TENSOR", [768, 3072], "float32"], null, "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.0006445004499999999, 0.0006610111999999999, 0.0006667967499999999], 0, 0.648043155670166, 1651716140.0694156], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [3072, 768], "float32"], null, "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.00062875765, 0.00063329275, 0.0006615721500000001], 0, 0.6620566844940186, 1651716142.9851758], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [768, 768], "float32"], null, "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.00016988825000000001, 0.00016991475, 0.00017001125], 0, 0.553156852722168, 1651716145.929202], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_cublas.cuda", [["TENSOR", [600, 32, 32], "float32"], ["TENSOR", [600, 64, 32], "float32"], [600, 32, 64], "float32", 0, 1], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[8.121880000000001e-05, 8.12663e-05, 8.133385e-05], 0, 0.5643446445465088, 1651716148.5040443], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "batch_matmul_cublas.cuda", [["TENSOR", [600, 32, 64], "float32"], ["TENSOR", [600, 32, 64], "float32"], [600, 32, 32], "float32", 0, 1], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.00012531600000000002, 0.0001253595, 0.00012557855000000002], 0, 0.5810286998748779, 1651716151.1036334], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1600, 768], "float32"], ["TENSOR", [2304, 768], "float32"], null, "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[0.00046191695, 0.00046221050000000007, 0.00047105154999999997], 0, 0.6363368034362793, 1651716154.1708283], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1280, 1, 1], "float16"], ["TENSOR", [1000, 1280, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.02168e-05, 5.048629999999999e-05, 5.07603e-05], 0, 1.243542194366455, 1651717284.5080416], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 320, 7, 7], "float16"], ["TENSOR", [1280, 320, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[4.0327099999999995e-05, 4.033165e-05, 4.037015e-05], 0, 1.2465286254882812, 1651717295.9882414], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 960, 7, 7], "float16"], ["TENSOR", [320, 960, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.149594999999999e-05, 6.213245e-05, 6.32615e-05], 0, 1.318899154663086, 1651717309.334955], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 960, 7, 7], "float16"], ["TENSOR", [160, 960, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.716590000000001e-05, 5.74579e-05, 5.75394e-05], 0, 1.2854020595550537, 1651717317.5957215], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 160, 7, 7], "float16"], ["TENSOR", [960, 160, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.303085e-05, 2.303885e-05, 2.3046849999999998e-05], 0, 1.2467610836029053, 1651717327.262812], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 576, 7, 7], "float16"], ["TENSOR", [160, 576, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.0953449999999995e-05, 6.199295e-05, 6.449e-05], 0, 1.27321195602417, 1651717339.923953], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 576, 14, 14], "float16"], ["TENSOR", [96, 576, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.847925e-05, 4.8582249999999994e-05, 4.8923249999999996e-05], 0, 0.7204091548919678, 1651717351.960737], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 96, 14, 14], "float16"], ["TENSOR", [576, 96, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.88083e-05, 1.883275e-05, 1.884675e-05], 0, 1.296351432800293, 1651717366.7528808], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 384, 14, 14], "float16"], ["TENSOR", [96, 384, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.69267e-05, 4.69492e-05, 4.775025e-05], 0, 0.6626412868499756, 1651717373.8019524], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 384, 28, 28], "float16"], ["TENSOR", [64, 384, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.43037e-05, 4.433015e-05, 4.5084200000000005e-05], 0, 0.7547762393951416, 1651717383.6092317], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 28, 28], "float16"], ["TENSOR", [384, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.609725e-05, 1.614725e-05, 1.641175e-05], 0, 1.2399446964263916, 1651717399.7526286], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 192, 28, 28], "float16"], ["TENSOR", [64, 192, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.708045e-05, 2.70849e-05, 2.7090900000000002e-05], 0, 1.2248289585113525, 1651717410.9461355], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 192, 28, 28], "float16"], ["TENSOR", [32, 192, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.6925900000000006e-05, 2.6931899999999995e-05, 2.706495e-05], 0, 1.2500784397125244, 1651717417.8287508], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 28, 28], "float16"], ["TENSOR", [192, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.570075e-05, 1.5813749999999997e-05, 1.586775e-05], 0, 1.197462558746338, 1651717429.7097864], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 144, 28, 28], "float16"], ["TENSOR", [32, 144, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.344685e-05, 2.34489e-05, 2.347385e-05], 0, 1.221846342086792, 1651717441.3164573], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 144, 56, 56], "float16"], ["TENSOR", [24, 144, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.3502899999999998e-05, 2.350685e-05, 2.3518900000000003e-05], 0, 1.2411229610443115, 1651717456.703392], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 24, 56, 56], "float16"], ["TENSOR", [144, 24, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.656425e-05, 1.65673e-05, 1.67688e-05], 0, 1.2433347702026367, 1651717463.5552585], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 96, 56, 56], "float16"], ["TENSOR", [24, 96, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.69633e-05, 1.6982800000000003e-05, 1.69848e-05], 0, 1.2298755645751953, 1651717475.358641], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 16, 112, 112], "float16"], ["TENSOR", [96, 16, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.81473e-05, 1.818825e-05, 1.8337800000000003e-05], 0, 1.2717070579528809, 1651717489.3928924], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 112, 112], "float16"], ["TENSOR", [16, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.660675e-05, 1.6634750000000002e-05, 1.6637750000000002e-05], 0, 1.2657883167266846, 1651717502.1012578], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 32, 112, 112], "float16"], ["TENSOR", [32, 32, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.70378e-05, 1.70493e-05, 1.75908e-05], 0, 1.2577564716339111, 1651717513.2569478], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 3, 224, 224], "float16"], ["TENSOR", [32, 3, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.683475e-05, 1.6837750000000003e-05, 1.6855800000000002e-05], 0, 1.2461504936218262, 1651717520.0673897], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1, 2048], "float16"], ["TENSOR", [1000, 2048], "float16"], null, "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[1.012365e-05, 1.013265e-05, 1.017665e-05], 0, 0.511929988861084, 1651718651.679756], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 7, 7], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.54195e-05, 6.549299999999999e-05, 6.550705e-05], 0, 1.395721435546875, 1651718658.7949653], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 2048, 7, 7], "float16"], ["TENSOR", [512, 2048, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.2396e-05, 6.26255e-05, 6.280845e-05], 0, 1.296645164489746, 1651718672.3619213], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 7, 7], "float16"], ["TENSOR", [2048, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.71654e-05, 5.731935e-05, 5.7772900000000004e-05], 0, 1.2215275764465332, 1651718687.327885], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.98051e-05, 6.982555e-05, 6.986205e-05], 0, 1.2769300937652588, 1651718697.2846546], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 14, 14], "float16"], ["TENSOR", [256, 256, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.818955e-05, 6.820205e-05, 6.87891e-05], 0, 1.224205493927002, 1651718706.8167992], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float16"], ["TENSOR", [256, 1024, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.9839299999999996e-05, 5.013474999999999e-05, 5.0225299999999995e-05], 0, 0.675896167755127, 1651718721.3677247], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 14, 14], "float16"], ["TENSOR", [1024, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.600905e-05, 3.601655e-05, 3.60421e-05], 0, 1.211909532546997, 1651718733.218238], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float16"], ["TENSOR", [256, 256, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.584605e-05, 6.602705e-05, 6.646905e-05], 0, 1.2575101852416992, 1651718744.2928133], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 28, 28], "float16"], ["TENSOR", [128, 128, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.469235e-05, 5.492235e-05, 5.5110899999999994e-05], 0, 1.3174407482147217, 1651718761.5744207], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [128, 512, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[4.54077e-05, 4.565825e-05, 4.6260699999999996e-05], 0, 0.6943933963775635, 1651718771.8819072], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 28, 28], "float16"], ["TENSOR", [512, 128, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.460135e-05, 2.4602899999999998e-05, 2.461885e-05], 0, 1.2023015022277832, 1651718779.6079652], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float16"], ["TENSOR", [128, 128, 3, 3], "float16"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.0838449999999993e-05, 6.0851449999999995e-05, 6.165745e-05], 0, 1.2409472465515137, 1651718790.9801152], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float16"], ["TENSOR", [64, 256, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.40235e-05, 3.405705e-05, 3.4086e-05], 0, 1.2000832557678223, 1651718801.1824563], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [64, 64, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[6.791055e-05, 6.794504999999999e-05, 6.79531e-05], 0, 1.2569653987884521, 1651718819.732785], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float16"], ["TENSOR", [64, 64, 1, 1], "float16"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.638225e-05, 1.63893e-05, 1.671225e-05], 0, 1.1855523586273193, 1651718824.89687], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 3, 224, 224], "float16"], ["TENSOR", [64, 3, 7, 7], "float16"], [2, 2], [3, 3, 3, 3], [1, 1], 1, "NCHW", "float16"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.066025e-05, 5.0691799999999995e-05, 5.092584999999999e-05], 0, 1.2404775619506836, 1651718841.5797617], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "dense_cublas.cuda", [["TENSOR", [1, 2048], "float32"], ["TENSOR", [1000, 2048], "float32"], null, "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": []}, "result": [[2.261185e-05, 2.2639350000000002e-05, 2.264785e-05], 0, 0.5434033870697021, 1651719634.0237954], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[0.0001233614, 0.00012337235, 0.0001235209], 0, 1.3691678047180176, 1651719642.5484385], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 2048, 7, 7], "float32"], ["TENSOR", [512, 2048, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[5.8770900000000003e-05, 5.880735e-05, 5.896090000000001e-05], 0, 0.7814383506774902, 1651719660.6665301], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [2048, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[2.3442850000000002e-05, 2.348735e-05, 2.349685e-05], 0, 0.7805249691009521, 1651719670.6545522], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[0.0001233889, 0.0001234364, 0.00012356584999999998], 0, 1.3443200588226318, 1651719689.3742628], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 6, "code_hash": null, "entity": [["algo", "ot", 6]]}, "result": [[6.372395e-05, 6.374645e-05, 6.375595e-05], 0, 1.365302562713623, 1651719706.674646], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [256, 1024, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[4.4549200000000004e-05, 4.45562e-05, 4.457515e-05], 0, 1.327195644378662, 1651719715.0719266], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [1024, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[2.234335e-05, 2.23598e-05, 2.237235e-05], 0, 0.6870465278625488, 1651719729.4540246], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[6.869755e-05, 6.872955e-05, 6.886254999999999e-05], 0, 1.3498952388763428, 1651719742.1431673], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 6, "code_hash": null, "entity": [["algo", "ot", 6]]}, "result": [[3.7592099999999995e-05, 3.768655e-05, 3.935910000000001e-05], 0, 1.39213228225708, 1651719760.1923416], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [128, 512, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[3.28385e-05, 3.28675e-05, 3.2889e-05], 0, 1.3503003120422363, 1651719778.4827368], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [512, 128, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[2.4456849999999996e-05, 2.4474399999999997e-05, 2.4484349999999995e-05], 0, 1.3296136856079102, 1651719786.8804219], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[7.127059999999999e-05, 7.132754999999999e-05, 7.13411e-05], 0, 1.249570608139038, 1651719801.3369465], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [64, 256, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 1, "code_hash": null, "entity": [["algo", "ot", 1]]}, "result": [[2.11993e-05, 2.130335e-05, 2.186885e-05], 0, 0.655217170715332, 1651719817.2530777], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 6, "code_hash": null, "entity": [["algo", "ot", 6]]}, "result": [[3.667505e-05, 3.6721099999999996e-05, 3.738805e-05], 0, 1.327420711517334, 1651719833.0931804], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 1, 1], "float32"], [1, 1], [0, 0, 0, 0], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[1.589875e-05, 1.591525e-05, 1.617775e-05], 0, 1.33211088180542, 1651719846.9002128], "version": 0.2, "tvm_version": "0.9.dev0"} +{"input": ["cuda -keys=cuda,gpu -arch=sm_86 -libs=cudnn,cublas -max_num_threads=1024 -thread_warp_size=32", "conv2d_cudnn.cuda", [["TENSOR", [1, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 7, 7], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], 1, "NCHW", "float32"], {}], "config": {"index": 0, "code_hash": null, "entity": [["algo", "ot", 0]]}, "result": [[4.56097e-05, 4.56587e-05, 4.57937e-05], 0, 1.3442072868347168, 1651719855.7311447], "version": 0.2, "tvm_version": "0.9.dev0"} diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h index f73f2230df4d7..eebb2fb77c201 100644 --- a/include/tvm/ir/module.h +++ b/include/tvm/ir/module.h @@ -544,6 +544,24 @@ constexpr const char* kExternalMods = "external_mods"; */ constexpr const char* kConstNameToConstant = "const_name_to_constant"; +/*! + * \brief All the runtime::Modules accumulated during compilation by external codegen. These + * modules must be either directly linked or captured in the final compilation artifact. + * + * Type: Array + */ +constexpr const char* kExternalMods = "external_mods"; + +/*! + * \brief All the named runtime::NDArrays accumulated during compilation by external codegen. + * Generally the associated runtime::Module will indicate it requires bindings for these names, + * and during module initialization these bindings will be recovered from a ConstLoaderModule. + * See also kConstantsArray above, which is the analog for PrimFuncs. + * + * Type: Map + */ +constexpr const char* kConstNameToNDArray = "const_name_to_ndarray"; + } // namespace attr } // namespace tvm #endif // TVM_IR_MODULE_H_ diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py index 11a608d4cbbf8..ee645976ec90c 100644 --- a/python/tvm/autotvm/task/dispatcher.py +++ b/python/tvm/autotvm/task/dispatcher.py @@ -58,6 +58,11 @@ class DispatchContext(object): def __init__(self): self._old_ctx = DispatchContext.current + # TODO(mbs): Collage only: Allow cache query + # DO NOT SUBMIT + def contains(self, target, workload): + raise NotImplementedError() + def query(self, target, workload): """ Query the context to get the specific config for a template. @@ -297,8 +302,10 @@ def load(self, records): counter = 0 for inp, res in joint_records: counter += 1 - if res.error_no != 0: - continue + # TODO(mbs): Collage only: Cache the error so don't re-tune + # DO NOT SUBMIT + # if res.error_no != 0: + # continue # use target keys in tvm target system as key to build best map for k in inp.target.keys: @@ -320,7 +327,16 @@ def load(self, records): if np.mean(other_res.costs) > np.mean(res.costs): best_by_model[key] = (inp, res) - logger.debug("Finish loading %d records", counter) + # TODO(mbs): Collage only: Too verbose + # DO NOT SUBMIT + # logger.info("Finished loading %d records", counter) + + # TODO(mbs): Collage only: Allow cache query + # DO NOT SUBMIT + def contains(self, target, workload): + # logger.info( + # f"look for match with {target} and {workload} with {len(self._best_user_defined)} user-defined, {len(self.best_by_model)} model and {len(self.best_by_targetkey)} target entries") + return self._query_inside(target, workload) is not None def _query_inside(self, target, workload): if target is None: diff --git a/python/tvm/meta_schedule/testing/custom_builder_runner.py b/python/tvm/meta_schedule/testing/custom_builder_runner.py index 3ba007d9a4d37..a62ad0ae3eec1 100644 --- a/python/tvm/meta_schedule/testing/custom_builder_runner.py +++ b/python/tvm/meta_schedule/testing/custom_builder_runner.py @@ -84,11 +84,8 @@ def build_relay_with_tensorrt( from tvm.relay.op.contrib import tensorrt from tvm.runtime import Module - mod, config = tensorrt.partition_for_tensorrt(mod, params) - with PassContext( - opt_level=3, - config={"relay.ext.tensorrt.options": config}, - ): + mod = tensorrt.partition_for_tensorrt(mod, params) + with PassContext(opt_level=3): result = relay_build(mod, target=target, target_host=None, params=params) assert isinstance(result, Module) return result diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py index 89c8fcb17d731..97842738e5cd4 100644 --- a/python/tvm/relay/__init__.py +++ b/python/tvm/relay/__init__.py @@ -32,6 +32,7 @@ from . import transform from . import analysis +from . import collage from .build_module import build, create_executor, optimize from .transform import build_config from . import debug diff --git a/python/tvm/relay/collage/__init__.py b/python/tvm/relay/collage/__init__.py new file mode 100644 index 0000000000000..bb77f69a7c2cb --- /dev/null +++ b/python/tvm/relay/collage/__init__.py @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=wildcard-import +from .collage_partitioner import * diff --git a/python/tvm/relay/collage/_ffi_api.py b/python/tvm/relay/collage/_ffi_api.py new file mode 100644 index 0000000000000..afaa5ce98df10 --- /dev/null +++ b/python/tvm/relay/collage/_ffi_api.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""FFI APIs for the Collage partitioner.""" +import tvm._ffi + + +tvm._ffi._init_api("collage", __name__) diff --git a/python/tvm/relay/collage/collage_partitioner.py b/python/tvm/relay/collage/collage_partitioner.py new file mode 100644 index 0000000000000..88a1b1da8fe2f --- /dev/null +++ b/python/tvm/relay/collage/collage_partitioner.py @@ -0,0 +1,237 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Search for optimal partitionings over Relay models.""" + +import tvm +import numpy as np +from tvm._ffi.registry import register_func, register_object +from tvm.runtime import Object +import logging +import os +import shutil +import math +import tempfile + +from . import _ffi_api + +AUTOTVM_NUM_TRIALS = 2000 +AUTOTVM_EARLY_STOPPING = 600 +MEASURE_NUMBER = 20 +MEASURE_REPEAT = 5 +WARMUP_MIN_REPEAT_MS = 250 +TIMEOUT = 10 + + +@register_object("collage.CostEstimator") +class CostEstimator(Object): + """CostEstimator class""" + + def __init__(self): + self.__init_handle_by_constructor__(_ffi_api.CostEstimator) + + +@register_object("collage.MockEstimator") +class MockEstimator(Object): + """MockEstimator class""" + + def __init__(self, target_costs): + self.__init_handle_by_constructor__(_ffi_api.MockEstimator, target_costs) + + +def arg_for(type, device): + """Returns a test argument of type on device""" + assert isinstance(type, tvm.ir.TensorType) + return tvm.nd.array( + np.random.uniform(-1.0, 1.0, size=type.concrete_shape).astype(type.dtype), device=device + ) + + +def is_already_tuned(task, log_filename): + """Returns true if we already have a tuning record for task in turning logs in log_filename""" + if not os.path.exists(log_filename): + return False + + dispatch_context = tvm.autotvm.task.ApplyHistoryBest(log_filename) + return dispatch_context.contains(task.target, task.workload) + + +def extract_autotvm_tasks(mod, target): + return tvm.autotvm.task.extract_from_program(mod, target=target, params=None) + + +def optional_tuning_records(log_filename): + if log_filename == "" or not os.path.exists(log_filename): + return tvm.autotvm.task.FallbackContext() + else: + return tvm.autotvm.task.ApplyHistoryBest(log_filename) + + +def tune_autotvm_tasks(tasks, log_filename): + """Appends to log_filename the best strategies for tasks""" + if len(tasks) == 0: + return + + measure_option = tvm.autotvm.measure_option( + builder=tvm.autotvm.LocalBuilder(timeout=TIMEOUT), + runner=tvm.autotvm.LocalRunner( + number=MEASURE_NUMBER, repeat=MEASURE_REPEAT, timeout=TIMEOUT, min_repeat_ms=0 + ), + ) + + logging.info( + f"Using autotvm tuning for {len(tasks)} tasks with {AUTOTVM_NUM_TRIALS} trials, logging to {log_filename}" + ) + + # create tmp log file, starting with contents from existing log file + tmp_log_filename = log_filename + ".tmp" + if os.path.exists(tmp_log_filename): + os.remove(tmp_log_filename) + if os.path.exists(log_filename): + logging.info(f"Copying existing log {log_filename} to {tmp_log_filename}") + shutil.copy(log_filename, tmp_log_filename) + + for i, task in enumerate(reversed(tasks)): + prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) + logging.info(f"Considering task {task.name} {prefix}") + if is_already_tuned(task, tmp_log_filename): + logging.info(f"Re-using existing record for {task.name}") + continue + + logging.info(f"Using autotvm to tune {task.name}") + tuner_obj = tvm.autotvm.tuner.XGBTuner(task, loss_type="rank") + if os.path.exists(tmp_log_filename): + tuner_obj.load_history(tvm.autotvm.record.load_from_file(tmp_log_filename)) + + # do tuning + n_trial = min(AUTOTVM_NUM_TRIALS, len(task.config_space)) + tuner_obj.tune( + n_trial=n_trial, + early_stopping=AUTOTVM_EARLY_STOPPING, + measure_option=measure_option, + callbacks=[ + tvm.autotvm.callback.progress_bar(n_trial, prefix=prefix), + tvm.autotvm.callback.log_to_file(tmp_log_filename), + ], + ) + + # pick best records and copy back to main log file + tvm.autotvm.record.pick_best(tmp_log_filename, log_filename) + os.remove(tmp_log_filename) + + logging.info("Done with autotvm tuning") + + +def vm_estimate_seconds(device, vm, func_name, args): + # Warmup + vm.benchmark( + device, repeat=1, number=1, min_repeat_ms=WARMUP_MIN_REPEAT_MS, func_name=func_name, **args + ) + # For realz this time + return vm.benchmark( + device, + repeat=MEASURE_REPEAT, + number=MEASURE_NUMBER, + min_repeat_ms=0, + func_name=func_name, + **args, + ) + + +@register_func("tvm.relay.collage.estimate_seconds") +def estimate_seconds(mod, target, needs_tvm_tuning): + """Returns the mean execution time of "main" in mod on target with params. The module + may contain "Primitive" functions, possibly with "Compiler" attributes.""" + device = tvm.device(target.kind.device_type) + + try: + # Build the module. + logging.info("Compiling module to estimate") + exe = tvm.relay.vm.compile(mod, target) + except RuntimeError as e: + # A build failure indicates the partition is not supported. + # eg trying to build an nn.batch_norm on GPU, which has no schedule since we assume it + # is only ever used with a tuple projection which is rewritten away. + logging.info(f"Assigning module infinite cost since unable to build: {e}") + return math.inf + + # Finalize compilation + tmp_dir = tempfile.mkdtemp() + code, lib = exe.save() + lib_path = os.path.join(tmp_dir, "library.so") + # TODO(mbs): Avoid nvcc dependency? + lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc") + lib = tvm.runtime.load_module(lib_path) + exe = tvm.runtime.vm.Executable.load_exec(code, lib) + + # Benchmark the module. + vm = tvm.runtime.vm.VirtualMachine(exe, device) + func_name = "main" + main_args = {v.name_hint: arg_for(v.checked_type, device) for v in mod[func_name].params} + logging.info("Benchmarking module to estimate") + profile = vm_estimate_seconds(device, vm, func_name, main_args) + logging.info(f"profile: {profile}") + return profile.median # seconds + + +make_labelled_dfpattern_partition_rule = tvm._ffi.get_global_func( + "relay.collage.make_labelled_dfpattern_partition_rule" +) +make_labelled_dfpattern_partition_rule_with_predicate = tvm._ffi.get_global_func( + "relay.collage.make_labelled_dfpattern_partition_rule_with_predicate" +) +make_pattern_byoc_partition_rule = tvm._ffi.get_global_func( + "relay.collage.make_pattern_byoc_partition_rule" +) + + +def make_labelled_dfpattern_partition_rule_wrapper(compiler, tuple): + if len(tuple) == 2: + rule_name, dataflow_pattern = tuple + return make_labelled_dfpattern_partition_rule(compiler, rule_name, dataflow_pattern) + else: + rule_name, dataflow_pattern, predicate = tuple + return make_labelled_dfpattern_partition_rule_with_predicate( + compiler, rule_name, dataflow_pattern, predicate + ) + + +@register_func("tvm.relay.collage.make_byoc_partition_rule") +def make_byoc_partition_rule(compiler): + """Returns the PartitionRule for BYOC compiler""" + pattern_table = tvm.relay.op.contrib.get_pattern_table(compiler) + assert ( + pattern_table is not None + ), f"No pattern table entry was found for BYOC compiler {compiler}" + logging.info( + f"Converting {len(pattern_table)} rules for {compiler} for use in pattern style BYOC lowering/codegen" + ) + sub_rules = [ + make_labelled_dfpattern_partition_rule_wrapper(compiler, tuple) for tuple in pattern_table + ] + return make_pattern_byoc_partition_rule(compiler, sub_rules) + + +def autotvm_tune_module(mod, target, log_filename): + if log_filename == "": + logging.info("Not tuning with autotvm since disabled") + return + # Extract and tune any TVM kernels. BYOC partitions will have no tasks extracted. + logging.info("Extracting tasks from overall module") + tasks = extract_autotvm_tasks(mod, target) + logging.info(f"Auto-tuning {len(tasks)} tasks from overall module") + tune_autotvm_tasks(tasks, log_filename) diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py index a69e2d4105290..c441c30808c3f 100644 --- a/python/tvm/relay/op/contrib/tensorrt.py +++ b/python/tvm/relay/op/contrib/tensorrt.py @@ -26,13 +26,17 @@ from tvm.relay import transform from tvm.relay.build_module import bind_params_by_name from tvm.relay.dataflow_pattern import is_op, wildcard, is_constant, is_tuple, is_tuple_get_item -from tvm.relay.expr import Call, Constant, TupleGetItem +from tvm.relay.expr import Call, Constant, GlobalVar, TupleGetItem from tvm.relay.expr_functor import ExprMutator, ExprVisitor from tvm.relay.op.contrib.register import register_pattern_table logger = logging.getLogger("TensorRT") +def is_tensorrt_compiler_enabled() -> bool: + return tvm.get_global_func("relay.ext.tensorrt.is_runtime_enabled", True) is not None + + def is_tensorrt_runtime_enabled() -> bool: """Check if the TensorRT graph executor is present. Returns @@ -40,116 +44,90 @@ def is_tensorrt_runtime_enabled() -> bool: ret: bool True if present, False if not. """ - check_enabled = tvm.get_global_func("relay.op.is_tensorrt_runtime_enabled", True) + check_enabled = tvm.get_global_func("relay.ext.tensorrt.is_runtime_enabled", True) if check_enabled: return check_enabled() return False +def get_tensorrt_target() -> tvm.target.Target: + """Returns the current Target, which must be of kind "tensorrt".""" + target = tvm.target.Target.current() + assert target.kind.name == "tensorrt" + return target + + def get_tensorrt_version() -> Tuple[int, int, int]: - """Gets the version of TensorRT that TVM is built against or is targeting. + """Returns the version of TensorRT to assume during compilation. + In order of preference this is taken from: + - The current "tensorrt" target's "tensorrt_version" attribute string. + - The version linked to the TVM runtime. + - (6, 0, 1) Returns ------- ret: Tuple[int, int, int] - TensorRT version as a tuple of major, minor, and patch number. If TVM - is not built with TensorRT, the value set by set_tensorrt_version() is returned instead. + TensorRT version as a tuple of (major, minor, patch). """ - pass_ctx = tvm.transform.PassContext.current() - if "relay.ext.tensorrt.options" in pass_ctx.config: - return tuple(pass_ctx.config["relay.ext.tensorrt.options"].tensorrt_version) # type: ignore - return tuple(tvm.get_global_func("relay.op.get_tensorrt_version")()) # type: ignore + target = get_tensorrt_target() + version = target.attrs["tensorrt_version"] + if len(version) == 3: + return int(version[0]), int(version[1]), int(version[2]) + assert len(version) == 0 + + get_version = tvm.get_global_func("relay.ext.tensorrt.get_version", True) + if get_version: + version = get_version() + assert len(version) == 3 + return int(version[0]), int(version[1]), int(version[2]) - -def get_tensorrt_use_implicit_batch_mode() -> bool: - pass_ctx = tvm.transform.PassContext.current() - if "relay.ext.tensorrt.options" in pass_ctx.config: - return pass_ctx.config["relay.ext.tensorrt.options"].use_implicit_batch logger.warning( - "PassContext has no relay.ext.tensorrt.options config, using default value " - "use_implicit_batch=True." + "TVM was not built against TensorRT and no version was provided to " + "partition_for_tensorrt. Defaulting to 6.0.1" ) - return True + return (6, 0, 1) + + +def get_tensorrt_use_implicit_batch_mode() -> bool: + """Returns the "use_implicit_batch" attribute of the current "tensorrt" target.""" + target = get_tensorrt_target() + return target.attrs["use_implicit_batch"] def get_tensorrt_remove_no_mac_subgraphs() -> bool: - pass_ctx = tvm.transform.PassContext.current() - if "relay.ext.tensorrt.options" in pass_ctx.config: - return pass_ctx.config["relay.ext.tensorrt.options"].remove_no_mac_subgraphs - logger.warning( - "PassContext has no relay.ext.tensorrt.options config, using default value " - "remove_no_mac_subgraphs=False." - ) - return False + """Returns the "remove_no_mac_subgraphs" attribute of the current "tensorrt" target.""" + target = get_tensorrt_target() + return target.attrs["remove_no_mac_subgraphs"] + + +def get_tensorrt_use_fp16() -> bool: + """Returns the "use_fp16" attribute of the current "tensorrt" target.""" + target = get_tensorrt_target() + return target.attrs["use_fp16"] def partition_for_tensorrt( mod: tvm.IRModule, params: Optional[Dict[str, tvm.nd.NDArray]] = None, - version: Optional[Tuple[int, int, int]] = None, - use_implicit_batch: bool = True, - remove_no_mac_subgraphs: bool = False, - max_workspace_size: int = 1 << 30, - use_fp16: bool = False, - use_uint8: bool = False, -) -> Tuple[tvm.IRModule, Dict[str, Any]]: - """Partition the graph greedily offloading supported operators to TensorRT. + target: tvm.target.Target = tvm.target.Target("tensorrt"), +) -> tvm.IRModule: + """Partition all functions in mod to greedily offload supported operators to TensorRT. Parameters ---------- mod : tvm.IRModule - The module to run passes on. + The module to partition. + target : tvm.target.Target + A target of kind "tensorrt" describing additional partitioning and compilation options. params : Optional[Dict[str, tvm.nd.NDArray]] Constant input parameters. - version : Optional[Tuple[int, int, int]] - TensorRT version to target as tuple of (major, minor, patch). If TVM is compiled with - USE_TENSORRT_RUNTIME=ON, the linked TensorRT version will be used instead. - use_implicit_batch : bool - Use TensorRT implicit batch mode (default true). Setting to false will enable explicit batch - mode which will widen supported operators to include those which modify the batch dimension, - but may reduce performance for some models. - remove_no_mac_subgraphs : bool - Removes subgraphs which have been partitioned for TensorRT if they do not have any - multiply-accumulate operations. The removed subgraphs will go through TVM's standard - compilation instead. Can improve performance. - max_workspace_size : int - How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation. - See TensorRT documentation for more info. - use_fp16: bool - Allows, TRT to automatically convert FP32 inputs to FP16. Also, it is required to be enabled - if FP16 inputs tensors and weights are used. - Note that TensorRT will still choose a higher-precision kernel if it results in overall - lower runtime, or if no low-precision implementation exists. - use_uint8: bool - Allows, TRT to automatically convert FP32 inputs to UINT8. Returns ------- - mod_and_config : Tuple[tvm.IRModule, Dict[str, Any]] - A tuple of 1) annotated and partitioned module and 2) "relay.ext.tensorrt.options" - configuration which should be given to PassContext when building. + partitioned_mod : tvm.IRModule + The partitioned module. """ - config: Dict[str, Any] = { - "use_implicit_batch": use_implicit_batch, - "max_workspace_size": max_workspace_size, - "remove_no_mac_subgraphs": remove_no_mac_subgraphs, - "use_fp16": use_fp16, - "use_uint8": use_uint8, - } - if version: - assert isinstance(version, tuple) and len(version) == 3 - config["tensorrt_version"] = version - else: - linked_version = tuple(tvm.get_global_func("relay.op.get_tensorrt_version")()) - if not linked_version: - logger.warning( - "TVM was not built against TensorRT and no version was provided to " - "partition_for_tensorrt. Defaulting to 6.0.1" - ) - linked_version = (6, 0, 1) - config["tensorrt_version"] = linked_version - if params: mod["main"] = bind_params_by_name(mod["main"], params) @@ -174,24 +152,27 @@ def partition_for_tensorrt( transform.InferType(), ] ) - with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): + with target: mod = seq(mod) - # TODO(mbs): Revisit - # mod = prune_tensorrt_subgraphs(mod) - return mod, config + mod = prune_tensorrt_subgraphs(mod) + return mod def is_supported_trt_type(typ: Union[tvm.ir.TensorType, tvm.ir.TupleType], op_name: str) -> bool: """Check whether a type is supported by TensorRT.""" - supported_dtypes = ["float32", "float16"] + supported_dtypes = ["float32"] + if get_tensorrt_use_fp16(): + supported_dtypes.append("float16") if isinstance(typ, tvm.ir.TensorType): if typ.dtype not in supported_dtypes: - logger.info(f"{op_name}: Only float32 and float16 tensor dtypes are supported.") + logger.info(f"{op_name}: Only {supported_dtypes} tensor dtypes are supported.") return False - # assumes dim 0 is for batch and can be dynamic - # TODO(mbs): But does this depend use_implicit_batch flag? - for dim_shape in typ.shape[1:]: - if isinstance(dim_shape, tvm.tir.expr.Any): + dims = typ.shape + if get_tensorrt_use_implicit_batch_mode(): + # The first dimension can be Any. + dims = dims[1:] + for dim in dims: + if isinstance(dim, tvm.tir.expr.Any): logger.info(f"{op_name}: Only statically known tensor shapes are supported.") return False elif isinstance(typ, tvm.ir.TupleType): @@ -247,7 +228,10 @@ def predicate(expr: relay.expr.Expr) -> bool: args = get_args(expr) if not all([is_supported_trt_type(arg.checked_type, op_name) for arg in args]): return False - return checker(attrs, args, op_name) + if not checker(attrs, args, op_name): + return False + logger.info(f"{op_name}: Predicate passes") + return True return predicate @@ -535,11 +519,16 @@ def concatenate_checker( if int(attrs.axis) == 0: logger.info(f"{op_name}: can't modify batch dimension.") return False - if isinstance(args[0], relay.Tuple): - for tuple_input in args[0].fields: - if isinstance(tuple_input, Constant): - logger.info(f"{op_name}: can't concatenate tensors with constants.") - return False + + if not isinstance(args[0], relay.Tuple): + logger.info("f{op_name}: concatenate must be applied to a literal tuple") + return False + + for tuple_input in args[0].fields: + if isinstance(tuple_input, Constant): + logger.info(f"{op_name}: can't concatenate tensors with constants.") + return False + return True diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py index d7979a757171b..fa2c2ceeb5ee2 100644 --- a/python/tvm/relay/transform/transform.py +++ b/python/tvm/relay/transform/transform.py @@ -1461,3 +1461,45 @@ def InlineCompilerFunctionsBoundTo(global_vars): The pass. """ return _ffi_api.InlineCompilerFunctionsBoundTo(global_vars) + + +def CaptureIndexInSpans(): + """Captures the post-dfs index and dominator post-dfs index of (most) expression nodes in + their span, in the form "index::". + + This is useful for debugging since a) it helps identify pretty-printed sub-expressions within + the overall model and b) the indexes are heavily used by Collage for its compact representation + of sub-graphs. + + Note that Op and Constructor nodes are not changed even though they are assigned an + post-dfs index. + + Returns + ------- + ret : tvm.transform.Pass + The pass. + """ + return _ffi_api.CaptureIndexInSpans() + + +def CollagePartition(config, cost_estimator=None): + """Partition the bodies of all functions according to the available targets so as to + minimize model latency. See https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md. + + Parameters + ---------- + config : CompilationConfig + The available targets. + cost_estimator : CostEstimator, optional + The custom cost estimator to use for costing each candidate partition. + + Returns + ------- + ret : tvm.transform.Pass + The pass. + + """ + if cost_estimator is None: + cost_estimator = relay.collage.CostEstimator() + + return _ffi_api.CollagePartition(config, cost_estimator) diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc index e08cd240d4d1e..ec1887cee9097 100644 --- a/src/relay/backend/contrib/tensorrt/codegen.cc +++ b/src/relay/backend/contrib/tensorrt/codegen.cc @@ -33,42 +33,56 @@ #include "../codegen_json/codegen_json.h" #if TVM_GRAPH_EXECUTOR_TENSORRT +#include "../../../transforms/compiler_function_utils.h" #include "NvInfer.h" #endif namespace tvm { namespace relay { namespace contrib { +namespace tensorrt { -/*! \brief Attributes to store the compiler options for TensorRT. */ -struct TensorRTCompilerConfigNode : public tvm::AttrsNode { - Array tensorrt_version; - bool use_implicit_batch; - size_t max_workspace_size; - bool remove_no_mac_subgraphs; - bool use_fp16; - bool use_uint8; - - TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") { - TVM_ATTR_FIELD(tensorrt_version) - .describe("TensorRT version as (major, minor, patch).") - .set_default(Array({6, 0, 1})); - TVM_ATTR_FIELD(use_implicit_batch).set_default(true); - TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30); - TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false); - TVM_ATTR_FIELD(use_fp16).set_default(false); - TVM_ATTR_FIELD(use_uint8).set_default(false); - } -}; +/*! + * \brief Check whether TensorRT graph executor is enabled. + * \return True if enabled, False if not. + */ +inline constexpr bool IsRuntimeEnabled() { +#if TVM_GRAPH_EXECUTOR_TENSORRT + return true; +#else + return false; +#endif // TVM_GRAPH_EXECUTOR_TENSORRT +} -class TensorRTCompilerConfig : public Attrs { - public: - TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TensorRTCompilerConfig, Attrs, - TensorRTCompilerConfigNode); -}; +TVM_REGISTER_GLOBAL("relay.ext.tensorrt.is_runtime_enabled").set_body_typed(IsRuntimeEnabled); -TVM_REGISTER_NODE_TYPE(TensorRTCompilerConfigNode); -TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.tensorrt.options", TensorRTCompilerConfig); +/*! + * \brief Get TensorRT version that TVM is built against. + * \return Array of three integers for major, minor, and patch, or empty array if TensorRT graph + * runtime is not enabled. + */ +Array GetVersion() { +#if TVM_GRAPH_EXECUTOR_TENSORRT + return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)}; +#else + return {}; +#endif // TVM_GRAPH_EXECUTOR_TENSORRT +} + +TVM_REGISTER_GLOBAL("relay.ext.tensorrt.get_version").set_body_typed(GetVersion); + +/*! + * \brief Returns the "tensorrt" Target instance to use for compilation. + */ +Target GetTensorRTTarget() { + Target target = Target::Current(/*allow_not_defined=*/true); + if (!target.defined() || target->kind->name != "tensorrt") { + // Since we allow partition_for_tensorrt to use the default "tensorrt" target, we should + // similarly allow the custom pass to execute without a specific "tensorrt" target in scope. + target = Target("tensorrt"); + } + return target; +} using JSONGraphNode = tvm::runtime::json::JSONGraphNode; using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry; @@ -87,6 +101,7 @@ class CollectFromCompositeFunctionBody : public ExprVisitor { explicit CollectFromCompositeFunctionBody(TensorRTJSONSerializer* serializer) : serializer_(serializer), node_(std::make_shared()) {} + // We'll need to implement these out-of-band since they use the serializer. void VisitExpr_(const ConstantNode* constant_node) final; void VisitExpr_(const CallNode* call_node) final; @@ -190,6 +205,7 @@ class CollectFromCompositeFunctionBody : public ExprVisitor { extractor.Extract(const_cast(attr_obj)); } + /*! \brief The parent serializer for the overall TensorRT partition. */ TensorRTJSONSerializer* serializer_; /*! \brief Accumulated translated arguments. */ std::vector args_; @@ -207,9 +223,10 @@ class CollectFromCompositeFunctionBody : public ExprVisitor { */ class TensorRTJSONSerializer : public JSONSerializer { public: - TensorRTJSONSerializer(const std::string& symbol, const Expr& expr) - : JSONSerializer(symbol, expr) {} + TensorRTJSONSerializer(Target target, const std::string& symbol, const Expr& expr) + : JSONSerializer(symbol, expr), target_(std::move(target)) {} + private: using JSONSerializer::VisitExpr_; std::vector VisitExpr_(const CallNode* call_node) final { @@ -245,40 +262,58 @@ class TensorRTJSONSerializer : public JSONSerializer { node->CaptureAttrs(*collector.node_); // Capture global settings on the JSON node. - SaveGlobalAttributes(node); + // TODO(mbs): Why on every call? + SaveGlobalAttributes(node.get()); VLOG(1) << name << " has " << node->GetInputs().size() << " inputs"; return AddNode(node, GetRef(call_node)); } - static void SaveGlobalAttributes(std::shared_ptr node) { - auto ctx = transform::PassContext::Current(); - auto cfg = ctx->GetConfig("relay.ext.tensorrt.options"); - if (!cfg.defined()) { - cfg = AttrsWithDefaultValues(); + static void SetAttr(JSONGraphNode* node, const std::string& key, + std::vector values) { + node->SetAttr(key, std::vector({std::move(values)})); + } + + /*! \brief Capture the compilation options as attributes on \p node. */ + void SaveGlobalAttributes(JSONGraphNode* node) { + { + Array target_attr = target_->GetAttr>("tensorrt_version").value(); + if (target_attr.empty()) { + target_attr = GetVersion(); + } + if (target_attr.empty()) { + target_attr = {6, 0, 1}; + } + ICHECK_EQ(target_attr.size(), 3); + SetAttr(node, "tensorrt_version", + {std::to_string(target_attr[0]), std::to_string(target_attr[1]), + std::to_string(target_attr[2])}); + } + + { + Bool target_attr = target_->GetAttr("use_implicit_batch").value(); + SetAttr(node, "use_implicit_batch", {std::to_string(target_attr->value)}); + } + + { + Integer target_attr = target_->GetAttr("max_workspace_size").value(); + SetAttr(node, "max_workspace_size", {std::to_string(target_attr->value)}); + } + + { + Bool target_attr = target_->GetAttr("use_fp16").value(); + SetAttr(node, "use_fp16", {std::to_string(target_attr->value)}); + } + + { + Bool target_attr = target_->GetAttr("use_uint8").value(); + SetAttr(node, "use_uint8", {std::to_string(target_attr->value)}); } - ICHECK_EQ(cfg.value()->tensorrt_version.size(), 3); - std::vector tensorrt_version = {std::to_string(cfg.value()->tensorrt_version[0]), - std::to_string(cfg.value()->tensorrt_version[1]), - std::to_string(cfg.value()->tensorrt_version[2])}; - std::vector use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)}; - std::vector max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)}; - std::vector use_fp16 = {std::to_string(cfg.value()->use_fp16)}; - std::vector use_uint8 = {std::to_string(cfg.value()->use_uint8)}; - std::vector tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr, - use_fp16_attr, use_uint8_attr; - tensorrt_version_attr.emplace_back(tensorrt_version); - use_implicit_batch_attr.emplace_back(use_implicit_batch); - max_workspace_size_attr.emplace_back(max_workspace_size); - use_fp16_attr.emplace_back(use_fp16); - use_uint8_attr.emplace_back(use_uint8); - node->SetAttr("tensorrt_version", tensorrt_version_attr); - node->SetAttr("use_implicit_batch", use_implicit_batch_attr); - node->SetAttr("max_workspace_size", max_workspace_size_attr); - node->SetAttr("use_fp16", use_fp16_attr); - node->SetAttr("use_uint8", use_uint8_attr); } + + /*! \brief The "tensorrt" Target guiding compilation. */ + Target target_; }; void CollectFromCompositeFunctionBody::VisitExpr_(const ConstantNode* constant_node) { @@ -304,64 +339,75 @@ void CollectFromCompositeFunctionBody::VisitExpr_(const CallNode* call_node) { } /*! - * \brief Create a runtime module for TensorRT. - * \param ref The ext_func Relay expression/module to be executed using extern ops. - * \return A runtime module. - */ -runtime::Module TensorRTCompiler(const ObjectRef& ref) { - ICHECK(ref->IsInstance()) << "The input ref is expected to be a Relay function."; - Function func = Downcast(ref); - std::string func_name = backend::GetExtSymbol(func); - - VLOG(1) << "TensorRT partition:" << std::endl << PrettyPrint(func); - TensorRTJSONSerializer serializer(func_name, func); - serializer.serialize(); - std::string graph_json = serializer.GetJSON(); - VLOG(1) << "TensorRT JSON:" << std::endl << graph_json; - - // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes - // a callback which calls backend::UpdateConstants to capture the map before the function - // 'disappears' into lowered form, on the assumption the visit order and thus constant - // names match those generated by the JSONSerializer. - - const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create"); - ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function."; - VLOG(1) << "Creating tensorrt runtime::Module for '" << func_name << "'"; - runtime::Module lib = (*pf)(func_name, graph_json, serializer.const_names()); - return lib; -} - -TVM_REGISTER_GLOBAL("relay.ext.tensorrt").set_body_typed(TensorRTCompiler); - -/*! - * \brief Check whether TensorRT graph executor is enabled. - * \return True if enabled, False if not. + * \brief The main TensorRT compiler. + * + * TODO(mbs): Currently we create a \p TensorRTRuntimeModule for every function with + * Compiler="tensorrt" (ie for each partition). Since the TensorRT engine is only designed to + * handle a single entry point this is mostly sensible, however there are probably opportunities + * for more sharing between functions. However, note this means each call to a TensorRT-compiled + * function will require a linear scan of imported runtime modules to find the matching + * TensorRTRuntimeModule implementing it. */ -inline constexpr bool IsTensorRTRuntimeEnabled() { -#if TVM_GRAPH_EXECUTOR_TENSORRT - return true; -#else - return false; -#endif // TVM_GRAPH_EXECUTOR_TENSORRT +transform::Pass CompileForTensorRTImpl() { + auto pass_func = [](IRModule mod, const transform::PassContext& pass_ctx) { + VLOG(1) << "CompileForTensorRT input:" << std::endl << PrettyPrint(mod); + Target target = GetTensorRTTarget(); + + const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create"); + ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function."; + + // The accumulated external runtime modules. + Array external_mods = + mod->GetAttr>(tvm::attr::kExternalMods, Array()) + .value(); + // The accumulated constant bindings. + Map const_name_to_ndarray = + mod->GetAttr>(tvm::attr::kConstNameToNDArray, + Map()) + .value(); + + for (const auto& kv : mod->functions) { + if (const auto* function_node = kv.second.as()) { + if (function_node->HasNonzeroAttr(attr::kPrimitive)) { + Optional opt_compiler = function_node->GetAttr(attr::kCompiler); + if (opt_compiler && opt_compiler.value() == "tensorrt") { + // Serialize the function to JSON. + TensorRTJSONSerializer serializer(target, kv.first->name_hint, + GetRef(function_node)); + serializer.serialize(); + std::string graph_json = serializer.GetJSON(); + VLOG(1) << "TensorRT JSON for '" << kv.first->name_hint << "':" << std::endl + << graph_json; + + // Remember all the constant bindings. + for (const auto& kv2 : serializer.const_name_to_constant()) { + ICHECK_EQ(const_name_to_ndarray.count(kv2.first), 0); + const_name_to_ndarray.Set(kv2.first, kv2.second); + } + + // Create the actual runtime module. + runtime::Module runtime_mod = + (*pf)(kv.first->name_hint, graph_json, serializer.const_names()); + + // Remember the runtime module. + external_mods.push_back(runtime_mod); + } + } + } + } + return WithAttrs(mod, {{tvm::attr::kExternalMods, external_mods}, + {tvm::attr::kConstNameToNDArray, const_name_to_ndarray}}); + }; + return tvm::transform::CreateModulePass(pass_func, 0, "CompileForTensorRT", {}); } -/*! - * \brief Get TensorRT version that TVM is built against. - * \return Array of three integers for major, minor, and patch, or empty array if TensorRT graph - * runtime is not enabled. - */ -Array GetTensorRTVersion() { -#if TVM_GRAPH_EXECUTOR_TENSORRT - return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)}; -#else - return {}; -#endif // TVM_GRAPH_EXECUTOR_TENSORRT +transform::Pass CompileForTensorRT() { + return transform::Sequential( + {transforms::OutlineCompilerFunctionsWithExistingGlobalSymbols("tensorrt"), + CompileForTensorRTImpl(), transforms::MarkCompilerFunctionsAsExtern("tensorrt")}); } -TVM_REGISTER_GLOBAL("relay.op.is_tensorrt_runtime_enabled") - .set_body_typed(IsTensorRTRuntimeEnabled); -TVM_REGISTER_GLOBAL("relay.op.get_tensorrt_version").set_body_typed(GetTensorRTVersion); - +} // namespace tensorrt } // namespace contrib } // namespace relay } // namespace tvm diff --git a/src/relay/backend/contrib/tensorrt/codegen.h b/src/relay/backend/contrib/tensorrt/codegen.h new file mode 100644 index 0000000000000..813a8663756dd --- /dev/null +++ b/src/relay/backend/contrib/tensorrt/codegen.h @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/backend/contrib/tensorrt/codegen.h + * \brief The 'custom' compilation pass for TensorRT (invoked by the RelayToTIRTargetHook pass). + */ + +#ifndef TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_ +#define TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_ + +#include + +namespace tvm { +namespace relay { +namespace contrib { +namespace tensorrt { + +/*! + * \brief Returns the pass which replaces all calls to "Primitive" functions with a "Compiler" + * attribute of "tensorrt" with calls to an extern which is implemented by a \p TensorRTRuntime + * runtime module added to the IRModule's "external_mods" attribute. + */ +transform::Pass CompileForTensorRT(); + +} // namespace tensorrt +} // namespace contrib +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_ diff --git a/src/relay/backend/contrib/tensorrt/target.cc b/src/relay/backend/contrib/tensorrt/target.cc index 85d127ab71152..2e4581d30a3c6 100644 --- a/src/relay/backend/contrib/tensorrt/target.cc +++ b/src/relay/backend/contrib/tensorrt/target.cc @@ -24,19 +24,46 @@ #include +#include "./codegen.h" + namespace tvm { namespace relay { namespace contrib { +namespace tensorrt { /*! * \brief This external codegen target can offload compilation to the TensorRT compiler. * - Patterns: python/tvm/relay/op/contrib/tensorrt.py * - Custom compiler: src/relay/backend/contrib/tensorrt/codegen.cc - * - Runtime: src/runtime/contrib/tensorrt/ *.cc + * - Runtime: src/runtime/contrib/tensorrt/... */ TVM_REGISTER_TARGET_KIND("tensorrt", kDLCUDA) - .set_attr(tvm::attr::kIsExternalCodegen, Bool(true)); + .set_attr(tvm::attr::kIsExternalCodegen, Bool(true)) + .set_attr("RelayToTIR", CompileForTensorRT()) + // A array of three integers given the major, minor, and patch numbers for the supported + // TensorRT compiler version. If empty will be auto-detected from linked library. Default empty. + .add_attr_option>("tensorrt_version", Array()) + // If true, the first tensor dimension for most operators is allowed to be Any and + // TensorRT will assume it represents a batch dimension only known at inference time. + // Fewer Relay operators are supported in implicit batch mode. Default true. + .add_attr_option("use_implicit_batch", Bool(true)) + // If true, excludes sub-graphs which do not have multiply-accumulate operations, even though + // TensorRT supports them. ad. This is a simple heuristic to optimize the partitioning between + // TensorRT and TVM. Not required if using Collage for partitioning. Defalut false. + .add_attr_option("remove_no_mac_subgraphs", Bool(false)) + // How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation. + // Default 1G. + .add_attr_option("max_workspace_size", Integer(1 << 30)) + // If true, allows TensorRT to automatically convert float32 operations to float16. Must also be + // enabled if any float16 operations are in the model. Note that TensorRT may still choose a + // higher-precision kernel if it results in overall lower runtime, or if no low-precision + // implementation exists. Default false. + .add_attr_option("use_fp16", Bool(false)) + // If true, allows TensorRT to automatically convert float32 operations to uint8 + // (aka quantized). Default false. + .add_attr_option("use_uint8", Bool(false)); +} // namespace tensorrt } // namespace contrib } // namespace relay } // namespace tvm diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc index bd3047e2862c1..4376f87787086 100644 --- a/src/relay/backend/utils.cc +++ b/src/relay/backend/utils.cc @@ -343,6 +343,7 @@ relay::Function BindParamsByName(relay::Function func, void BindParamsInModule(IRModule mod, const std::unordered_map& params) { + VLOG(1) << "BindParamsInModule"; if (!params.empty()) { BaseFunc base_func = mod->Lookup("main"); ICHECK(base_func->IsInstance()); diff --git a/src/relay/collage/README.md b/src/relay/collage/README.md new file mode 100644 index 0000000000000..dc56496092cc0 --- /dev/null +++ b/src/relay/collage/README.md @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + +The `CollagePartition` pass for finding optimal partitionings of Relay models. + +See the [RFC](https://github.com/mbs-octoml/mbs-tvm-rfcs/blob/mbs-rfcs-collage/rfcs/xxxx-collage.md). + +Based on: +> *Collage: Automated Integration of Deep Learning Backends* +> Byungsoo Jeon, Sunghyun Park, Peiyuan Liao, Sheng Xu, Tianqi Chen, Zhihao Jia + +CAUTION: This is a prototype, do not use in prod. diff --git a/src/relay/collage/candidate_function_cache.cc b/src/relay/collage/candidate_function_cache.cc new file mode 100644 index 0000000000000..32982dc08f3d7 --- /dev/null +++ b/src/relay/collage/candidate_function_cache.cc @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/candidate_function_cache.cc + * \brief A cache of the unique global name and costs for partitioned functions. + */ + +#include "./candidate_function_cache.h" + +namespace tvm { +namespace relay { +namespace collage { + +CandidateFunctionCache::Entry& CandidateFunctionCache::GetEntry(const std::string& label, + const Function& function) { + auto itr = cache_.find(function); + if (itr == cache_.end()) { + String compiler = function->GetAttr(attr::kCompiler, String("tvm")).value(); + std::string global_symbol_name = name_supply_->Fresh({compiler, label}); + GlobalVar global_symbol(std::move(global_symbol_name), function->checked_type()); + itr = cache_.emplace(function, Entry(std::move(global_symbol))).first; + } + return itr->second; +} + +GlobalVar CandidateFunctionCache::GetGlobalSymbol(const Function& function) { + return GetEntry(/*label=*/"", function).global_symbol; +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/candidate_function_cache.h b/src/relay/collage/candidate_function_cache.h new file mode 100644 index 0000000000000..322128c46fbad --- /dev/null +++ b/src/relay/collage/candidate_function_cache.h @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/candidate_function_cache.h + * \brief A cache of the unique global symbol name and cost for partitioned functions. + */ + +#ifndef TVM_RELAY_COLLAGE_CANDIDATE_FUNCTION_CACHE_H_ +#define TVM_RELAY_COLLAGE_CANDIDATE_FUNCTION_CACHE_H_ + +#include +#include +#include +#include + +#include "../transforms/compiler_function_utils.h" +#include "cost.h" +#include "name_supply.h" +#include "tvm/relay/function.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief A cache of the unique global symbol and cost for functions extracted to represent + * partitions. If two functions are structurally equal (which includes equality of their "Compiler" + * attributes) then they will share the same global symbol and estimated cost. We rely on the + * function's attributes to distinguish partitions which are structurally the same graph but + * intended for different targets. + */ +class CandidateFunctionCache : public transforms::GlobalSymbolCache { + public: + explicit CandidateFunctionCache(std::shared_ptr name_supply) + : name_supply_(std::move(name_supply)) {} + + struct Entry { + GlobalVar global_symbol; + Cost cost = Cost::Unknown(); // Filled in when have estimated cost. + + explicit Entry(GlobalVar global_symbol) : global_symbol(std::move(global_symbol)) {} + }; + + /*! + * \brief Returns the unique entry for \p function. If no such entry already exists, create it + * and assign it a unique global symbol name. + */ + Entry& GetEntry(const std::string& label, const Function& function); + + GlobalVar GetGlobalSymbol(const Function& function) final; + + private: + std::shared_ptr name_supply_; + std::unordered_map cache_; +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_CANDIDATE_FUNCTION_CACHE_H_ diff --git a/src/relay/collage/candidate_partition.cc b/src/relay/collage/candidate_partition.cc new file mode 100644 index 0000000000000..45365d0c7e0f8 --- /dev/null +++ b/src/relay/collage/candidate_partition.cc @@ -0,0 +1,357 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/candidate_partition.cc + * \brief A potential partition in the search. + */ + +#include "./candidate_partition.h" + +#include +#include +#include + +#include "../transforms/compiler_function_utils.h" +#include "./candidate_function_cache.h" +#include "./candidate_set.h" +#include "./partition_rule.h" +#include "./partition_spec.h" +#include "./utils.h" + +namespace tvm { +namespace relay { +namespace collage { + +TVM_REGISTER_NODE_TYPE(CandidatePartitionNode); + +void CandidatePartitionNode::VisitAttrs(AttrVisitor* v) { + v->Visit("rule_name", &rule_name_); + v->Visit("sub_graph", &sub_graph_); + v->Visit("spec", &spec_); + // TODO(mbs): cost_ +} + +PartitionSpec CandidatePartitionNode::partition_spec() const { + return Downcast(spec_); +} + +std::string CandidatePartitionNode::partition_spec_name() const { + return Downcast(spec_)->spec_name_; +} + +Target CandidatePartitionNode::target() const { return Downcast(spec_)->target_; } + +std::string CandidatePartitionNode::ToSummary(const DataflowGraph& dataflow_graph) const { + std::ostringstream os; + os << sub_graph_->label_; + os << " | ("; + bool first = true; + for (PostDfsIndex index : sub_graph_->input_) { + Expr sub_expr = dataflow_graph.index_to_node(index)->ref(); + if (CanInline(sub_expr)) { + continue; + } + if (first) { + first = false; + } else { + os << ", "; + } + os << PrettyPrint(sub_expr->checked_type()); + } + os << ") -> ("; + first = true; + for (PostDfsIndex index : sub_graph_->exit_) { + Expr sub_expr = dataflow_graph.index_to_node(index)->ref(); + if (CanInline(sub_expr)) { + continue; + } + if (first) { + first = false; + } else { + os << ", "; + } + os << PrettyPrint(sub_expr->checked_type()); + } + os << ") | "; + os << sub_graph_->inside_.ToString(); + os << " | "; + os << partition_spec_name(); + os << " | "; + os << cost_.ToString(); + return os.str(); +} + +std::string CandidatePartitionNode::ToString() const { + std::ostringstream os; + os << "{rule_name=" << rule_name_; + os << ",sub_graph=" << sub_graph_->ToString(); + os << ",spec_name=" << partition_spec_name(); + if (!cost_.is_unknown()) { + os << ",cost=" << cost_.ToString(); + } + os << "}"; + return os.str(); +} + +namespace { +/*! + * \brief If function's body is a call to an inlined "Primitive" function, return it. + * Otherwise return function directly. + */ +Function GetPrimitiveFunction(const Function& function) { + if (const auto* call_node = function->body.as()) { + if (const auto* function_node = call_node->op.as()) { + if (function_node->HasNonzeroAttr(attr::kPrimitive)) { + return GetRef(function_node); + } + } + } + return function; +} + +/*! + * \brief Eta-expand any tuple arguments of \p function. Ie rewrite: + * \code + * f(x: (t1, t2)) { ... x ... } + * \endcode + * to + * \code + * f(x_1: t1, x_2: t2) { ... (x_1, x_2) ... } + * \endcode + */ +Function EtaExpandTuples(const Function& function) { + Map subst; + Array new_params; + for (const auto& param : function->params) { + std::vector tensor_types = FlattenTupleType(param->type_annotation); + if (tensor_types.size() == 1) { + new_params.push_back(param); + } else { + Array fields; + for (size_t i = 0; i < tensor_types.size(); ++i) { + Var new_param(param->name_hint() + "_" + std::to_string(i), tensor_types[i], param->span); + new_param->checked_type_ = tensor_types[i]; + new_params.push_back(new_param); + fields.push_back(new_param); + } + Tuple new_tuple(fields); + subst.Set(param, new_tuple); + } + } + if (subst.empty()) { + return function; + } + return WithFields(function, new_params, Bind(function->body, subst)); +} + +} // namespace + +Cost CandidatePartitionNode::EstimatedCost(const DataflowGraph& dataflow_graph, + CostEstimator cost_estimator, + const CompilationConfig& config, + std::shared_ptr cache) const { + if (cost_.is_unknown()) { + VLOG_CONTEXT << "spec " << partition_spec_name(); + Function extracted_function = sub_graph_->ExtractAsFunction(dataflow_graph); + extracted_function = EtaExpandTuples(extracted_function); + VLOG(2) << "Validating function:" << std::endl << PrettyPrint(extracted_function); + String error = partition_spec()->validate_sub_graph_func_(extracted_function); + if (!error.empty()) { + cost_ = Cost::Invalid(); + VLOG(1) << "Unable to rewrite function: " << error; + } else { + // The extracted function may be the eta-expansion of a "Primitive" function. + // If so we want the cached external name and cost to be w.r.t. that function + // rather than the outer so that we'll get a cache hit when we outline functions + // in the final program. + Function primitive_function = GetPrimitiveFunction(extracted_function); + CandidateFunctionCache::Entry& entry = + cache->GetEntry(sub_graph_->label_, primitive_function); + if (entry.cost.is_unknown()) { + IRModule mod = IRModule::FromExpr(extracted_function); + VLOG(1) << "Outlining:" << std::endl << PrettyPrint(mod); + mod = OutlineCompilerFunctions(cache)(mod); + VLOG(1) << "Estimating cost of:" << std::endl + << PrettyPrint(mod) << std::endl + << "using target " << target()->ToDebugString(); + entry.cost = cost_estimator->Estimate(mod, target(), + /*needs_tvm_tuning=*/!target().IsExternalCodegen()); + VLOG(1) << "Measured cost as " << entry.cost.ToString(); + } else { + VLOG(1) << "Reusing cost " << entry.cost.ToString() + << " cached in candidate function cache"; + } + cost_ = entry.cost; + } + } else { + VLOG(1) << "Reusing cost " << cost_.ToString() << " cached in candidate"; + } + return cost_; +} + +CandidatePartition::CandidatePartition(String rule_name, SubGraph sub_graph, + ObjectRef /* actually PartitionSpec */ spec, Cost cost) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->sub_graph_ = std::move(sub_graph); + node->spec_ = std::move(spec); + node->cost_ = cost; + data_ = std::move(node); +} + +CandidatePartition WithRuleName(CandidatePartition candidate, String rule_name) { + if (rule_name == candidate->rule_name_) { + return candidate; + } + auto* node = candidate.CopyOnWrite(); + node->rule_name_ = std::move(rule_name); + return GetRef(node); +} + +CandidatePartition WithSubGraph(CandidatePartition candidate, SubGraph sub_graph) { + if (sub_graph == candidate->sub_graph_) { + return candidate; + } + auto* node = candidate.CopyOnWrite(); + node->sub_graph_ = std::move(sub_graph); + return GetRef(node); +} + +bool CandidatePartition::operator<(const CandidatePartition& that) const { + // Order lexicographically on sub-graphs. + if (*get()->sub_graph_.get() < *that->sub_graph_.get()) { + return true; + } + if (*that->sub_graph_.get() < *get()->sub_graph_.get()) { + return false; + } + // Break ties by rule name. + return get()->rule_name_ < that->rule_name_; +} + +bool CandidatePartition::AreTouching(const DataflowGraph& dataflow_graph, + const CandidatePartition& that) const { + return get()->spec_ == that->spec_ && + get()->sub_graph_.AreTouching(dataflow_graph, that->sub_graph_); +} + +CandidatePartition CandidatePartition::DisjointUnion(const DataflowGraph& dataflow_graph, + const CandidatePartition& that) const { + ICHECK_EQ(get()->spec_, that->spec_); + return CandidatePartition(UnionLabels(get()->rule_name_, that->rule_name_), + get()->sub_graph_.DisjointUnion(dataflow_graph, that->sub_graph_), + get()->spec_, get()->cost_ + that->cost_); +} + +/*static*/ +CandidatePartition CandidatePartition::DisjointUnion(const DataflowGraph& dataflow_graph, + std::vector candidates) { + ICHECK_GT(candidates.size(), 1); + CandidatePartition result = candidates.front(); + for (size_t i = 1; i < candidates.size(); ++i) { + result = result.DisjointUnion(dataflow_graph, candidates[i]); + } + return result; +} + +/*static*/ +Expr CandidatePartition::ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr, + const std::vector& candidates) { + std::vector sub_graphs; + for (const auto& candidate : candidates) { + sub_graphs.emplace_back(candidate->sub_graph_); + } + return SubGraph::ParallelRewrite(dataflow_graph, expr, sub_graphs); +} + +/*static*/ +std::vector CandidatePartition::MaxCoalesce( + const DataflowGraph& dataflow_graph, std::vector candidates) { + VLOG(1) << "Running MaxCoalesce over " << candidates.size() << " candidates"; + // This is an eager version of using the simple (kOpaque, kOpaque) combiner. + + // Switch to set representation. + CandidateSet result_set(std::move(candidates)); + + // Until fixed point... + size_t num_rounds = 0; + while (result_set.PrepareForNextRound()) { + VLOG_CONTEXT << "round " << ++num_rounds; + VLOG(1) << "checking " << result_set.size() << " candidates (" << result_set.first_new_index() + << " existing)"; + IndexSet removed_this_round(result_set.size()); // over candidate indexes! + + // Build map from post-dfs indices to the indices of candidates with corresponding entry node. + // NOTE: the index set is over candidate indices not post-dfs indices! + std::vector entry_map(dataflow_graph.size(), IndexSet(result_set.size())); + for (size_t i = 0; i < result_set.size(); ++i) { + CandidatePartition candidate = result_set.at(i); + for (PostDfsIndex entry_index : candidate->sub_graph_->entry_) { + entry_map[entry_index].Add(i); + } + } + + for (size_t i = 0; i < result_set.size(); ++i) { + if (removed_this_round[i]) { + // Already merged. + continue; + } + CandidatePartition upstream = result_set.at(i); + // Narrow our search to just those candidates which could touch. + IndexSet possible_downstream(result_set.size()); // over candidate indexes! + for (PostDfsIndex output_index : upstream->sub_graph_->output_) { + possible_downstream = possible_downstream | entry_map[output_index]; + } + for (size_t j : possible_downstream) { + if (removed_this_round[j]) { + // Already merged. + continue; + } + if (i == j) { + // Ignore self. + continue; + } + CandidatePartition downstream = result_set.at(j); + if (!upstream.AreTouching(dataflow_graph, downstream)) { + continue; + } + CandidatePartition new_candidate = upstream.DisjointUnion(dataflow_graph, downstream); + VLOG(2) << "Merging upstream candidate " << upstream->ToString() + << " and downstream candidate " << downstream->ToString() << " to yield " + << new_candidate->ToString(); + result_set.Add(dataflow_graph, new_candidate); + result_set.Remove(upstream); + removed_this_round.Add(i); + result_set.Remove(downstream); + removed_this_round.Add(j); + } + } + } + + // Restore canonical order. + result_set.sort(); + + VLOG(1) << "MaxCoalesce produced " << result_set.size() << " candidates"; + return result_set.MovedCurrentCandidates(); +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/candidate_partition.h b/src/relay/collage/candidate_partition.h new file mode 100644 index 0000000000000..1e324666fc658 --- /dev/null +++ b/src/relay/collage/candidate_partition.h @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/candidate_partition.cc + * \brief A potential partition in the search. + */ + +#ifndef TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_ +#define TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_ + +#include +#include + +#include +#include +#include + +#include "./candidate_function_cache.h" +#include "./cost.h" +#include "./cost_estimator.h" +#include "./name_supply.h" +#include "./sub_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +class PartitionSpec; + +/*! + * \brief A candidate partition w.r.t. the body of an overall Relay expression. + * + * We represent the partition as a sub-graph. This means not only can we represent the scope + * of Relay sub-expressions intended for a particular partition (or kernel), but we can also + * represent various conventions for encoding how the operators in the partition should be + * tagged for downstream processing. + */ +class CandidatePartitionNode : public Object { + public: + CandidatePartitionNode() = default; + + /*! + * \brief Combination of all the partition rule names which produced this candidate. + * For debugging and explainability. + */ + String rule_name_; + + /*! + * \brief The sub-graph of the overall expression matched by the partition rule. + */ + SubGraph sub_graph_; + + /*! + * \brief The partition specification which produced this candidate. + */ + ObjectRef /* actually PartitionSpec */ spec_; + + /*! + * \brief The (cached) cost of the partition. + * + * Initially Cost::Unknown, calculated and cached by EstimateCost. + */ + mutable Cost cost_ = Cost::Unknown(); + + void VisitAttrs(AttrVisitor* v); + + /*! + * \brief Returns the partition specification which produced this candidate. + */ + PartitionSpec partition_spec() const; + + /*! + * \brief Returns the name of the partition specification which produced this candidate. + */ + std::string partition_spec_name() const; + + /*! + * \brief Returns the target of the partition specification which produced this candidate. + */ + Target target() const; + + /*! + * \brief Return the estimated cost of the candidate partition, using \p cost_estimator and + * \p cache. + */ + Cost EstimatedCost(const DataflowGraph& dataflow_graph, CostEstimator cost_estimator, + const CompilationConfig& config, + std::shared_ptr cache) const; + + /*! + * \brief Returns a brief description of candidate suitable for debugging output. + */ + std::string ToSummary(const DataflowGraph& dataflow_graph) const; + + std::string ToString() const; + + static constexpr const char* _type_key = "relay.collage.CandidatePartition"; + TVM_DECLARE_FINAL_OBJECT_INFO(CandidatePartitionNode, Object); +}; + +class CandidatePartition : public ObjectRef { + public: + CandidatePartition(String rule_name, SubGraph sub_graph, + ObjectRef /* actually PartitionSpec */ spec, Cost cost = Cost::Unknown()); + + bool operator<(const CandidatePartition& that) const; + + /*! + * \brief Returns true if this and \p that candidate are disjoint, have the same (or no) target, + * and touch. This does not imply the \p DisjointUnion of this and that will be valid. For + * example, the result may be too deep or have too many outputs. + */ + bool AreTouching(const DataflowGraph& dataflow_graph, const CandidatePartition& that) const; + + /*! + * \brief Returns the disjoint union of this and \p that. + */ + CandidatePartition DisjointUnion(const DataflowGraph& dataflow_graph, + const CandidatePartition& that) const; + + /*! + * \brief Returns the disjoint union of all \p candidates. + */ + static CandidatePartition DisjointUnion(const DataflowGraph& dataflow_graph, + std::vector candidates); + + /*! + * \brief Returns \p expr rewritten to apply all the partitions implied by \p candidates. + * The candidates can be in any order but must be disjoint. + */ + static Expr ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr, + const std::vector& candidates); + + /*! + * Eagerly merge all touching candidates for the same target. The candidates must be disjoint + * and have their Targets filled in. This is typically called on the optimal list of candidate + * partitions found by the Collage search in order to remove unnecessary partition boundaries. + * Ideally the search would never produce such candidates however to keep the search space + * manageable Collage may only consider candidate partitions up to a particular depth. + */ + static std::vector MaxCoalesce(const DataflowGraph& dataflow_graph, + std::vector candidates); + + TVM_DEFINE_OBJECT_REF_METHODS(CandidatePartition, ObjectRef, CandidatePartitionNode); + TVM_DEFINE_OBJECT_REF_COW_METHOD(CandidatePartitionNode); +}; + +CandidatePartition WithRuleName(CandidatePartition candidate, String rule_name); +CandidatePartition WithTarget(CandidatePartition candidate, Target target); +CandidatePartition WithSubGraph(CandidatePartition candidate, SubGraph sub_graph); + +struct CandidatePartitionHash { + size_t operator()(const CandidatePartition& candidate) const { + return candidate->sub_graph_->hash(); + } +}; + +struct CandidatePartitionEquals { + bool operator()(const CandidatePartition& left, const CandidatePartition& right) const { + return *left->sub_graph_.get() == *right->sub_graph_.get(); + } +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_ diff --git a/src/relay/collage/candidate_partition_index.cc b/src/relay/collage/candidate_partition_index.cc new file mode 100644 index 0000000000000..7541df87d331c --- /dev/null +++ b/src/relay/collage/candidate_partition_index.cc @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file relay/collage/candidate_partition_index.h + * \brief Index for finding relevant candidate partitions for a particular search state. + */ + +#include "./candidate_partition_index.h" + +#include "./gather_partition_specs.h" +#include "./prune_candidates.h" +#include "utils.h" + +namespace tvm { +namespace relay { +namespace collage { + +CandidatePartitionIndex::CandidatePartitionIndex( + const std::unordered_map* virtual_devices, + DataflowGraph* dataflow_graph) + : virtual_devices_(virtual_devices), + dataflow_graph_(dataflow_graph), + first_inside_index_to_candidates_(dataflow_graph->size()) {} + +void CandidatePartitionIndex::Index(const Array& partition_specs) { + std::vector candidates = Collect(partition_specs); + candidates = PruneCandidates(*dataflow_graph_, candidates); + // Index the candidates by their first inside index. + for (auto& candidate : candidates) { + first_inside_index_to_candidates_[candidate->sub_graph_->first_inside_index_].emplace_back( + candidate); + } + size_ = candidates.size(); +} + +void CandidatePartitionIndex::EstimateAllCosts(CostEstimator cost_estimator, + const CompilationConfig& config, + std::shared_ptr cache) { + size_t n = 0; + for (PostDfsIndex index = 0; index < dataflow_graph_->size(); ++index) { + for (const auto& candidate : first_inside_index_to_candidates_[index]) { + LOG(INFO) << "Estimating cost of candidate " << candidate->ToSummary(*dataflow_graph_) << " [" + << n++ << "/" << size_ << "]"; + // Cost will be cached in candidate as a side effect. + Cost cost = candidate->EstimatedCost(*dataflow_graph_, cost_estimator, config, cache); + LOG(INFO) << "Candidate has cost " << cost.ToString(); + } + } +} + +std::string CandidatePartitionIndex::ToSummary() const { + std::vector lines; + for (const auto& candidates : first_inside_index_to_candidates_) { + for (const auto& candidate : candidates) { + if (candidate->partition_spec_name() == kHostSpecName) { + continue; + } + lines.emplace_back(candidate->ToSummary(*dataflow_graph_)); + } + } + std::sort(lines.begin(), lines.end()); + std::ostringstream os; + bool first = true; + for (const auto& line : lines) { + if (first) { + first = false; + } else { + os << std::endl; + } + os << line; + } + return os.str(); +} + +bool CandidatePartitionIndex::IsCompatibleWithVirtualDevice(const CandidatePartition& candidate) { + for (PostDfsIndex index : candidate->sub_graph_->inside_) { + const ExprNode* sub_expr_node = dataflow_graph_->index_to_node(index)->node_ref_; + auto itr = virtual_devices_->find(sub_expr_node); + ICHECK(itr != virtual_devices_->end()); + const Target& existing_target = itr->second->target; + if (!existing_target.defined()) { + // No constraint. + continue; + } + if (StructuralEqual()(existing_target, candidate->target())) { + // No disagreement. + continue; + } + if (!candidate->target().IsExternalCodegenFor(itr->second->target)) { + // The candidate's target is not an external codegen target compatible with the existing + // target. + // TODO(mbs): There's a conflict here between Collage's desire to leave some expression nodes + // 'behind' on the VM and PlanDevice's desire to assign a primitive Target to every node. + // I think PlanDevices is the one that needs to give here by leaving such nodes + // unconstrained. + VLOG(1) << "Ignoring candidate " << candidate->ToString() + << " since incompatible with existing virtual device assignment of:" << std::endl + << itr->second << std::endl + << "to sub-graph:" << std::endl + << PrettyPrint(GetRef(sub_expr_node)); + return false; + } + } + return true; +} + +std::vector CandidatePartitionIndex::Collect( + const Array& partition_specs) { + VLOG_CONTEXT << "collecting"; + std::vector result; + for (const auto& spec : partition_specs) { + VLOG_CONTEXT << "spec " << spec->spec_name_; + VLOG(1) << "collecting candidates"; + std::vector candidates = spec->AllCandidates(*dataflow_graph_); + for (auto& candidate : candidates) { + if (!IsCompatibleWithVirtualDevice(candidate)) { + continue; + } + result.push_back(candidate); + } + } + VLOG(1) << "Found " << result.size() << " candidates"; + return result; +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/candidate_partition_index.h b/src/relay/collage/candidate_partition_index.h new file mode 100644 index 0000000000000..cfb83de829967 --- /dev/null +++ b/src/relay/collage/candidate_partition_index.h @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file relay/collage/candidate_partition_index.h + * \brief Index for finding relevant candidate partitions for a particular search state. + */ +#ifndef TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_INDEX_H_ +#define TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_INDEX_H_ + +#include + +#include +#include +#include +#include + +#include "partition_spec.h" +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Collects and indexes all the candidate partitions for the overall expression. This index + * is used during partitioning search to find the next valid candidate partition to explore from the + * current search state. We do not yet attempt to estimate the cost of each candidate partition, and + * when we do so during the search we may discover it to be infeasible. + */ +class CandidatePartitionIndex { + public: + CandidatePartitionIndex(const std::unordered_map* virtual_devices, + DataflowGraph* dataflow_graph); + + /*! \brief Constructs the index. */ + void Index(const Array& partition_specs); + + /*! \brief Returns all the candidates which may begin at \p index. */ + const std::vector& candidates_at(PostDfsIndex index) const { + ICHECK_LT(index, dataflow_graph_->size()); + return first_inside_index_to_candidates_[index]; + } + + /*! \brief Estimates the casts of all candidates in the index. Each candidate caches its cost. */ + void EstimateAllCosts(CostEstimator cost_estimater, const CompilationConfig& config, + std::shared_ptr cache); + + size_t size() const { return size_; } + + std::string ToSummary() const; + + private: + /*! + * \brief Returns true if \p candidate's desired target is compatible with any existing target + * constraints on the candidate's sub-expressions. + */ + bool IsCompatibleWithVirtualDevice(const CandidatePartition& candidate); + + /*! \brief Returns all valid candidates found from \p partition_specs. */ + std::vector Collect(const Array& partition_specs); + + /*! + * \brief The \p VirtualDevice for every sub-expression in the overall expression. Needed to + * ensure candidates do not contradict the target/device placement already determined by + * device planning. + */ + const std::unordered_map* virtual_devices_; + + /*! \brief Dataflow graph for overall expression. */ + DataflowGraph* dataflow_graph_; + + /*! + * \brief Maps post-dfs indexes to the all the candidates which have that as their first inside + * index, and which should be considered in the Collage search. + */ + std::vector> first_inside_index_to_candidates_; + + /*! \brief Number of entries in above. */ + size_t size_ = 0; +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_INDEX_H_ diff --git a/src/relay/collage/candidate_set.cc b/src/relay/collage/candidate_set.cc new file mode 100644 index 0000000000000..2c2a7eaf8d540 --- /dev/null +++ b/src/relay/collage/candidate_set.cc @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/candidate_set.cc + * \brief Collects a set of candidate partitions. + */ + +#include "./candidate_set.h" + +namespace tvm { +namespace relay { +namespace collage { + +CandidateSet::CandidateSet(std::vector candidates_to_add) + : candidates_to_add_(std::move(candidates_to_add)) { + for (const auto& candidate : candidates_to_add_) { + seen_.emplace(candidate); + } +} + +void CandidateSet::Add(const DataflowGraph& dataflow_graph, + const CandidatePartition& new_candidate) { + VLOG(2) << "adding " << new_candidate->ToString(); + if (seen_.count(new_candidate)) { + VLOG(2) << "already seen candidate, ignoring"; + return; + } + seen_.emplace(new_candidate); + candidates_to_add_.emplace_back(new_candidate); +} + +void CandidateSet::Remove(const CandidatePartition& old_candidate) { + ICHECK(seen_.count(old_candidate)); + VLOG(2) << "removing " << old_candidate->ToString(); + candidates_to_remove_.emplace_back(old_candidate); +} + +bool CandidateSet::PrepareForNextRound() { + size_t init_size = current_candidates_.size(); + for (const auto& candidate_to_remove : candidates_to_remove_) { + current_candidates_.erase( + std::remove(current_candidates_.begin(), current_candidates_.end(), candidate_to_remove), + current_candidates_.end()); + } + size_t num_removed = init_size - current_candidates_.size(); + candidates_to_remove_.clear(); + first_new_index_ = current_candidates_.size(); + for (const auto& new_candidate : candidates_to_add_) { + current_candidates_.push_back(new_candidate); + } + size_t num_added = candidates_to_add_.size(); + candidates_to_add_.clear(); + VLOG(1) << "removed " << num_removed << " and added " << num_added << " candidates"; + return num_removed + num_added > 0; +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/candidate_set.h b/src/relay/collage/candidate_set.h new file mode 100644 index 0000000000000..4cb2c40e9500e --- /dev/null +++ b/src/relay/collage/candidate_set.h @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/candidate_set.h + * \brief Collects a set of candidate partitions. + */ + +#ifndef TVM_RELAY_COLLAGE_CANDIDATE_SET_H_ +#define TVM_RELAY_COLLAGE_CANDIDATE_SET_H_ + +#include +#include +#include +#include + +#include "./candidate_partition.h" +#include "./dataflow_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Holds a vector of current candidates and the additions/removals to apply to them. + */ +struct CandidateSet { + CandidateSet() = default; + + explicit CandidateSet(std::vector candidates_to_add); + + /*! + * \brief Schedule \p new_candidate for addition before the next round (unless it is not valid). + */ + void Add(const DataflowGraph& dataflow_graph, const CandidatePartition& new_candidate); + + /*! \brief Schedule \p old_candidate for removal before the next round. */ + void Remove(const CandidatePartition& old_candidate); + + /*! + * \brief Update \p current_candidates and \p first_new_index. Return false if no + * new candidates were added, in which case we have reached a fixed point. + */ + bool PrepareForNextRound(); + + size_t size() const { return current_candidates_.size(); } + + CandidatePartition operator[](size_t i) const { + ICHECK_LT(i, current_candidates_.size()); + return current_candidates_[i]; + } + CandidatePartition at(size_t i) const { return (*this)[i]; } + + size_t first_new_index() const { return first_new_index_; } + + void sort() { std::sort(current_candidates_.begin(), current_candidates_.end()); } + + std::vector MovedCurrentCandidates() { + return std::move(current_candidates_); + } + + private: + /*! + * \brief Index of first candidate in current_candidates added in last round. This can be used to + * avoid considering candidates or candidate combinations which have already been considered in an + * earlier round. + */ + size_t first_new_index_ = 0; + /*! \brief Candidates gathered in previous rounds. */ + std::vector current_candidates_; + /*! \brief New candidates gathered in the current round. */ + std::vector candidates_to_add_; + /*! \brief Existing candidates to remove before starting the next round. */ + std::vector candidates_to_remove_; + /*! \brief Which candidates have been seen so far and should not be added again. */ + std::unordered_set seen_; +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_CANDIDATE_SET_H_ diff --git a/src/relay/collage/collage_partitioner.cc b/src/relay/collage/collage_partitioner.cc new file mode 100644 index 0000000000000..52abd7c08c45b --- /dev/null +++ b/src/relay/collage/collage_partitioner.cc @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/collage_partitioner.cc + * \brief Search for an optimal partitioning of a Relay model. + */ + +#include "./collage_partitioner.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ir/dataflow_matcher_impl.h" +#include "../transforms/compiler_function_utils.h" +#include "./candidate_partition.h" +#include "./candidate_partition_index.h" +#include "./cost.h" +#include "./cost_estimator.h" +#include "./gather_partition_specs.h" +#include "./name_supply.h" +#include "./partition_rule.h" +#include "./partition_spec.h" +#include "./priority_queue.h" +#include "./recover_virtual_device_map.h" +#include "./sub_graph.h" +#include "./utils.h" + +namespace tvm { +namespace relay { +namespace collage { +namespace { + +TVM_REGISTER_PASS_CONFIG_OPTION("relay.collage.tvm_max_max_depth", Integer); +TVM_REGISTER_PASS_CONFIG_OPTION("relay.collage.byoc_max_max_depth", Integer); + +/*! + * \brief Represents the overall expression after some number of non-overlapping candidate + * partitions have been applied. + */ +class SearchState { + public: + explicit SearchState(IndexSet covered) : covered_(std::move(covered)) {} + + /*! + * \brief Order states by increasing best cost, breaking ties by lexicographic order on + * the covering sub graph. + */ + bool operator<(const SearchState& that) const { + return std::tie(best_cost_, covered_) < std::tie(that.best_cost_, that.covered_); + } + + const IndexSet& covered() const { return covered_; } + + std::string ToString() const { + std::ostringstream os; + os << "State("; + os << "covered=" << covered_.ToString(); + os << ",best_cost=" << best_cost_.ToString(); + if (best_candidate_.defined()) { + os << ",best_candidate=" << best_candidate_->ToString(); + } + os << ")"; + return os.str(); + } + + private: + /*! \brief Which nodes of overall expression have been placed on all paths to this state. */ + IndexSet covered_; + /*! \brief Predecessor state for sequence of candidates reaching this state with least + * cost. Null if initial search state. */ + SearchState* pred_state_ = nullptr; + /*! + * \brief Cost of reaching this state using placement implied by path given by pred_state fields. + * Includes estimated/measured cost of all candidates plus any candidate launch penalty. + * Initially invalid cost. + */ + Cost best_cost_ = Cost::Invalid(); + /*! \brief Candidate partition selected in transition from pred_state to this state. */ + CandidatePartition best_candidate_; + + friend class Partitioner; +}; + +struct CompareSearchStatePtrs { + bool operator()(const SearchState* left, const SearchState* right) const { + return *left < *right; + } +}; + +struct EqualSearchStatePtrs { + bool operator()(const SearchState* left, const SearchState* right) const { + return left->covered() == right->covered(); + } +}; + +/*! + * \brief Finds the optimal partitioning of an expression to candidate partitions. + * Though no candidate partitions overlap, it is possible some sub-expressions end up in + * no candidate. Those sub-expressions must be evaluated by the host executor (eg VM). + */ +class Partitioner { + public: + explicit Partitioner(CompilationConfig config, Array partition_specs, + const std::unordered_map* virtual_devices, + CostEstimator cost_estimator, std::shared_ptr cache) + : config_(std::move(config)), + partition_specs_(std::move(partition_specs)), + virtual_devices_(virtual_devices), + cost_estimator_(std::move(cost_estimator)), + cache_(std::move(cache)) {} + + Expr Partition(const Expr& expr) { + // Establish core data structures. + dataflow_graph_ = std::make_unique(expr); + VLOG(1) << "Created dataflow graph with " << dataflow_graph_->size() << " nodes"; + + // Build the candidate index. This is where all the partition rules are invoked . + index_ = std::make_unique(virtual_devices_, dataflow_graph_.get()); + index_->Index(partition_specs_); + VLOG(1) << "All candidates before search:" << std::endl << index_->ToSummary(); + + // 'Eagerly' estimate the cost of all candidates. + // + // Note if this is not done costs will simply be estimated 'lazily' as the search proceeds. + // Typically, some candidates are never explored during the search because: + // - There are no paths in which the candidate does not intersect candidates already + // applied on the path. + // - The Dijkstra search terminates early with a least cost path. + // So eager may result in more estimation overhead. However, eager could be made + // embarrassingly parallel. + VLOG(1) << "Beginning eager cost estimation"; + index_->EstimateAllCosts(cost_estimator_, config_, cache_); + VLOG(1) << "Finished eager cost estimation"; + + // Setup initial state. + SearchState* init_state = GetState(IndexSet(dataflow_graph_->size())); + init_state->best_cost_ = Cost::Zero(); + pq_.Push(init_state); + + size_t num_transitions = 0; + + VLOG(1) << "#### Commencing Collage search over " << index_->size() << " candidates ####"; + while (!pq_.empty()) { + SearchState* curr_state = pq_.Pop(); + VLOG(1) << "Looking at state " << curr_state->covered_.ToString(); + PostDfsIndex next_index = curr_state->covered_.FirstOutsideIndex(); + + if (next_index >= dataflow_graph_->size()) { + // The entire expression has been explored. Collect the candidates on the optimal path. + VLOG(1) << "#### Finished Collage search after exploring " << num_transitions + << " transitions ####"; + std::vector best_candidates; + while (curr_state != init_state) { + ICHECK(curr_state->best_candidate_.defined()); + best_candidates.emplace_back(curr_state->best_candidate_); + curr_state = curr_state->pred_state_; + ICHECK(curr_state != nullptr); + } + return Finalize(expr, best_candidates); + } + + size_t num_fires = 0; + Expr sub_expr = dataflow_graph_->index_to_node(next_index)->ref(); + VLOG(1) << "Looking at index " << next_index << " for sub-expression " + << SubExprKindAndLabel(sub_expr).second << " out of " << dataflow_graph_->size() + << " total dataflow nodes"; + + // Explore all the outgoing candidates from the current state. + for (const auto& candidate : index_->candidates_at(next_index)) { + VLOG(1) << "Considering candidate " << candidate->ToSummary(*dataflow_graph_) + << " for transition " << ++num_transitions << " over " << index_->size() + << " total candidates"; + if (!candidate->sub_graph_->inside_.AreDisjoint(curr_state->covered_)) { + LOG(INFO) << "Candidate overlaps with already partitioned nodes"; + continue; + } + IndexSet next_covered = curr_state->covered_ | candidate->sub_graph_->inside_; + SearchState* next_state = GetState(next_covered); + Relax(curr_state, next_state, candidate); + ++num_fires; + } + ICHECK_GT(num_fires, 0) + << "No candidate was found covering sub-expression at index " << next_index + << ", suggesting the partition rules are incomplete for the given targets."; + } + ICHECK(false) << "should have reached end state in which all sub-expressions are covered"; + return {}; + } + + /*! \brief Returns the unique state corresponding to the \p covered sub-graph. */ + SearchState* GetState(const IndexSet& covered) { + auto itr = covered_to_state_.find(covered); + if (itr != covered_to_state_.end()) { + return itr->second.get(); + } + auto state = std::make_unique(covered); + SearchState* raw_ptr = state.get(); + covered_to_state_.emplace(covered, std::move(state)); + return raw_ptr; + } + + /*! + * \brief Record that it is possible to reach \p next_state by choosing \p candidate + * in \p curr_state. If the resulting cost is better than the best known so far, update + * \p next_state's best cost, predecessor and candidate to match. + */ + void Relax(SearchState* curr_state, SearchState* next_state, + const CandidatePartition& candidate) { + // Note this may already be cached if the candidate partition costs were 'eagerly' estimated. + Cost candidate_cost = + candidate->EstimatedCost(*dataflow_graph_, cost_estimator_, config_, cache_); + VLOG(1) << "Candidate has cost " << candidate_cost.ToString(); + Cost new_state_cost = candidate_cost + curr_state->best_cost_; + const bool is_new = next_state->best_cost_.is_invalid(); + CandidatePartition previously_best_candidate = next_state->best_candidate_; + if (is_new || new_state_cost < next_state->best_cost_) { + next_state->pred_state_ = curr_state; + Cost previously_best_cost = next_state->best_cost_; + next_state->best_cost_ = new_state_cost; + next_state->best_candidate_ = candidate; + if (is_new) { + VLOG(1) << "transition " << curr_state->ToString() << " --> " << next_state->ToString() + << " (New state for spec " << candidate->partition_spec_name() << ")"; + pq_.Push(next_state); + } else { + VLOG(1) << "transition " << curr_state->ToString() << " --> " << next_state->ToString() + << " (Spec " << candidate->partition_spec_name() << " beats previous spec " + << previously_best_candidate->partition_spec_name() << " by " + << (previously_best_cost - curr_state->best_cost_).ToString() << ")"; + pq_.Update(next_state); + } + } else { + VLOG(1) << "transition " << curr_state->ToString() << " --> " << next_state->ToString() + << " (Spec " << candidate->partition_spec_name() << " does not beat existing spec " + << previously_best_candidate->partition_spec_name() << ")"; + } + } + + /*! + * \brief Returns the result of partitioning \p expr according to 'optimal' candidates found + * by the search. + */ + Expr Finalize(const Expr& expr, std::vector best_candidates) { + best_candidates = CandidatePartition::MaxCoalesce(*dataflow_graph_, best_candidates); + + Cost total_cost = Cost::Zero(); + std::ostringstream os; + os << "Optimal partitioning:" << std::endl; + for (const auto& best_candidate : best_candidates) { + if (best_candidate->partition_spec_name() == kHostSpecName) { + continue; + } + os << best_candidate->ToSummary(*dataflow_graph_); + os << std::endl; + total_cost = total_cost + best_candidate->cost_; + } + os << "Estimated overall cost is " << total_cost.ToString(); + LOG(INFO) << os.str(); + + LOG(INFO) << "All candidates after search:" << std::endl << index_->ToSummary(); + + return CandidatePartition::ParallelRewrite(*dataflow_graph_, expr, best_candidates); + } + + private: + /*! \brief Available targets, including both 'regular' and 'external codegen'. */ + CompilationConfig config_; + /*! \brief Available partition specs to use during search. */ + Array partition_specs_; + /*! + * \brief The virtual devices for every sub-expression so we can respect any existing target + * constraints. + */ + const std::unordered_map* virtual_devices_; + /*! \brief Cost estimator to use for candidates. */ + CostEstimator cost_estimator_; + /*! \brief Cached names and costs for all partition functions. */ + std::shared_ptr cache_; + /*! \brief Dataflow graph for overall expression. */ + std::unique_ptr dataflow_graph_; + /*! \brief Index of all avoilable candidates we are searching over. */ + std::unique_ptr index_; + /*! \brief Map from covered sub-graphs to the corresponding state. */ + std::unordered_map, IndexSetHash, IndexSetEqual> + covered_to_state_; + /*! \brief Priority queue of states, ordered by increasing cost. */ + PriorityQueue pq_; +}; + +} // namespace + +transform::Pass CollagePartition(CompilationConfig config, CostEstimator cost_estimator) { + runtime::TypedPackedFunc pass_func = + [config = std::move(config), cost_estimator = std::move(cost_estimator)]( + IRModule mod, transform::PassContext ctxt) { + VLOG(1) << "CollagePartition input:" << std::endl << PrettyPrint(mod); + + Array partition_specs = GatherPartitionSpecs(config); + VLOG(1) << "Gathered " << partition_specs.size() << " partition specs"; + + auto cache = + std::make_shared(std::make_shared("collage")); + + IRModule out_mod = mod->ShallowCopy(); + for (const auto& kv : mod->functions) { + if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) { + auto function = GetRef(function_node); + std::unordered_map virtual_devices = + RecoverVirtualDeviceMap(mod, function); + Partitioner partitioner(config, partition_specs, &virtual_devices, cost_estimator, + cache); + Function result = Downcast(partitioner.Partition(function)); + out_mod->Add(kv.first, result); + } + } + + out_mod = OutlineCompilerFunctions(cache)(std::move(out_mod)); + VLOG(1) << "CollagePartition result:" << std::endl << PrettyPrint(out_mod); + return out_mod; + }; + return tvm::transform::CreateModulePass(pass_func, /*opt_level=*/0, "CollagePartition", {}); +} + +TVM_REGISTER_GLOBAL("relay._transform.CollagePartition").set_body_typed(CollagePartition); + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/collage_partitioner.h b/src/relay/collage/collage_partitioner.h new file mode 100644 index 0000000000000..7c8de87ffe0a3 --- /dev/null +++ b/src/relay/collage/collage_partitioner.h @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file relay/collage/collage_partitioner.h + * \brief Search for an optimal partitioning of a Relay model. + * + * See: + * Collage: Automated Integration of Deep Learning Backends + * Byungsoo Jeon, Sunghyun Park, Peiyuan Liao, Sheng Xu, Tianqi Chen, Zhihao Jia + * https://arxiv.org/pdf/2111.00655.pdf + */ +#ifndef TVM_RELAY_COLLAGE_COLLAGE_PARTITIONER_H_ +#define TVM_RELAY_COLLAGE_COLLAGE_PARTITIONER_H_ + +#include + +#include "./cost_estimator.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Explores the space of all possible (sub-graph, target) pairs which cover the + * model, and applies the globally optimal choice (assuming partition costs are additive). + */ +transform::Pass CollagePartition(CompilationConfig config, CostEstimator cost_estimator); + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_COLLAGE_PARTITIONER_H_ diff --git a/src/relay/collage/combiner_rule.cc b/src/relay/collage/combiner_rule.cc new file mode 100644 index 0000000000000..bf6e0eec1cf42 --- /dev/null +++ b/src/relay/collage/combiner_rule.cc @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/combiner_rule.cc + * \brief Helpers for the \p CombinePartitionRule + */ + +#include "./combiner_rule.h" + +#include "./partition_spec.h" + +namespace tvm { +namespace relay { +namespace collage { + +TVM_REGISTER_NODE_TYPE(SimpleCombinerRuleNode); + +void SimpleCombinerRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +bool SimpleCombinerRuleNode::Fires(const DataflowGraph& dataflow_graph, + const CandidatePartition& upstream, + const CandidatePartition& downstream) const { + return false; +} + +std::string SimpleCombinerRuleNode::ToString() const { + return "SimpleCombinerRule(" + rule_name_ + ")"; +} + +SimpleCombinerRule::SimpleCombinerRule(String rule_name) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(ByKindSimpleCombinerRuleNode); + +void ByKindSimpleCombinerRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +bool ByKindSimpleCombinerRuleNode::Fires(const DataflowGraph& dataflow_graph, + const CandidatePartition& upstream, + const CandidatePartition& downstream) const { + return upstream->sub_graph_->kind_ <= upstream_kind_ && + downstream->sub_graph_->kind_ <= downstream_kind_; +} + +std::string ByKindSimpleCombinerRuleNode::ToString() const { + std::ostringstream os; + os << "ByKindSimpleCombinerRule(" << rule_name_ << ")"; + return os.str(); +} + +ByKindSimpleCombinerRule::ByKindSimpleCombinerRule(OpPatternKind upstream_kind, + OpPatternKind downstream_kind) { + auto node = runtime::make_object(); + String rule_name = KindToString(upstream_kind) + "->" + KindToString(downstream_kind); + node->rule_name_ = std::move(rule_name); + node->upstream_kind_ = upstream_kind; + node->downstream_kind_ = downstream_kind; + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(CombinerRuleNode); + +void CombinerRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +void CombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const {} + +std::string CombinerRuleNode::ToString() const { return "CombinerRuleNode(" + rule_name_ + ")"; } + +CombinerRule::CombinerRule(String rule_name) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(AllSimpleCombinerRuleNode); + +void AllSimpleCombinerRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +void AllSimpleCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const { + VLOG(1) << "running AllSimpleCombinerRule(" << rule_name_ << ")"; + // Build map from post-dfs indices to the indices of candidates with corresponding entry node. + // NOTE: the index set is over candidate indices not post-dfs indices! + std::vector entry_map(ctxt->dataflow_graph->size(), + IndexSet(ctxt->candidate_set->size())); + for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) { + CandidatePartition candidate = ctxt->candidate_set->at(i); + for (PostDfsIndex entry_index : candidate->sub_graph_->entry_) { + entry_map[entry_index].Add(i); + } + } + + for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) { + CandidatePartition upstream = ctxt->candidate_set->at(i); + // Narrow our search to just those candidates which could touch. + IndexSet possible_downstream(ctxt->candidate_set->size()); + for (PostDfsIndex output_index : upstream->sub_graph_->output_) { + possible_downstream = possible_downstream | entry_map[output_index]; + } + size_t start_j = + i < ctxt->candidate_set->first_new_index() ? ctxt->candidate_set->first_new_index() : 0; + for (size_t j : possible_downstream) { + if (i == j) { + continue; + } + if (i < start_j) { + // We already explored the cross-product of candidates [0, first_new_index), so don't + // do it again. + continue; + } + // Note that the rules are not commutative so we can't just ignore if j < i. + CandidatePartition downstream = ctxt->candidate_set->at(j); + if (ctxt->max_max_depth > 0 && + upstream->sub_graph_->max_depth_ + downstream->sub_graph_->max_depth_ > + ctxt->max_max_depth) { + continue; + } + if (!upstream.AreTouching(*ctxt->dataflow_graph, downstream)) { + continue; + } + for (const auto& simple_rule : simple_rules_) { + if (simple_rule->Fires(*ctxt->dataflow_graph, upstream, downstream)) { + CandidatePartition new_candidate = + upstream.DisjointUnion(*ctxt->dataflow_graph, downstream); + VLOG(2) << "Fired " << simple_rule->rule_name_ << " on upstream candidate " + << upstream->ToString() << " and downstream candidate " << downstream->ToString() + << " to yield " << new_candidate->ToString(); + ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate); + } + } + } + } +} + +std::string AllSimpleCombinerRuleNode::ToString() const { + std::ostringstream os; + os << "AllSimpleCombinerRule(" << rule_name_; + for (const auto& simple : simple_rules_) { + os << ", " << simple->ToString(); + } + os << ")"; + return os.str(); +} + +AllSimpleCombinerRule::AllSimpleCombinerRule(String rule_name, + Array simple_rules) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->simple_rules_ = std::move(simple_rules); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(TupleArgCombinerRuleNode); + +void TupleArgCombinerRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +void TupleArgCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const { + VLOG(1) << "running TupleArgCombinerRule(" << rule_name_ << ")"; + // Build map from post-dfs index to the indices of injective candidates with corresponding entry + // node. NOTE: the index set is over candidate indices not post-dfs indices! + std::vector exit_map(ctxt->dataflow_graph->size(), + IndexSet(ctxt->candidate_set->size())); + for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) { + CandidatePartition candidate = ctxt->candidate_set->at(i); + if (candidate->sub_graph_->kind_ > kInjective) { + continue; + } + for (PostDfsIndex exit_index : candidate->sub_graph_->exit_) { + exit_map[exit_index].Add(i); + } + } + + // The two-step I -> tuple -> I rule. + // Look all possible tuple consumers... + for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) { + CandidatePartition tuple_consumer_candidate = ctxt->candidate_set->at(i); + if (tuple_consumer_candidate->sub_graph_->kind_ > kInjective) { + continue; + } + // For all possible tuples feeding into candidate... + for (PostDfsIndex input_index : tuple_consumer_candidate->sub_graph_->input_) { + auto node = ctxt->dataflow_graph->index_to_node(input_index); + Expr sub_expr = node->ref(); + const auto* tuple_node = sub_expr.as(); + if (tuple_node == nullptr) { + continue; + } + // The tuple_consumer_candidate candidate consumes (at least one) tuple, eg as an argument + // to an operator. + // eg: concatenate((field1, ..., fieldn)) + auto tuple_dataflow_node = ctxt->dataflow_graph->item_to_node(tuple_node); + + // Collect all the possible unions. There may be more than one if different candidates + // could supply the same tuple field. + std::vector> all_possible_unions; + + // Obviously we must include the consumer. + all_possible_unions.emplace_back(); + all_possible_unions.back().emplace_back(tuple_consumer_candidate); + + // We must include the tuple itself. + SubGraph tuple_sub_graph(*ctxt->dataflow_graph, + IndexSet(ctxt->dataflow_graph->size(), {node->index_}), kInjective, + "tuple"); + CandidatePartition tuple_candidate("", std::move(tuple_sub_graph), + tuple_consumer_candidate->partition_spec()); + all_possible_unions.back().emplace_back(std::move(tuple_candidate)); + + // For all tuple fields... + bool all_tuple_fields_have_producer = true; + for (auto* tuple_field_dataflow_node : tuple_dataflow_node->inputs_) { + // Collect all the candidates which could produce this tuple field. + std::vector to_appends; + size_t start_j = + i < ctxt->candidate_set->first_new_index() ? ctxt->candidate_set->first_new_index() : 0; + for (size_t j : exit_map[tuple_field_dataflow_node->index_]) { + if (i == j) { + continue; + } + if (i < start_j) { + // We already explored the cross-product of candidates [0, first_new_index), so don't + // do it again. + continue; + } + CandidatePartition tuple_field_producer = ctxt->candidate_set->at(j); + // The tuple_field_producer candidate can provide this tuple field. + // eg concatenate((..., producer, ...)) + to_appends.emplace_back(tuple_field_producer); + } + if (to_appends.empty()) { + // At least one of the tuple's fields does not have a producer candidate we can + // union in, so we need to give up. + all_tuple_fields_have_producer = false; + break; + } else { + // If to_appends = [A, B] and we already have possible unions [C, D] and [E, F] then + // the new possible unions are [C, D, A], [C, D, B], [E, F, A] and [E, F, B]. + std::vector> new_all_possible_unions; + for (const auto& to_append : to_appends) { + for (const auto& possible_union : all_possible_unions) { + new_all_possible_unions.emplace_back(possible_union); + new_all_possible_unions.back().emplace_back(to_append); + } + } + all_possible_unions = std::move(new_all_possible_unions); + } + } + + if (!all_tuple_fields_have_producer) { + continue; + } + + // Actually build the candidates which union according to all_possible_unions. + for (const auto& possible_union : all_possible_unions) { + if (possible_union.size() > 2) { + CandidatePartition new_candidate = + CandidatePartition::DisjointUnion(*ctxt->dataflow_graph, possible_union); +#if TVM_LOG_DEBUG + std::ostringstream os; + bool first = true; + for (const auto& candidate : possible_union) { + if (first) { + first = false; + } else { + os << ", "; + } + os << candidate->ToString(); + } + VLOG(2) << "Fired rule " << rule_name_ << " on {" << os.str() << "} to yield " + << new_candidate->ToString(); +#endif + ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate); + } + } + } + } +} + +std::string TupleArgCombinerRuleNode::ToString() const { + return "TupleArgCombinerRule(" + rule_name_ + ")"; +} + +TupleArgCombinerRule::TupleArgCombinerRule(String rule_name) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(TupleProjCombinerRuleNode); + +void TupleProjCombinerRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +void TupleProjCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const { + VLOG(1) << "running TupleProjCombinerRule(" << rule_name_ << ")"; + // We already explored [0, first_new_index), so don't do it again. + for (size_t i = ctxt->candidate_set->first_new_index(); i < ctxt->candidate_set->size(); ++i) { + CandidatePartition base = ctxt->candidate_set->at(i); + for (PostDfsIndex index : base->sub_graph_->output_) { + auto node = ctxt->dataflow_graph->index_to_node(index); + if (node->ref().as()) { + IndexSet index_set(ctxt->dataflow_graph->size(), {node->index_}); + SubGraph sub_graph(*ctxt->dataflow_graph, std::move(index_set), kInjective, "proj"); + CandidatePartition proj_candidate("", std::move(sub_graph), base->spec_); + CandidatePartition new_candidate = + base.DisjointUnion(*ctxt->dataflow_graph, proj_candidate); + VLOG(2) << "Fired rule " << rule_name_ << " on " << proj_candidate->ToString() << " and " + << base->ToString() << " to yield " << new_candidate->ToString(); + ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate); + } + } + } +} + +std::string TupleProjCombinerRuleNode::ToString() const { + return "TupleProjCombinerRule(" + rule_name_ + ")"; +} + +TupleProjCombinerRule::TupleProjCombinerRule(String rule_name) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(ConstantCombinerRuleNode); + +void ConstantCombinerRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +void ConstantCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const { + VLOG(1) << "running ConstantCombinerRule(" << rule_name_ << ")"; + // We already explored [0, first_new_index), so don't do it again. + for (size_t i = ctxt->candidate_set->first_new_index(); i < ctxt->candidate_set->size(); ++i) { + CandidatePartition base = ctxt->candidate_set->at(i); + IndexSet new_constants(ctxt->dataflow_graph->size()); + for (PostDfsIndex index : base->sub_graph_->input_) { + auto node = ctxt->dataflow_graph->index_to_node(index); + if (node->ref().as()) { + new_constants.Add(index); + } + } + if (!new_constants.IsZero()) { + SubGraph sub_graph(*ctxt->dataflow_graph, new_constants, kElemWise, "const"); + CandidatePartition new_const_candidate("", std::move(sub_graph), base->spec_); + CandidatePartition new_candidate = + base.DisjointUnion(*ctxt->dataflow_graph, new_const_candidate); + VLOG(2) << "Fired rule " << rule_name_ << " on " << new_const_candidate->ToString() << " and " + << base->ToString() << " to yield " << new_candidate->ToString(); + ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate); + } + } +} + +std::string ConstantCombinerRuleNode::ToString() const { + return "ConstantCombinerRule(" + rule_name_ + ")"; +} + +ConstantCombinerRule::ConstantCombinerRule(String rule_name) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + data_ = std::move(node); +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/combiner_rule.h b/src/relay/collage/combiner_rule.h new file mode 100644 index 0000000000000..bbaa9486d9297 --- /dev/null +++ b/src/relay/collage/combiner_rule.h @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/combiner_rule.h + * \brief Helpers for the \p CombinePartitionRule + */ + +#ifndef TVM_RELAY_COLLAGE_COMBINER_RULE_H_ +#define TVM_RELAY_COLLAGE_COMBINER_RULE_H_ + +#include +#include + +#include + +#include "./candidate_partition.h" +#include "./candidate_set.h" +#include "./sub_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Base class for all 'simple' combiner rules. + * + * Given \p upstream and \p downstream candidates which touch, a simple combiner rule returns + * true if their union should also be considered a candidate. + */ +class SimpleCombinerRuleNode : public Object { + public: + String rule_name_; + + void VisitAttrs(AttrVisitor* v); + + virtual bool Fires(const DataflowGraph& dataflow_graph, const CandidatePartition& upstream, + const CandidatePartition& downstream) const; + + virtual std::string ToString() const; + + static constexpr const char* _type_key = "relay.collage.SimpleCombinerRule"; + static constexpr const uint32_t _type_child_slots = 1; + TVM_DECLARE_BASE_OBJECT_INFO(SimpleCombinerRuleNode, Object); +}; + +class SimpleCombinerRule : public ObjectRef { + public: + explicit SimpleCombinerRule(String rule_name); + + TVM_DEFINE_OBJECT_REF_METHODS(SimpleCombinerRule, ObjectRef, SimpleCombinerRuleNode); +}; + +/*! + * \brief A simple combiner rule which fires if the \p upstream and \p downstream candidates have + * the given \p upstream_kind and \p downstream_kind (or less) respectively. + */ +class ByKindSimpleCombinerRuleNode : public SimpleCombinerRuleNode { + public: + OpPatternKind upstream_kind_; + OpPatternKind downstream_kind_; + + void VisitAttrs(AttrVisitor* v); + + bool Fires(const DataflowGraph& dataflow_graph, const CandidatePartition& upstream, + const CandidatePartition& downstream) const override; + std::string ToString() const override; + + static constexpr const char* _type_key = "relay.collage.ByKindSimpleCombinerRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(ByKindSimpleCombinerRuleNode, SimpleCombinerRuleNode); +}; + +class ByKindSimpleCombinerRule : public SimpleCombinerRule { + public: + ByKindSimpleCombinerRule(OpPatternKind upstream_kind, OpPatternKind downstream_kind); + + TVM_DEFINE_OBJECT_REF_METHODS(ByKindSimpleCombinerRule, SimpleCombinerRule, + ByKindSimpleCombinerRuleNode); +}; + +/*! \brief Context required by CombineRuleNode::AppendAllResultsContext. */ +struct AppendAllResultsContext { + AppendAllResultsContext(const DataflowGraph* dataflow_graph, size_t max_max_depth, + CandidateSet* candidate_set) + : dataflow_graph(dataflow_graph), + max_max_depth(max_max_depth), + candidate_set(candidate_set) {} + + const DataflowGraph* dataflow_graph; + size_t max_max_depth; + CandidateSet* candidate_set; +}; + +/*! + * \brief Base class for all 'combiner' rules. + * + * Given the current candidate set, a combiner rule looks for opportunities to form larger + * candidates, optionally removing existing candidates in the process. + */ +class CombinerRuleNode : public Object { + public: + String rule_name_; + + void VisitAttrs(AttrVisitor* v); + + virtual void AppendAllResults(AppendAllResultsContext* ctxt) const; + virtual std::string ToString() const; + + static constexpr const char* _type_key = "relay.collage.CombinerRule"; + static constexpr const uint32_t _type_child_slots = 4; + TVM_DECLARE_BASE_OBJECT_INFO(CombinerRuleNode, Object); +}; + +class CombinerRule : public ObjectRef { + public: + explicit CombinerRule(String rule_name); + + TVM_DEFINE_OBJECT_REF_METHODS(CombinerRule, ObjectRef, CombinerRuleNode); +}; + +/*! + * \brief A combiner rule which runs one or more simple combiner rules over the current + * touching candidates. + */ +class AllSimpleCombinerRuleNode : public CombinerRuleNode { + public: + Array simple_rules_; + + void VisitAttrs(AttrVisitor* v); + + void AppendAllResults(AppendAllResultsContext* ctxt) const override; + std::string ToString() const override; + + static constexpr const char* _type_key = "relay.collage.AllSimpleCombinerRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(AllSimpleCombinerRuleNode, CombinerRuleNode); +}; + +class AllSimpleCombinerRule : public CombinerRule { + public: + AllSimpleCombinerRule(String rule_name, Array simple_rules); + + TVM_DEFINE_OBJECT_REF_METHODS(AllSimpleCombinerRule, CombinerRule, AllSimpleCombinerRuleNode); +}; + +/*! + * \brief A combiner rule which combines injective sub-groups which appear inside tuples which are + * themselves inputs to injective sub-groups. + */ +class TupleArgCombinerRuleNode : public CombinerRuleNode { + public: + void VisitAttrs(AttrVisitor* v); + + void AppendAllResults(AppendAllResultsContext* ctxt) const override; + std::string ToString() const override; + + static constexpr const char* _type_key = "relay.collage.TupleArgCombinerRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(TupleArgCombinerRuleNode, CombinerRuleNode); +}; + +class TupleArgCombinerRule : public CombinerRule { + public: + explicit TupleArgCombinerRule(String rule_name); + + TVM_DEFINE_OBJECT_REF_METHODS(TupleArgCombinerRule, CombinerRule, TupleArgCombinerRuleNode); +}; + +/*! + * \brief A combiner rule which combines tuple projection if it's an output of an injective + * group. + */ +class TupleProjCombinerRuleNode : public CombinerRuleNode { + public: + void VisitAttrs(AttrVisitor* v); + + void AppendAllResults(AppendAllResultsContext* ctxt) const override; + std::string ToString() const override; + + static constexpr const char* _type_key = "relay.collage.TupleProjCombinerRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(TupleProjCombinerRuleNode, CombinerRuleNode); +}; + +class TupleProjCombinerRule : public CombinerRule { + public: + explicit TupleProjCombinerRule(String rule_name); + + TVM_DEFINE_OBJECT_REF_METHODS(TupleProjCombinerRule, CombinerRule, TupleProjCombinerRuleNode); +}; + +/*! + * \brief A combiner rule which combines constants in argument positions to existing candidates. + * Note that scalars are always inlined, so this rule only combines tensor constant arguments. + */ +class ConstantCombinerRuleNode : public CombinerRuleNode { + public: + void VisitAttrs(AttrVisitor* v); + + void AppendAllResults(AppendAllResultsContext* ctxt) const override; + std::string ToString() const override; + + static constexpr const char* _type_key = "relay.collage.ConstantCombinerRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(ConstantCombinerRuleNode, CombinerRuleNode); +}; + +class ConstantCombinerRule : public CombinerRule { + public: + explicit ConstantCombinerRule(String rule_name); + + TVM_DEFINE_OBJECT_REF_METHODS(ConstantCombinerRule, CombinerRule, ConstantCombinerRuleNode); +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_COMBINER_RULE_H_ diff --git a/src/relay/collage/cost.cc b/src/relay/collage/cost.cc new file mode 100644 index 0000000000000..ae2eb8600ebd0 --- /dev/null +++ b/src/relay/collage/cost.cc @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/cost.cc + * \brief Represents the estimated cost of a candidate partition. + */ + +#include "./cost.h" + +namespace tvm { +namespace relay { +namespace collage { + +std::string Cost::ToString() const { + if (is_invalid()) { + return "invalid"; + } else if (is_unknown()) { + return "unknown"; + } else if (value_ == 0.0) { + return "0"; + } else { + return std::to_string(value_ * 1e6) + "us"; + } +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/cost.h b/src/relay/collage/cost.h new file mode 100644 index 0000000000000..8ae276d22078f --- /dev/null +++ b/src/relay/collage/cost.h @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/cost.h + * \brief Represents the estimated cost of a candidate partition. + */ +#ifndef TVM_RELAY_COLLAGE_COST_H_ +#define TVM_RELAY_COLLAGE_COST_H_ + +#include + +#include +#include +#include + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief The assumed cost for a candidate partition. Generally average execution time in seconds. + * However other cost functions are possible, for example to introduce a penalty for high memory + * use, etc. + */ +class Cost { + public: + Cost() = delete; + + static Cost Zero() { return Cost(0.0); } + + /*! + * \brief Returns the distinguished 'invalid' cost signaling a candidate partition is not + * supported by the intended target, for example because the sub-graph has an unsupported operator + * or the intermediate memory required exceeds some system limit. + */ + static Cost Invalid() { return Cost(std::numeric_limits::infinity()); } + + bool is_invalid() const { return std::isinf(value_) && value_ > 0.0; } + + /*! + * \brief Returns the distinguished 'unknown' cost, signaling fixed priorities should be used to + * choose the best partitions. This can be used to disable tuning and fallback to fixed rules, + * much as TVM will use an un-tuned kernel if no tuning records are available. + */ + static Cost Unknown() { return Cost(std::numeric_limits::quiet_NaN()); } + + bool is_unknown() const { return std::isnan(value_); } + + /*! \brief Returns cost with given finite, non-negative value. */ + static Cost Value(double value) { + ICHECK(!std::isnan(value) && !std::isinf(value) && value >= 0.0); + return Cost(value); + } + + bool is_value() const { return !std::isnan(value_) && !std::isinf(value_); } + + /*! \brief Return true if the less-than relation is defined for this and that. */ + bool are_comparable(Cost that) const { return !std::isnan(value_) && !std::isnan(that.value_); } + + /*! \brief Returns sum of this and that. */ + Cost operator+(Cost that) const { return Cost(value_ + that.value_); } + + /*! \brief Returns difference of this and that. */ + Cost operator-(Cost that) const { return Cost(value_ - that.value_); } + + /*! \brief Returns true if this is cheaper than that, assuming they are comparable. */ + bool operator<(Cost that) const { return value_ < that.value_; } + + std::string ToString() const; + + private: + explicit Cost(double value) : value_(value) {} + + /*! + * \brief Non-negative value or: + * - +inf if candidate partition is not feasible. + * - NaN if candidate partition has an unknown cost (priority may be used to break ties). + */ + double value_ = 0.0; +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_COST_H_ diff --git a/src/relay/collage/cost_estimator.cc b/src/relay/collage/cost_estimator.cc new file mode 100644 index 0000000000000..94a3062f9dc60 --- /dev/null +++ b/src/relay/collage/cost_estimator.cc @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/cost_estimator.cc + * \brief Interface for measuring candidate partition cost. + */ + +#include "./cost_estimator.h" + +#include +#include + +namespace tvm { +namespace relay { +namespace collage { + +TVM_REGISTER_OBJECT_TYPE(CostEstimatorNode); +TVM_REGISTER_OBJECT_TYPE(MockEstimatorNode); + +CostEstimator::CostEstimator() { + auto node = make_object(); + data_ = std::move(node); +} + +Cost CostEstimatorNode::Estimate(const IRModule& mod, const Target& target, + bool needs_tvm_turning) const { + static const runtime::PackedFunc* estimate_seconds = + runtime::Registry::Get("tvm.relay.collage.estimate_seconds"); + ICHECK(estimate_seconds); + const double value = (*estimate_seconds)(mod, target, needs_tvm_turning); + if (std::isinf(value)) { + return Cost::Invalid(); + } else if (std::isnan(value)) { + return Cost::Unknown(); + } else { + return Cost::Value(value); + } +} + +class MockEstimationVisitor : private ExprVisitor { + public: + MockEstimationVisitor(double op_cost, double fusion_benefit) + : op_cost_(op_cost), fusion_benefit_(fusion_benefit) {} + + double EstimateCost(const Expr& body) { + this->VisitExpr(body); + return cost_; + } + + private: + double op_cost_; + double fusion_benefit_; + int ops_ = 0; + double cost_ = 0.0; + + void VisitExpr_(const CallNode* call) final { + if (call->op->IsInstance()) { + cost_ += op_cost_ * pow(fusion_benefit_, ops_); + ops_++; + } + ExprVisitor::VisitExpr_(call); + } +}; + +Cost MockEstimatorNode::Estimate(const IRModule& mod, const Target& target, + bool needs_tvm_tuning) const { + double op_cost = static_cast(target_costs_.at(target->kind->name)); + double cost = 0; + for (const auto& gv : mod->GetGlobalVars()) { + cost += MockEstimationVisitor(op_cost, /*fusion_benefit=*/0.9).EstimateCost(mod->Lookup(gv)); + } + return Cost::Value(cost); +} + +MockEstimator::MockEstimator(Map target_costs) { + auto node = make_object(); + node->target_costs_ = std::move(target_costs); + data_ = std::move(node); +} + +TVM_REGISTER_GLOBAL("collage.CostEstimator").set_body_typed([]() { return CostEstimator(); }); + +TVM_REGISTER_GLOBAL("collage.MockEstimator").set_body_typed([](Map target_costs) { + return MockEstimator(target_costs); +}); + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/cost_estimator.h b/src/relay/collage/cost_estimator.h new file mode 100644 index 0000000000000..145845b4a5587 --- /dev/null +++ b/src/relay/collage/cost_estimator.h @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/cost_estimator.cc + * \brief Interface for measuring candidate partition cost. + */ + +#ifndef TVM_RELAY_COLLAGE_COST_ESTIMATOR_H_ +#define TVM_RELAY_COLLAGE_COST_ESTIMATOR_H_ + +#include + +#include "./cost.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief An (abstract) estimator for the cost of executing "main" in an \p IRModule representing + * a candidate partition, using the given target for lowering and codegen. + * + * Generally the implementation will compile to a \p runtime::Module (possibly on a target-specific + * worker if cross-compilation is not available), repeatedly invoke "main" with random data until + * measure variance is acceptable (on a target-specific worker), and return the summarized costs. + * + * If using a TVM native \p Target, it is possible compilation will itself invoke TVM tuning. + * + * TODO(mbs): Actually, currently not abstract so can get some local measurements. + */ +class CostEstimatorNode : public Object { + public: + /*! + * \brief Returns the estimated cost (possibly after many many minutes of training time) of + * running "main" in \p mod using \p target, which represents a possible partitioning of + * some overall Relay expression. + */ + virtual Cost Estimate(const IRModule& mod, const Target& target, bool needs_tvm_tuning) const; + + static constexpr const char* _type_key = "collage.CostEstimator"; + TVM_DECLARE_BASE_OBJECT_INFO(CostEstimatorNode, Object); +}; +class CostEstimator : public ObjectRef { + public: + CostEstimator(); + explicit CostEstimator(::tvm::runtime::ObjectPtr<::tvm::runtime::Object> n) : ObjectRef(n) {} + TVM_DEFINE_DEFAULT_COPY_MOVE_AND_ASSIGN(CostEstimator); + const CostEstimatorNode* operator->() const { + return static_cast(data_.get()); + } + const CostEstimatorNode* get() const { return operator->(); } + using ContainerType = CostEstimatorNode; +}; + +class MockEstimatorNode : public CostEstimatorNode { + public: + Cost Estimate(const IRModule& mod, const Target& target, bool needs_tvm_tuning) const; + + static constexpr const char* _type_key = "collage.MockEstimator"; + TVM_DECLARE_FINAL_OBJECT_INFO(MockEstimatorNode, CostEstimatorNode); + + protected: + friend class MockEstimator; + + Map target_costs_; +}; + +class MockEstimator : public CostEstimator { + public: + explicit MockEstimator(Map target_costs); + + TVM_DEFINE_OBJECT_REF_METHODS(MockEstimator, CostEstimator, MockEstimatorNode); +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_COST_ESTIMATOR_H_ diff --git a/src/relay/collage/dataflow_graph.cc b/src/relay/collage/dataflow_graph.cc new file mode 100644 index 0000000000000..b4e19a73f04d3 --- /dev/null +++ b/src/relay/collage/dataflow_graph.cc @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/dataflow_graph.cc + * \brief A representation of the dataflow for an overall Relay expression. + */ + +#include "./dataflow_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +DataflowGraph::DataflowGraph(Expr expr) : expr_(std::move(expr)) { + indexed_graph_ = CreateIndexedGraph(expr_); + downstream_map_.reserve(indexed_graph_->size()); + for (PostDfsIndex index = 0; index < indexed_graph_->size(); ++index) { + const Node* node = indexed_graph_->index_to_node(index); + std::unordered_set downstream_nodes; + node->AccumulateDownstreamNodes(&downstream_nodes); + IndexSet index_set(indexed_graph_->size()); + for (const Node* downstream_node : downstream_nodes) { + index_set.Add(downstream_node->index_); + } + downstream_map_.emplace_back(std::move(index_set)); + } +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/dataflow_graph.h b/src/relay/collage/dataflow_graph.h new file mode 100644 index 0000000000000..a30132ec3d61a --- /dev/null +++ b/src/relay/collage/dataflow_graph.h @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/dataflow_graph.h + * \brief A representation of the dataflow for an overall Relay expression. + */ +#ifndef TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_ +#define TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_ + +#include + +#include +#include + +#include "../ir/indexed_graph.h" +#include "index_set.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Represents the dataflow of an overall Relay expression. + */ +class DataflowGraph { + public: + using Node = IndexedGraph::Node; + + explicit DataflowGraph(Expr expr); + + size_t size() const { return indexed_graph_->size(); } + const Node* index_to_node(PostDfsIndex index) const { + return indexed_graph_->index_to_node(index); + } + const Node* item_to_node(const Expr& expr) const { return indexed_graph_->item_to_node(expr); } + const Node* item_to_node(const ExprNode* expr_node) const { + return indexed_graph_->item_to_node(expr_node); + } + const IndexedGraph& indexed_graph() const { return *indexed_graph_; } + + const IndexSet& downstream_of(PostDfsIndex index) const { + ICHECK_LT(index, indexed_graph_->size()); + return downstream_map_[index]; + } + + private: + /*! \brief The overall expression. */ + Expr expr_; + /*! \brief The indexed graph which captures the main dataflow. */ + std::unique_ptr> indexed_graph_; + /*! \brief Map from a node's PostDfsIndex to the set of it's downstream dataflow node indexes. */ + std::vector downstream_map_; +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_ diff --git a/src/relay/collage/gather_partition_specs.cc b/src/relay/collage/gather_partition_specs.cc new file mode 100644 index 0000000000000..0275541d9fa53 --- /dev/null +++ b/src/relay/collage/gather_partition_specs.cc @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/gather_partition_specs.cc + * \brief Gather the relevant \p PartitionSpecs from the available \p Targets. + */ + +#include "./gather_partition_specs.h" + +#include "utils.h" + +namespace tvm { +namespace relay { +namespace collage { + +namespace { + +PartitionRule MakeCombinePartitionRule(PartitionRule sub_rule, Array combiner_rules, + size_t max_max_depth) { + if (combiner_rules.empty()) { + return sub_rule; + } else { + return CombinePartitionRule("", std::move(sub_rule), std::move(combiner_rules), max_max_depth); + } +} + +/*! \brief Returns the primitive combiner rules which mimic TVM's \p FuseOps. */ +Array TVMCombinerRules() { + Array simple_rules; + // Mimic the FuseOps rules. + simple_rules.push_back(ByKindSimpleCombinerRule(kOutEWiseFusable, kBroadcast)); + simple_rules.push_back(ByKindSimpleCombinerRule(kBroadcast, kCommReduce)); + simple_rules.push_back(ByKindSimpleCombinerRule(kInjective, kInjective)); + + Array combiner_rules; + // Fire the simple fusion rules + combiner_rules.push_back(AllSimpleCombinerRule("combiner", std::move(simple_rules))); + // Fuse tuple arguments + combiner_rules.push_back(TupleArgCombinerRule("tuple")); + // Fuse tuple projection + combiner_rules.push_back(TupleProjCombinerRule("proj")); + + return combiner_rules; +} + +size_t GetMaxMaxDepth(std::string key) { + transform::PassContext ctxt = transform::PassContext::Current(); + std::string config_key = "relay.collage." + key; + Optional opt_max_max_depth = ctxt->GetConfig(config_key, Optional()); + ICHECK(opt_max_max_depth.defined()) + << "missing binding for '" << config_key << " in pass context"; + ICHECK(opt_max_max_depth.value()->value > 0) + << "invalid value for '" << config_key << " in pass context"; + return static_cast(opt_max_max_depth.value()); +} + +/*! \brief Returns partition rule mimicking TVM FuseOps. */ +PartitionRule MakeTVMPartitionRule() { + size_t max_max_depth = GetMaxMaxDepth("tvm_max_max_depth"); + // Build singleton candidates for all calls to ops <= kOutEWiseFusable. + OpCallByKindPartitionRule op_call_by_kind(""); + // Combine candidates according to the TVM fusion rules. + PartitionRule combine = + MakeCombinePartitionRule(std::move(op_call_by_kind), TVMCombinerRules(), max_max_depth); + // Discard invalid candidates. + SubGraphConfig sub_graph_config; + sub_graph_config.allow_taps = false; + sub_graph_config.max_max_depth = max_max_depth; + sub_graph_config.max_exits = 1; + return OnlyValidPartitionRule("", std::move(combine), sub_graph_config); + // NOTE: We don't wrap by a "Primitive" since we want to defer making TVM fusion decisions until + // after running more Relay passes. +} + +/*! + * \brief Returns the fusion style for \p compiler. + * + * TODO(mbs): Defer to per-BYOC integration definition. + */ +BYOCStyle BYOCFusionStyleForCompiler(const String& compiler) { + if (compiler == "cutlass" || compiler == "cublas" || compiler == "cudnn") { + return kNoFusionBYOCStyle; + } else if (compiler == "tensorrt") { + return kTVMFusionBYOCStyle; + } else { + return kArbitraryFusionBYOCStyle; + } +} + +/*! + * \brief Returns the primitive combiner rules which allow for any touching candidates + * to be fused provided they don't have kind \p kOpaque. + */ +Array BYOCCombinerRules(const String& compiler) { + Array simple_rules; + Array combiner_rules; + switch (BYOCFusionStyleForCompiler(compiler)) { + case kNoFusionBYOCStyle: + break; + case kTVMFusionBYOCStyle: + // Conservatively assume the BYOC toolchain follows the same rules as for TVM's FuseOps. + simple_rules.push_back(ByKindSimpleCombinerRule(kOutEWiseFusable, kBroadcast)); + simple_rules.push_back(ByKindSimpleCombinerRule(kBroadcast, kCommReduce)); + simple_rules.push_back(ByKindSimpleCombinerRule(kInjective, kInjective)); + combiner_rules.push_back(AllSimpleCombinerRule("combiner", std::move(simple_rules))); + break; + case kArbitraryFusionBYOCStyle: + // Just try all combinations up to the max_max_depth limit. + simple_rules.push_back(ByKindSimpleCombinerRule(kOutEWiseFusable, kOutEWiseFusable)); + combiner_rules.push_back(AllSimpleCombinerRule("combiner", std::move(simple_rules))); + break; + } + return combiner_rules; +} + +/*! + * \brief Returns partition rule mimicking one entry in the patterns list passed to the + * MergeComposite pass. + */ +PartitionRule MakeLabelledDFPatternPartitionRule( + const std::string& compiler, String rule_name, DFPattern dataflow_pattern, + TPatternPredicate predicate = DefaultPatternPredicate) { + DFPatternPartitionRule patterns("", std::move(dataflow_pattern), std::move(predicate)); + return CompositePartitionRule(std::move(rule_name), std::move(patterns)); +} + +/*! + * \brief Returns partition rule mimicking + * MergeComposite/AnnotateTarget/MergeCompilerRegions/PartitionGraph passes for "compiler" + * attribute of \p target. + */ +PartitionRule MakePatternBYOCPartitionRule(const std::string& compiler, + Array sub_rules) { + size_t max_max_depth = GetMaxMaxDepth("byoc_max_max_depth"); + // Union all the individual pattern rules. + UnionPartitionRule unioned("", std::move(sub_rules)); + PartitionRule combine = + MakeCombinePartitionRule(std::move(unioned), BYOCCombinerRules(compiler), max_max_depth); + // Ignore invalid candidates. + SubGraphConfig sub_graph_config; + sub_graph_config.allow_taps = false; + sub_graph_config.max_max_depth = max_max_depth; + sub_graph_config.max_exits = 1; + OnlyValidPartitionRule valid("", std::move(combine), sub_graph_config); + // Wrap the candidates in a "Primitive" function with a "Compiler" attribute. + return PrimitivePartitionRule("", std::move(valid)); +} + +TVM_REGISTER_GLOBAL("relay.collage.make_labelled_dfpattern_partition_rule") + .set_body_typed([](String compiler, String rule_name, DFPattern dataflow_pattern) { + return MakeLabelledDFPatternPartitionRule(std::move(compiler), std::move(rule_name), + std::move(dataflow_pattern)); + }); + +TVM_REGISTER_GLOBAL("relay.collage.make_labelled_dfpattern_partition_rule_with_predicate") + .set_body_typed([](String compiler, String rule_name, DFPattern dataflow_pattern, + TPatternPredicate predicate) { + return MakeLabelledDFPatternPartitionRule(std::move(compiler), std::move(rule_name), + std::move(dataflow_pattern), std::move(predicate)); + }); + +TVM_REGISTER_GLOBAL("relay.collage.make_pattern_byoc_partition_rule") + .set_body_typed(MakePatternBYOCPartitionRule); + +/*! + * \brief Returns the rule to pick out expression nodes which can be 'left behind' for execution + * on the host. + */ +PartitionRule MakeHostPartitionRule() { return HostPartitionRule(""); } + +} // namespace + +Array GatherPartitionSpecs(const CompilationConfig& config) { + Array result; + for (const auto& primitive_target : config->primitive_targets) { + String spec_name = GetSpecName(primitive_target); + PartitionRule rule; + if (primitive_target.IsExternalCodegen()) { + // Transition to the Python side so we can get access to the BYOC pattern registry. + // That will bounce right back into the above construction helpers. + static const runtime::PackedFunc* make_byoc_partition_rule = + runtime::Registry::Get("tvm.relay.collage.make_byoc_partition_rule"); + ICHECK(make_byoc_partition_rule); + rule = (*make_byoc_partition_rule)(spec_name); + VLOG(1) << "Target " << primitive_target->ToDebugString() << " is for BYOC spec_name " + << spec_name << " and has default partition rule:\n" + << rule->ToString(); + } else { + rule = MakeTVMPartitionRule(); + VLOG(1) << "Target " << primitive_target->ToDebugString() << " is for TVM spec_name " + << spec_name << " and has default partition rule:\n" + << rule->ToString(); + } + result.push_back(PartitionSpec(spec_name, primitive_target, rule)); + } + + // Add one more spec to cover the host target. + result.push_back(PartitionSpec(kHostSpecName, config->host_target, MakeHostPartitionRule())); + + return result; +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/gather_partition_specs.h b/src/relay/collage/gather_partition_specs.h new file mode 100644 index 0000000000000..62ffca27d635e --- /dev/null +++ b/src/relay/collage/gather_partition_specs.h @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/gather_partition_specs.h + * \brief Gather the relevant \p PartitionSpecs from the available \p Targets. + */ +#ifndef TVM_RELAY_COLLAGE_GATHER_PARTITION_SPECS_H_ +#define TVM_RELAY_COLLAGE_GATHER_PARTITION_SPECS_H_ + +#include + +#include "./partition_spec.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief The 'styles' of BYOC integrations. Used to influence how their corresponding + * partition rule is constructed. + */ +enum BYOCStyle { + /*! + * \brief The BYOC patterns pick out 'ideal' candidates directly, either because: + * - the BYOC toolchain does not perform any fusion so each matched sub-expression maps 1:1 to a + * BYOC-provided operator, or + * - the BYOC toolchain does perform fusion, however the patterns have been written to pick out + * fusable sub-graphs. + */ + kNoFusionBYOCStyle, + + /*! + * \brief The BYOC patterns pick out supported operators, but the BYOC backend may perform + * fusion over those operators in much the same way TVM does. + */ + kTVMFusionBYOCStyle, + + /*! + * \brief The BYOC patterns pick out supported operators, but the BYOC backend may perform + * arbitrary fusion over those operators. + */ + kArbitraryFusionBYOCStyle, +}; + +/*! + * \brief Returns all the partition specifications gathered from the \p Targets in \p config. + */ +Array GatherPartitionSpecs(const CompilationConfig& config); + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_GATHER_PARTITION_SPECS_H_ diff --git a/src/relay/collage/index_set.cc b/src/relay/collage/index_set.cc new file mode 100644 index 0000000000000..55bec80820a47 --- /dev/null +++ b/src/relay/collage/index_set.cc @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/index_set.cc + * \brief Efficient representation of a set of post-dfs indexes. + */ + +#include "./index_set.h" + +namespace tvm { +namespace relay { +namespace collage { + +// TODO(mbs): These should operate one-word-at-a-time + +IndexSet::IndexSet(size_t size, const std::vector& indexes) : bitvec_(size, false) { + for (size_t index : indexes) { + ICHECK_LT(index, bitvec_.size()); + ICHECK(!bitvec_[index]); + bitvec_[index] = true; + } +} + +IndexSet IndexSet::operator&(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + std::vector result(bitvec_.size(), false); + for (size_t index = 0; index < bitvec_.size(); ++index) { + result[index] = bitvec_[index] && that.bitvec_[index]; + } + return IndexSet(result); +} + +IndexSet IndexSet::operator|(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + std::vector result(bitvec_.size(), false); + for (size_t index = 0; index < bitvec_.size(); ++index) { + result[index] = bitvec_[index] || that.bitvec_[index]; + } + return IndexSet(result); +} + +IndexSet IndexSet::operator-(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + std::vector result(bitvec_.size()); + for (size_t index = 0; index < bitvec_.size(); ++index) { + result[index] = bitvec_[index] && !that.bitvec_[index]; + } + return IndexSet(result); +} + +bool IndexSet::AreDisjoint(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index] && that.bitvec_[index]) { + return false; + } + } + return true; +} + +bool IndexSet::IsSubset(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index] && !that.bitvec_[index]) { + return false; + } + } + return true; +} + +bool IndexSet::Intersects(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index] && that.bitvec_[index]) { + return true; + } + } + return false; +} + +IndexSet IndexSet::Subst(size_t new_size, const IndexSubst& subst) const { + std::vector result(new_size, false); + for (PostDfsIndex index = 0; index < bitvec_.size(); ++index) { + if (!bitvec_[index]) { + continue; + } + auto itr = subst.find(index); + ICHECK(itr != subst.end()); + PostDfsIndex new_index = itr->second; + ICHECK(new_index < new_size); + ICHECK(!result[new_index]); + result[new_index] = true; + } + return IndexSet(result); +} + +size_t IndexSet::PopCount() const { + size_t n = 0; + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index]) { + ++n; + } + } + return n; +} + +bool IndexSet::IsZero() const { + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index]) { + return false; + } + } + return true; +} + +size_t IndexSet::FirstInsideIndex() const { + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index]) { + return index; + } + } + return bitvec_.size(); +} + +size_t IndexSet::LastInsideIndex() const { + for (size_t i = bitvec_.size(); i > 0; i--) { + const size_t index = i - 1; + if (bitvec_[index]) { + return index; + } + } + return bitvec_.size(); +} + +size_t IndexSet::NextIndex(size_t index) const { + ICHECK_LT(index, bitvec_.size()); + for (index++; index < bitvec_.size(); index++) { + if (bitvec_[index]) { + return index; + } + } + return bitvec_.size(); +} + +size_t IndexSet::FirstOutsideIndex() const { + for (size_t index = 0; index < bitvec_.size(); index++) { + if (!bitvec_[index]) { + return index; + } + } + return bitvec_.size(); +} + +bool IndexSet::operator==(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + return bitvec_ == that.bitvec_; +} + +bool IndexSet::operator!=(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + return bitvec_ != that.bitvec_; +} + +bool IndexSet::operator<(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index] && !that.bitvec_[index]) { + return true; + } + if (!bitvec_[index] && that.bitvec_[index]) { + return false; + } + } + return false; +} + +size_t IndexSet::hash() const { + std::hash> h; + return h(bitvec_); +} + +std::string IndexSet::ToString() const { + std::ostringstream os; + os << "{"; + bool first = true; + for (size_t start = 0; start < bitvec_.size(); /*no-op*/) { + if (!bitvec_[start]) { + ++start; + continue; + } + size_t end; + for (end = start + 1; end < bitvec_.size() && bitvec_[end]; ++end) { + /*no-op*/ + } + if (first) { + first = false; + } else { + os << ","; + } + os << start; + if (end > start + 2) { + os << ".." << (end - 1); + start = end; + } else { + ++start; + } + } + os << "}"; + return os.str(); +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/index_set.h b/src/relay/collage/index_set.h new file mode 100644 index 0000000000000..f24b695cc76c9 --- /dev/null +++ b/src/relay/collage/index_set.h @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/index_set.h + * \brief Efficient representation of a set of post-dfs indexes. + */ + +#ifndef TVM_RELAY_COLLAGE_INDEX_SET_H_ +#define TVM_RELAY_COLLAGE_INDEX_SET_H_ + +#include +#include +#include +#include + +#include "../ir/dataflow_matcher_impl.h" +#include "../ir/indexed_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +using IndexSubst = std::unordered_map; + +class IndexSet { + public: + IndexSet() = default; + explicit IndexSet(size_t size) : bitvec_(size, false) {} + IndexSet(size_t size, const std::vector& indexes); + + IndexSet operator&(const IndexSet& that) const; + IndexSet operator|(const IndexSet& that) const; + IndexSet operator-(const IndexSet& that) const; + bool AreDisjoint(const IndexSet& that) const; + bool IsSubset(const IndexSet& that) const; + bool Intersects(const IndexSet& that) const; + + bool operator[](size_t index) const { + ICHECK_LT(index, bitvec_.size()); + return bitvec_[index]; + } + + IndexSet& Add(size_t index) { + ICHECK_LT(index, bitvec_.size()); + bitvec_[index] = true; + return *this; + } + + IndexSet Subst(size_t new_size, const IndexSubst& subst) const; + + size_t end_index() const { return bitvec_.size(); } + size_t PopCount() const; + bool IsZero() const; + size_t FirstInsideIndex() const; + size_t LastInsideIndex() const; + size_t NextIndex(size_t index) const; + size_t FirstOutsideIndex() const; + bool operator==(const IndexSet& that) const; + bool operator!=(const IndexSet& that) const; + bool operator<(const IndexSet& that) const; + size_t hash() const; + std::string ToString() const; + + struct IndexSetIterator { + const IndexSet* set; + size_t i; + + size_t operator*() const { + ICHECK_LT(i, set->end_index()); + return i; + } + + const IndexSetIterator& operator++() { + ICHECK_LT(i, set->end_index()); + i = set->NextIndex(i); + return *this; + } + + bool operator==(const IndexSetIterator& that) const { + ICHECK(set == that.set); + return i == that.i; + } + + bool operator!=(const IndexSetIterator& that) const { + ICHECK(set == that.set); + return i != that.i; + } + }; + + IndexSetIterator begin() const { return IndexSetIterator{this, FirstInsideIndex()}; } + IndexSetIterator end() const { return IndexSetIterator{this, end_index()}; } + + private: + explicit IndexSet(std::vector bitvec) : bitvec_(std::move(bitvec)) {} + + std::vector bitvec_; +}; + +struct IndexSetEqual { + bool operator()(const IndexSet& left, const IndexSet& right) const { return left == right; } +}; + +struct IndexSetHash { + size_t operator()(const IndexSet& set) const { return set.hash(); } +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_INDEX_SET_H_ diff --git a/src/relay/collage/name_supply.cc b/src/relay/collage/name_supply.cc new file mode 100644 index 0000000000000..4b7d497b0d577 --- /dev/null +++ b/src/relay/collage/name_supply.cc @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/name_supply.cc + * \brief A source of fresh variable names. + */ + +#include "./name_supply.h" + +#include +#include + +namespace tvm { +namespace relay { +namespace collage { + +namespace { +void AppendCSafe(bool* first, std::ostringstream& os, const std::string& str) { + for (size_t i = 0; i < str.size(); ++i) { + const char c = str[i]; + if (i == 0 && first && (!std::isalpha(c) && c != '_')) { + os << "_"; + } + if (c == '_' || std::isalnum(c)) { + os << c; + } else { + os << "_"; + } + *first = false; + } +} +} // namespace + +NameSupply NameSupply::MakeSubNameSupply() { + NameSupply result(prefix_); + for (const auto& kv : next_free_index_) { + result.next_free_index_.emplace(kv.first, kv.second); + } + return result; +} + +std::string NameSupply::Fresh(const std::initializer_list& hints) { + std::ostringstream os; + bool first = true; + bool need_sep = false; + if (!prefix_.empty()) { + AppendCSafe(&first, os, prefix_); + need_sep = true; + } + for (const auto& hint : hints) { + if (hint.empty()) { + continue; + } + if (need_sep) { + os << "_"; + } + AppendCSafe(&first, os, hint); + need_sep = true; + } + std::string name = os.str(); + auto itr = next_free_index_.find(name); + if (itr == next_free_index_.end()) { + next_free_index_.emplace(name, 1); + } else { + os << "_" << itr->second++; + name = os.str(); + } + return name; +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/name_supply.h b/src/relay/collage/name_supply.h new file mode 100644 index 0000000000000..d37023ab6f815 --- /dev/null +++ b/src/relay/collage/name_supply.h @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/name_supply.h + * \brief A source of fresh variable names. + */ + +#ifndef TVM_RELAY_COLLAGE_NAME_SUPPLY_H_ +#define TVM_RELAY_COLLAGE_NAME_SUPPLY_H_ + +#include +#include +#include + +namespace tvm { +namespace relay { +namespace collage { + +/*! \brief A supply of fresh names. */ +class NameSupply { + public: + explicit NameSupply(std::string prefix) : prefix_(std::move(prefix)) {} + + NameSupply MakeSubNameSupply(); + + void Reserve(const std::string& existing) { next_free_index_.emplace(existing, 1); } + + std::string Fresh(const std::initializer_list& hints); + + private: + /*! \brief Prefix for all names. May be empty. */ + std::string prefix_; + /*! \brief Next unused index for variables with given basename. */ + std::unordered_map next_free_index_; +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_NAME_SUPPLY_H_ diff --git a/src/relay/collage/partition_rule.cc b/src/relay/collage/partition_rule.cc new file mode 100644 index 0000000000000..25429aeb5f094 --- /dev/null +++ b/src/relay/collage/partition_rule.cc @@ -0,0 +1,433 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/partition_rule.cc + * \brief Compositional partitioning rules. + */ + +#include "./partition_rule.h" + +#include + +#include "./partition_rule.h" +#include "./partition_spec.h" +#include "./utils.h" + +namespace tvm { +namespace relay { +namespace collage { + +TVM_REGISTER_NODE_TYPE(PartitionRuleNode); + +void PartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector PartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + ICHECK(false) << "PartitionRuleNode::AllCandidates should be overridden in sub-class"; + return {}; +} + +std::string PartitionRuleNode::ToString() const { return ToDoc().str(); } + +Doc PartitionRuleNode::ToDoc() const { + Doc doc; + doc << GetTypeKey() << "(" << Doc::NewLine(2); + std::vector body_items; + AppendBodyItems(&body_items); + doc << Doc::Indent(2, Doc::Concat(body_items, Doc::NewLine())) << Doc::NewLine(); + doc << ")"; + return doc; +} + +void PartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + body_items->emplace_back(); + body_items->back() << "rule_name=" << Doc::StrLiteral(rule_name_); +} + +PartitionRule::PartitionRule(String rule_name) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + data_ = std::move(node); +} + +bool DefaultPatternPredicate(const Expr& matched_sub_expr) { return true; } + +TVM_REGISTER_NODE_TYPE(DFPatternPartitionRuleNode); + +void DFPatternPartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector DFPatternPartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + VLOG(1) << "running DFPatternPartitionRule(" << rule_name_ << ")"; + std::vector result; + DFPatternMatcher matcher(&dataflow_graph.indexed_graph()); + for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) { + Expr sub_expr = dataflow_graph.index_to_node(index)->ref(); + if (!matcher.Match(pattern_, sub_expr)) { + continue; + } + if (!predicate_(sub_expr)) { + VLOG(1) << "DFPatternPartitionRule(" << rule_name_ << ") has failing predicate"; + continue; + } + IndexSet inside = MatcherToIndexSet(matcher); + OpPatternKind kind; + String label; + std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside); + SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label)); + String rule_name = rule_name_.empty() ? sub_graph->label_ : rule_name_; + CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec); + VLOG(2) << "DFPatternPartitionRule(" << rule_name_ << ") yields " << candidate->ToString(); + result.emplace_back(std::move(candidate)); + } + VLOG(1) << "DFPatternPartitionRule(" << rule_name_ << ") produced " << result.size() + << " candidates"; + return result; +} + +void DFPatternPartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); + body_items->emplace_back(); + body_items->back() << "pattern=" << PrettyPrint(pattern_); +} + +DFPatternPartitionRule::DFPatternPartitionRule(String rule_name, DFPattern pattern, + TPatternPredicate predicate) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->pattern_ = std::move(pattern); + node->predicate_ = std::move(predicate); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(CompositePartitionRuleNode); + +void CompositePartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector CompositePartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + std::vector candidates = sub_rule_->AllCandidates(dataflow_graph, spec); + VLOG(1) << "running CompositePartitionRule(" << rule_name_ << ") over " << candidates.size() + << " sub-candidates"; + std::vector result; + FunctionAttrsMap attrs; + attrs.Set(attr::kComposite, rule_name_); + for (auto& candidate : candidates) { + String rule_name = NestLabels(rule_name_, candidate->rule_name_); + SubGraph sub_graph = candidate->sub_graph_.WithAttrs(dataflow_graph, attrs); + CandidatePartition new_candidate = WithSubGraph( + WithRuleName(std::move(candidate), std::move(rule_name)), std::move(sub_graph)); + VLOG(2) << "CompositePartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString(); + result.emplace_back(std::move(new_candidate)); + } + VLOG(1) << "CompositePartitionRule(" << rule_name_ << ") produced " << result.size() + << " candidates"; + return result; +} + +void CompositePartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); + body_items->emplace_back(); + body_items->back() << "sub_rule=" << sub_rule_->ToDoc(); +} + +CompositePartitionRule::CompositePartitionRule(String rule_name, PartitionRule sub_rule) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->sub_rule_ = std::move(sub_rule); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(PrimitivePartitionRuleNode); + +void PrimitivePartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector PrimitivePartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + std::vector candidates = sub_rule_->AllCandidates(dataflow_graph, spec); + VLOG(1) << "running PrimitivePartitionRule(" << rule_name_ << ") over " << candidates.size() + << " sub-candidates"; + std::vector result; + FunctionAttrsMap attrs; + attrs.Set(attr::kPrimitive, Integer(1)); + if (spec->target_.IsExternalCodegen()) { + // The spec name will be the target kind name which is 1:1 with the "Compiler" attribute name. + attrs.Set(attr::kCompiler, spec->spec_name_); + } + for (auto& candidate : candidates) { + String rule_name = NestLabels(rule_name_, candidate->rule_name_); + SubGraph sub_graph = candidate->sub_graph_.WithAttrs(dataflow_graph, attrs); + CandidatePartition new_candidate = WithSubGraph( + WithRuleName(std::move(candidate), std::move(rule_name)), std::move(sub_graph)); + VLOG(2) << "PrimitivePartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString(); + result.emplace_back(std::move(new_candidate)); + } + VLOG(1) << "PrimitivePartitionRule(" << rule_name_ << ") produced " << result.size() + << " candidates"; + return result; +} + +void PrimitivePartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); + body_items->emplace_back(); + body_items->back() << "sub_rule=" << sub_rule_->ToDoc(); +} + +PrimitivePartitionRule::PrimitivePartitionRule(String rule_name, PartitionRule sub_rule) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->sub_rule_ = std::move(sub_rule); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(UnionPartitionRuleNode); + +void UnionPartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector UnionPartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + std::vector result; + for (const auto& sub_rule : sub_rules_) { + std::vector candidates = sub_rule->AllCandidates(dataflow_graph, spec); + for (auto& candidate : candidates) { + String rule_name = NestLabels(rule_name_, candidate->rule_name_); + CandidatePartition new_candidate = WithRuleName(std::move(candidate), std::move(rule_name)); + VLOG(2) << "UnionPartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString(); + result.emplace_back(std::move(new_candidate)); + } + } + VLOG(1) << "UnionPartitionRule(" << rule_name_ << ") produced " << result.size() << " candidates"; + return result; +} + +void UnionPartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); + for (const auto& sub_rule : sub_rules_) { + body_items->emplace_back(); + body_items->back() << "sub_rule=" << sub_rule->ToDoc(); + } +} + +UnionPartitionRule::UnionPartitionRule(String rule_name, Array sub_rules) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->sub_rules_ = std::move(sub_rules); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(OpCallByKindPartitionRuleNode); + +void OpCallByKindPartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector OpCallByKindPartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + VLOG(1) << "running OpCallByKindPartitionRule(" << rule_name_ << ")"; + std::vector result; + for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) { + auto node = dataflow_graph.index_to_node(index); + Expr sub_expr = node->ref(); + if (sub_expr->IsInstance()) { + OpPatternKind kind; + String label; + std::tie(kind, label) = SubExprKindAndLabel(sub_expr); + if (kind <= kOutEWiseFusable) { + IndexSet inside(dataflow_graph.size(), {index}); + SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label)); + String rule_name = NestLabels(rule_name_, sub_graph->label_); + CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec); + VLOG(2) << "OpCallByKindPartitionRule(" << rule_name_ << ") yields " + << candidate->ToString(); + result.emplace_back(std::move(candidate)); + } + } + } + VLOG(1) << "OpCallByKindPartitionRule(" << rule_name_ << ") produced " << result.size() + << " candidates"; + return result; +} + +void OpCallByKindPartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); +} + +OpCallByKindPartitionRule::OpCallByKindPartitionRule(String rule_name) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(CombinePartitionRuleNode); + +void CombinePartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector CombinePartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + // We'll accumulate all the candidates here, starting with those from the sub-rule. + // Once a candidate is added to this vector it is immutable. + std::vector candidates = sub_rule_->AllCandidates(dataflow_graph, spec); + VLOG(1) << "running CombinePartitionRule(" << rule_name_ << ") over " << candidates.size() + << " sub-candidates"; + CandidateSet result_set(std::move(candidates)); + + size_t num_rounds = 0; + AppendAllResultsContext ctxt(&dataflow_graph, max_max_depth_, &result_set); + while (result_set.PrepareForNextRound()) { + VLOG_CONTEXT << "round " << ++num_rounds; + VLOG(1) << "checking " << result_set.size() << " candidates (" << result_set.first_new_index() + << " existing)"; + for (const auto& combiner_rule : combiner_rules_) { + combiner_rule->AppendAllResults(&ctxt); + } + } + + std::vector result; + for (auto& candidate : result_set.MovedCurrentCandidates()) { + String rule_name = NestLabels(rule_name_, candidate->rule_name_); + CandidatePartition new_candidate = WithRuleName(std::move(candidate), std::move(rule_name)); + VLOG(2) << "CombinePartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString(); + result.emplace_back(std::move(new_candidate)); + } + VLOG(1) << "CombinePartitionRule(" << rule_name_ << ") produced " << result.size() + << " candidates"; + return result; +} + +void CombinePartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); + body_items->emplace_back(); + body_items->back() << "sub_rule=" << sub_rule_->ToDoc(); + for (const auto& combiner_rule : combiner_rules_) { + body_items->emplace_back(); + body_items->back() << "combiner_rule=" << combiner_rule->ToString(); + } + body_items->emplace_back(); + body_items->back() << "max_max_depth=" << max_max_depth_; +} + +CombinePartitionRule::CombinePartitionRule(String rule_name, PartitionRule sub_rule, + Array combiner_rules, + size_t max_max_depth_) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->sub_rule_ = std::move(sub_rule); + node->combiner_rules_ = std::move(combiner_rules); + node->max_max_depth_ = max_max_depth_; + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(OnlyValidPartitionRuleNode); + +void OnlyValidPartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector OnlyValidPartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + std::vector candidates = sub_rule_->AllCandidates(dataflow_graph, spec); + VLOG(1) << "running OnlyValidPartitionRule(" << rule_name_ << ") over " << candidates.size() + << " sub-candidates"; + std::vector result; + for (auto& candidate : candidates) { + if (!candidate->sub_graph_->IsValid(dataflow_graph, config_)) { + VLOG(2) << "Ignoring invalid candidate " << candidate->ToString(); + continue; + } + String rule_name = NestLabels(rule_name_, candidate->rule_name_); + CandidatePartition new_candidate = WithRuleName(std::move(candidate), std::move(rule_name)); + VLOG(2) << "OnlyValidPartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString(); + result.emplace_back(std::move(new_candidate)); + } + VLOG(1) << "OnlyValidPartitionRule(" << rule_name_ << ") produced " << result.size() + << " candidates"; + return result; +} + +void OnlyValidPartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); + body_items->emplace_back(); + body_items->back() << "sub_rule=" << sub_rule_->ToDoc(); + body_items->emplace_back(); + body_items->back() << "config=" << config_.ToString(); +} + +OnlyValidPartitionRule::OnlyValidPartitionRule(String rule_name, PartitionRule sub_rule, + const SubGraphConfig& config) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->sub_rule_ = std::move(sub_rule); + node->config_ = config; + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(HostPartitionRuleNode); + +void HostPartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector HostPartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + VLOG(1) << "running HostPartitionRule(" << rule_name_ << ")"; + std::vector result; + for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) { + if (MustBeLowered(dataflow_graph.index_to_node(index)->ref())) { + continue; + } + IndexSet inside(dataflow_graph.size(), {index}); + OpPatternKind kind; + String label; + std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside); + SubGraph sub_graph(dataflow_graph, std::move(inside), kind, label); + String rule_name = NestLabels(rule_name_, sub_graph->label_); + // We'll a zero cost for the candidate since we'll never want to actually estimate the cost + // of this 'partition'. + CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec, Cost::Zero()); + VLOG(2) << "HostPartitionRule(" << rule_name_ << ") yields " << candidate->ToString(); + result.push_back(candidate); + } + VLOG(1) << "HostPartitionRule(" << rule_name_ << ") produced " << result.size() << " candidates"; + return result; +} + +void HostPartitionRuleNode::AppendBodyItems(std::vector* body_items) const {} + +HostPartitionRule::HostPartitionRule(String rule_name) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + data_ = std::move(node); +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/partition_rule.h b/src/relay/collage/partition_rule.h new file mode 100644 index 0000000000000..a9209b4235552 --- /dev/null +++ b/src/relay/collage/partition_rule.h @@ -0,0 +1,487 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/partition_rule.h + * \brief Compositional partitioning rules. + */ + +#ifndef TVM_RELAY_COLLAGE_PARTITION_RULE_H_ +#define TVM_RELAY_COLLAGE_PARTITION_RULE_H_ + +#include +#include + +#include +#include + +#include "../../printer/doc.h" +#include "./candidate_partition.h" +#include "./combiner_rule.h" +#include "./sub_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Type of function to check if a matched sub-expression should be accepted by a rule. This + * can be used to, eg, reject operators of unsupported shape or dtype, or otherwise implement rules + * which are difficult to express in the dataflow pattern language directly. + */ +using TPatternPredicate = TypedPackedFunc; + +/*! + * \brief The default pattern predicate. Always returns true. + */ +bool DefaultPatternPredicate(const Expr& matched_sub_expr); + +/*! + * \brief Base class of all partition rules. + * + * A \p PartitionRule describes how to find a set of \p CandidatePartitions for a \p DataflowGraph. + * The candidates are allowed to overlap, and ultimately it is the job of the Collage searcher to + * find a selection of candidates which covers the whole Relay expression without overlap. Partition + * rules are paired with their \p Target and other 'top level' configuration in a \p PartitionSpec. + * + * We provide a set of 'base' partition rules which produce candidates from the dataflow graph + * directly. We also provide a set of 'combinator' partition rules which can produce new candidates + * from the results of an arbitrary sub-rule or sub-rules. By mixing these base and combinator + * rules we can express a wide variety of partition strategies and encoding conventions. + * + * There may be many thousands of candidates in flight during the Collage search. We take care to + * defer constructing or rewriting Relay expressions until absolutely necessary. We only pay for + * extracting a function to represent a candidate when we need to measure it's cost. And we only + * pay for rewriting the overall Relay expression to commit to a partitioning when the Collage + * search has completed. + * + * The base rules implemented so far: + * - \p DFPatternPartitionRule: Given a \p DFPattern and expression predicate, produces a candidate + * for every sub-graph matched by the pattern and predicate. Unlike the \p PatternRewriter, + * candidates are free to overlap. Used to bring BYOC patterns into the Collage framework. + * - \p OpCallByKindPartitionRule: Uses the "TOpPattern" attribute provided for every Relay + * operator to produce a candidate for every call to a 'fusable Relay operator'. Used to + * look ahead to how TVM will fuse sub-graphs. + * + * The combinator rules implemented so far: + * - \p CompositePartitionRule: Indicates all candidates matched by the sub-rule should be wrapped + * by a "Composite" function. The "Composite" name is taken from the rule name. Used to indicate + * Relay operators (or groups of Relay operators) should be mapped to target-specific operators, + * both for BYOC and TVM external library integrations. + * - \p PrimitivePartitionRule: Indicates all candidates matched by the sub-rule should be wrapped + * by a "Primitive" function, possibly with an additional "Compiler" attribute. Used to + * delineate a partition (or kernel). + * - \p UnionPartitionRule: Simply unions all the candidates from all sub-rules together. Used to + * combine individual \p DFPatternPartitionRules. + * - \p CombinePartitionRule: Given a sub-rule and a list of 'combiner' rules, finds + * all possible ways of combining the sub-rule's candidates to yield even larger candidates. + * Note that the sub-rule's candidates may also be directly included in the results. The + * 'combiner' rules allow combining by \p OpPatternKinds, combining the arguments to tuples + * which themselves are arguments to Relay operator calls, and so on. This rule is intended to + * mimic the existing TVM \p FuseOps pass, though: + * i) all candidates are found rather than just the largest, ii) the starting set of candidates + * can be provided by any other rule, and iii) we rely on \p SubGraph validity checking to weed + * out infeasible candidates. + * - \p OnlyValidPartitionRule: Given a \p SubGraphConfig, ignores candidates with 'invalid' + * sub-graphs. Used to limit the maximum candidate depth, the number of independent outputs, + * and whether intermediate 'taps' are allowed. + * - \p HostPartitionRule: Produces candidates for all Relay expressions which could be + * 'left behind' for execution by the host (eg on the VM). This rule lets us simplify the + * overall Collage search algorithm. + * + * (Though not yet implemented, we'd like to allow a combinator rule which will union candidate + * based on their 'anchor' operators. This can be used to implement 'vertical' and 'horizontal' + * partition on more primitive candidates. Note that the \p SubGraph machinery supports + * multiple-input and -output sub-graphs and their validation, so horizontal partition is easy + * implement.) + * + * Here are some typical ways to combine \p PartitionRules for different partition/fusion + * strategies: + * + * - Classic pattern-based BYOC with \p MergeComposite/AnnotateTarget/PartitionGraph passes: + * \code + * PrimitivePartitionRule + * OnlyValidPartitionRule + * CombinePartitionRule (with join-anything combiner rule) + * UnionPartitionRule + * CompositePartitionRule(label1) + * DFPatternPartitionRule(pattern1) + * : + * CompositePartitionRule(labeln) + * DFPatternPartitionRule(patternn) + * \endcode + * + * - "Consider this library implementation for these sub-expressions", using \p DFPatterns to + * pick out which Relay operators are supported: + * \code + * OnlyValidPartitionRule + * CombinePartitionRule (with default TVM combiner rules) + * UnionPartitionRule + * OpCallByKindPartitionRule + * CompositePartitionRule(lable1) + * DFPatternPartitionRule(pattern1) + * : + * CompositePartitionRule(lablen) + * DFPatternPartitionRule(patternn) + * \endcode + * + * - Classic TVM \p FuseOps + * \code + * PrimitivePartitionRule + * OnlyValidPartitionRule + * CombinePartitionRule (with default TVM combiner rules) + * OpCallByKindPartitionRule + * \endcode + * + * - "Just fuse what I tell you to fuse", using \p DFPatterns to directly select candidates: + * \code + * PrimitivePartitionRule + * OnlyValidPartitionRule + * UnionPartitionRule + * DFPatternPartitionRule(pattern1) + * : + * DFPatternPartitionRule(patternn) + * \endcode + */ +class PartitionRuleNode : public Object { + public: + /*! + * \brief A unique (over all rules for the same target) name for the rule. Rule names are + * combined and captured with \p PartitionCandidate rule names for debuggability and + * explainability. Some rules will copy the rule name into function attributes. + * + */ + String rule_name_; + + void VisitAttrs(AttrVisitor* v); + + /*! + * \brief Returns all the possible candidate partitions according to this rule for the overall + * expression corresponding to \p dataflow_graph. The candidates will generally have unknown + * target and cost: the target will be filled in by the \p PartitionSpec, while the cost will + * be filled in lazily. + */ + virtual std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const; + + std::string ToString() const; + Doc ToDoc() const; + + protected: + virtual void AppendBodyItems(std::vector* body_items) const; + + public: + static constexpr const char* _type_key = "relay.collage.PartitionRule"; + static constexpr const uint32_t _type_child_slots = 10; + TVM_DECLARE_BASE_OBJECT_INFO(PartitionRuleNode, Object); +}; + +class PartitionRule : public ObjectRef { + public: + explicit PartitionRule(String rule_name); + + TVM_DEFINE_OBJECT_REF_METHODS(PartitionRule, ObjectRef, PartitionRuleNode); +}; + +/*! + * \brief Partition rule which fires on all sub-expressions matching a dataflow-pattern and pattern + * predicate. It is valid for matching candidates to overlap. + */ +class DFPatternPartitionRuleNode : public PartitionRuleNode { + public: + /*! + * \brief Relay pattern. + */ + DFPattern pattern_; + + /*! + * \brief Predicate on matched sub-expression to decide if partition rule should fire. + */ + TPatternPredicate predicate_; + + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + static constexpr const char* _type_key = "relay.collage.DFPatternPartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(DFPatternPartitionRuleNode, PartitionRuleNode); +}; + +class DFPatternPartitionRule : public PartitionRule { + public: + DFPatternPartitionRule(String rule_name, DFPattern pattern, + TPatternPredicate predicate = DefaultPatternPredicate); + + TVM_DEFINE_OBJECT_REF_METHODS(DFPatternPartitionRule, PartitionRule, DFPatternPartitionRuleNode); +}; + +/*! + * \brief Partition rule which wraps candidates within a function with the "Composite" attribute + * bound to the given rule name. + * + * This is the standard way by which operators or operator groups are tagged as being supported + * by a particular externally provided function. It is up to the BYOC lowering function to + * recognize the "Composite" name and emit the appropriate code or call. + */ +class CompositePartitionRuleNode : public PartitionRuleNode { + public: + /*! \brief The sub-partition rule. */ + PartitionRule sub_rule_; + + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + static constexpr const char* _type_key = "relay.collage.CompositePartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(CompositePartitionRuleNode, PartitionRuleNode); +}; + +class CompositePartitionRule : public PartitionRule { + public: + CompositePartitionRule(String rule_name, PartitionRule sub_rule); + + TVM_DEFINE_OBJECT_REF_METHODS(CompositePartitionRule, PartitionRule, CompositePartitionRuleNode); +}; + +/*! + * \brief Partition rule which wraps candidates within a function with the "Primitive" attribute + * bound to 1. If the partition spec target(s) have the "compiler" attribute then that name is + * also added to the function as a "Compiler" attribute. + * + * This is the standard way by which sub-graphs are marked as being in a 'partition' who's + * compilation will be managed by an external BYOC toolchain. It can also be used to mark + * sub-graphs for lowering to a single kernel by the built-in TVM lowering machinery. + */ +class PrimitivePartitionRuleNode : public PartitionRuleNode { + public: + /*! \brief The sub-partition rule. */ + PartitionRule sub_rule_; + + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + static constexpr const char* _type_key = "relay.collage.PrimitivePartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(PrimitivePartitionRuleNode, PartitionRuleNode); +}; + +class PrimitivePartitionRule : public PartitionRule { + public: + PrimitivePartitionRule(String rule_name, PartitionRule sub_rule); + + TVM_DEFINE_OBJECT_REF_METHODS(PrimitivePartitionRule, PartitionRule, PrimitivePartitionRuleNode); +}; + +/*! + * \brief Partition rule which simply unions all matches from all sub-partition rules. + * + * This can be used to combine the results of a set of, eg, DFPatternPartitionRules. + */ +class UnionPartitionRuleNode : public PartitionRuleNode { + public: + Array sub_rules_; + + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + static constexpr const char* _type_key = "relay.collage.UnionPartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(UnionPartitionRuleNode, PartitionRuleNode); +}; + +class UnionPartitionRule : public PartitionRule { + public: + UnionPartitionRule(String rule_name, Array sub_rules); + + TVM_DEFINE_OBJECT_REF_METHODS(UnionPartitionRule, PartitionRule, UnionPartitionRuleNode) +}; + +/* + *! \brief Partition rule which places calls to Relay operators with a "TOpPattern" attribute of + * \p kOutEWiseFusable or less in their own singleton sub-graph. No other Relay sub-expressions + * (such as tuples or tuple projection) are selected, and it is up to outer partition rules to + * account for them. + */ +class OpCallByKindPartitionRuleNode : public PartitionRuleNode { + public: + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + static constexpr const char* _type_key = "relay.collage.OpCallByKindPartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(OpCallByKindPartitionRuleNode, PartitionRuleNode); +}; + +class OpCallByKindPartitionRule : public PartitionRule { + public: + explicit OpCallByKindPartitionRule(String rule_name); + + TVM_DEFINE_OBJECT_REF_METHODS(OpCallByKindPartitionRule, PartitionRule, + OpCallByKindPartitionRuleNode); +}; + +/*! + * \brief Partition rule which combines sub-graphs to exploit optimizations commonly available in + * backends (including the TVM lowering backend). Those optimization rules are in turn described by + * one or more \p PrimRules. + * + * For TVM these primitive rules are guided by the \p OpPatternKind associated with every sub-graph. + * That in turn is the maximum of the kind of each expression node in the sub-graph, using the + * rules: + * - Constants are \p kElemwise. + * - A call to a Relay operator has the kind of its callee. + * - Tuple construction and projection are injective provided all tuple fields are of tensor type. + * - All other sub-expressions are opaque. + * + * The available \p OpPatternKinds (and our abbreviations for them) are: + * - E: kElemWise, eg nn.relu + * - B: kBroadcast, eg add + * - I: kInjective, eg concatenate + * - R: kCommReduce, eg sum + * - A: kOutEWiseFusable, eg nn.conv2d (often called 'anchor nodes', hence the A abbreviation) + * - O: kOpaque, everything else + * (The kTuple kind is not used by this machinery.) + * + * Kinds are ordered as above from least- to most-constraining w.r.t. possible partition + * opportunities. When we write a kind abbreviation below we intend it to mean that kind *or less*. + * And when when write 'kl -> kr' we mean it to match a sub-expression of kind kr or less who's + * dataflow inputs are all of kind kl or less. + * + * We can then mimic the classic \p FuseOps TVM Pass with the following more primitive 'combiner' + * rules: + * - Sub-groups cannot have taps. In the classic \p FuseOps pass taps are avoided by construction + * by always considering all node->dominator paths. Here we naively allow taps on all candidates, + * but reject them using SubGraph::IsValid with a SubGraphConfig with allow_taps = false. + * - Combine A -> B + * - Combine B -> R + * - Combine I -> I + * - Combine I -> tuple -> I. That is, if an I sub-graph has a tuple as input, and at least one + * tuple field can be provided by an I sub-graph exit, then both the tuple and all such fields + * may be joined. + * + * Note that \p FuseOps only considers the largest possible sub-graphs. However this partition rule + * considers all possibilities so as to 'make room' for other targets supplying other + * overlapping candidates. + * + * See combiner_rule.h for the more primitive combiner rules which implement the above. + */ +class CombinePartitionRuleNode : public PartitionRuleNode { + public: + /*! \brief The sub-rule supplying the initial set of candidates. */ + PartitionRule sub_rule_; + /*! \brief The more primitive rules to use to combine the candidates found by the above rule. */ + Array combiner_rules_; + /*! \brief Maximum max_depth for candidates. */ + size_t max_max_depth_; + + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + public: + static constexpr const char* _type_key = "relay.collage.CombinePartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(CombinePartitionRuleNode, PartitionRuleNode); +}; + +class CombinePartitionRule : public PartitionRule { + public: + CombinePartitionRule(String rule_name, PartitionRule sub_rule, Array combiner_rules, + size_t max_max_depth_); + + TVM_DEFINE_OBJECT_REF_METHODS(CombinePartitionRule, PartitionRule, CombinePartitionRuleNode); +}; + +/*! + * \brief Partition rules which keeps only candidates from the sub-rule whose sub-groups are valid + * w.r.t. the given \p SubGraphConfig. + */ +class OnlyValidPartitionRuleNode : public PartitionRuleNode { + public: + PartitionRule sub_rule_; + SubGraphConfig config_; + + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + public: + static constexpr const char* _type_key = "relay.collage.OnlyValidPartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(OnlyValidPartitionRuleNode, PartitionRuleNode); +}; + +class OnlyValidPartitionRule : public PartitionRule { + public: + OnlyValidPartitionRule(String rule_name, PartitionRule sub_rule, const SubGraphConfig& config); + + TVM_DEFINE_OBJECT_REF_METHODS(OnlyValidPartitionRule, PartitionRule, OnlyValidPartitionRuleNode); +}; + +/*! + * \brief Partition rule which selects nodes which can be 'left behind' to be executed by the host + * (eg on the VM). This includes most of the 'interstitial' Relay constructs, such a let bindings, + * operators on references, calls to non-operator functions, and so on. It can also include the + * construction of and projection from tuples which may not be supported within a partition. + */ +class HostPartitionRuleNode : public PartitionRuleNode { + public: + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + public: + static constexpr const char* _type_key = "relay.collage.HostPartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(HostPartitionRuleNode, PartitionRuleNode); +}; + +class HostPartitionRule : public PartitionRule { + public: + explicit HostPartitionRule(String rule_name); + + TVM_DEFINE_OBJECT_REF_METHODS(HostPartitionRule, PartitionRule, HostPartitionRuleNode); +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_PARTITION_RULE_H_ diff --git a/src/relay/collage/partition_spec.cc b/src/relay/collage/partition_spec.cc new file mode 100644 index 0000000000000..60c2e6b6d9764 --- /dev/null +++ b/src/relay/collage/partition_spec.cc @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/partition_spec.cc + * \brief Combine a \p PartitionRule with one or more \p Targets. + */ + +#include "./partition_spec.h" + +#include "./utils.h" + +namespace tvm { +namespace relay { +namespace collage { + +String DefaultValidateSubGraphFunc(const Function& function) { return String(); } + +TVM_REGISTER_NODE_TYPE(PartitionSpecNode); + +void PartitionSpecNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector PartitionSpecNode::AllCandidates( + const DataflowGraph& dataflow_graph) const { + std::vector result; + // Make sure the target is in scope for inspection by any predicates in + // DFPatternPartitionRuleNode rules. + With target_scope(target_); + // Gather all the candidates. + std::vector candidates = + rule_->AllCandidates(dataflow_graph, GetRef(this)); + // Update the rules names. + for (const auto& candidate : candidates) { + ICHECK_EQ(candidate->spec_, GetRef(this)); + String rule_name = NestLabels(spec_name_, candidate->rule_name_); + CandidatePartition new_candidate = WithRuleName(candidate, std::move(rule_name)); + result.emplace_back(std::move(new_candidate)); + } + return result; +} + +std::string PartitionSpecNode::ToString() const { + Doc doc; + doc << "PartitionSpec(" << Doc::NewLine(2); + std::vector body_items; + body_items.emplace_back(); + body_items.back() << "spec_name=" << Doc::StrLiteral(spec_name_); + body_items.emplace_back(); + body_items.back() << "target=" << target_->ToDebugString(); + body_items.emplace_back(); + body_items.back() << "rule=" << rule_->ToDoc(); + doc << Doc::Indent(2, Doc::Concat(body_items, Doc::NewLine())) << Doc::NewLine(); + doc << ")"; + return doc.str(); +} + +PartitionSpec::PartitionSpec(String spec_name, Target target, PartitionRule rule, + TValidateSubGraphFunc validate_sub_graph_func) { + auto node = runtime::make_object(); + node->spec_name_ = std::move(spec_name); + node->target_ = std::move(target); + node->rule_ = std::move(rule); + node->validate_sub_graph_func_ = std::move(validate_sub_graph_func); + data_ = std::move(node); +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/partition_spec.h b/src/relay/collage/partition_spec.h new file mode 100644 index 0000000000000..90a7b6d65a65f --- /dev/null +++ b/src/relay/collage/partition_spec.h @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/partition_spec.h + * \brief Combine a \p PartitionRule with one or more \p Targets. + */ + +#ifndef TVM_RELAY_COLLAGE_PARTITION_SPEC_H_ +#define TVM_RELAY_COLLAGE_PARTITION_SPEC_H_ + +#include +#include +#include + +#include +#include + +#include "./partition_rule.h" +#include "./sub_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Type of functions for checking the validity of partitions before they proceed to lowering + * and codegen. The argument is the function extracted from the overall expression to represent + * the partition. The result is a non-empty error message string if the candidate should be + * rejected. + */ +using TValidateSubGraphFunc = TypedPackedFunc; + +/*! + * \brief The default validation function. Always returns the empty string, ie no error. + */ +String DefaultValidateSubGraphFunc(const Function& function); + +/*! + * \brief Pairs a \p PartitionRule with one or more \p Targets it can be used for. + */ +class PartitionSpecNode : public Object { + public: + /*! + * \brief Specification name to distinguish this spec from all others. Typically the BYOC + * 'compiler' name, "tvm", or "host". + */ + String spec_name_; + + /*! + * \brief The target all candidate partitions should be compiled for. + * + * It's tempting to support multiple targets here since. Eg the partitioning rules for + * TVM are the same irrespective of whether the target is "cuda" or "llvm", so it would make + * sense to build the candidate partitions first without committing to any target, then 'stamp' + * them for each target as the final step. + * + * However, we want to make sure any predicate in \p DFPatternPartitionRuleNode instances + * can have access to the current target instance. Eg the predicate may need to consult + * build-time configuration to decide what operators, shapes etc are actually supported. + * That implies the specific target is known when the candidate partitions are being constructed. + * + * So for now we'll just force each spec to have exactly one target. + */ + Target target_; + + /*! + * \brief The partition rule to use to gather candidates. + */ + PartitionRule rule_; + + /*! + * \brief The validation function to apply to each candidate's the extracted function before + * proceeding to lowering/codegen. + */ + TValidateSubGraphFunc validate_sub_graph_func_ = DefaultValidateSubGraphFunc; + + void VisitAttrs(AttrVisitor* v); + + /*! + * \brief Returns all the candidate partitions found by this specification. The candidates + * will be for a specific target, but will not yet have an extracted function or cost. + */ + std::vector AllCandidates(const DataflowGraph& dataflow_graph) const; + + std::string ToString() const; + + static constexpr const char* _type_key = "relay.collage.PartitionSpec"; + TVM_DECLARE_FINAL_OBJECT_INFO(PartitionSpecNode, Object); +}; + +class PartitionSpec : public ObjectRef { + public: + PartitionSpec(String spec_name, Target target, PartitionRule rule, + TValidateSubGraphFunc validate_sub_graph_func = DefaultValidateSubGraphFunc); + + TVM_DEFINE_OBJECT_REF_METHODS(PartitionSpec, ObjectRef, PartitionSpecNode); +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_PARTITION_SPEC_H_ diff --git a/src/relay/collage/priority_queue.h b/src/relay/collage/priority_queue.h new file mode 100644 index 0000000000000..1d30fe5d96af3 --- /dev/null +++ b/src/relay/collage/priority_queue.h @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/priority_queue.h + * \brief An updatable priority queue. + */ + +#ifndef TVM_RELAY_COLLAGE_PRIORITY_QUEUE_H_ +#define TVM_RELAY_COLLAGE_PRIORITY_QUEUE_H_ + +#include + +namespace tvm { +namespace relay { +namespace collage { + +/*! \brief Priority queue of search states, ordered by increasing cost. */ +template +class PriorityQueue { + public: + PriorityQueue() = default; + + /*! \brief Pushes \p item onto the queue. */ + void Push(T* item) { set_.emplace(item); } + + /*! \brief Pops the item with the least cost off the queue. */ + T* Pop() { + ICHECK(!set_.empty()); + T* item = *set_.begin(); + set_.erase(set_.begin()); + return item; + } + + /*! \brief Updates the queue to account for \p item's best cost being lowered. */ + void Update(T* item) { + auto itr = std::find_if(set_.begin(), set_.end(), + [item](const T* that) { return EqTPtr()(that, item); }); + ICHECK(itr != set_.end()); + set_.erase(itr); + set_.emplace(item); + } + + bool empty() const { return set_.empty(); } + size_t size() const { return set_.size(); } + + private: + // TODO(mbs): Actually use a pri-queue datastructure! + std::set set_; +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_PRIORITY_QUEUE_H_ diff --git a/src/relay/collage/prune_candidates.cc b/src/relay/collage/prune_candidates.cc new file mode 100644 index 0000000000000..f6a53b75f4b24 --- /dev/null +++ b/src/relay/collage/prune_candidates.cc @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/prune_candidates.cc + * \brief Try to remove candidates which will never contribute to an optimal partitioning. + */ + +#include "./prune_candidates.h" + +#include "./dataflow_graph.h" +#include "./gather_partition_specs.h" + +namespace tvm { +namespace relay { +namespace collage { + +namespace { + +/*! + * \brief Returns a map from post-dfs dataflow node indices to the indices within \p candidates for + * those candidates which intersect that dataflow node. + * + * NOTE: The index set in the vector results is over candidate indices not post-dfs indices! + */ +std::vector MakeInsideMap(const DataflowGraph& dataflow_graph, + const std::vector& candidates) { + std::vector result(dataflow_graph.size(), IndexSet(candidates.size())); + for (size_t i = 0; i < candidates.size(); ++i) { + CandidatePartition candidate = candidates[i]; + for (PostDfsIndex index : candidate->sub_graph_->inside_) { + result[index].Add(i); + } + } + return result; +} + +/*! + * \brief Returns the maximal candidates within \p candidates. A candidate is maximal if it is not + * contained by any super-candidate for the same target. + */ +std::vector MaximalCandidates( + const DataflowGraph& dataflow_graph, const std::vector& candidates) { + std::vector inside_map = MakeInsideMap(dataflow_graph, candidates); + std::vector result; + for (size_t i = 0; i < candidates.size(); ++i) { + CandidatePartition maximal_candidate = candidates[i]; + bool has_super_candidate = false; + IndexSet explored_candidates(candidates.size()); // over candidates! + for (PostDfsIndex index : maximal_candidate->sub_graph_->inside_) { + for (size_t j : inside_map[index]) { + if (i == j) { + // Ignore self. + continue; + } + if (explored_candidates[j]) { + // Already checked. + continue; + } + explored_candidates.Add(j); + CandidatePartition super_candidate = candidates[j]; + if (maximal_candidate->spec_ == super_candidate->spec_ && + maximal_candidate->sub_graph_->inside_.IsSubset(super_candidate->sub_graph_->inside_)) { + has_super_candidate = true; + break; + } + } + if (has_super_candidate) { + break; + } + } + if (!has_super_candidate) { + VLOG(2) << "Found maximal candidate " << maximal_candidate->ToString(); + result.emplace_back(maximal_candidate); + } + } + VLOG(1) << "Have " << result.size() << " maximal candidates"; + return result; +} + +/*! + * \brief Returns all the candidates in \p candidates which intersect without being equal. + */ +std::vector IntersectingCandidates( + const DataflowGraph& dataflow_graph, const std::vector& candidates) { + std::vector inside_map = MakeInsideMap(dataflow_graph, candidates); + IndexSet intersecting(candidates.size()); // over candidates! + for (size_t i = 0; i < candidates.size(); ++i) { + CandidatePartition intersecting_candidate = candidates[i]; + IndexSet explored_candidates(candidates.size()); // over candidates! + for (PostDfsIndex index : intersecting_candidate->sub_graph_->inside_) { + for (size_t j : inside_map[index]) { + if (j < i) { + // Intersection is commutative. + continue; + } + if (i == j) { + // Ignore self. + continue; + } + if (explored_candidates[j]) { + // Already checked. + continue; + } + explored_candidates.Add(j); + CandidatePartition other_candidate = candidates[j]; + if (intersecting_candidate->sub_graph_->inside_ == other_candidate->sub_graph_->inside_) { + // Have same inside set. + continue; + } + VLOG(2) << "Candidate " << intersecting_candidate->ToString() << " intersects with " + << other_candidate->ToString(); + intersecting.Add(i); + intersecting.Add(j); + } + } + } + std::vector result; + for (size_t i : intersecting) { + CandidatePartition candidate = candidates[i]; + VLOG(2) << "Found intersecting candidate " << candidate->ToString(); + result.emplace_back(candidate); + } + VLOG(1) << "Have " << result.size() << " intersecting candidates"; + return result; +} + +/*! + * \brief Returns the set operation left - right. + */ +std::vector SetDifference(const std::vector& left, + const std::vector& right) { + std::unordered_set + right_set(right.begin(), right.end()); + std::vector result; + for (const auto& candidate : left) { + if (right_set.count(candidate) == 0) { + result.emplace_back(candidate); + } + } + return result; +} + +/*! + * \brief Adds everything in right to left. Returns the number of elements added. + */ +size_t SetUnionInPlace( + std::unordered_set* left, + const std::vector& right) { + size_t init_size = left->size(); + for (const auto& candidate : right) { + left->emplace(candidate); + } + return left->size() - init_size; +} + +} // namespace + +std::vector PruneCandidates( + const DataflowGraph& dataflow_graph, + const std::vector& initial_candidates) { + VLOG_CONTEXT << "prune"; + // Start with all candidates available. + std::vector candidates = initial_candidates; + std::unordered_set pruned; + size_t num_rounds = 0; + while (true) { + VLOG_CONTEXT << "round " << ++num_rounds; + VLOG(1) << "checking " << candidates.size() << " candidates"; + // Add all the maximal candidates to the pruned set. + std::vector maximal_candidates = + MaximalCandidates(dataflow_graph, candidates); + size_t num_new_pruned = SetUnionInPlace(&pruned, maximal_candidates); + VLOG(1) << "Added " << num_new_pruned << " new pruned candidates"; + if (num_new_pruned == 0) { + // We've reached a fixed point. + break; + } + // If two pruned candidates intersect without being equal then we may miss valid + // paths during search. So remove those intersecting candidates from the available candidates + // and try again so as to find smaller candidates to 'bridge the gaps'. + std::vector pruned_vec(pruned.begin(), pruned.end()); + std::vector intersecting_candidates = + IntersectingCandidates(dataflow_graph, pruned_vec); + // We need more maximal candidates to fill in the gaps between the current pruned candidates. + // Force that by removing the intersecting candidates from the set of available candidates + // and going around again. + candidates = SetDifference(candidates, intersecting_candidates); + } + + VLOG(1) << "Have " << pruned.size() << " pruned candidates"; + std::vector result(pruned.begin(), pruned.end()); + // Re-establish a canonical order of candidates. + std::sort(result.begin(), result.end()); + return result; +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/prune_candidates.h b/src/relay/collage/prune_candidates.h new file mode 100644 index 0000000000000..6e35870b9b97f --- /dev/null +++ b/src/relay/collage/prune_candidates.h @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/prune_candidates.h + * \brief Try to remove candidates which will never contribute to an optimal partitioning. + */ + +#ifndef TVM_RELAY_COLLAGE_PRUNE_CANDIDATES_H_ +#define TVM_RELAY_COLLAGE_PRUNE_CANDIDATES_H_ + +#include + +#include "./candidate_partition.h" +#include "./dataflow_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Returns \p initial_candidates with all unnecessary candidates pruned. + * + * We prune according to the following two heuristics: + * 1. Given partitions (A, target) and (B, target) then + * cost(A union B, target) < cost(A, target) + cost(B, target). + * That is, there's no use estimating the cost of small partitions when a larger partition + * containing them is also available. More precisely, call a partition 'maximal' if it is + * not contained by any other partition for the same target. Then we want to prefer maximal + * candidates when searching. + * 2. Given maximal partitions (A union B, target) and (A union B, target') where + * target != target', then min(cost(A union B, target), cost(A union B, target')) < + * min(cost(A, target) + cost(B, target'), cost(A, target') + cost(B, target)). + * That is, there's no use estimating cross-combinations of partitions which are not maximal. + * + * However, we can't prune a non-maximal candidate if it will make some other maximal candidate + * unreachable during the Collage search. We achieve this by iterating until fixed point: + * - Find maximal candidates of current set of candidates. + * - Add those maximal candidates to the output 'pruned' set. + * - If any two candidates in the 'pruned' set intersect without being equal, remove those from + * the current set of candidates and go around again. That will force more candidates to + * be considered 'maximal'. + * That over-approximates the true necessary candidates but is at least simple. + */ +std::vector PruneCandidates( + const DataflowGraph& dataflow_graph, const std::vector& initial_candidates); + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_PRUNE_CANDIDATES_H_ diff --git a/src/relay/collage/recover_virtual_device_map.cc b/src/relay/collage/recover_virtual_device_map.cc new file mode 100644 index 0000000000000..47265b85c8a21 --- /dev/null +++ b/src/relay/collage/recover_virtual_device_map.cc @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/recover_virtual_device_map.cc + * \brief Recover the virtual device for every Relay expression node. + */ + +#include "./recover_virtual_device_map.h" + +#include "../transforms/device_aware_visitors.h" + +namespace tvm { +namespace relay { +namespace collage { + +std::unordered_map RecoverVirtualDeviceMap(const IRModule& mod, + const Expr& expr) { + class Visitor : public transform::DeviceAwareExprVisitor { + public: + explicit Visitor(const Optional& maybe_mod) + : transform::DeviceAwareExprVisitor(maybe_mod) {} + + void VisitExpr(const Expr& expr) final { + map_[expr.get()] = GetVirtualDevice(expr); + transform::DeviceAwareExprVisitor::VisitExpr(expr); + } + + std::unordered_map map_; + }; + + Visitor visitor(mod); + visitor.VisitExpr(expr); + return std::move(visitor.map_); +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/recover_virtual_device_map.h b/src/relay/collage/recover_virtual_device_map.h new file mode 100644 index 0000000000000..e3104b457e458 --- /dev/null +++ b/src/relay/collage/recover_virtual_device_map.h @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file relay/collage/recover_virtual_device_map.h + * \brief Recover the virtual device for every Relay expression node. + * + * Temporary hack until virtual_device_ work is finished. + */ +#ifndef TVM_RELAY_COLLAGE_RECOVER_VIRTUAL_DEVICE_MAP_H_ +#define TVM_RELAY_COLLAGE_RECOVER_VIRTUAL_DEVICE_MAP_H_ + +#include + +#include + +namespace tvm { +namespace relay { +namespace collage { + +std::unordered_map RecoverVirtualDeviceMap(const IRModule& mod, + const Expr& expr); + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_RECOVER_VIRTUAL_DEVICE_MAP_H_ diff --git a/src/relay/collage/sub_graph.cc b/src/relay/collage/sub_graph.cc new file mode 100644 index 0000000000000..016ce958ee5ba --- /dev/null +++ b/src/relay/collage/sub_graph.cc @@ -0,0 +1,1005 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/sub_graph.cc + * \brief Represents a sub-graph of an overall Relay expression. + */ + +#include "./sub_graph.h" + +#include + +#include "../../support/scalars.h" +#include "../transforms/pass_utils.h" +#include "./utils.h" + +namespace tvm { +namespace relay { +namespace collage { + +namespace { + +class Extractor; + +/*! + * \brief Helper class for rewriting expressions to replace a sub-graph according to the + * given extractor. + */ +class Rewriter : public ExprMutator { + public: + explicit Rewriter(const Extractor* extractor) : extractor_(extractor) {} + + Expr VisitExpr(const Expr& expr) final; + + private: + /*! \brief Already prepared extractor which will guide the rewrite. */ + const Extractor* extractor_; +}; + +/*! \brief Helper class for extracting matched sub-graphs from the overall expression. */ +class Extractor : public ExprMutator { + public: + Extractor(const DataflowGraph* dataflow_graph, const SubGraphNode* sub_graph, + FunctionAttrsMap opt_attrs) + : dataflow_graph_(dataflow_graph), sub_graph_(sub_graph), opt_attrs_(std::move(opt_attrs)) { + ICHECK_EQ(dataflow_graph_->size(), sub_graph_->overall_size()); + } + + const DataflowGraph& dataflow_graph() const { return *dataflow_graph_; } + + /*! + * \brief Collect the parameters and output expressions for the function representing + * the sub-graph. + */ + void Extract() { + ICHECK(!sub_graph_->IsEmpty()); + VLOG(2) << "Extracting " << sub_graph_->ToString(); + const bool for_function = opt_attrs_.defined(); + + // In reverse dataflow order... + for (PostDfsIndex i = dataflow_graph_->size(); i > 0; --i) { + PostDfsIndex index = i - 1; + if (!sub_graph_->inside_[index]) { + // Node is outside sub-graph. + continue; + } + VLOG(2) << "index " << index; + auto node = dataflow_graph_->index_to_node(index); + if (sub_graph_->exit_[node->index_] || node->is_external_ || memo_.count(node->ref()) == 0) { + // This sub-expression is: + // - inside the sub-graph and needed outside the sub-graph. So it must contribute to an + // output (even if we've already visited it while constructing an output from a + // downstream sub-expression). + // - not yet visited, in which case it must still be considered an 'output' so it will + // be evaluated for any possible side effects. + Expr output = VisitExpr(GetRef(node->node_ref_)); + VLOG(2) << "index " << index << " added as output:\n" + << PrettyPrint(output) << "\nat " << outputs_.size(); + expr_to_output_index_.emplace(node->node_ref_, outputs_.size()); + outputs_.emplace_back(std::move(output)); + output_types_.emplace_back(node->node_ref_->checked_type()); + } + } + ICHECK(!outputs_.empty()); + + // Reverse the outputs so as to preserve the original evaluation order. + std::reverse(outputs_.begin(), outputs_.end()); + std::reverse(output_types_.begin(), output_types_.end()); + for (auto& kv : expr_to_output_index_) { + kv.second = static_cast(outputs_.size()) - 1 - kv.second; + } + + // Build a 'body' expression to represent the extracted sub-graph. If we have multiple + // outputs we'll place them in a tuple. + Type body_type; + Expr body; + if (outputs_.size() > 1) { + body_type = TupleType(output_types_); + body = Tuple(outputs_); + body->checked_type_ = body_type; + } else { + body_type = output_types_.front(); + body = outputs_.front(); + } + + // Re-express all the sub-sub-graphs in terms of the body. + DataflowGraph body_dataflow_graph(body); + std::vector sub_sub_graphs; + IndexSubst subst = MakeIndexSubst(body_dataflow_graph); + for (const auto& sub_sub_graph : sub_graph_->sub_sub_graphs_) { + sub_sub_graphs.emplace_back(sub_sub_graph.Subst(body_dataflow_graph, subst)); + } + + // Sweep backwards through the body, rewriting to account for each sub-sub-graph. + body = SubSubGraph::ParallelRewrite(body_dataflow_graph, body, std::move(sub_sub_graphs)); + + if (for_function) { + // Rewrite so all input nodes are now conveyed via call arguments to a new function. + Array arg_types; + arg_types.reserve(params_.size()); + for (const auto& param : params_) { + arg_types.push_back(param->checked_type()); + } + extracted_ = Function(std::move(params_), std::move(body), body_type, + /*ty_params=*/{}, DictAttrs(opt_attrs_)); + extracted_->checked_type_ = + FuncType(std::move(arg_types), body_type, /*type_params=*/{}, /*type_constraints=*/{}); + body = Call(extracted_, std::move(args_)); + body->checked_type_ = body_type; + } else { + // Don't do anything with the inputs. + extracted_ = body; + } + + // Setup the output substitution. + for (const auto& kv : expr_to_output_index_) { + Expr expr; + if (outputs_.size() == 1) { + expr = body; + } else if (for_function) { + expr = TupleGetItem(body, kv.second); + expr->checked_type_ = output_types_[kv.second]; + } else { + const auto* tuple_node = body.as(); + ICHECK(tuple_node); + expr = tuple_node->fields[kv.second]; + } + VLOG(2) << "output " << dataflow_graph_->item_to_node(kv.first)->index_ << " is at index " + << kv.second << " (of " << outputs_.size() << " outputs)"; + output_substitution_.emplace(kv.first, std::move(expr)); + } + } + + ////// Following members are valid only after Extract() has returned. + + /*! + * \brief Returns the expression representing the extracted sub-graph. If opt_attrs_ is + * defined then will be a function. + */ + Expr extracted() const { return extracted_; } + + /*! + * \brief Returns the substitution to apply to all expression nodes in the overall expression + * so as to replace references to outputs of the sub-graph with their rewritten form. + */ + const std::unordered_map& output_substitution() const { + return output_substitution_; + } + + private: + /*! + * \brief Returns a map from original index to new index for each node inside the sub-graph. Only + * valid after \p Extract has made its backwards dataflow sweep. + */ + IndexSubst MakeIndexSubst(const DataflowGraph& new_dataflow_graph) const { + VLOG(2) << "building extractor substitution"; + IndexSubst subst; + for (PostDfsIndex index : sub_graph_->inside_) { + auto orig_node = dataflow_graph_->index_to_node(index); + ICHECK_EQ(orig_node->index_, index); + auto itr = memo_.find(orig_node->ref()); + ICHECK(itr != memo_.end()); + auto new_node = new_dataflow_graph.item_to_node(itr->second); + VLOG(2) << orig_node->index_ << " |-> " << new_node->index_; + subst.emplace(orig_node->index_, new_node->index_); + } + return subst; + } + + /*! \brief Returns true if \p expr is inside the sub-graph. */ + bool inside(const Expr& expr) { + return sub_graph_->inside_[dataflow_graph_->item_to_node(expr)->index_]; + } + + /*! + * \brief Returns the variable uniquely representing \p expr, which should be + * an input node (ie outside the sub-graph but feeding into a node inside the sub-graph). + * + * It is valid for: + * - An expression outside the sub-graph to be used multiple times inside the sub-graph. + * - An expression outside the sub-graph to be used both inside and outside the sub-graph. + */ + Var VarFor(const Expr& expr) { + ICHECK(!inside(expr)); + ICHECK(opt_attrs_.defined()); + auto itr = expr_to_param_.find(expr.get()); + if (itr != expr_to_param_.end()) { + return itr->second; + } + auto fresh_var = Var("FunctionVar_" + std::to_string(params_.size()), expr->checked_type()); + fresh_var->checked_type_ = expr->checked_type(); + params_.push_back(fresh_var); + args_.push_back(expr); + expr_to_param_.emplace(expr.get(), fresh_var); + return fresh_var; + } + + /*! + * \brief If \p expr is inside the sub-graph then return it's rewritten form. + * If \p expr is outside the sub-graph then it must correspond to an input node. + * - If opt_attrs_ is defined return the variable to represent it. + * - Otherwise just return the expression directly. + * + * Should be called only on inputs to nodes which are inside the sub-graph. + */ + Expr VisitExpr(const Expr& expr) final { + if (inside(expr)) { + return ExprMutator::VisitExpr(expr); + } else if (CanInline(expr)) { + // Implicitly include inlinable input sub-expressions. + return expr; + } else if (opt_attrs_.defined()) { + // Map to a function parameter. + return VarFor(expr); + } else { + // Stop rewriting. + return expr; + } + } + + Expr VisitExpr_(const FunctionNode* function_node) override { + if (function_node->HasNonzeroAttr(attr::kPrimitive)) { + return GetRef(function_node); + } + return ExprMutator::VisitExpr_(function_node); + } + + //// Context fields, passed in constructor. + + /*! \brief The dataflow graph corresponding to the overall expression. */ + const DataflowGraph* dataflow_graph_; + /*! \brief The sub-graph of the above we are extracting. */ + const SubGraphNode* sub_graph_; + /*! \brief Optional attributes if the sub-graph should be extracted as a function. */ + FunctionAttrsMap opt_attrs_; + + //// Result fields, available after Extract() called. + + /*! + * \brief The extracted expression. If opt_attrs_ is defined this will be a function. + */ + Expr extracted_; + /*! + * \brief Map from output nodes to corresponding expressions. If the sub-graph has more than + * one exit node then each entry will be a tuple projection. + */ + std::unordered_map output_substitution_; + + //// Accumulator fields, built as we visit expressions. + + /*! \brief (If opt_attrs_ is defined) Parameters representing input expression nodes. */ + Array params_; + /*! + * \brief (If opt_attrs_ is defined) The input expression nodes for each of the above params_. + */ + Array args_; + /*! + * \brief (If opt_attrs_ is defined) Map from existing input expression nodes to the parameters + * in params_ which now representing them. + */ + std::unordered_map expr_to_param_; + /*! + * \brief Accumulated new expressions which represent the exit nodes of the rewritten sub-graph. + * It is possible to have multiple outputs. It is possible one output also contributes to other + * outputs (ie the output is a 'tap'). + */ + std::vector outputs_; + /*! \brief (If opt_attrs_ is defined) Types of original expressions corresponding to outputs_. */ + std::vector output_types_; + /*! + * \brief Map from existing exit expression nodes to the index in outputs_ which should + * represent them in the rewritten overall expression. + */ + std::unordered_map expr_to_output_index_; +}; + +Expr Rewriter::VisitExpr(const Expr& expr) { + auto itr = extractor_->output_substitution().find(expr.get()); + if (itr == extractor_->output_substitution().end()) { + return ExprMutator::VisitExpr(expr); + } else { + return itr->second; + } +} + +} // namespace + +std::pair SubExprKindAndLabel(const Expr& sub_expr) { + class Visitor : public ExprFunctor(const Expr&)> { + private: + std::pair VisitExpr_(const CallNode* call_node) final { + if (const auto* op_node = call_node->op.as()) { + auto op = GetRef(op_node); + static auto fpattern = Op::GetAttrMap("TOpPattern"); + if (fpattern.count(op) == 0) { + VLOG(1) << "no TOpPattern known for " << op->name << ", considering opaque"; + return {kOpaque, op->name}; + } else if (IsDynamic(call_node->checked_type()) && IsDataDependent(call_node)) { + VLOG(1) << "call has dynamic shape which is data-dependent, considering opaque"; + return {kOpaque, op->name}; + } else { + OpPatternKind kind = static_cast(fpattern[op]); + VLOG(2) << "TOpPattern for " << op->name << " is " << KindToString(kind); + return {kind, op->name}; + } + } else if (const auto* function_node = call_node->op.as()) { + Optional opt_i = + function_node->GetAttr("TOpPattern", Optional()); + if (opt_i.defined()) { + OpPatternKind kind = static_cast(opt_i.value()->value); + VLOG(1) << "TOpPattern for function is " << KindToString(kind); + return {kind, "call_prim"}; + } else { + VLOG(1) << "calling function without TOpPattern, considering opaque"; + return {kOpaque, "call_fun"}; + } + } else { + VLOG(1) << "unsupported call, considering opaque"; + return {kOpaque, "call_any"}; + } + } + + std::pair VisitExpr_(const ConstantNode* constant_node) final { + VLOG(2) << "TOpPattern for constant is " << KindToString(kElemWise); + if (support::IsSimpleScalar(constant_node)) { + return {kElemWise, "scalar"}; + } else { + return {kElemWise, "const"}; + } + } + + std::pair VisitExpr_(const TupleNode* tuple_node) final { + const auto* tuple_type_node = tuple_node->checked_type().as(); + ICHECK(tuple_type_node != nullptr); + if (std::all_of(tuple_type_node->fields.begin(), tuple_type_node->fields.end(), + [](const Type& type) { return type.as() != nullptr; })) { + VLOG(2) << "TOpPattern for tuple is " << KindToString(kInjective); + return {kInjective, "tuple"}; + } else { + VLOG(1) << "tuple contains non-tensors, considering opaque"; + return {kOpaque, "tuple"}; + } + } + + std::pair VisitExpr_( + const TupleGetItemNode* tuple_get_item_node) final { + const auto* tuple_type_node = tuple_get_item_node->tuple->checked_type().as(); + ICHECK(tuple_type_node != nullptr); + if (std::all_of(tuple_type_node->fields.begin(), tuple_type_node->fields.end(), + [](const Type& type) { return type.as() != nullptr; })) { + VLOG(2) << "TOpPattern for tuple projection is " << KindToString(kInjective); + return {kInjective, "proj"}; + } else { + VLOG(1) << "tuple being projected contains non-tensors, considering opaque"; + return {kOpaque, "proj"}; + } + } + + // TODO(mbs): We implement the following mostly so we have a lightweight way of describing + // the current sub-expression. If partitioning is ever extended beyond the usual call/tuple/proj + // sub-language we should revise the returned operator kinds to match. + + std::pair VisitExpr_(const VarNode* var_node) final { + return {kOpaque, "%" + var_node->name_hint()}; + } + std::pair VisitExpr_(const GlobalVarNode* global_var_node) final { + return {kOpaque, "@" + global_var_node->name_hint}; + } + std::pair VisitExpr_(const OpNode* op_node) final { + return {kOpaque, "`" + op_node->name}; + } + std::pair VisitExpr_(const FunctionNode* function_node) final { + return {kOpaque, "fn"}; + } + std::pair VisitExpr_(const LetNode* let_node) final { + return {kOpaque, "let"}; + } + std::pair VisitExpr_(const IfNode* if_node) final { + return {kOpaque, "if"}; + } + std::pair VisitExpr_(const RefCreateNode* ref_create_node) final { + return {kOpaque, "ref"}; + } + std::pair VisitExpr_(const RefReadNode* op) final { + return {kOpaque, "ref_read"}; + } + std::pair VisitExpr_(const RefWriteNode* op) final { + return {kOpaque, "ref_write"}; + } + std::pair VisitExpr_(const ConstructorNode* op) final { + return {kOpaque, "`" + op->name_hint}; + } + std::pair VisitExpr_(const MatchNode* op) final { + return {kOpaque, "match"}; + } + }; + return Visitor().VisitExpr(sub_expr); +} + +std::pair SubGraphKindAndLabel(const DataflowGraph& dataflow_graph, + const IndexSet& inside) { + std::ostringstream os; + bool first = true; + OpPatternKind max_kind = kElemWise; + for (PostDfsIndex index : inside) { + OpPatternKind sub_kind; + std::string sub_label; + std::tie(sub_kind, sub_label) = SubExprKindAndLabel(dataflow_graph.index_to_node(index)->ref()); + if (!sub_label.empty()) { + if (first) { + first = false; + } else { + os << "+"; + } + os << sub_label; + } + max_kind = CombineKinds(max_kind, sub_kind); + } + return {max_kind, os.str()}; +} + +IndexSet MatcherToIndexSet(const DFPatternMatcher& matcher) { + IndexSet result(matcher.size()); + for (const auto& kv : matcher.memo()) { + for (const auto& matched_sub_expr : kv.second) { + if (CanInline(matched_sub_expr)) { + // Trivial sub-expressions can just be included in the extracted function body + // when we construct it and don't need to be considered part of the sub-graph. + continue; + } + if (kv.first.as()) { + // Don't consider the expressions matched by a wildcard to be part of the sub-graph. + continue; + } + result.Add(matcher.expr_to_node(matched_sub_expr)->index_); + } + } + return result; +} + +std::string SubGraphConfig::ToString() const { + std::ostringstream os; + os << "{max_exits=" << max_exits; + os << ",allow_taps=" << allow_taps; + os << ",max_max_depth=" << max_max_depth; + os << "}"; + return os.str(); +} + +TVM_REGISTER_NODE_TYPE(SubSubGraphNode); + +void SubSubGraphNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +SubGraph SubSubGraphNode::sub_graph() const { return Downcast(sub_graph_obj_); } + +bool SubSubGraphNode::operator==(const SubSubGraphNode& that) const { + return *sub_graph().get() == *that.sub_graph().get(); +} + +bool SubSubGraphNode::operator<(const SubSubGraphNode& that) const { + return *sub_graph().get() < *that.sub_graph().get(); +} + +size_t SubSubGraphNode::hash() const { + size_t h = StructuralHash()(attrs_); + h ^= sub_graph()->hash() + 0x9e3779b9 + (h << 6) + (h >> 2); + return h; +} + +std::string SubSubGraphNode::ToString() const { + std::ostringstream os; + os << "{sub_graph=" << sub_graph()->ToString(); + os << ",attrs=" << PrettyPrint(attrs_); + os << "}"; + return os.str(); +} + +Function SubSubGraphNode::Extract(const DataflowGraph& dataflow_graph) const { + Extractor extractor(&dataflow_graph, sub_graph().get(), attrs_); + extractor.Extract(); + return Downcast(extractor.extracted()); +} + +Expr SubSubGraphNode::Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const { + Extractor extractor(&dataflow_graph, sub_graph().get(), attrs_); + extractor.Extract(); + Rewriter rewriter(&extractor); + return rewriter.VisitExpr(expr); +} + +SubSubGraph::SubSubGraph(SubGraph sub_graph, FunctionAttrsMap attrs) { + auto data = runtime::make_object(); + data->sub_graph_obj_ = std::move(sub_graph); + data->attrs_ = std::move(attrs); + data_ = std::move(data); +} + +SubSubGraph SubSubGraph::Subst(const DataflowGraph& new_dataflow_graph, + const std::unordered_map& subst) const { + return SubSubGraph(get()->sub_graph().Subst(new_dataflow_graph, subst), get()->attrs_); +} + +bool SubSubGraph::TriviallyUnionable(const SubSubGraph& that) const { + if (get()->attrs_.size() != that->attrs_.size()) { + return false; + } + for (const auto& kv : get()->attrs_) { + if (kv.first == "Composite") { + // Even if all the attributes agree we don't consider "Composite" functions to + // ever be unionable. + // TODO(mbs): Find a cleaner way to do this. + return false; + } + auto itr = that->attrs_.find(kv.first); + if (itr == that->attrs_.end()) { + return false; + } + if (!StructuralEqual()(kv.second, (*itr).second)) { + return false; + } + } + return true; +} + +SubSubGraph SubSubGraph::DisjointUnion(const DataflowGraph& dataflow_graph, + const SubSubGraph& that) const { + ICHECK(TriviallyUnionable(that)); + return SubSubGraph(get()->sub_graph().DisjointUnion(dataflow_graph, that->sub_graph()), + get()->attrs_); +} + +/*static*/ +Expr SubSubGraph::ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr, + std::vector sub_sub_graphs) { + // IMPORTANT: See the corresponding comment in SubGraph::ParallelRewrite. + std::sort(sub_sub_graphs.begin(), sub_sub_graphs.end(), + [](const SubSubGraph& left, const SubSubGraph& right) { + return left->sub_graph()->last_inside_index_ > right->sub_graph()->last_inside_index_; + }); + + Expr result = expr; + for (const auto& sub_sub_graph : sub_sub_graphs) { + result = sub_sub_graph->Rewrite(dataflow_graph, result); + } + return result; +} + +TVM_REGISTER_NODE_TYPE(SubGraphNode); + +void SubGraphNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +IndexSet SubGraphNode::Downstream(const DataflowGraph& dataflow_graph) const { + IndexSet downstream(dataflow_graph.size()); + for (PostDfsIndex exit_index : exit_) { + downstream = downstream | dataflow_graph.downstream_of(exit_index); + } + return downstream; +} + +bool SubGraphNode::IsValid(const DataflowGraph& dataflow_graph, + const SubGraphConfig& config) const { + // Check we don't have too many exit nodes. + if (config.max_exits > 0 && exit_.PopCount() > config.max_exits) { + VLOG(1) << "Subgraph " << ToString() << " is invalid: " << exit_.PopCount() + << " exits exceeds maximum " << config.max_exits; + return false; + } + + // Check the maximum path depth is in limit. + if (config.max_max_depth > 0 && max_depth_ > config.max_max_depth) { + VLOG(1) << "Subgraph " << ToString() << " is invalid: maximum depth " << max_depth_ + << " exceeds limit " << config.max_max_depth; + return false; + } + + // All inside nodes must be in the same basic block. + const DataflowGraph::Node* basic_block = nullptr; + for (PostDfsIndex index : inside_) { + auto node = dataflow_graph.index_to_node(index); + if (basic_block == nullptr) { + basic_block = node->basic_block_; + } + if (node->basic_block_ != basic_block) { + VLOG(1) << "Subgraph " << ToString() << " is invalid: nodes are from different basic blocks"; + return false; + } + } + + // The sub-sub-graphs must be subsets and non-overlapping. + IndexSet union_inside(dataflow_graph.size()); + for (const auto& sub_sub_graph : sub_sub_graphs_) { + if (!sub_sub_graph->sub_graph()->inside_.AreDisjoint(union_inside)) { + VLOG(1) << "Subgraph " << ToString() << " is invalid: sub-sub-graphs overlap"; + return false; + } + if (!sub_sub_graph->sub_graph()->inside_.IsSubset(inside_)) { + VLOG(1) << "Subgraph " << ToString() + << " is invalid: sub-sub-graph is not subset of overall sub-graph"; + return false; + } + } + + if (!config.allow_taps) { + // Exit nodes cannot also contribute to inside nodes. + for (PostDfsIndex index : exit_) { + auto node = dataflow_graph.index_to_node(index); + if (AnyOutputInside(node)) { + VLOG(1) << "Subgraph " << ToString() + << " is invalid: inner node is 'tapped' and also contributes to output, but taps " + "are disabled"; + return false; + } + } + } + + // Check no output would end up feeding into any entry node. + for (PostDfsIndex output_index : output_) { + if (dataflow_graph.downstream_of(output_index).Intersects(entry_)) { + VLOG(1) << "Subgraph " << ToString() << " is invalid: output node " << output_index + << " feeds back into this sub-graph"; + return false; + } + } + + // Looks legit! + return true; +} + +Function SubGraphNode::ExtractAsFunction(const DataflowGraph& dataflow_graph) const { + SubSubGraph sub_sub_graph(GetRef(this), FunctionAttrsMap()); + return sub_sub_graph->Extract(dataflow_graph); +} + +Expr SubGraphNode::Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const { + if (sub_sub_graphs_.empty()) { + // Nothing to rewrite. + return expr; + } + Extractor extractor(&dataflow_graph, this, NullValue()); + extractor.Extract(); + Rewriter rewriter(&extractor); + return rewriter.VisitExpr(expr); +} + +std::string SubGraphNode::ToString() const { + std::ostringstream os; + os << "{inside=" << inside_.ToString(); + os << ",entry=" << entry_.ToString(); + os << ",exit=" << exit_.ToString(); + os << ",input=" << input_.ToString(); + os << ",output=" << output_.ToString(); + os << ",max_depth=" << max_depth_; + os << ",kind=" << KindToString(kind_); + if (!label_.empty()) { + os << ",label=" << label_; + } + for (const auto& sub_sub_graph : sub_sub_graphs_) { + os << ",sub_sub_graph=" << sub_sub_graph->ToString(); + } + os << "}"; + return os.str(); +} + +bool SubGraphNode::operator==(const SubGraphNode& that) const { + ICHECK_EQ(inside_.end_index(), that.inside_.end_index()); + if (inside_ != that.inside_) { + return false; + } + if (sub_sub_graphs_.size() != that.sub_sub_graphs_.size()) { + return false; + } + for (size_t i = 0; i < sub_sub_graphs_.size(); ++i) { + if (*sub_sub_graphs_[i].get() != *that.sub_sub_graphs_[i].get()) { + return false; + } + } + return true; +} + +bool SubGraphNode::operator<(const SubGraphNode& that) const { + if (first_inside_index_ < that.first_inside_index_) { + return true; + } + if (that.first_inside_index_ < first_inside_index_) { + return false; + } + return inside_ < that.inside_; +} + +size_t SubGraphNode::hash() const { + size_t h = inside_.hash(); + for (const auto& sub_sub_graph : sub_sub_graphs_) { + h ^= sub_sub_graph->hash() + 0x9e3779b9 + (h << 6) + (h >> 2); + } + return h; +} + +void SubGraphNode::Init(const DataflowGraph& dataflow_graph) { + for (PostDfsIndex index = 0; index < inside_.end_index(); ++index) { + auto node = dataflow_graph.index_to_node(index); + if (inside_[index]) { + if (AnyInputOutside(node)) { + entry_.Add(index); + } + if (AnyOutputOutside(node) || node->is_external_) { + exit_.Add(index); + } + } else { + if (AnyInputInside(node)) { + output_.Add(index); + } + if (AnyOutputInside(node) && !CanInline(node->ref())) { + input_.Add(index); + } + } + } + max_depth_ = MaxDepth(dataflow_graph); +} + +size_t SubGraphNode::MaxDepth(const DataflowGraph& dataflow_graph) const { + std::unordered_map max_depths; + std::vector stack; + size_t max_depth = 0; + // All the entry nodes have max depth 0. + for (PostDfsIndex index : entry_) { + auto node = dataflow_graph.index_to_node(index); + max_depths.emplace(node, 0); + stack.push_back(node); + } + while (!stack.empty()) { + const DataflowGraph::Node* node = stack.back(); + stack.pop_back(); + size_t next_depth = max_depths[node] + 1; + if (exit_[node->index_]) { + // If this node is external then it will have no outputs but we still wish to consider + // the path to the implied output as requiring one more step. + // Otherwise we're accounting for reaching one of the external outputs belowe. + max_depth = std::max(max_depth, next_depth); + } + for (const DataflowGraph::Node* output_node : node->outputs_) { + if (!inside_[output_node->index_]) { + continue; + } + if (max_depths.count(output_node) == 0) { + max_depths.emplace(output_node, next_depth); + stack.push_back(output_node); + } else if (next_depth > max_depths[output_node]) { + // We found a deeper path to an already expanded node. We'll expand again. + max_depths[output_node] = next_depth; + stack.push_back(output_node); + } + } + } + return max_depth; +} + +/*! \brief Return's true if any (input/output) of node is (outside/inside) the sub-graph. */ +bool SubGraphNode::AnyInputOutside(const DataflowGraph::Node* node) const { + return std::any_of(node->inputs_.begin(), node->inputs_.end(), + [this](const DataflowGraph::Node* sub_node) { + return !inside_[sub_node->index_] && !CanInline(sub_node->ref()); + }); +} + +bool SubGraphNode::AnyInputInside(const DataflowGraph::Node* node) const { + return std::any_of( + node->inputs_.begin(), node->inputs_.end(), + [this](const DataflowGraph::Node* sub_node) { return inside_[sub_node->index_]; }); +} + +bool SubGraphNode::AnyOutputOutside(const DataflowGraph::Node* node) const { + return std::any_of( + node->outputs_.begin(), node->outputs_.end(), + [this](const DataflowGraph::Node* sub_node) { return !inside_[sub_node->index_]; }); +} + +bool SubGraphNode::AnyOutputInside(const DataflowGraph::Node* node) const { + return std::any_of( + node->outputs_.begin(), node->outputs_.end(), + [this](const DataflowGraph::Node* sub_node) { return inside_[sub_node->index_]; }); +} + +SubGraph::SubGraph(const DataflowGraph& dataflow_graph, IndexSet inside, OpPatternKind kind, + String label, std::vector sub_sub_graphs) { + std::sort( + sub_sub_graphs.begin(), sub_sub_graphs.end(), + [](const SubSubGraph& left, const SubSubGraph& right) { return *left.get() < *right.get(); }); + auto node = runtime::make_object(); + node->inside_ = std::move(inside); + node->first_inside_index_ = node->inside_.FirstInsideIndex(); + node->last_inside_index_ = node->inside_.LastInsideIndex(); + node->entry_ = IndexSet(node->inside_.end_index()); + node->exit_ = IndexSet(node->inside_.end_index()); + node->input_ = IndexSet(node->inside_.end_index()); + node->output_ = IndexSet(node->inside_.end_index()); + node->kind_ = kind; + node->label_ = std::move(label); + node->sub_sub_graphs_ = sub_sub_graphs; + node->Init(dataflow_graph); + data_ = std::move(node); +} + +SubGraph::SubGraph(const DataflowGraph& dataflow_graph) + : SubGraph(dataflow_graph, IndexSet(dataflow_graph.size())) {} + +bool SubGraph::AreDisjoint(const SubGraph& that) const { + return get()->inside_.AreDisjoint(that->inside_); +} + +namespace { +/*! \brief Returns true if an output of \p left not in \p right ultimately flows into \p right. */ +bool FlowsInto(const DataflowGraph& dataflow_graph, const SubGraph& left, const SubGraph& right) { + for (PostDfsIndex output_index : left->output_) { + if (!right->inside_[output_index] && + dataflow_graph.downstream_of(output_index).Intersects(right->entry_)) { + return true; + } + } + return false; +} +} // namespace + +bool SubGraph::AreTouching(const DataflowGraph& dataflow_graph, const SubGraph& that) const { + if (!get()->inside_.AreDisjoint(that->inside_)) { + // Easy rejection. + return false; + } + if (!get()->output_.Intersects(that->entry_)) { + // Not touching. + return false; + } + if (FlowsInto(dataflow_graph, *this, that) || FlowsInto(dataflow_graph, that, *this)) { + // Unioning would create a cycle. + return false; + } + return true; +} + +bool SubGraph::AreSelfContained(const SubGraph& that) const { + return get()->output_.IsSubset(that->entry_) && that->input_.IsSubset(get()->exit_); +} + +SubGraph SubGraph::DisjointUnion(const DataflowGraph& dataflow_graph, const SubGraph& that) const { + ICHECK(AreDisjoint(that)); + IndexSet inside = get()->inside_ | that->inside_; + std::vector sub_sub_graphs; + for (const auto& sub_sub_graph : get()->sub_sub_graphs_) { + sub_sub_graphs.push_back(sub_sub_graph); + } + for (const auto& sub_sub_graph : that->sub_sub_graphs_) { + auto existing_itr = std::find_if(sub_sub_graphs.begin(), sub_sub_graphs.end(), + [&sub_sub_graph](const SubSubGraph& existing) { + return existing.TriviallyUnionable(sub_sub_graph); + }); + if (existing_itr != sub_sub_graphs.end()) { + *existing_itr = existing_itr->DisjointUnion(dataflow_graph, sub_sub_graph); + } else { + sub_sub_graphs.push_back(sub_sub_graph); + } + } + return SubGraph(dataflow_graph, std::move(inside), CombineKinds(get()->kind_, that->kind_), + UnionLabels(get()->label_, that->label_), std::move(sub_sub_graphs)); +} + +SubGraph SubGraph::WithAttrs(const DataflowGraph& dataflow_graph, FunctionAttrsMap attrs) const { + std::vector sub_sub_graphs; + sub_sub_graphs.push_back(SubSubGraph(*this, attrs)); + return SubGraph(dataflow_graph, get()->inside_, get()->kind_, get()->label_, + std::move(sub_sub_graphs)); +} + +SubGraph SubGraph::Subst(const DataflowGraph& new_dataflow_graph, const IndexSubst& subst) const { + IndexSet new_inside = get()->inside_.Subst(new_dataflow_graph.size(), subst); + std::vector new_sub_sub_graphs; + for (const auto& sub_sub_graph : get()->sub_sub_graphs_) { + new_sub_sub_graphs.push_back(sub_sub_graph.Subst(new_dataflow_graph, subst)); + } + return SubGraph(new_dataflow_graph, std::move(new_inside), get()->kind_, get()->label_, + std::move(new_sub_sub_graphs)); +} + +/*static*/ +Expr SubGraph::ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr, + std::vector sub_graphs) { + // IMPORTANT: + // - All the sub-graphs will be w.r.t. the dataflow graph for the original expression. + // Each time we call Rewrite on one of those graphs the result expression will be rewritten + // from the final output back to the inputs. The inputs will then be shared with the original + // expression. Thus it is safe to iteratively rewrite all the sub-graphs without redoing the + // dataflow_graph and substituting indexes provided we work in reverse dataflow order. + // - We rely on the argument expression reference holding the original expression alive so that + // the dataflow_graph will never contain dangling pointes (even though as per above we'll + // never dereference them). + std::sort(sub_graphs.begin(), sub_graphs.end(), [](const SubGraph& left, const SubGraph& right) { + return left->last_inside_index_ > right->last_inside_index_; + }); + Expr result = expr; + for (const auto& sub_graph : sub_graphs) { + result = sub_graph->Rewrite(dataflow_graph, result); + } + return result; +} + +transform::Pass PartitionOnIndexesForTesting(size_t max_exits, bool allow_taps, + Array indexes, Array labels) { + auto pass_func = [=](Function function, IRModule mod, transform::PassContext ctxt) { + ICHECK(!labels.defined() || indexes.size() == labels.size()); + VLOG(1) << "Considering partitioning for:\n" << PrettyPrint(function); + DataflowGraph dataflow_graph(function); + std::unordered_map> sub_sub_graph_indexes; + std::vector node_indexes; + node_indexes.reserve(indexes.size()); + for (size_t i = 0; i < indexes.size(); ++i) { + const Integer& index = indexes[i]; + ICHECK_GE(index->value, 0); + ICHECK_LT(index->value, dataflow_graph.size()); + PostDfsIndex index_int = static_cast(index->value); + node_indexes.push_back(index_int); + if (labels.defined()) { + const String& label = labels[i]; + if (!label.empty()) { + sub_sub_graph_indexes[label].push_back(index_int); + } + } + } + std::vector sub_sub_graphs; + for (const auto& kv : sub_sub_graph_indexes) { + FunctionAttrsMap attrs; + attrs.Set("Composite", kv.first); + sub_sub_graphs.push_back( + SubSubGraph(SubGraph(dataflow_graph, IndexSet(dataflow_graph.size(), kv.second)), attrs)); + } + OpPatternKind kind; + String label; + IndexSet inside(dataflow_graph.size(), node_indexes); + std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside); + SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label), + std::move(sub_sub_graphs)); + SubGraphConfig config; + config.max_exits = max_exits; + config.allow_taps = allow_taps; + if (sub_graph->IsValid(dataflow_graph, config)) { + VLOG(1) << "Sub-graph " << sub_graph->ToString() << " is considered valid"; + } else { + VLOG(1) << "Sub-graph " << sub_graph->ToString() + << " is NOT considered valid, not partitioning"; + return function; + } + Function result = Downcast(sub_graph->Rewrite(dataflow_graph, function)); + VLOG(1) << "Partitioned to:\n" << PrettyPrint(result); + return result; + }; + return transform::CreateFunctionPass(pass_func, /*opt_level=*/0, "PartitionOnIndexesForTesting", + {}); +} + +TVM_REGISTER_GLOBAL("relay.collage.partition_on_indexes_for_testing") + .set_body_typed([](size_t max_outputs, bool allow_taps, Array indexes, + Array labels) { + return PartitionOnIndexesForTesting(max_outputs, allow_taps, indexes, labels); + }); + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/sub_graph.h b/src/relay/collage/sub_graph.h new file mode 100644 index 0000000000000..021bc73a8a26b --- /dev/null +++ b/src/relay/collage/sub_graph.h @@ -0,0 +1,441 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/sub_graph.h + * \brief Represents a sub-graph of an overall Relay expression. + */ + +#ifndef TVM_RELAY_COLLAGE_SUB_GRAPH_H_ +#define TVM_RELAY_COLLAGE_SUB_GRAPH_H_ + +#include + +#include +#include +#include +#include + +#include "../ir/dataflow_matcher_impl.h" +#include "../ir/indexed_graph.h" +#include "./index_set.h" +#include "dataflow_graph.h" +#include "name_supply.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! \brief Returns operator pattern kind as single-letter string. */ +std::string KindToString(OpPatternKind kind); + +/*! + * \brief Returns a kind and label for the single \p sub_expr, ignoring it's sub-sub expressions. + */ +std::pair SubExprKindAndLabel(const Expr& sub_expr); + +/*! + * \brief Returns a kind and label for all the nodes in \p inside. + */ +std::pair SubGraphKindAndLabel(const DataflowGraph& dataflow_graph, + const IndexSet& inside); + +/*! + * \brief Returns the index set representing all the sub-expression matched by \p matcher. + */ +IndexSet MatcherToIndexSet(const DFPatternMatcher& matcher); + +/*! + * \brief Configuration controlling which sub-graphs are considered valid. + */ +struct SubGraphConfig { + /*! \brief Maximum number of exit nodes in the sub-graph, or zero if no limit. */ + size_t max_exits = 0; + /*! + * \brief Whether a node inside the sub-graph may flow to nodes both inside and outside + * the sub-graph (which we call a 'tap'). Note that it is still possible to have multiple outputs + * even with this flag false. + */ + bool allow_taps = false; + /*! + * \brief Maximum allowed maximum depth, or zero if no-limit. + */ + size_t max_max_depth = 0; + + std::string ToString() const; +}; + +class SubGraph; +using FunctionAttrsMap = Map; + +/*! + * \brief A sub-sub graph is a sub-graph which is to be nested inside a function as part of some + * enclosing sub-graph. + * + * Extraction yields a function with input nodes replaced by parameters and exit nodes in the + * function result. Rewriting replaces the sub-graph with a call to that function, and all + * outputs with (projections from) the call result. + * + * (Note that it's tempting to move attrs_ into \p SubGraphNode and thus avoid this class. + * However we found the implementation was easier to understand in this form since it makes + * the result of \p Extract unambiguous.) + */ +class SubSubGraphNode : public Object { + public: + /*! \brief The nested sub-graph. */ + ObjectRef /* actually SubGraph */ sub_graph_obj_; + /*! \brief Attributes (possibly empty) to attach to the extracted function. */ + FunctionAttrsMap attrs_; + + void VisitAttrs(AttrVisitor* v); + + SubGraph sub_graph() const; + + bool operator==(const SubSubGraphNode& that) const; + bool operator!=(const SubSubGraphNode& that) const { return !(*this == that); } + bool operator<(const SubSubGraphNode& that) const; + size_t hash() const; + + std::string ToString() const; + + /*! + * \brief Returns the function representing this sub-sub-graph within the overall expression + * represented by \p dataflow_graph: + * - All sub-graph inputs become parameters. + * - All sub-graph outputs become function results (either directly or as a field in a tuple). + * - The function has attrs_ for attributes (which may be empty). + * - The function body accounts for any rewrites implied by the nested sub-graph. + */ + Function Extract(const DataflowGraph& dataflow_graph) const; + + /*! + * \brief Returns \p expr (which has matching \p dataflow_graph) rewritten to encode the + * partitioning implied by this sub-sub-graph. + */ + Expr Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const; + + static constexpr const char* _type_key = "relay.collage.SubSubGraph"; + TVM_DECLARE_FINAL_OBJECT_INFO(SubSubGraphNode, Object); +}; + +class SubSubGraph : public ObjectRef { + public: + SubSubGraph(SubGraph sub_graph, FunctionAttrsMap attrs); + + /*! + * \brief Returns copy of this sub-sub-graph with all indexes substituted according to \p subst, + * whose range is w.r.t. \p new_dataflow_graph. + */ + SubSubGraph Subst(const DataflowGraph& new_dataflow_graph, + const std::unordered_map& subst) const; + + /*! + * \brief Returns true if this can be safely unioned. + */ + bool TriviallyUnionable(const SubSubGraph& that) const; + + /*! + * \brief Returns the disjoin union of this and \p that sub-sub graphs, which must agree on + * their attributes. + */ + SubSubGraph DisjointUnion(const DataflowGraph& dataflow_graph, const SubSubGraph& that) const; + + /*! + * \brief Returns \p expr rewritten according to all the given sub-sub-graphs. The sub-sub-graphs + * can be given in any order, but must be disjoint. + */ + static Expr ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr, + std::vector sub_sub_graphs); + + TVM_DEFINE_OBJECT_REF_METHODS(SubSubGraph, ObjectRef, SubSubGraphNode); +}; + +using SubSubGraphs = Array; + +/*! + * \brief A compact representation of a sub-graph within an (implied) overall Relay expression. + * + * Sub-graphs can be used to represent partitions/kernels/composite functions without having to + * pay the cost of constructing or rewriting any expressions. We also allow 'extracting' a + * function to use for measuring a partition/kernel's latency independently from 'rewriting' + * the overall Relay expression since only a tiny subset of candidate partitions will end up being + * needed after Collage has completed its search. + * + * We expect O(thousands) of sub-graphs to be in flight while processing a given model, so are + * mindful of space overhead. + * + * A sub-graph classifies every dataflow node of the overall expression as either 'inside' or + * 'outside' the sub-graph. Obviously not all such divisions make sense, for example it is not + * valid for an inside node to feed into another inside node via outside nodes. We provide the + * \p IsValid method to check for validity, and \p SubGraphConfig to control which validity rules + * apply (such as maximum depth). + * + * We generally work with the \p DataflowGraph representation of the overall Relay expression + * rather than the expression itself. We use the post-dfs visit index to uniquely refer to + * expression nodes. + * + * As well as 'inside' and 'outside' we have four other flavors of dataflow nodes, all uniquely + * determined from the 'inside' nodes: + * - 'entry' nodes are those inside with at least one dataflow input outside. + * - 'exit' nodes are those inside with at least one dataflow output outside, or which + * are considered 'external' in the underlying dataflow graph (eg because they represent + * the result of the overall function). + * - 'input' nodes are those outside with at least one dataflow output inside. + * - 'output' nodes are those outside with at least one dataflow input inside. + * Index sets for these are cached with the sub-graph for performance. + * + * It is valid to have multiple entry nodes (we can bind a parameter for each). It may be valid to + * have multiple exit nodes (we can build a tuple of all such). It may be valid to have exit nodes + * which also contribute to other inside nodes (ie represent a 'tap' on an intermediate result). + * + * Sub-graphs are closed under: + * - Disjoint union. + * - Wrapping by a function with given attributes (see \p SubSubGraph above). This can be used + * to encode "Composite" functions, or to represent a candidate kernel within a "Primitive" + * function. (By combining 'wrapping' with 'union' we can encode, eg, 'this sub-graph should + * be placed inside a primitive function which itself may have calls to composite functions). + * - Substitution, which allows a sub-graph w.r.t. one dataflow graph to be transformed to + * match some other (typically smaller) dataflow graph. + * + * See the subclasses of \p PartitionRule for how sub-graphs are built and combined during Collage + * search. + * + * To support some of the \p OpPatternKind-based fusion rule processing we give sub-graphs + * a kind, which is generally the maximum of the kinds of all the operator calls appearing + * inside it. We also given sub-graphs a (not necessarily unique) label to help debugging + * and guide the selection of global symbol names. + */ +class SubGraphNode : public Object { + public: + /*! + * \brief Which sub-expressions are inside the sub-graph (using their post-dfs indexes w.r.t. + * the implied DataflowGraph). + */ + IndexSet inside_; + + /*! + * \brief Index of first and last inside nodes. + * + * Cached for performance, uniquely determined by inside_. + */ + PostDfsIndex first_inside_index_ = 0; + PostDfsIndex last_inside_index_ = 0; + + /*! + * \brief Which sub-expressions are entry/exit/input/output for this sub-graph. + * + * Cached for performance, uniquely determined by inside_. + */ + IndexSet entry_; + IndexSet exit_; + IndexSet input_; + IndexSet output_; + + /*! + * \brief Maximum depth of any dataflow path from an entry to an output sub-expression. + * + * Cached for performance, uniquely determined by inside_. + */ + size_t max_depth_ = 0; + + /*! + * \brief The \p OpPatternKind summarizing the input/output behavior of the sub-graph. + * + * A sub-graph consisting of a single Relay expression node is given kind: + * - For Call to a Relay operator, the "TOpPattern" attribute of that operator (provided the + * call does not involve data-dependent dynamic shapes). + * - For Call to Relay Function, the "TOpPattern" attribute of the function (provided it has + * that attribute) + * - For Constants, \p kElemWise. + * - For Tuple and tuple projections, \p kInjective (provided all tuple fields are of tensor + * type) + * - All other nodes \p kOpaque. + * Sub-graphs with more than one node have the maximum of the kind of each node. + * + * Cached for performance, uniquely determined by inside_. + */ + OpPatternKind kind_ = kOpaque; + + /*! + * \brief A label for the sub-graph. Not guaranteed to be unique, but is a human-readable summary + * of the sub-graph which can help with debugging and guide the selection of global symbol names. + */ + String label_; + + /*! + * \brief Sub-sub-graphs of this sub-graph which must be represented by functions. These must + * be disjoint, but it's ok for this sub-graph to have nodes not inside any sub-sub-graph. + */ + SubSubGraphs sub_sub_graphs_; + + void VisitAttrs(AttrVisitor* v); + + // TODO(mbs): 'Anchor nodes' and rules for unioning them. + // In FuseOps it's just the unique kEWiseFusable node, if any. + // I'd like to allow writing vertical fusion rules, eg if two candidates are directly + // connected and have nn.conv2d anchors allow their join. + // I'd also like to allow horizontal fusion rules, eg if two candidates are not directly + // connected but could be joined without producing invalid (eg cyclic) and have nn.conv2d anchors + // then do so. Come back to this. + + /*! \brief Number of nodes in overall dataflow graph. */ + size_t overall_size() const { return inside_.end_index(); } + + bool IsEmpty() const { return inside_.IsZero(); } + + /*! \brief Number of nodes in sub-graph. */ + size_t Size() const { return inside_.PopCount(); } + + /*! + * \brief Returns the dataflow nodes downstream of all exit nodes. + */ + IndexSet Downstream(const DataflowGraph& dataflow_graph) const; + + /*! + * \brief Returns true if this sub-graph is valid. Ie: + * - no output of the sub-graph can flow to any input of the sub-graph (otherwise we'd end up + * with a dataflow cycle when we partition). + * - all inputs and outputs of the sub-graph are in the same scope, ie not separated by + * control flow (otherwise there'd be no consistent program point at which to eval the + * partitioned function). + * - no more than config.max_outputs outputs are require. + * - if config.allow_taps is false, no inside node has outputs to nodes both inside and + * outside the sub-graph. + */ + bool IsValid(const DataflowGraph& dataflow_graph, const SubGraphConfig& config) const; + + /*! + * \brief Returns this sub-graph extracted as a stand-alone function. The function will have + * no attributes, and is suitable for building and profiling by the \p CostEstimator. + */ + Function ExtractAsFunction(const DataflowGraph& dataflow_graph) const; + + /*! + * \brief Returns \p expr (which has matching \p dataflow_graph) rewritten to encode the + * partitioning implied by this sub-graph. + */ + Expr Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const; + + std::string ToString() const; + + bool operator==(const SubGraphNode& that) const; + bool operator!=(const SubGraphNode& that) const { return !(*this == that); } + bool operator<(const SubGraphNode& that) const; + size_t hash() const; + + private: + /*! \brief Initialize the entry/exit/input/output sets given the inside and \p dataflow_graph. */ + void Init(const DataflowGraph& dataflow_graph); + + /*! \brief Calculates and returns the maximum path depth. */ + size_t MaxDepth(const DataflowGraph& dataflow_graph) const; + + /*! \brief Return's true if any (input/output) of node is (outside/inside) the sub-graph. */ + bool AnyInputOutside(const DataflowGraph::Node* node) const; + bool AnyInputInside(const DataflowGraph::Node* node) const; + bool AnyOutputOutside(const DataflowGraph::Node* node) const; + bool AnyOutputInside(const DataflowGraph::Node* node) const; + + public: + static constexpr const char* _type_key = "relay.collage.SubGraph"; + TVM_DECLARE_FINAL_OBJECT_INFO(SubGraphNode, Object); + + friend class SubGraph; +}; + +class SubGraph : public ObjectRef { + public: + /*! \brief Primitive constructor. The following constructors are generally more convenient. */ + SubGraph(const DataflowGraph& dataflow_graph, IndexSet inside, OpPatternKind kind = kOpaque, + String label = {}, std::vector sub_sub_graphs = {}); + + /*! \brief Constructs the empty sub-graph for \p dataflow_graph. */ + explicit SubGraph(const DataflowGraph& dataflow_graph); + + /*! \brief Returns true if this and that are disjoint. */ + bool AreDisjoint(const SubGraph& that) const; + + /*! + * \brief Returns true if: + * - \p this and \p that are disjoint, and + * - an output node of \p this coincides with an entry node of \p that, and + * - \p this and \p that are not obviously invalid after \p DisjointUnion + * (eg because such a sub-graph would produce a cycle). + * Note however that the \p DisjointUnion may not necessarily be valid even with the above + * checks. + */ + bool AreTouching(const DataflowGraph& dataflow_graph, const SubGraph& that) const; + + /*! + * \brief Returns true if: + * - all the outputs of \p this are entries for \p that, and + * - all the inputs of \p that are exits for \p this. + */ + bool AreSelfContained(const SubGraph& that) const; + + /*! + * \brief Returns disjoint union of this and \p that sub-graphs. The result may not be valid. + */ + SubGraph DisjointUnion(const DataflowGraph& dataflow_graph, const SubGraph& that) const; + + /*! + * \brief Returns copy of this sub-graph with all nodes placed inside a sub-sub-graph with + * given attributes. + */ + SubGraph WithAttrs(const DataflowGraph& dataflow_graph, FunctionAttrsMap attrs) const; + + /*! + * \brief Returns copy of this sub-graph with all indexes substituted according to \p subst, + * whose range is w.r.t. \p new_dataflow_graph. + */ + SubGraph Subst(const DataflowGraph& new_dataflow_graph, + const std::unordered_map& subst) const; + + /*! + * \brief Returns \p expr rewritten according to all the given sub-graphs. The sub-graphs can + * be given in any order, but must be disjoint. + */ + static Expr ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr, + std::vector sub_graphs); + + TVM_DEFINE_OBJECT_REF_METHODS(SubGraph, ObjectRef, SubGraphNode); +}; + +struct SubGraphEqual { + bool operator()(const SubGraph& left, const SubGraph& right) const { + return *left.get() == *right.get(); + } +}; + +struct SubGraphHash { + size_t operator()(const SubGraph& sub_graph) const { return sub_graph->hash(); } +}; + +/*! + * \brief Pass to partition every global function according to the post-dfs indexes + * given in an array. Visible for testing from Python only, would never make sense to use + * as a generic pass! + */ +transform::Pass PartitionOnIndexesForTesting(Array indexes); + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_SUB_GRAPH_H_ diff --git a/src/relay/collage/utils.cc b/src/relay/collage/utils.cc new file mode 100644 index 0000000000000..03af980e8c1d3 --- /dev/null +++ b/src/relay/collage/utils.cc @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/utils.cc + * \brief Misc helpers. + */ + +#include "./utils.h" + +#include "../../support/scalars.h" +#include "../op/memory/device_copy.h" + +namespace tvm { +namespace relay { +namespace collage { + +String GetSpecName(const Target& target) { + if (TargetKind::GetAttrMap(tvm::attr::kIsExternalCodegen).get(target->kind, Bool(false))) { + return target->kind->name; + } else { + return std::string(kTVMSpecNamePrefix) + target->kind->name; + } +} + +String UnionLabels(String left, String right) { + if (left.empty()) { + return right; + } + if (right.empty()) { + return left; + } + return left + "+" + right; +} + +String NestLabels(String left, String right) { + if (left.empty()) { + return right; + } + if (right.empty()) { + return left; + } + if (right.size() > left.size()) { + std::string right_str = right; + if (right_str.substr(0, left.size()) == left) { + return right; + } + } + return left + "." + right; +} + +std::string KindToString(OpPatternKind kind) { + switch (kind) { + case kElemWise: + return "E"; + case kBroadcast: + return "B"; + case kInjective: + return "I"; + case kCommReduce: + return "R"; + case kOutEWiseFusable: + return "A"; + case kTuple: + return "T"; + case kOpaque: + return "O"; + } + return "?"; +} + +OpPatternKind CombineKinds(OpPatternKind left, OpPatternKind right) { + return std::max(left, right); +} + +bool CanInline(const Expr& expr) { + if (expr.as() || expr.as() || expr.as()) { + return true; + } + if (const auto* constant_node = expr.as()) { + return support::IsSimpleScalar(constant_node); + } + return false; +} + +bool IsSpecialOp(const OpNode* op_node) { + auto op = GetRef(op_node); + static auto fnoncomputational = Op::GetAttrMap("TNonComputational"); + if (fnoncomputational.count(op) && fnoncomputational[op]) { + // Operator has been marked as non-computational. + return true; + } + // TODO(mbs): This is incomplete. + static auto shape_of_op_ = Op::Get("shape_of"); + static auto vm_shape_of_op_ = Op::Get("vm.shape_of"); + if (op == DeviceCopyOp() || op == shape_of_op_ || op == vm_shape_of_op_) { + // Operator is compiled away by the VM compilation flow. + return true; + } + return false; +} + +bool MustBeLowered(const Expr& expr) { + if (const auto* call_node = expr.as()) { + if (const auto* function_node = call_node->op.as()) { + if (function_node->HasNonzeroAttr(attr::kPrimitive)) { + // We've already committed to this call being to one or more operators which must be + // lowered. + return true; + } + } else if (const auto* op_node = call_node->op.as()) { + if (!IsSpecialOp(op_node)) { + // The VM compilation path won't rewrite this call. + return true; + } + } + } + return false; +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/utils.h b/src/relay/collage/utils.h new file mode 100644 index 0000000000000..4c0493cdd675c --- /dev/null +++ b/src/relay/collage/utils.h @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/utils.h + * \brief Misc helpers. + */ + +#ifndef TVM_RELAY_COLLAGE_UTILS_H_ +#define TVM_RELAY_COLLAGE_UTILS_H_ + +#include +#include +#include +#include + +#include + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Distinguished partition spec names. + */ +constexpr const char* kTVMSpecNamePrefix = "tvm_"; +constexpr const char* kHostSpecName = "host"; + +/*! + * \brief Returns the partition spec name to use for \p target. For external codegen targets the + * spec name is just the target kind name. For TVM native targets the spec name is of the form + * "tvm_". + */ +String GetSpecName(const Target& target); + +/*! \brief Returns \p "+". */ +String UnionLabels(String left, String right); + +/*! \brief Returns \p ".". */ +String NestLabels(String outer, String inner); + +/*! \brief Returns abbreviation for \p kind. */ +std::string KindToString(OpPatternKind kind); + +/*! \brief Returns maximum of \p left and \p right. */ +OpPatternKind CombineKinds(OpPatternKind left, OpPatternKind right); + +/*! + * \brief Returns true if \p expr can be safely inlined in body of function extracted + * from sub-graph, even if \p expr was not technically matched by the pattern which produced + * the sub-graph. + */ +bool CanInline(const Expr& expr); + +/*! + * \brief Returns true if \p op_node can be directly handled by the VM. + */ +bool IsSpecialOp(const OpNode* op_node); + +/*! + * \brief Return true if the Relay expression node given by \p expr cannot be evaluated by + * the VM and must end up in a kernel. + */ +bool MustBeLowered(const Expr& expr); + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_UTILS_H_ diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index e3e3bfbb973e5..e38066c83d360 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -585,6 +585,7 @@ RELAY_REGISTER_OP("nn.relu") .add_argument("data", "Tensor", "The input tensor.") .set_support_level(1) .add_type_rel("Identity", IdentityRel) + // .set_attr("TOpPattern", kElemWise) .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, const Type& out_type) { diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc index 3f1985b7ddfa5..0a5a29a9872c8 100644 --- a/src/relay/transforms/annotate_target.cc +++ b/src/relay/transforms/annotate_target.cc @@ -216,6 +216,8 @@ class AnnotateTargetRewriter : public ExprRewriter { if (!Op::HasAttrMap("target." + std::string(target))) { continue; } + // TODO(mbs): Do not check in + LOG(FATAL) << "Unexpected BYOC predicate on op " << op->name << " for target " << target; auto fannotate = Op::GetAttrMap("target." + std::string(target)); const Expr& ex = GetRef(pre); if (fannotate.count(op) && fannotate[op](ex)) { diff --git a/src/relay/transforms/infer_layout_utils.cc b/src/relay/transforms/infer_layout_utils.cc index efe886c29d23b..70e7d5d1cf2fc 100644 --- a/src/relay/transforms/infer_layout_utils.cc +++ b/src/relay/transforms/infer_layout_utils.cc @@ -42,9 +42,9 @@ Layout AdjustSubordinateFactors(const Layout& src_layout, const Layout& old_layo // 2) Find the Index of this dual axis in old_layout. // 3) Find the shape of the that axis in old_shape. // 4) a) Adjust factor to 1, if that shape is 1. b) Else retain the factor. - DLOG(INFO) << "AdjustSubordinateFactors" - << "src_layout: " << src_layout << " old_layout: " << old_layout - << " old_shape: " << old_shape << std::endl; + VLOG(1) << "AdjustSubordinateFactors" + << "src_layout: " << src_layout << " old_layout: " << old_layout + << " old_shape: " << old_shape << std::endl; std::string new_layout; for (auto axis : src_layout->axes) { if (!LayoutAxis::Get(axis).IsPrimal()) { @@ -85,8 +85,8 @@ Layout AdjustSubordinateFactors(const Layout& src_layout, const Layout& old_layo } bool Isomorphic(const Layout& lhs, const Layout& rhs) { - DLOG(INFO) << "Isomorphic: " - << "lhs: " << lhs << " rhs: " << rhs << std::endl; + VLOG(1) << "Isomorphic: " + << "lhs: " << lhs << " rhs: " << rhs << std::endl; ICHECK(lhs.defined()); ICHECK(rhs.defined()); if (lhs->axes.size() != rhs->axes.size()) return false; @@ -115,8 +115,8 @@ bool Isomorphic(const Layout& lhs, const Layout& rhs) { } Layout TryTransformLike(const Layout& old, const Layout& ref_old, const Layout& ref_new) { - DLOG(INFO) << "transform_layout: old = " << old << ", ref_new = " << ref_new - << ", ref_old = " << ref_old << std::endl; + VLOG(1) << "transform_layout: old = " << old << ", ref_new = " << ref_new + << ", ref_old = " << ref_old << std::endl; ICHECK(ref_old.defined()); ICHECK(ref_new.defined()); ICHECK(old.defined()); @@ -181,7 +181,7 @@ Layout TryTransformLike(const Layout& old, const Layout& ref_old, const Layout& } } - DLOG(INFO) << "new_layout = " << new_layout << std::endl; + VLOG(1) << "new_layout = " << new_layout << std::endl; return Layout(new_layout); } diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc index 012b3579494f1..90b916b5471ed 100644 --- a/src/relay/transforms/inline.cc +++ b/src/relay/transforms/inline.cc @@ -218,8 +218,13 @@ IRModule Inline(const IRModule& module) { namespace transform { Pass Inline() { - runtime::TypedPackedFunc pass_func = - [=](IRModule m, PassContext pc) { return relay::Inline(m); }; + runtime::TypedPackedFunc pass_func = [=](IRModule mod, + PassContext pc) { + VLOG(1) << "Inline input:" << std::endl << PrettyPrint(mod); + IRModule out_mod = relay::Inline(mod); + VLOG(1) << "Inline result:" << std::endl << PrettyPrint(out_mod); + return out_mod; + }; return CreateModulePass(pass_func, 1, "InlineGlobals", {}); } diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc index 5b584e199dc73..13dc689f6f135 100644 --- a/src/relay/transforms/memory_alloc.cc +++ b/src/relay/transforms/memory_alloc.cc @@ -79,6 +79,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator { for (auto field : tuple_node->fields) { auto new_field = Mutate(field); if (const auto* op = new_field.as()) { + // TODO(mbs): Replace with support::IsSimpleScalar? DataType dtype(op->data->dtype); bool is_simple_const = (dtype == DataType::Int(32) || dtype == DataType::Int(64) || dtype == DataType::Float(32) || dtype == DataType::Float(64) || diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc index bc1ed518d4736..bbe03c6421ad3 100644 --- a/src/relay/transforms/partition_graph.cc +++ b/src/relay/transforms/partition_graph.cc @@ -331,6 +331,12 @@ class Partitioner : public MixedModeMutator { global_region_func = WithAttr(std::move(global_region_func), attr::kPrimitive, tvm::Integer(1)); global_region_func = WithAttr(std::move(global_region_func), attr::kCompiler, tvm::runtime::String(target)); + // TODO(mbs): The partitioned functions are tagged as Inline=1 so that they can be collapsed + // back into the main relay function and thus pass through the keyhole of the + // GraphExecutorCodegen and AOTExecutorCodegen's 'codegen' method, only to then be outlined + // again. Ideally codegen would be IRModule at a time instead of function at a time, but + // the assumption of IRModule == single 'main' function is now so deeply engrained I it would + // be tricky to make that change. global_region_func = WithAttr(std::move(global_region_func), attr::kInline, tvm::Integer(1)); std::string fname = name; diff --git a/src/relay/transforms/simplify_inference.cc b/src/relay/transforms/simplify_inference.cc index e7eef41e41c4b..04383d33cfeb2 100644 --- a/src/relay/transforms/simplify_inference.cc +++ b/src/relay/transforms/simplify_inference.cc @@ -204,7 +204,7 @@ class InferenceSimplifier : public MixedModeMutator { return new_e; } - Expr Rewrite_(const CallNode* n, const Expr& new_n) { + Expr Rewrite_(const CallNode* n, const Expr& new_n) final { if (n->op == batch_norm_op_) { ty_map_[new_n.as()->args[0]] = n->args[0]->checked_type(); } else if (n->op == layer_norm_op_) { diff --git a/src/runtime/const_loader_module.cc b/src/runtime/const_loader_module.cc index 2e91d26d5f965..520cc8342bad2 100644 --- a/src/runtime/const_loader_module.cc +++ b/src/runtime/const_loader_module.cc @@ -55,11 +55,16 @@ class ConstLoaderModuleNode : public ModuleNode { // symbol lookup for initialization. Otherwise, symbols/primitives in the // DSO module will also be cached but they never need to be initialized. for (const auto& it : const_vars_by_symbol_) { + for (const auto& s : it.second) { + VLOG(1) << "ConstLoaderModuleNode has constant symbol '" << s << "' for function '" + << it.first << "'"; + } initialized_[it.first] = false; } } PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) final { + VLOG(1) << "ConstLoaderModuleNode::GetFunction(" << name << ")"; // Initialize and memoize the module. // Usually, we have some warmup runs. The module initialization should be // done at this stage. Therefore, runtime overhead is not a concern. diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc index 3971081bf8f8a..cd46967e532b7 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc @@ -67,7 +67,7 @@ nvinfer1::ITensor* TensorRTOpConverter::Transpose(TensorRTOpConverterParams* par // Batch dimension cannot be modified. ICHECK_EQ(input->getDimensions().nbDims, order.size() - 1); ICHECK_EQ(order[0], 0); - for (size_t i = 0; i < order.size(); ++i) { + for (size_t i = 0; i + 1 < order.size(); ++i) { perm.order[i] = order[i + 1] - 1; } } else { @@ -880,7 +880,7 @@ class ConcatOpConverter : public TensorRTOpConverter { const int input_rank = params->inputs[0].tensor->getDimensions().nbDims; std::vector input_tensors; for (auto input : params->inputs) { - ICHECK(input.type == kTensor); + ICHECK_EQ(input.type, kTensor); ICHECK_EQ(input_rank, input.tensor->getDimensions().nbDims); input_tensors.push_back(input.tensor); } diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index b4d7b41b7f4ae..fd8e99d2c9997 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -118,20 +118,22 @@ class CUDADeviceAPI final : public DeviceAPI { CUDA_CALL(cudaSetDevice(dev.device_id)); size_t free_mem, total_mem; CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem)); - VLOG(1) << "allocating " << nbytes << " bytes on device, with " << free_mem - << " bytes currently free out of " << total_mem << " bytes available"; + VLOG(1) << "allocating " << nbytes << " bytes on device " << dev.device_id << " with " + << free_mem << " bytes currently free out of " << total_mem << " bytes available"; CUDA_CALL(cudaMalloc(&ret, nbytes)); } + VLOG(1) << "allocated at " << std::hex << reinterpret_cast(ret); return ret; } void FreeDataSpace(Device dev, void* ptr) final { if (dev.device_type == kDLCUDAHost) { - VLOG(1) << "freeing host memory"; + VLOG(1) << "freeing host memory at " << std::hex << reinterpret_cast(ptr); CUDA_CALL(cudaFreeHost(ptr)); } else { CUDA_CALL(cudaSetDevice(dev.device_id)); - VLOG(1) << "freeing device memory"; + VLOG(1) << "freeing device " << dev.device_id << " memory at " << std::hex + << reinterpret_cast(ptr); CUDA_CALL(cudaFree(ptr)); } } diff --git a/src/runtime/vm/pooled_allocator.h b/src/runtime/vm/pooled_allocator.h index e5f236983a735..743d1b900ea97 100644 --- a/src/runtime/vm/pooled_allocator.h +++ b/src/runtime/vm/pooled_allocator.h @@ -52,6 +52,8 @@ class PooledAllocator final : public Allocator { auto&& pool = it->second; auto ret = pool.back(); pool.pop_back(); + VLOG(1) << "reusing buffer of " << ret.size << " bytes at " << std::hex + << reinterpret_cast(ret.data); return ret; } Buffer buf; @@ -67,7 +69,9 @@ class PooledAllocator final : public Allocator { } used_memory_.fetch_add(size, std::memory_order_relaxed); - VLOG(1) << "allocate " << size << " B, used memory " << used_memory_ << " B"; + VLOG(1) << "allocated " << size << " bytes at " << std::hex + << reinterpret_cast(buf.data) << std::dec << ", total used memory is now " + << used_memory_ << " bytes"; return buf; } @@ -77,7 +81,8 @@ class PooledAllocator final : public Allocator { memory_pool_.emplace(buffer.size, std::vector{}); } memory_pool_.at(buffer.size).push_back(buffer); - VLOG(1) << "reclaim buffer " << buffer.size; + VLOG(1) << "reclaiming buffer of " << buffer.size << " bytes at " << std::hex + << reinterpret_cast(buffer.data); } size_t UsedMemory() const override { return used_memory_.load(std::memory_order_relaxed); } @@ -88,12 +93,14 @@ class PooledAllocator final : public Allocator { for (auto const& it : memory_pool_) { auto const& pool = it.second; for (auto const& buf : pool) { + VLOG(1) << "freeing " << buf.size << " bytes at " << std::hex + << reinterpret_cast(buf.data); DeviceAPI::Get(buf.device)->FreeDataSpace(buf.device, buf.data); } } memory_pool_.clear(); used_memory_ = 0; - VLOG(1) << "release all buffers"; + VLOG(1) << "released all buffers"; } private: diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc index 8d03dbf210c33..43c2b3ddb0573 100644 --- a/src/runtime/vm/vm.cc +++ b/src/runtime/vm/vm.cc @@ -264,6 +264,7 @@ void VirtualMachine::SetOneInput(std::string func_name, const TVMArgValue& tag, } else { LOG(FATAL) << "The type of input tensor tag (" << tag.type_code() << ") doesn't match integer or string"; + inp_index = 0; } ICHECK_LT(inp_index, params_num); @@ -359,11 +360,11 @@ void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector& args) { - DLOG(INFO) << "Executing Function: " << std::endl << func; + VLOG(1) << "Executing Function: " << std::endl << func; for (int i = 0; i < static_cast(devices_.size()); ++i) { - DLOG(INFO) << "Device " << i << " has device type " << devices_[i].device_type - << " and device id " << devices_[i].device_id - << (i == exec_->host_device_index ? " (using as host device)" : ""); + VLOG(1) << "Device " << i << " has device type " << devices_[i].device_type << " and device id " + << devices_[i].device_id + << (i == exec_->host_device_index ? " (using as host device)" : ""); } InvokeGlobal(func, args); diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc index 1148013706ab7..a0b3c6fcfc1de 100644 --- a/src/target/target_kind.cc +++ b/src/target/target_kind.cc @@ -416,6 +416,9 @@ TVM_REGISTER_TARGET_KIND("hybrid", kDLCPU) // line break TVM_REGISTER_TARGET_KIND("composite", kDLCPU) // line break .add_attr_option>("devices"); +TVM_REGISTER_TARGET_KIND("test_external_cpu_target", kDLCPU) // line break + .set_attr(tvm::attr::kIsExternalCodegen, Bool(true)); + /********** Registry **********/ TVM_REGISTER_GLOBAL("target.TargetKindGetAttr") diff --git a/tests/cpp/relay/collage/partition_rule_test.cc b/tests/cpp/relay/collage/partition_rule_test.cc new file mode 100644 index 0000000000000..4e55359993adf --- /dev/null +++ b/tests/cpp/relay/collage/partition_rule_test.cc @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "../../../src/relay/collage/partition_rule.h" + +#include +#include +#include +#include + +#include "../../../src/relay/collage/partition_spec.h" + +namespace tvm { +namespace relay { +namespace { + +IRModule TestIRModule() { + constexpr const char* kModel = R"( + #[version = "0.0.5"] + def @main(%x: Tensor[(10, 10), float32]) { + %0 = abs(%x); // 3 + %1 = nn.relu(%0); // 4 + nn.relu(%1) // 5 + } + )"; + return parser::ParseModule("string", kModel); +} + +std::vector MakeCandidates( + const collage::DataflowGraph& graph, const runtime::String rule_name, + const collage::PartitionSpec& spec, const std::vector> index_sets) { + std::vector candidate_partitions; + for (const auto& indexes : index_sets) { + auto subgraph = collage::SubGraph(graph, collage::IndexSet(graph.size(), indexes)); + auto candidate = collage::CandidatePartition(rule_name, subgraph, spec); + candidate_partitions.emplace_back(std::move(candidate)); + } + return candidate_partitions; +} + +TEST(PartitionRule, DFPatternSingleOp) { + IRModule ir_mod = TestIRModule(); + auto main = Downcast(ir_mod->Lookup("main")); + auto graph = collage::DataflowGraph(main); + Target target("llvm"); + auto spec = collage::PartitionSpec("test_spec", target, {}); + + { + auto pattern = IsOp("nn.relu")({IsWildcard()}); + auto rule = collage::DFPatternPartitionRule("relu_pattern", pattern); + auto expected_candidates = MakeCandidates(graph, "relu_pattern", spec, {{4}, {5}}); + + auto candidates = rule->AllCandidates(graph, spec); + + ICHECK_EQ(candidates.size(), 2); + for (size_t i = 0; i < candidates.size(); i++) { + ICHECK(collage::CandidatePartitionEquals()(candidates[i], expected_candidates[i])); + } + } +} + +TEST(PartitionRule, DFPatternOverlap) { + IRModule ir_mod = TestIRModule(); + auto main = Downcast(ir_mod->Lookup("main")); + auto graph = collage::DataflowGraph(main); + Target target("llvm"); + auto spec = collage::PartitionSpec("test_spec", target, {}); + + { + auto pattern = + IsOp("nn.relu")({IsOp("nn.relu")({IsWildcard()}) || IsOp("abs")({IsWildcard()})}); + auto rule = collage::DFPatternPartitionRule("relu+abs_pattern", pattern); + auto expected_candidates = MakeCandidates(graph, "relu+abs_pattern", spec, {{3, 4}, {4, 5}}); + + auto candidates = rule->AllCandidates(graph, spec); + + ICHECK_EQ(candidates.size(), 2); + for (size_t i = 0; i < candidates.size(); i++) { + ICHECK(collage::CandidatePartitionEquals()(candidates[i], expected_candidates[i])); + } + } +} + +TEST(PartitionRule, Composite) { + IRModule ir_mod = TestIRModule(); + auto main = Downcast(ir_mod->Lookup("main")); + auto graph = collage::DataflowGraph(main); + Target target("llvm"); + auto spec = collage::PartitionSpec("test_spec", target, {}); + + { + constexpr const char* kExpectedMod = R"( + #[version = "0.0.5"] + def @main(%x: Tensor[(10, 10), float32]) { + %0 = abs(%x); + %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="composite") { + nn.relu(%FunctionVar_01) + }; + %2 = %1(%0); + %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Composite="composite") { + nn.relu(%FunctionVar_0) + }; + %3(%2) + } + )"; + auto expected_expr = + Downcast(parser::ParseModule("string", kExpectedMod)->Lookup("main")); + auto pattern = IsOp("nn.relu")({IsWildcard()}); + auto df_rule = collage::DFPatternPartitionRule("relu_pattern", pattern); + auto composite_rule = collage::CompositePartitionRule("composite", df_rule); + + auto candidates = composite_rule->AllCandidates(graph, spec); + auto rewrite_expr = collage::CandidatePartition::ParallelRewrite(graph, main, candidates); + + ICHECK_EQ(candidates.size(), 2); + ICHECK(StructuralEqual()(rewrite_expr, expected_expr)); + } +} + +TEST(PartitionRule, PrimitiveTVM) { + IRModule ir_mod = TestIRModule(); + auto main = Downcast(ir_mod->Lookup("main")); + auto graph = collage::DataflowGraph(main); + Target target("llvm"); + auto spec = collage::PartitionSpec("test_spec", target, {}); + + { + constexpr const char* kExpectedMod = R"( + #[version = "0.0.5"] + def @main(%x: Tensor[(10, 10), float32]) { + %0 = abs(%x); + %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Primitive=1) { + nn.relu(%FunctionVar_01) + }; + %2 = %1(%0); + %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1) { + nn.relu(%FunctionVar_0) + }; + %3(%2) + } + )"; + auto expected_expr = + Downcast(parser::ParseModule("string", kExpectedMod)->Lookup("main")); + auto pattern = IsOp("nn.relu")({IsWildcard()}); + auto df_rule = collage::DFPatternPartitionRule("relu_pattern", pattern); + auto primitive_rule = collage::PrimitivePartitionRule("primitive", df_rule); + + auto candidates = primitive_rule->AllCandidates(graph, spec); + auto rewrite_expr = collage::CandidatePartition::ParallelRewrite(graph, main, candidates); + + ICHECK_EQ(candidates.size(), 2); + ICHECK(StructuralEqual()(rewrite_expr, expected_expr)); + } +} + +TVM_REGISTER_TARGET_KIND("test_ext_codegen", kDLCUDA) + .set_attr(tvm::attr::kIsExternalCodegen, Bool(true)); + +TEST(PartitionRule, PrimitiveExternal) { + IRModule ir_mod = TestIRModule(); + auto main = Downcast(ir_mod->Lookup("main")); + auto graph = collage::DataflowGraph(main); + Target target("test_ext_codegen"); + auto spec = collage::PartitionSpec("test_ext_codegen", target, {}); + + { + constexpr const char* kExpectedMod = R"( + #[version = "0.0.5"] + def @main(%x: Tensor[(10, 10), float32]) { + %0 = abs(%x); + %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Primitive=1, Compiler="test_ext_codegen") { + nn.relu(%FunctionVar_01) + }; + %2 = %1(%0); + %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="test_ext_codegen") { + nn.relu(%FunctionVar_0) + }; + %3(%2) + } + )"; + auto expected_expr = + Downcast(parser::ParseModule("string", kExpectedMod)->Lookup("main")); + auto pattern = IsOp("nn.relu")({IsWildcard()}); + auto df_rule = collage::DFPatternPartitionRule("relu_pattern", pattern); + auto primitive_rule = collage::PrimitivePartitionRule("primitive", df_rule); + + auto candidates = primitive_rule->AllCandidates(graph, spec); + auto rewrite_expr = collage::CandidatePartition::ParallelRewrite(graph, main, candidates); + + ICHECK_EQ(candidates.size(), 2); + ICHECK(StructuralEqual()(rewrite_expr, expected_expr)); + } +} + +TEST(PartitionRule, Union) { + IRModule ir_mod = TestIRModule(); + auto main = Downcast(ir_mod->Lookup("main")); + auto graph = collage::DataflowGraph(main); + Target target("llvm"); + auto spec = collage::PartitionSpec("test_spec", target, {}); + + { + auto abs_pattern = IsOp("abs")({IsWildcard()}); + auto abs_rule = collage::DFPatternPartitionRule("abs_pattern", abs_pattern); + auto relu_pattern = IsOp("nn.relu")({IsWildcard()}); + auto relu_rule = collage::DFPatternPartitionRule("relu_pattern", relu_pattern); + auto union_rule = collage::UnionPartitionRule("union", {abs_rule, relu_rule}); + + auto abs_candidates = MakeCandidates(graph, "abs_pattern", spec, {{3}}); + auto relu_candidates = MakeCandidates(graph, "relu_pattern", spec, {{4}, {5}}); + + std::vector expected_candidates; + expected_candidates.insert(expected_candidates.end(), abs_candidates.begin(), + abs_candidates.end()); + expected_candidates.insert(expected_candidates.end(), relu_candidates.begin(), + relu_candidates.end()); + + auto candidates = union_rule->AllCandidates(graph, spec); + + ICHECK_EQ(candidates.size(), expected_candidates.size()); + for (size_t i = 0; i < candidates.size(); i++) { + ICHECK(collage::CandidatePartitionEquals()(candidates[i], expected_candidates[i])); + } + } +} + +TEST(PartitionRule, OpCallByKind) { + constexpr const char* kMod = R"( + #[version = "0.0.5"] + def @main(%x: Tensor[(10, 10), float32]) { + %0 = abs(%x); // 4 + %1 = add(%0, %x); // 5 + shape_of(%1) // 6 + } + )"; + auto main = Downcast(parser::ParseModule("string", kMod)->Lookup("main")); + auto graph = collage::DataflowGraph(main); + Target target("llvm"); + auto spec = collage::PartitionSpec("test_spec", target, {}); + + { + auto rule = collage::OpCallByKindPartitionRule("op_call_by_kind"); + auto expected_candidates = MakeCandidates(graph, "op_call_by_kind", spec, {{4}, {5}}); + + auto candidates = rule->AllCandidates(graph, spec); + + ICHECK_EQ(candidates.size(), expected_candidates.size()); + for (size_t i = 0; i < candidates.size(); i++) { + ICHECK(collage::CandidatePartitionEquals()(candidates[i], expected_candidates[i])); + } + } +} + +} // namespace +} // namespace relay +} // namespace tvm diff --git a/tests/lint/rat-excludes b/tests/lint/rat-excludes index 1cdb78e31913c..cbc2043cfa4ed 100644 --- a/tests/lint/rat-excludes +++ b/tests/lint/rat-excludes @@ -19,6 +19,7 @@ .*\.log .*\.interp .*\.tokens +.*\.tuninglog # microTVM test data files testdata diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py index cecb64785a49a..38da305f3b172 100644 --- a/tests/python/contrib/test_tensorrt.py +++ b/tests/python/contrib/test_tensorrt.py @@ -18,7 +18,7 @@ import numpy as np import pytest import itertools - +import logging import tvm import tvm.relay.testing @@ -33,12 +33,14 @@ from tvm.contrib.download import download from tvm.relay.op.contrib import tensorrt - SUPPORTED_DTYPES = ["float16", "float32"] has_tensorrt_codegen = pytest.mark.skipif( - not tvm.get_global_func("relay.ext.tensorrt", True), reason="TensorRT codegen not available" + not tensorrt.is_tensorrt_compiler_enabled(), reason="TensorRT codegen not available" ) + +# CAUTION: Currently always false in CI since adds tens of minutes to test time and depends +# on TensorRT installation. See https://github.com/apache/tvm/issues/11765 has_tensorrt_runtime = pytest.mark.skipif( not tensorrt.is_tensorrt_runtime_enabled(), reason="TensorRT runtime not available" ) @@ -72,7 +74,7 @@ def assert_result_dict_holds(result_dict, dtype="float16"): tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=5e-3) -def set_func_attr(func, compile_name, symbol_name): +def set_outer_func_attr(func, compile_name, symbol_name): func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1)) func = func.with_attr("Compiler", compile_name) @@ -80,6 +82,12 @@ def set_func_attr(func, compile_name, symbol_name): return func +def set_inner_func_attr(func, pattern_name, composite_name): + func = func.with_attr("PartitionedFromPattern", pattern_name) + func = func.with_attr("Composite", composite_name) + return func + + def run_and_verify_func(config, target="cuda", run_module=True, data_type="float32"): """Test a Relay func by compiling, running, and comparing TVM and TRT outputs. @@ -110,34 +118,31 @@ def run_and_verify_func(config, target="cuda", run_module=True, data_type="float result_dict = dict() for mode in ["vm", "graph"]: - for mode in ["graph"]: - for use_trt in [True, False]: - mod = tvm.IRModule() - mod["main"] = f - result_key = mode + ("_trt" if use_trt else "") - if use_trt: - mod = relay.transform.InferType()(mod) - mod, config = tensorrt.partition_for_tensorrt( - mod, params, use_fp16=data_type == "float16" - ) - with tvm.transform.PassContext( - opt_level=3, config={"relay.ext.tensorrt.options": config} - ): - func = relay.create_executor( - mode, mod=mod, device=dev, target=target - ).evaluate() - else: - mod = relay.transform.InferType()(mod) - with tvm.transform.PassContext(opt_level=3): - func = relay.create_executor( - mode, mod=mod, device=dev, target=target - ).evaluate() + for use_trt in [True, False]: + mod = tvm.IRModule() + mod["main"] = f + result_key = mode + ("_trt" if use_trt else "") + if use_trt: + use_fp16 = data_type == "float16" + trt_target = tvm.target.Target(f"tensorrt -use_fp16={use_fp16}") + mod = relay.transform.InferType()(mod) + mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target) + with tvm.transform.PassContext(opt_level=3): + func = relay.create_executor( + mode, mod=mod, device=dev, target=[target, trt_target] + ).evaluate() + else: + mod = relay.transform.InferType()(mod) + with tvm.transform.PassContext(opt_level=3): + func = relay.create_executor( + mode, mod=mod, device=dev, target=target + ).evaluate() - if run_module: - result_dict[result_key] = func(**input_dict, **params) + if run_module: + result_dict[result_key] = func(**input_dict, **params) - if run_module: - assert_result_dict_holds(result_dict, data_type) + if run_module: + assert_result_dict_holds(result_dict, data_type) def test_tensorrt_simple(run_module): @@ -163,10 +168,8 @@ def test_tensorrt_simple(run_module): result_key = mode + ("_trt" if use_trt else "") if use_trt: mod = relay.transform.InferType()(mod) - mod, config = tensorrt.partition_for_tensorrt(mod) - with tvm.transform.PassContext( - opt_level=3, config={"relay.ext.tensorrt.options": config} - ): + mod = tensorrt.partition_for_tensorrt(mod) + with tvm.transform.PassContext(opt_level=3): func = relay.create_executor( mode, mod=mod, device=tvm.cuda(0), target="cuda" ).evaluate() @@ -212,9 +215,9 @@ def test_tensorrt_not_compatible(run_module): f = relay.Function([x], out) mod = tvm.IRModule() mod["main"] = f - mod, config = tensorrt.partition_for_tensorrt(mod) + mod = tensorrt.partition_for_tensorrt(mod) for mode in ["graph", "vm"]: - with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): + with tvm.transform.PassContext(opt_level=3): func = relay.create_executor( mode, mod=mod, device=tvm.cuda(0), target="cuda" ).evaluate() @@ -622,26 +625,18 @@ def are_ops_on_graph(self, subgraph) -> bool: def are_ops_on_trt(mod, op_list): + op_on_trt = False + op_on_tvm = False for subgraph in mod.get_global_vars(): name = subgraph.name_hint - op_on_trt = False - op_on_tvm = True - if name == "main": - op_on_tvm = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) - elif mod[name].attrs and mod[name].attrs["Compiler"] == "tensorrt": - op_on_trt = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) + if mod[name].attrs and mod[name].attrs["Compiler"] == "tensorrt": + op_on_trt |= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) else: - op_on_tvm &= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) - - if not op_on_trt or op_on_tvm: - return False + op_on_tvm |= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) - return True + return op_on_trt and not op_on_tvm -@pytest.mark.xfail( - reason=("Currently failing test. See tracking issue https://github.com/apache/tvm/issues/8901") -) def test_dynamic_reshape(run_module): def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt): result_arr = [{} for _ in range(len(x_data_list))] @@ -652,9 +647,9 @@ def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt): mod = tvm.IRModule() mod["main"] = f if use_trt: - mod, _ = tensorrt.partition_for_tensorrt( - mod, params={}, remove_no_mac_subgraphs=False - ) + logging.info("Before partitioning:\n%s", mod) + mod = tensorrt.partition_for_tensorrt(mod) + logging.info("After partitioning:\n%s", mod) assert are_ops_on_trt(mod, op_list=["reshape"]) == should_offload_to_trt if run_module: with relay.build_config(opt_level=3): @@ -1051,6 +1046,7 @@ def get_graph(d_type="float16"): run_and_verify_func(get_graph(d_type=type), run_module=run_module, data_type=type) +@pytest.mark.skip(reason=("Fails assert_allclose. See https://github.com/apache/tvm/issues/11765")) def test_conv3d(run_module): def get_graph( x_shape=(1, 24, 8, 8, 8), @@ -1143,11 +1139,6 @@ def get_graph( ) -@pytest.mark.xfail( - reason=("Currently failing test. See tracking issue https://github.com/apache/tvm/issues/8901") -) -@has_tensorrt_codegen -@tvm.testing.requires_cuda def test_dynamic_offload(): """ This test checks for proper dynamic offloading of relay graphs. An addition between @@ -1161,24 +1152,29 @@ def test_dynamic_offload(): x = relay.var("x", shape=(data_shape[0], data_shape[1], Any(), Any()), dtype="float32") y = relay.var("y", shape=(data_shape), dtype="float32") - kernel = relay.var("kernel", shape=(k_shape), dtype="float32") + kernel = relay.const(np.random.rand(*k_shape).astype("float32")) def get_expected(): # Create a nested TRT function that matches the expected output mod = tvm.IRModule() - var1 = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32") - kernel_trt = relay.var("tensorrt_0_i1", shape=(k_shape), dtype="float32") - out1 = relay.nn.conv2d(var1, kernel_trt, channels=k_shape[0], kernel_size=k_shape[2:4]) - f1 = GlobalVar("tvmgen_default_tensorrt_0") - func = relay.Function([var1, kernel_trt], out1) - func = set_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0") - mod[f1] = func + outer_var = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32") + inner_var = relay.var("FunctionVar_0_0", shape=(data_shape), dtype="float32") + inner_body = relay.nn.conv2d( + inner_var, kernel, channels=k_shape[0], kernel_size=k_shape[2:4] + ) + inner_func = relay.Function([inner_var], inner_body) + inner_func = set_inner_func_attr(inner_func, "nn.conv2d_", "tensorrt.nn.conv2d") + outer_body = inner_func(outer_var) + outer_func = relay.Function([outer_var], outer_body) + outer_func = set_outer_func_attr(outer_func, "tensorrt", "tvmgen_default_tensorrt_main_0") + gv = GlobalVar("tvmgen_default_tensorrt_main_0") + mod[gv] = outer_func mod = relay.transform.InferType()(mod) # Create the main function out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) - out = relay.add(out1, f1(y, kernel)) - f = relay.Function([x, y, kernel], out) + out = relay.add(out1, gv(y)) + f = relay.Function([x, y], out) mod["main"] = f mod = relay.transform.InferType()(mod) return mod @@ -1187,13 +1183,13 @@ def get_expected(): out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) out2 = relay.nn.conv2d(y, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) out = relay.add(out1, out2) - f = relay.Function([x, y, kernel], out) + f = relay.Function([x, y], out) # Pass the function to TRT compilation mod = tvm.IRModule() mod["main"] = f mod = relay.transform.InferType()(mod) - mod_trt, config = tensorrt.partition_for_tensorrt(mod, params={}) + mod_trt = tensorrt.partition_for_tensorrt(mod) # Get the expected relay graph and compare mod_exp = get_expected() @@ -1212,7 +1208,7 @@ def test_tensorrt_dynamic_batch(run_module): mod = tvm.IRModule() mod["main"] = f if use_trt: - mod, _ = tensorrt.partition_for_tensorrt(mod) + mod = tensorrt.partition_for_tensorrt(mod) if run_module: with relay.build_config(opt_level=3): @@ -1242,17 +1238,17 @@ def test_tensorrt_dynamic_batch_conv(run_module): f = relay.Function([x, kernel], out) mod = tvm.IRModule() mod["main"] = f + trt_target = tvm.target.Target(f"tensorrt -use_implicit_batch={use_implicit_batch}") if use_trt: - mod, config = tensorrt.partition_for_tensorrt( - mod, params, use_implicit_batch=use_implicit_batch - ) + mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target) if run_module: for target in ["llvm", "cuda"]: - with tvm.transform.PassContext( - opt_level=3, config={"relay.ext.tensorrt.options": config} - ): + targets = [target] + if use_trt: + targets.append(trt_target) + with tvm.transform.PassContext(opt_level=3): func = relay.create_executor( - "vm", mod=mod, device=tvm.device(target), target=target + "vm", mod=mod, device=tvm.device(target), target=targets ).evaluate() for i, batch_size in enumerate(batches_to_test): result_arr[i][target][use_trt] = func(x_data[:batch_size, ...], **params) @@ -1262,6 +1258,11 @@ def test_tensorrt_dynamic_batch_conv(run_module): assert_result_dict_holds(result_arr[i][target]) +@pytest.mark.skip( + reason=( + "Coredumps, possibly due to LLVM and PyTorch version mismatch. See https://github.com/apache/tvm/issues/11765" + ) +) def test_maskrcnn_resnet50(run_module) -> None: """ This function tests the working of pytorch maskrcnn with resnet50 as backbone with @@ -1281,9 +1282,11 @@ def convert_traced_model_to_vm_trt( input_name = "input0" shape_list = [(input_name, input_shape)] mod, params = relay.frontend.from_pytorch(traced_module, shape_list) - mod, config = tensorrt.partition_for_tensorrt(mod, params, remove_no_mac_subgraphs=True) + trt_target = tvm.target.Target("tensorrt -remove_no_mac_subgraphs=True") + mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target) + targets = [target, trt_target] with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]): - vm_trt_exec = relay.vm.compile(mod, target=target, params=params) + vm_trt_exec = relay.vm.compile(mod, target=targets, params=params) return vm_trt_exec @@ -1381,7 +1384,7 @@ def test_empty_subgraph(run_module): var1 = relay.var("tensorrt_0_i0", shape=(x_shape), dtype="float32") f1 = GlobalVar("tensorrt_0") func = relay.Function([var1], var1) - func = set_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0") + func = set_outer_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0") mod[f1] = func mod = relay.transform.InferType()(mod) @@ -1402,4 +1405,5 @@ def test_empty_subgraph(run_module): if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) tvm.testing.main() diff --git a/tests/python/contrib/test_tensorrt_int8_exp.py b/tests/python/contrib/test_tensorrt_int8_exp.py index 84360e92d33b9..97eff3af0e42a 100644 --- a/tests/python/contrib/test_tensorrt_int8_exp.py +++ b/tests/python/contrib/test_tensorrt_int8_exp.py @@ -103,8 +103,8 @@ def test_trt_int8(): # compile the model target = "cuda" dev = tvm.cuda(1) - mod, config = partition_for_tensorrt(mod, params) - with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): + mod = partition_for_tensorrt(mod, params) + with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) dtype = "float32" diff --git a/tests/python/relay/collage/menangerie.py b/tests/python/relay/collage/menangerie.py new file mode 100644 index 0000000000000..60f150e11cea8 --- /dev/null +++ b/tests/python/relay/collage/menangerie.py @@ -0,0 +1,4286 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +import onnx +import numpy as np +import logging +import tvm.contrib.target.onnx + +MODEL_PREFIX = "/home/mbs/gauntlet/models/" +MNIST = { + "name": "mnist", + "filename": "mnist-8.onnx", + "input_shapes": {"Input3": [1, 1, 28, 28]}, + "input_dtypes": {"Input3": "float32"}, + "main_dtype": "float32", +} +GPT2 = { + "name": "gpt2", + "filename": "gpt2.onnx", + "input_shapes": {"input1": [1, 50, 32]}, + "input_dtypes": {"input1": "int64"}, + "main_dtype": "float32", +} +RESNET50V2 = { + "name": "resnet50", + "filename": "resnet50-v2-7.onnx", + "input_shapes": {"data": [1, 3, 224, 224]}, + "input_dtypes": {"data": "float32"}, + "main_dtype": "float32", +} +MOBILENETV2 = { + "name": "mobilenet", + "filename": "mobilenetv2-1.0.onnx", + "input_shapes": {"data": [1, 3, 224, 224]}, + "input_dtypes": {"data": "float32"}, + "main_dtype": "float32", +} +# Note that resnext50_32_4d below was extracted directly from the pytorch model and not from any onnx file. +RESNEXT50_32_4d = { + "name": "resnext50_32_4d", + "filename": "resnext50_32x4d.onnx", + "input_shapes": {"x": [1, 64, 56, 56]}, + "input_dtypes": {"x": "float32"}, + "main_dtype": "float32", +} + + +def make_const(dtype, shape): + return tvm.relay.const(np.random.rand(*shape).astype(dtype)) + + +def make_consts(dtype, shapes): + return [make_const(dtype, shape) for shape in shapes] + + +def mnist_consts(dtype): + return make_consts( + dtype, + [ + (8, 1, 5, 5), # 0 + (8, 1, 1), # 1 + (16, 8, 5, 5), # 2 + (16, 1, 1), # 3 + (10, 256), # 4 + (1, 10), # 5 + ], + ) + + +def mnist(): + metatable = {"relay.Constant": mnist_consts("float32")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%x: Tensor[(1, 1, 28, 28), float32]) -> Tensor[(1, 10), float32] { + %0 = nn.pad(%x, 0f, pad_width=[[0, 0], [0, 0], [2, 2], [2, 2]]); + %1 = nn.conv2d(%0, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=8, kernel_size=[5, 5]); + %2 = add(%1, meta[relay.Constant][1]); + %3 = nn.relu(%2); + %4 = nn.max_pool2d(%3, pool_size=[2, 2], strides=[2, 2], padding=[0, 0, 0, 0]); + %5 = nn.pad(%4, 0f, pad_width=[[0, 0], [0, 0], [2, 2], [2, 2]]); + %6 = nn.conv2d(%5, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=16, kernel_size=[5, 5]); + %7 = add(%6, meta[relay.Constant][3]); + %8 = nn.relu(%7); + %9 = nn.max_pool2d(%8, pool_size=[3, 3], strides=[3, 3], padding=[0, 0, 0, 0]); + %10 = reshape(%9, newshape=[1, 256]); + %11 = nn.dense(%10, meta[relay.Constant][4], units=None, out_dtype="float32"); + add(%11, meta[relay.Constant][5]) + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "mnist", + "input_shapes": {"x": [1, 1, 28, 28]}, + "input_dtypes": {"x": "float32"}, + "mod": mod, + "params": None, + "main_dtype": "float32", + } + + +def gpt2_consts(dtype): + return make_consts( + dtype, + [ + (50257, 768), # 0 + (1, 32, 768), # 1 + (768,), # 2 + (768,), # 3 + (2304, 768), # 4 + (2304,), # 5 + (1, 1, 32, 32), # 6 + (1, 1, 32, 32), # 7 + (768, 768), # 8 + (768,), # 9 + (768,), # 10 + (768,), # 11 + (3072, 768), # 12 + (3072,), # 13 + (768, 3072), # 14 + (768,), # 15 + (768,), # 16 + (768,), # 17 + (2304, 768), # 18 + (2304,), # 19 + (1, 1, 32, 32), # 20 + (1, 1, 32, 32), # 21 + (768, 768), # 22 + (768,), # 23 + (768,), # 24 + (768,), # 25 + (3072, 768), # 26 + (3072,), # 27 + (768, 3072), # 28 + (768,), # 29 + (768,), # 30 + (768,), # 31 + (2304, 768), # 32 + (2304,), # 33 + (1, 1, 32, 32), # 34 + (1, 1, 32, 32), # 35 + (768, 768), # 36 + (768,), # 37 + (768,), # 38 + (768,), # 39 + (3072, 768), # 40 + (3072,), # 41 + (768, 3072), # 42 + (768,), # 43 + (768,), # 44 + (768,), # 45 + (2304, 768), # 46 + (2304,), # 47 + (1, 1, 32, 32), # 48 + (1, 1, 32, 32), # 49 + (768, 768), # 50 + (768,), # 51 + (768,), # 52 + (768,), # 53 + (3072, 768), # 54 + (3072,), # 55 + (768, 3072), # 56 + (768,), # 57 + (768,), # 58 + (768,), # 59 + (2304, 768), # 60 + (2304,), # 61 + (1, 1, 32, 32), # 62 + (1, 1, 32, 32), # 63 + (768, 768), # 64 + (768,), # 65 + (768,), # 66 + (768,), # 67 + (3072, 768), # 68 + (3072,), # 69 + (768, 3072), # 70 + (768,), # 71 + (768,), # 72 + (768,), # 73 + (2304, 768), # 74 + (2304,), # 75 + (1, 1, 32, 32), # 76 + (1, 1, 32, 32), # 77 + (768, 768), # 78 + (768,), # 79 + (768,), # 80 + (768,), # 81 + (3072, 768), # 82 + (3072,), # 83 + (768, 3072), # 84 + (768,), # 85 + (768,), # 86 + (768,), # 87 + (2304, 768), # 88 + (2304,), # 89 + (1, 1, 32, 32), # 90 + (1, 1, 32, 32), # 91 + (768, 768), # 92 + (768,), # 93 + (768,), # 94 + (768,), # 95 + (3072, 768), # 96 + (3072,), # 97 + (768, 3072), # 98 + (768,), # 99 + (768,), # 100 + (768,), # 101 + (2304, 768), # 102 + (2304,), # 103 + (1, 1, 32, 32), # 104 + (1, 1, 32, 32), # 105 + (768, 768), # 106 + (768,), # 107 + (768,), # 108 + (768,), # 109 + (3072, 768), # 110 + (3072,), # 111 + (768, 3072), # 112 + (768,), # 113 + (768,), # 114 + (768,), # 115 + (2304, 768), # 116 + (2304,), # 117 + (1, 1, 32, 32), # 118 + (1, 1, 32, 32), # 119 + (768, 768), # 120 + (768,), # 121 + (768,), # 122 + (768,), # 123 + (3072, 768), # 124 + (3072,), # 125 + (768, 3072), # 126 + (768,), # 127 + (768,), # 128 + (768,), # 129 + (2304, 768), # 130 + (2304,), # 131 + (1, 1, 32, 32), # 132 + (1, 1, 32, 32), # 133 + (768, 768), # 134 + (768,), # 135 + (768,), # 136 + (768,), # 137 + (3072, 768), # 138 + (3072,), # 139 + (768, 3072), # 140 + (768,), # 141 + (768,), # 142 + (768,), # 143 + (2304, 768), # 144 + (2304,), # 145 + (1, 1, 32, 32), # 146 + (1, 1, 32, 32), # 147 + (768, 768), # 148 + (768,), # 149 + (768,), # 150 + (768,), # 151 + (3072, 768), # 152 + (3072,), # 153 + (768, 3072), # 154 + (768,), # 155 + (768,), # 156 + (768,), # 157 + (2304, 768), # 158 + (2304,), # 159 + (1, 1, 32, 32), # 160 + (1, 1, 32, 32), # 161 + (768, 768), # 162 + (768,), # 163 + (768,), # 164 + (768,), # 165 + (3072, 768), # 166 + (3072,), # 167 + (768, 3072), # 168 + (768,), # 169 + (768,), # 170 + (768,), # 171 + ], + ) + + +def gpt2(): + metatable = {"relay.Constant": gpt2_consts("float32")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%x: Tensor[(1, 50, 32), int64]) -> (Tensor[(1, 50, 32, 768), float32], + Tensor[(2, 50, 12, 32, 64), float32], + Tensor[(2, 50, 12, 32, 64), float32], + Tensor[(2, 50, 12, 32, 64), float32], + Tensor[(2, 50, 12, 32, 64), float32], + Tensor[(2, 50, 12, 32, 64), float32], + Tensor[(2, 50, 12, 32, 64), float32], + Tensor[(2, 50, 12, 32, 64), float32], + Tensor[(2, 50, 12, 32, 64), float32], + Tensor[(2, 50, 12, 32, 64), float32], + Tensor[(2, 50, 12, 32, 64), float32], + Tensor[(2, 50, 12, 32, 64), float32], + Tensor[(2, 50, 12, 32, 64), float32]) { + %0 = reshape(%x, newshape=[-1, 32]); + %1 = less(%0, 0i64); + %2 = add(%0, 50257i64); + %3 = where(%1, %2, %0); + %4 = take(meta[relay.Constant][0], %3, axis=0); + %5 = add(%4, meta[relay.Constant][1]); + %6 = mean(%5, axis=[-1], keepdims=True); + %7 = subtract(%5, %6); + %8 = power(%7, 2f); + %9 = mean(%8, axis=[-1], keepdims=True); + %10 = add(%9, 1e-05f); + %11 = sqrt(%10); + %12 = divide(%7, %11); + %13 = multiply(%12, meta[relay.Constant][2]); + %14 = add(%13, meta[relay.Constant][3]); + %15 = reshape(%14, newshape=[-1, 768]); + %16 = nn.dense(%15, meta[relay.Constant][4], units=2304); + %17 = add(%16, meta[relay.Constant][5]); + %18 = reshape(%17, newshape=[50, 32, 2304]); + %19 = split(%18, indices_or_sections=[768, 1536], axis=2); + %20 = %19.0; + %21 = reshape(%20, newshape=[50, 32, 12, 64]); + %22 = transpose(%21, axes=[0, 2, 1, 3]); + %23 = %19.1; + %24 = reshape(%23, newshape=[50, 32, 12, 64]); + %25 = transpose(%24, axes=[0, 2, 3, 1]); + %26 = reshape(%25, newshape=[-1, 64, 32]); + %27 = reshape(%22, newshape=[-1, 32, 64]); + %28 = transpose(%26, axes=[0, 2, 1]); + %29 = nn.batch_matmul(%27, %28, out_dtype="float32", transpose_b=True); + %30 = reshape(%29, newshape=[50, 12, 32, 32]); + %31 = divide(%30, 8f); + %32 = multiply(%31, meta[relay.Constant][6]); + %33 = subtract(%32, meta[relay.Constant][7]); + %34 = nn.softmax(%33, axis=3); + %35 = %19.2; + %36 = reshape(%35, newshape=[50, 32, 12, 64]); + %37 = transpose(%36, axes=[0, 2, 1, 3]); + %38 = reshape(%37, newshape=[-1, 32, 64]); + %39 = reshape(%34, newshape=[-1, 32, 32]); + %40 = transpose(%38, axes=[0, 2, 1]); + %41 = nn.batch_matmul(%39, %40, out_dtype="float32", transpose_b=True); + %42 = reshape(%41, newshape=[50, 12, 32, 64]); + %43 = transpose(%42, axes=[0, 2, 1, 3]); + %44 = reshape(%43, newshape=[50, 32, 768]); + %45 = reshape(%44, newshape=[-1, 768]); + %46 = nn.dense(%45, meta[relay.Constant][8], units=768); + %47 = add(%46, meta[relay.Constant][9]); + %48 = reshape(%47, newshape=[50, 32, 768]); + %49 = add(%5, %48); + %50 = mean(%49, axis=[-1], keepdims=True); + %51 = subtract(%49, %50); + %52 = power(%51, 2f); + %53 = mean(%52, axis=[-1], keepdims=True); + %54 = add(%53, 1e-05f); + %55 = sqrt(%54); + %56 = divide(%51, %55); + %57 = multiply(%56, meta[relay.Constant][10]); + %58 = add(%57, meta[relay.Constant][11]); + %59 = reshape(%58, newshape=[-1, 768]); + %60 = nn.dense(%59, meta[relay.Constant][12], units=3072); + %61 = add(%60, meta[relay.Constant][13]); + %62 = reshape(%61, newshape=[50, 32, 3072]); + %63 = power(%62, 3f); + %64 = multiply(%63, 0.044715f); + %65 = add(%62, %64); + %66 = multiply(%65, 0.797885f); + %67 = tanh(%66); + %68 = multiply(%62, 0.5f); + %69 = add(%67, 1f); + %70 = multiply(%68, %69); + %71 = reshape(%70, newshape=[-1, 3072]); + %72 = nn.dense(%71, meta[relay.Constant][14], units=768); + %73 = add(%72, meta[relay.Constant][15]); + %74 = reshape(%73, newshape=[50, 32, 768]); + %75 = add(%49, %74); + %76 = mean(%75, axis=[-1], keepdims=True); + %77 = subtract(%75, %76); + %78 = power(%77, 2f); + %79 = mean(%78, axis=[-1], keepdims=True); + %80 = add(%79, 1e-05f); + %81 = sqrt(%80); + %82 = divide(%77, %81); + %83 = multiply(%82, meta[relay.Constant][16]); + %84 = add(%83, meta[relay.Constant][17]); + %85 = reshape(%84, newshape=[-1, 768]); + %86 = nn.dense(%85, meta[relay.Constant][18], units=2304); + %87 = add(%86, meta[relay.Constant][19]); + %88 = reshape(%87, newshape=[50, 32, 2304]); + %89 = split(%88, indices_or_sections=[768, 1536], axis=2); + %90 = %89.0; + %91 = reshape(%90, newshape=[50, 32, 12, 64]); + %92 = transpose(%91, axes=[0, 2, 1, 3]); + %93 = %89.1; + %94 = reshape(%93, newshape=[50, 32, 12, 64]); + %95 = transpose(%94, axes=[0, 2, 3, 1]); + %96 = reshape(%95, newshape=[-1, 64, 32]); + %97 = reshape(%92, newshape=[-1, 32, 64]); + %98 = transpose(%96, axes=[0, 2, 1]); + %99 = nn.batch_matmul(%97, %98, out_dtype="float32", transpose_b=True); + %100 = reshape(%99, newshape=[50, 12, 32, 32]); + %101 = divide(%100, 8f); + %102 = multiply(%101, meta[relay.Constant][20]); + %103 = subtract(%102, meta[relay.Constant][21]); + %104 = nn.softmax(%103, axis=3); + %105 = %89.2; + %106 = reshape(%105, newshape=[50, 32, 12, 64]); + %107 = transpose(%106, axes=[0, 2, 1, 3]); + %108 = reshape(%107, newshape=[-1, 32, 64]); + %109 = reshape(%104, newshape=[-1, 32, 32]); + %110 = transpose(%108, axes=[0, 2, 1]); + %111 = nn.batch_matmul(%109, %110, out_dtype="float32", transpose_b=True); + %112 = reshape(%111, newshape=[50, 12, 32, 64]); + %113 = transpose(%112, axes=[0, 2, 1, 3]); + %114 = reshape(%113, newshape=[50, 32, 768]); + %115 = reshape(%114, newshape=[-1, 768]); + %116 = nn.dense(%115, meta[relay.Constant][22], units=768); + %117 = add(%116, meta[relay.Constant][23]); + %118 = reshape(%117, newshape=[50, 32, 768]); + %119 = add(%75, %118); + %120 = mean(%119, axis=[-1], keepdims=True); + %121 = subtract(%119, %120); + %122 = power(%121, 2f); + %123 = mean(%122, axis=[-1], keepdims=True); + %124 = add(%123, 1e-05f); + %125 = sqrt(%124); + %126 = divide(%121, %125); + %127 = multiply(%126, meta[relay.Constant][24]); + %128 = add(%127, meta[relay.Constant][25]); + %129 = reshape(%128, newshape=[-1, 768]); + %130 = nn.dense(%129, meta[relay.Constant][26], units=3072); + %131 = add(%130, meta[relay.Constant][27]); + %132 = reshape(%131, newshape=[50, 32, 3072]); + %133 = power(%132, 3f); + %134 = multiply(%133, 0.044715f); + %135 = add(%132, %134); + %136 = multiply(%135, 0.797885f); + %137 = tanh(%136); + %138 = multiply(%132, 0.5f); + %139 = add(%137, 1f); + %140 = multiply(%138, %139); + %141 = reshape(%140, newshape=[-1, 3072]); + %142 = nn.dense(%141, meta[relay.Constant][28], units=768); + %143 = add(%142, meta[relay.Constant][29]); + %144 = reshape(%143, newshape=[50, 32, 768]); + %145 = add(%119, %144); + %146 = mean(%145, axis=[-1], keepdims=True); + %147 = subtract(%145, %146); + %148 = power(%147, 2f); + %149 = mean(%148, axis=[-1], keepdims=True); + %150 = add(%149, 1e-05f); + %151 = sqrt(%150); + %152 = divide(%147, %151); + %153 = multiply(%152, meta[relay.Constant][30]); + %154 = add(%153, meta[relay.Constant][31]); + %155 = reshape(%154, newshape=[-1, 768]); + %156 = nn.dense(%155, meta[relay.Constant][32], units=2304); + %157 = add(%156, meta[relay.Constant][33]); + %158 = reshape(%157, newshape=[50, 32, 2304]); + %159 = split(%158, indices_or_sections=[768, 1536], axis=2); + %160 = %159.0; + %161 = reshape(%160, newshape=[50, 32, 12, 64]); + %162 = transpose(%161, axes=[0, 2, 1, 3]); + %163 = %159.1; + %164 = reshape(%163, newshape=[50, 32, 12, 64]); + %165 = transpose(%164, axes=[0, 2, 3, 1]); + %166 = reshape(%165, newshape=[-1, 64, 32]); + %167 = reshape(%162, newshape=[-1, 32, 64]); + %168 = transpose(%166, axes=[0, 2, 1]); + %169 = nn.batch_matmul(%167, %168, out_dtype="float32", transpose_b=True); + %170 = reshape(%169, newshape=[50, 12, 32, 32]); + %171 = divide(%170, 8f); + %172 = multiply(%171, meta[relay.Constant][34]); + %173 = subtract(%172, meta[relay.Constant][35]); + %174 = nn.softmax(%173, axis=3); + %175 = %159.2; + %176 = reshape(%175, newshape=[50, 32, 12, 64]); + %177 = transpose(%176, axes=[0, 2, 1, 3]); + %178 = reshape(%177, newshape=[-1, 32, 64]); + %179 = reshape(%174, newshape=[-1, 32, 32]); + %180 = transpose(%178, axes=[0, 2, 1]); + %181 = nn.batch_matmul(%179, %180, out_dtype="float32", transpose_b=True); + %182 = reshape(%181, newshape=[50, 12, 32, 64]); + %183 = transpose(%182, axes=[0, 2, 1, 3]); + %184 = reshape(%183, newshape=[50, 32, 768]); + %185 = reshape(%184, newshape=[-1, 768]); + %186 = nn.dense(%185, meta[relay.Constant][36], units=768); + %187 = add(%186, meta[relay.Constant][37]); + %188 = reshape(%187, newshape=[50, 32, 768]); + %189 = add(%145, %188); + %190 = mean(%189, axis=[-1], keepdims=True); + %191 = subtract(%189, %190); + %192 = power(%191, 2f); + %193 = mean(%192, axis=[-1], keepdims=True); + %194 = add(%193, 1e-05f); + %195 = sqrt(%194); + %196 = divide(%191, %195); + %197 = multiply(%196, meta[relay.Constant][38]); + %198 = add(%197, meta[relay.Constant][39]); + %199 = reshape(%198, newshape=[-1, 768]); + %200 = nn.dense(%199, meta[relay.Constant][40], units=3072); + %201 = add(%200, meta[relay.Constant][41]); + %202 = reshape(%201, newshape=[50, 32, 3072]); + %203 = power(%202, 3f); + %204 = multiply(%203, 0.044715f); + %205 = add(%202, %204); + %206 = multiply(%205, 0.797885f); + %207 = tanh(%206); + %208 = multiply(%202, 0.5f); + %209 = add(%207, 1f); + %210 = multiply(%208, %209); + %211 = reshape(%210, newshape=[-1, 3072]); + %212 = nn.dense(%211, meta[relay.Constant][42], units=768); + %213 = add(%212, meta[relay.Constant][43]); + %214 = reshape(%213, newshape=[50, 32, 768]); + %215 = add(%189, %214); + %216 = mean(%215, axis=[-1], keepdims=True); + %217 = subtract(%215, %216); + %218 = power(%217, 2f); + %219 = mean(%218, axis=[-1], keepdims=True); + %220 = add(%219, 1e-05f); + %221 = sqrt(%220); + %222 = divide(%217, %221); + %223 = multiply(%222, meta[relay.Constant][44]); + %224 = add(%223, meta[relay.Constant][45]); + %225 = reshape(%224, newshape=[-1, 768]); + %226 = nn.dense(%225, meta[relay.Constant][46], units=2304); + %227 = add(%226, meta[relay.Constant][47]); + %228 = reshape(%227, newshape=[50, 32, 2304]); + %229 = split(%228, indices_or_sections=[768, 1536], axis=2); + %230 = %229.0; + %231 = reshape(%230, newshape=[50, 32, 12, 64]); + %232 = transpose(%231, axes=[0, 2, 1, 3]); + %233 = %229.1; + %234 = reshape(%233, newshape=[50, 32, 12, 64]); + %235 = transpose(%234, axes=[0, 2, 3, 1]); + %236 = reshape(%235, newshape=[-1, 64, 32]); + %237 = reshape(%232, newshape=[-1, 32, 64]); + %238 = transpose(%236, axes=[0, 2, 1]); + %239 = nn.batch_matmul(%237, %238, out_dtype="float32", transpose_b=True); + %240 = reshape(%239, newshape=[50, 12, 32, 32]); + %241 = divide(%240, 8f); + %242 = multiply(%241, meta[relay.Constant][48]); + %243 = subtract(%242, meta[relay.Constant][49]); + %244 = nn.softmax(%243, axis=3); + %245 = %229.2; + %246 = reshape(%245, newshape=[50, 32, 12, 64]); + %247 = transpose(%246, axes=[0, 2, 1, 3]); + %248 = reshape(%247, newshape=[-1, 32, 64]); + %249 = reshape(%244, newshape=[-1, 32, 32]); + %250 = transpose(%248, axes=[0, 2, 1]); + %251 = nn.batch_matmul(%249, %250, out_dtype="float32", transpose_b=True); + %252 = reshape(%251, newshape=[50, 12, 32, 64]); + %253 = transpose(%252, axes=[0, 2, 1, 3]); + %254 = reshape(%253, newshape=[50, 32, 768]); + %255 = reshape(%254, newshape=[-1, 768]); + %256 = nn.dense(%255, meta[relay.Constant][50], units=768); + %257 = add(%256, meta[relay.Constant][51]); + %258 = reshape(%257, newshape=[50, 32, 768]); + %259 = add(%215, %258); + %260 = mean(%259, axis=[-1], keepdims=True); + %261 = subtract(%259, %260); + %262 = power(%261, 2f); + %263 = mean(%262, axis=[-1], keepdims=True); + %264 = add(%263, 1e-05f); + %265 = sqrt(%264); + %266 = divide(%261, %265); + %267 = multiply(%266, meta[relay.Constant][52]); + %268 = add(%267, meta[relay.Constant][53]); + %269 = reshape(%268, newshape=[-1, 768]); + %270 = nn.dense(%269, meta[relay.Constant][54], units=3072); + %271 = add(%270, meta[relay.Constant][55]); + %272 = reshape(%271, newshape=[50, 32, 3072]); + %273 = power(%272, 3f); + %274 = multiply(%273, 0.044715f); + %275 = add(%272, %274); + %276 = multiply(%275, 0.797885f); + %277 = tanh(%276); + %278 = multiply(%272, 0.5f); + %279 = add(%277, 1f); + %280 = multiply(%278, %279); + %281 = reshape(%280, newshape=[-1, 3072]); + %282 = nn.dense(%281, meta[relay.Constant][56], units=768); + %283 = add(%282, meta[relay.Constant][57]); + %284 = reshape(%283, newshape=[50, 32, 768]); + %285 = add(%259, %284); + %286 = mean(%285, axis=[-1], keepdims=True); + %287 = subtract(%285, %286); + %288 = power(%287, 2f); + %289 = mean(%288, axis=[-1], keepdims=True); + %290 = add(%289, 1e-05f); + %291 = sqrt(%290); + %292 = divide(%287, %291); + %293 = multiply(%292, meta[relay.Constant][58]); + %294 = add(%293, meta[relay.Constant][59]); + %295 = reshape(%294, newshape=[-1, 768]); + %296 = nn.dense(%295, meta[relay.Constant][60], units=2304); + %297 = add(%296, meta[relay.Constant][61]); + %298 = reshape(%297, newshape=[50, 32, 2304]); + %299 = split(%298, indices_or_sections=[768, 1536], axis=2); + %300 = %299.0; + %301 = reshape(%300, newshape=[50, 32, 12, 64]); + %302 = transpose(%301, axes=[0, 2, 1, 3]); + %303 = %299.1; + %304 = reshape(%303, newshape=[50, 32, 12, 64]); + %305 = transpose(%304, axes=[0, 2, 3, 1]); + %306 = reshape(%305, newshape=[-1, 64, 32]); + %307 = reshape(%302, newshape=[-1, 32, 64]); + %308 = transpose(%306, axes=[0, 2, 1]); + %309 = nn.batch_matmul(%307, %308, out_dtype="float32", transpose_b=True); + %310 = reshape(%309, newshape=[50, 12, 32, 32]); + %311 = divide(%310, 8f); + %312 = multiply(%311, meta[relay.Constant][62]); + %313 = subtract(%312, meta[relay.Constant][63]); + %314 = nn.softmax(%313, axis=3); + %315 = %299.2; + %316 = reshape(%315, newshape=[50, 32, 12, 64]); + %317 = transpose(%316, axes=[0, 2, 1, 3]); + %318 = reshape(%317, newshape=[-1, 32, 64]); + %319 = reshape(%314, newshape=[-1, 32, 32]); + %320 = transpose(%318, axes=[0, 2, 1]); + %321 = nn.batch_matmul(%319, %320, out_dtype="float32", transpose_b=True); + %322 = reshape(%321, newshape=[50, 12, 32, 64]); + %323 = transpose(%322, axes=[0, 2, 1, 3]); + %324 = reshape(%323, newshape=[50, 32, 768]); + %325 = reshape(%324, newshape=[-1, 768]); + %326 = nn.dense(%325, meta[relay.Constant][64], units=768); + %327 = add(%326, meta[relay.Constant][65]); + %328 = reshape(%327, newshape=[50, 32, 768]); + %329 = add(%285, %328); + %330 = mean(%329, axis=[-1], keepdims=True); + %331 = subtract(%329, %330); + %332 = power(%331, 2f); + %333 = mean(%332, axis=[-1], keepdims=True); + %334 = add(%333, 1e-05f); + %335 = sqrt(%334); + %336 = divide(%331, %335); + %337 = multiply(%336, meta[relay.Constant][66]); + %338 = add(%337, meta[relay.Constant][67]); + %339 = reshape(%338, newshape=[-1, 768]); + %340 = nn.dense(%339, meta[relay.Constant][68], units=3072); + %341 = add(%340, meta[relay.Constant][69]); + %342 = reshape(%341, newshape=[50, 32, 3072]); + %343 = power(%342, 3f); + %344 = multiply(%343, 0.044715f); + %345 = add(%342, %344); + %346 = multiply(%345, 0.797885f); + %347 = tanh(%346); + %348 = multiply(%342, 0.5f); + %349 = add(%347, 1f); + %350 = multiply(%348, %349); + %351 = reshape(%350, newshape=[-1, 3072]); + %352 = nn.dense(%351, meta[relay.Constant][70], units=768); + %353 = add(%352, meta[relay.Constant][71]); + %354 = reshape(%353, newshape=[50, 32, 768]); + %355 = add(%329, %354); + %356 = mean(%355, axis=[-1], keepdims=True); + %357 = subtract(%355, %356); + %358 = power(%357, 2f); + %359 = mean(%358, axis=[-1], keepdims=True); + %360 = add(%359, 1e-05f); + %361 = sqrt(%360); + %362 = divide(%357, %361); + %363 = multiply(%362, meta[relay.Constant][72]); + %364 = add(%363, meta[relay.Constant][73]); + %365 = reshape(%364, newshape=[-1, 768]); + %366 = nn.dense(%365, meta[relay.Constant][74], units=2304); + %367 = add(%366, meta[relay.Constant][75]); + %368 = reshape(%367, newshape=[50, 32, 2304]); + %369 = split(%368, indices_or_sections=[768, 1536], axis=2); + %370 = %369.0; + %371 = reshape(%370, newshape=[50, 32, 12, 64]); + %372 = transpose(%371, axes=[0, 2, 1, 3]); + %373 = %369.1; + %374 = reshape(%373, newshape=[50, 32, 12, 64]); + %375 = transpose(%374, axes=[0, 2, 3, 1]); + %376 = reshape(%375, newshape=[-1, 64, 32]); + %377 = reshape(%372, newshape=[-1, 32, 64]); + %378 = transpose(%376, axes=[0, 2, 1]); + %379 = nn.batch_matmul(%377, %378, out_dtype="float32", transpose_b=True); + %380 = reshape(%379, newshape=[50, 12, 32, 32]); + %381 = divide(%380, 8f); + %382 = multiply(%381, meta[relay.Constant][76]); + %383 = subtract(%382, meta[relay.Constant][77]); + %384 = nn.softmax(%383, axis=3); + %385 = %369.2; + %386 = reshape(%385, newshape=[50, 32, 12, 64]); + %387 = transpose(%386, axes=[0, 2, 1, 3]); + %388 = reshape(%387, newshape=[-1, 32, 64]); + %389 = reshape(%384, newshape=[-1, 32, 32]); + %390 = transpose(%388, axes=[0, 2, 1]); + %391 = nn.batch_matmul(%389, %390, out_dtype="float32", transpose_b=True); + %392 = reshape(%391, newshape=[50, 12, 32, 64]); + %393 = transpose(%392, axes=[0, 2, 1, 3]); + %394 = reshape(%393, newshape=[50, 32, 768]); + %395 = reshape(%394, newshape=[-1, 768]); + %396 = nn.dense(%395, meta[relay.Constant][78], units=768); + %397 = add(%396, meta[relay.Constant][79]); + %398 = reshape(%397, newshape=[50, 32, 768]); + %399 = add(%355, %398); + %400 = mean(%399, axis=[-1], keepdims=True); + %401 = subtract(%399, %400); + %402 = power(%401, 2f); + %403 = mean(%402, axis=[-1], keepdims=True); + %404 = add(%403, 1e-05f); + %405 = sqrt(%404); + %406 = divide(%401, %405); + %407 = multiply(%406, meta[relay.Constant][80]); + %408 = add(%407, meta[relay.Constant][81]); + %409 = reshape(%408, newshape=[-1, 768]); + %410 = nn.dense(%409, meta[relay.Constant][82], units=3072); + %411 = add(%410, meta[relay.Constant][83]); + %412 = reshape(%411, newshape=[50, 32, 3072]); + %413 = power(%412, 3f); + %414 = multiply(%413, 0.044715f); + %415 = add(%412, %414); + %416 = multiply(%415, 0.797885f); + %417 = tanh(%416); + %418 = multiply(%412, 0.5f); + %419 = add(%417, 1f); + %420 = multiply(%418, %419); + %421 = reshape(%420, newshape=[-1, 3072]); + %422 = nn.dense(%421, meta[relay.Constant][84], units=768); + %423 = add(%422, meta[relay.Constant][85]); + %424 = reshape(%423, newshape=[50, 32, 768]); + %425 = add(%399, %424); + %426 = mean(%425, axis=[-1], keepdims=True); + %427 = subtract(%425, %426); + %428 = power(%427, 2f); + %429 = mean(%428, axis=[-1], keepdims=True); + %430 = add(%429, 1e-05f); + %431 = sqrt(%430); + %432 = divide(%427, %431); + %433 = multiply(%432, meta[relay.Constant][86]); + %434 = add(%433, meta[relay.Constant][87]); + %435 = reshape(%434, newshape=[-1, 768]); + %436 = nn.dense(%435, meta[relay.Constant][88], units=2304); + %437 = add(%436, meta[relay.Constant][89]); + %438 = reshape(%437, newshape=[50, 32, 2304]); + %439 = split(%438, indices_or_sections=[768, 1536], axis=2); + %440 = %439.0; + %441 = reshape(%440, newshape=[50, 32, 12, 64]); + %442 = transpose(%441, axes=[0, 2, 1, 3]); + %443 = %439.1; + %444 = reshape(%443, newshape=[50, 32, 12, 64]); + %445 = transpose(%444, axes=[0, 2, 3, 1]); + %446 = reshape(%445, newshape=[-1, 64, 32]); + %447 = reshape(%442, newshape=[-1, 32, 64]); + %448 = transpose(%446, axes=[0, 2, 1]); + %449 = nn.batch_matmul(%447, %448, out_dtype="float32", transpose_b=True); + %450 = reshape(%449, newshape=[50, 12, 32, 32]); + %451 = divide(%450, 8f); + %452 = multiply(%451, meta[relay.Constant][90]); + %453 = subtract(%452, meta[relay.Constant][91]); + %454 = nn.softmax(%453, axis=3); + %455 = %439.2; + %456 = reshape(%455, newshape=[50, 32, 12, 64]); + %457 = transpose(%456, axes=[0, 2, 1, 3]); + %458 = reshape(%457, newshape=[-1, 32, 64]); + %459 = reshape(%454, newshape=[-1, 32, 32]); + %460 = transpose(%458, axes=[0, 2, 1]); + %461 = nn.batch_matmul(%459, %460, out_dtype="float32", transpose_b=True); + %462 = reshape(%461, newshape=[50, 12, 32, 64]); + %463 = transpose(%462, axes=[0, 2, 1, 3]); + %464 = reshape(%463, newshape=[50, 32, 768]); + %465 = reshape(%464, newshape=[-1, 768]); + %466 = nn.dense(%465, meta[relay.Constant][92], units=768); + %467 = add(%466, meta[relay.Constant][93]); + %468 = reshape(%467, newshape=[50, 32, 768]); + %469 = add(%425, %468); + %470 = mean(%469, axis=[-1], keepdims=True); + %471 = subtract(%469, %470); + %472 = power(%471, 2f); + %473 = mean(%472, axis=[-1], keepdims=True); + %474 = add(%473, 1e-05f); + %475 = sqrt(%474); + %476 = divide(%471, %475); + %477 = multiply(%476, meta[relay.Constant][94]); + %478 = add(%477, meta[relay.Constant][95]); + %479 = reshape(%478, newshape=[-1, 768]); + %480 = nn.dense(%479, meta[relay.Constant][96], units=3072); + %481 = add(%480, meta[relay.Constant][97]); + %482 = reshape(%481, newshape=[50, 32, 3072]); + %483 = power(%482, 3f); + %484 = multiply(%483, 0.044715f); + %485 = add(%482, %484); + %486 = multiply(%485, 0.797885f); + %487 = tanh(%486); + %488 = multiply(%482, 0.5f); + %489 = add(%487, 1f); + %490 = multiply(%488, %489); + %491 = reshape(%490, newshape=[-1, 3072]); + %492 = nn.dense(%491, meta[relay.Constant][98], units=768); + %493 = add(%492, meta[relay.Constant][99]); + %494 = reshape(%493, newshape=[50, 32, 768]); + %495 = add(%469, %494); + %496 = mean(%495, axis=[-1], keepdims=True); + %497 = subtract(%495, %496); + %498 = power(%497, 2f); + %499 = mean(%498, axis=[-1], keepdims=True); + %500 = add(%499, 1e-05f); + %501 = sqrt(%500); + %502 = divide(%497, %501); + %503 = multiply(%502, meta[relay.Constant][100]); + %504 = add(%503, meta[relay.Constant][101]); + %505 = reshape(%504, newshape=[-1, 768]); + %506 = nn.dense(%505, meta[relay.Constant][102], units=2304); + %507 = add(%506, meta[relay.Constant][103]); + %508 = reshape(%507, newshape=[50, 32, 2304]); + %509 = split(%508, indices_or_sections=[768, 1536], axis=2); + %510 = %509.0; + %511 = reshape(%510, newshape=[50, 32, 12, 64]); + %512 = transpose(%511, axes=[0, 2, 1, 3]); + %513 = %509.1; + %514 = reshape(%513, newshape=[50, 32, 12, 64]); + %515 = transpose(%514, axes=[0, 2, 3, 1]); + %516 = reshape(%515, newshape=[-1, 64, 32]); + %517 = reshape(%512, newshape=[-1, 32, 64]); + %518 = transpose(%516, axes=[0, 2, 1]); + %519 = nn.batch_matmul(%517, %518, out_dtype="float32", transpose_b=True); + %520 = reshape(%519, newshape=[50, 12, 32, 32]); + %521 = divide(%520, 8f); + %522 = multiply(%521, meta[relay.Constant][104]); + %523 = subtract(%522, meta[relay.Constant][105]); + %524 = nn.softmax(%523, axis=3); + %525 = %509.2; + %526 = reshape(%525, newshape=[50, 32, 12, 64]); + %527 = transpose(%526, axes=[0, 2, 1, 3]); + %528 = reshape(%527, newshape=[-1, 32, 64]); + %529 = reshape(%524, newshape=[-1, 32, 32]); + %530 = transpose(%528, axes=[0, 2, 1]); + %531 = nn.batch_matmul(%529, %530, out_dtype="float32", transpose_b=True); + %532 = reshape(%531, newshape=[50, 12, 32, 64]); + %533 = transpose(%532, axes=[0, 2, 1, 3]); + %534 = reshape(%533, newshape=[50, 32, 768]); + %535 = reshape(%534, newshape=[-1, 768]); + %536 = nn.dense(%535, meta[relay.Constant][106], units=768); + %537 = add(%536, meta[relay.Constant][107]); + %538 = reshape(%537, newshape=[50, 32, 768]); + %539 = add(%495, %538); + %540 = mean(%539, axis=[-1], keepdims=True); + %541 = subtract(%539, %540); + %542 = power(%541, 2f); + %543 = mean(%542, axis=[-1], keepdims=True); + %544 = add(%543, 1e-05f); + %545 = sqrt(%544); + %546 = divide(%541, %545); + %547 = multiply(%546, meta[relay.Constant][108]); + %548 = add(%547, meta[relay.Constant][109]); + %549 = reshape(%548, newshape=[-1, 768]); + %550 = nn.dense(%549, meta[relay.Constant][110], units=3072); + %551 = add(%550, meta[relay.Constant][111]); + %552 = reshape(%551, newshape=[50, 32, 3072]); + %553 = power(%552, 3f); + %554 = multiply(%553, 0.044715f); + %555 = add(%552, %554); + %556 = multiply(%555, 0.797885f); + %557 = tanh(%556); + %558 = multiply(%552, 0.5f); + %559 = add(%557, 1f); + %560 = multiply(%558, %559); + %561 = reshape(%560, newshape=[-1, 3072]); + %562 = nn.dense(%561, meta[relay.Constant][112], units=768); + %563 = add(%562, meta[relay.Constant][113]); + %564 = reshape(%563, newshape=[50, 32, 768]); + %565 = add(%539, %564); + %566 = mean(%565, axis=[-1], keepdims=True); + %567 = subtract(%565, %566); + %568 = power(%567, 2f); + %569 = mean(%568, axis=[-1], keepdims=True); + %570 = add(%569, 1e-05f); + %571 = sqrt(%570); + %572 = divide(%567, %571); + %573 = multiply(%572, meta[relay.Constant][114]); + %574 = add(%573, meta[relay.Constant][115]); + %575 = reshape(%574, newshape=[-1, 768]); + %576 = nn.dense(%575, meta[relay.Constant][116], units=2304); + %577 = add(%576, meta[relay.Constant][117]); + %578 = reshape(%577, newshape=[50, 32, 2304]); + %579 = split(%578, indices_or_sections=[768, 1536], axis=2); + %580 = %579.0; + %581 = reshape(%580, newshape=[50, 32, 12, 64]); + %582 = transpose(%581, axes=[0, 2, 1, 3]); + %583 = %579.1; + %584 = reshape(%583, newshape=[50, 32, 12, 64]); + %585 = transpose(%584, axes=[0, 2, 3, 1]); + %586 = reshape(%585, newshape=[-1, 64, 32]); + %587 = reshape(%582, newshape=[-1, 32, 64]); + %588 = transpose(%586, axes=[0, 2, 1]); + %589 = nn.batch_matmul(%587, %588, out_dtype="float32", transpose_b=True); + %590 = reshape(%589, newshape=[50, 12, 32, 32]); + %591 = divide(%590, 8f); + %592 = multiply(%591, meta[relay.Constant][118]); + %593 = subtract(%592, meta[relay.Constant][119]); + %594 = nn.softmax(%593, axis=3); + %595 = %579.2; + %596 = reshape(%595, newshape=[50, 32, 12, 64]); + %597 = transpose(%596, axes=[0, 2, 1, 3]); + %598 = reshape(%597, newshape=[-1, 32, 64]); + %599 = reshape(%594, newshape=[-1, 32, 32]); + %600 = transpose(%598, axes=[0, 2, 1]); + %601 = nn.batch_matmul(%599, %600, out_dtype="float32", transpose_b=True); + %602 = reshape(%601, newshape=[50, 12, 32, 64]); + %603 = transpose(%602, axes=[0, 2, 1, 3]); + %604 = reshape(%603, newshape=[50, 32, 768]); + %605 = reshape(%604, newshape=[-1, 768]); + %606 = nn.dense(%605, meta[relay.Constant][120], units=768); + %607 = add(%606, meta[relay.Constant][121]); + %608 = reshape(%607, newshape=[50, 32, 768]); + %609 = add(%565, %608); + %610 = mean(%609, axis=[-1], keepdims=True); + %611 = subtract(%609, %610); + %612 = power(%611, 2f); + %613 = mean(%612, axis=[-1], keepdims=True); + %614 = add(%613, 1e-05f); + %615 = sqrt(%614); + %616 = divide(%611, %615); + %617 = multiply(%616, meta[relay.Constant][122]); + %618 = add(%617, meta[relay.Constant][123]); + %619 = reshape(%618, newshape=[-1, 768]); + %620 = nn.dense(%619, meta[relay.Constant][124], units=3072); + %621 = add(%620, meta[relay.Constant][125]); + %622 = reshape(%621, newshape=[50, 32, 3072]); + %623 = power(%622, 3f); + %624 = multiply(%623, 0.044715f); + %625 = add(%622, %624); + %626 = multiply(%625, 0.797885f); + %627 = tanh(%626); + %628 = multiply(%622, 0.5f); + %629 = add(%627, 1f); + %630 = multiply(%628, %629); + %631 = reshape(%630, newshape=[-1, 3072]); + %632 = nn.dense(%631, meta[relay.Constant][126], units=768); + %633 = add(%632, meta[relay.Constant][127]); + %634 = reshape(%633, newshape=[50, 32, 768]); + %635 = add(%609, %634); + %636 = mean(%635, axis=[-1], keepdims=True); + %637 = subtract(%635, %636); + %638 = power(%637, 2f); + %639 = mean(%638, axis=[-1], keepdims=True); + %640 = add(%639, 1e-05f); + %641 = sqrt(%640); + %642 = divide(%637, %641); + %643 = multiply(%642, meta[relay.Constant][128]); + %644 = add(%643, meta[relay.Constant][129]); + %645 = reshape(%644, newshape=[-1, 768]); + %646 = nn.dense(%645, meta[relay.Constant][130], units=2304); + %647 = add(%646, meta[relay.Constant][131]); + %648 = reshape(%647, newshape=[50, 32, 2304]); + %649 = split(%648, indices_or_sections=[768, 1536], axis=2); + %650 = %649.0; + %651 = reshape(%650, newshape=[50, 32, 12, 64]); + %652 = transpose(%651, axes=[0, 2, 1, 3]); + %653 = %649.1; + %654 = reshape(%653, newshape=[50, 32, 12, 64]); + %655 = transpose(%654, axes=[0, 2, 3, 1]); + %656 = reshape(%655, newshape=[-1, 64, 32]); + %657 = reshape(%652, newshape=[-1, 32, 64]); + %658 = transpose(%656, axes=[0, 2, 1]); + %659 = nn.batch_matmul(%657, %658, out_dtype="float32", transpose_b=True); + %660 = reshape(%659, newshape=[50, 12, 32, 32]); + %661 = divide(%660, 8f); + %662 = multiply(%661, meta[relay.Constant][132]); + %663 = subtract(%662, meta[relay.Constant][133]); + %664 = nn.softmax(%663, axis=3); + %665 = %649.2; + %666 = reshape(%665, newshape=[50, 32, 12, 64]); + %667 = transpose(%666, axes=[0, 2, 1, 3]); + %668 = reshape(%667, newshape=[-1, 32, 64]); + %669 = reshape(%664, newshape=[-1, 32, 32]); + %670 = transpose(%668, axes=[0, 2, 1]); + %671 = nn.batch_matmul(%669, %670, out_dtype="float32", transpose_b=True); + %672 = reshape(%671, newshape=[50, 12, 32, 64]); + %673 = transpose(%672, axes=[0, 2, 1, 3]); + %674 = reshape(%673, newshape=[50, 32, 768]); + %675 = reshape(%674, newshape=[-1, 768]); + %676 = nn.dense(%675, meta[relay.Constant][134], units=768); + %677 = add(%676, meta[relay.Constant][135]); + %678 = reshape(%677, newshape=[50, 32, 768]); + %679 = add(%635, %678); + %680 = mean(%679, axis=[-1], keepdims=True); + %681 = subtract(%679, %680); + %682 = power(%681, 2f); + %683 = mean(%682, axis=[-1], keepdims=True); + %684 = add(%683, 1e-05f); + %685 = sqrt(%684); + %686 = divide(%681, %685); + %687 = multiply(%686, meta[relay.Constant][136]); + %688 = add(%687, meta[relay.Constant][137]); + %689 = reshape(%688, newshape=[-1, 768]); + %690 = nn.dense(%689, meta[relay.Constant][138], units=3072); + %691 = add(%690, meta[relay.Constant][139]); + %692 = reshape(%691, newshape=[50, 32, 3072]); + %693 = power(%692, 3f); + %694 = multiply(%693, 0.044715f); + %695 = add(%692, %694); + %696 = multiply(%695, 0.797885f); + %697 = tanh(%696); + %698 = multiply(%692, 0.5f); + %699 = add(%697, 1f); + %700 = multiply(%698, %699); + %701 = reshape(%700, newshape=[-1, 3072]); + %702 = nn.dense(%701, meta[relay.Constant][140], units=768); + %703 = add(%702, meta[relay.Constant][141]); + %704 = reshape(%703, newshape=[50, 32, 768]); + %705 = add(%679, %704); + %706 = mean(%705, axis=[-1], keepdims=True); + %707 = subtract(%705, %706); + %708 = power(%707, 2f); + %709 = mean(%708, axis=[-1], keepdims=True); + %710 = add(%709, 1e-05f); + %711 = sqrt(%710); + %712 = divide(%707, %711); + %713 = multiply(%712, meta[relay.Constant][142]); + %714 = add(%713, meta[relay.Constant][143]); + %715 = reshape(%714, newshape=[-1, 768]); + %716 = nn.dense(%715, meta[relay.Constant][144], units=2304); + %717 = add(%716, meta[relay.Constant][145]); + %718 = reshape(%717, newshape=[50, 32, 2304]); + %719 = split(%718, indices_or_sections=[768, 1536], axis=2); + %720 = %719.0; + %721 = reshape(%720, newshape=[50, 32, 12, 64]); + %722 = transpose(%721, axes=[0, 2, 1, 3]); + %723 = %719.1; + %724 = reshape(%723, newshape=[50, 32, 12, 64]); + %725 = transpose(%724, axes=[0, 2, 3, 1]); + %726 = reshape(%725, newshape=[-1, 64, 32]); + %727 = reshape(%722, newshape=[-1, 32, 64]); + %728 = transpose(%726, axes=[0, 2, 1]); + %729 = nn.batch_matmul(%727, %728, out_dtype="float32", transpose_b=True); + %730 = reshape(%729, newshape=[50, 12, 32, 32]); + %731 = divide(%730, 8f); + %732 = multiply(%731, meta[relay.Constant][146]); + %733 = subtract(%732, meta[relay.Constant][147]); + %734 = nn.softmax(%733, axis=3); + %735 = %719.2; + %736 = reshape(%735, newshape=[50, 32, 12, 64]); + %737 = transpose(%736, axes=[0, 2, 1, 3]); + %738 = reshape(%737, newshape=[-1, 32, 64]); + %739 = reshape(%734, newshape=[-1, 32, 32]); + %740 = transpose(%738, axes=[0, 2, 1]); + %741 = nn.batch_matmul(%739, %740, out_dtype="float32", transpose_b=True); + %742 = reshape(%741, newshape=[50, 12, 32, 64]); + %743 = transpose(%742, axes=[0, 2, 1, 3]); + %744 = reshape(%743, newshape=[50, 32, 768]); + %745 = reshape(%744, newshape=[-1, 768]); + %746 = nn.dense(%745, meta[relay.Constant][148], units=768); + %747 = add(%746, meta[relay.Constant][149]); + %748 = reshape(%747, newshape=[50, 32, 768]); + %749 = add(%705, %748); + %750 = mean(%749, axis=[-1], keepdims=True); + %751 = subtract(%749, %750); + %752 = power(%751, 2f); + %753 = mean(%752, axis=[-1], keepdims=True); + %754 = add(%753, 1e-05f); + %755 = sqrt(%754); + %756 = divide(%751, %755); + %757 = multiply(%756, meta[relay.Constant][150]); + %758 = add(%757, meta[relay.Constant][151]); + %759 = reshape(%758, newshape=[-1, 768]); + %760 = nn.dense(%759, meta[relay.Constant][152], units=3072); + %761 = add(%760, meta[relay.Constant][153]); + %762 = reshape(%761, newshape=[50, 32, 3072]); + %763 = power(%762, 3f); + %764 = multiply(%763, 0.044715f); + %765 = add(%762, %764); + %766 = multiply(%765, 0.797885f); + %767 = tanh(%766); + %768 = multiply(%762, 0.5f); + %769 = add(%767, 1f); + %770 = multiply(%768, %769); + %771 = reshape(%770, newshape=[-1, 3072]); + %772 = nn.dense(%771, meta[relay.Constant][154], units=768); + %773 = add(%772, meta[relay.Constant][155]); + %774 = reshape(%773, newshape=[50, 32, 768]); + %775 = add(%749, %774); + %776 = mean(%775, axis=[-1], keepdims=True); + %777 = subtract(%775, %776); + %778 = power(%777, 2f); + %779 = mean(%778, axis=[-1], keepdims=True); + %780 = add(%779, 1e-05f); + %781 = sqrt(%780); + %782 = divide(%777, %781); + %783 = multiply(%782, meta[relay.Constant][156]); + %784 = add(%783, meta[relay.Constant][157]); + %785 = reshape(%784, newshape=[-1, 768]); + %786 = nn.dense(%785, meta[relay.Constant][158], units=2304); + %787 = add(%786, meta[relay.Constant][159]); + %788 = reshape(%787, newshape=[50, 32, 2304]); + %789 = split(%788, indices_or_sections=[768, 1536], axis=2); + %790 = %789.0; + %791 = reshape(%790, newshape=[50, 32, 12, 64]); + %792 = transpose(%791, axes=[0, 2, 1, 3]); + %793 = %789.1; + %794 = reshape(%793, newshape=[50, 32, 12, 64]); + %795 = transpose(%794, axes=[0, 2, 3, 1]); + %796 = reshape(%795, newshape=[-1, 64, 32]); + %797 = reshape(%792, newshape=[-1, 32, 64]); + %798 = transpose(%796, axes=[0, 2, 1]); + %799 = nn.batch_matmul(%797, %798, out_dtype="float32", transpose_b=True); + %800 = reshape(%799, newshape=[50, 12, 32, 32]); + %801 = divide(%800, 8f); + %802 = multiply(%801, meta[relay.Constant][160]); + %803 = subtract(%802, meta[relay.Constant][161]); + %804 = nn.softmax(%803, axis=3); + %805 = %789.2; + %806 = reshape(%805, newshape=[50, 32, 12, 64]); + %807 = transpose(%806, axes=[0, 2, 1, 3]); + %808 = reshape(%807, newshape=[-1, 32, 64]); + %809 = reshape(%804, newshape=[-1, 32, 32]); + %810 = transpose(%808, axes=[0, 2, 1]); + %811 = nn.batch_matmul(%809, %810, out_dtype="float32", transpose_b=True); + %812 = reshape(%811, newshape=[50, 12, 32, 64]); + %813 = transpose(%812, axes=[0, 2, 1, 3]); + %814 = reshape(%813, newshape=[50, 32, 768]); + %815 = reshape(%814, newshape=[-1, 768]); + %816 = nn.dense(%815, meta[relay.Constant][162], units=768); + %817 = add(%816, meta[relay.Constant][163]); + %818 = reshape(%817, newshape=[50, 32, 768]); + %819 = add(%775, %818); + %820 = mean(%819, axis=[-1], keepdims=True); + %821 = subtract(%819, %820); + %822 = power(%821, 2f); + %823 = mean(%822, axis=[-1], keepdims=True); + %824 = add(%823, 1e-05f); + %825 = sqrt(%824); + %826 = divide(%821, %825); + %827 = multiply(%826, meta[relay.Constant][164]); + %828 = add(%827, meta[relay.Constant][165]); + %829 = reshape(%828, newshape=[-1, 768]); + %830 = nn.dense(%829, meta[relay.Constant][166], units=3072); + %831 = add(%830, meta[relay.Constant][167]); + %832 = reshape(%831, newshape=[50, 32, 3072]); + %833 = power(%832, 3f); + %834 = multiply(%833, 0.044715f); + %835 = add(%832, %834); + %836 = multiply(%835, 0.797885f); + %837 = tanh(%836); + %838 = multiply(%832, 0.5f); + %839 = add(%837, 1f); + %840 = multiply(%838, %839); + %841 = reshape(%840, newshape=[-1, 3072]); + %842 = nn.dense(%841, meta[relay.Constant][168], units=768); + %843 = add(%842, meta[relay.Constant][169]); + %844 = reshape(%843, newshape=[50, 32, 768]); + %845 = add(%819, %844); + %846 = mean(%845, axis=[-1], keepdims=True); + %847 = subtract(%845, %846); + %848 = power(%847, 2f); + %849 = mean(%848, axis=[-1], keepdims=True); + %850 = add(%849, 1e-05f); + %851 = sqrt(%850); + %852 = divide(%847, %851); + %853 = multiply(%852, meta[relay.Constant][170]); + %854 = add(%853, meta[relay.Constant][171]); + %855 = transpose(%24, axes=[0, 2, 1, 3]); + %856 = expand_dims(%855, axis=0); + %857 = expand_dims(%37, axis=0); + %858 = (%856, %857); + %859 = transpose(%94, axes=[0, 2, 1, 3]); + %860 = expand_dims(%859, axis=0); + %861 = expand_dims(%107, axis=0); + %862 = (%860, %861); + %863 = transpose(%164, axes=[0, 2, 1, 3]); + %864 = expand_dims(%863, axis=0); + %865 = expand_dims(%177, axis=0); + %866 = (%864, %865); + %867 = transpose(%234, axes=[0, 2, 1, 3]); + %868 = expand_dims(%867, axis=0); + %869 = expand_dims(%247, axis=0); + %870 = (%868, %869); + %871 = transpose(%304, axes=[0, 2, 1, 3]); + %872 = expand_dims(%871, axis=0); + %873 = expand_dims(%317, axis=0); + %874 = (%872, %873); + %875 = transpose(%374, axes=[0, 2, 1, 3]); + %876 = expand_dims(%875, axis=0); + %877 = expand_dims(%387, axis=0); + %878 = (%876, %877); + %879 = transpose(%444, axes=[0, 2, 1, 3]); + %880 = expand_dims(%879, axis=0); + %881 = expand_dims(%457, axis=0); + %882 = (%880, %881); + %883 = transpose(%514, axes=[0, 2, 1, 3]); + %884 = expand_dims(%883, axis=0); + %885 = expand_dims(%527, axis=0); + %886 = (%884, %885); + %887 = transpose(%584, axes=[0, 2, 1, 3]); + %888 = expand_dims(%887, axis=0); + %889 = expand_dims(%597, axis=0); + %890 = (%888, %889); + %891 = transpose(%654, axes=[0, 2, 1, 3]); + %892 = expand_dims(%891, axis=0); + %893 = expand_dims(%667, axis=0); + %894 = (%892, %893); + %895 = transpose(%724, axes=[0, 2, 1, 3]); + %896 = expand_dims(%895, axis=0); + %897 = expand_dims(%737, axis=0); + %898 = (%896, %897); + %899 = transpose(%794, axes=[0, 2, 1, 3]); + %900 = expand_dims(%899, axis=0); + %901 = expand_dims(%807, axis=0); + %902 = (%900, %901); + %903 = reshape(%854, newshape=[1, 50, 32, 768]); + %904 = concatenate(%858); + %905 = concatenate(%862); + %906 = concatenate(%866); + %907 = concatenate(%870); + %908 = concatenate(%874); + %909 = concatenate(%878); + %910 = concatenate(%882); + %911 = concatenate(%886); + %912 = concatenate(%890); + %913 = concatenate(%894); + %914 = concatenate(%898); + %915 = concatenate(%902); + (%903, %904, %905, %906, %907, %908, %909, %910, %911, %912, %913, %914, %915) + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "gpt2", + "input_shapes": {"x": [1, 50, 32]}, + "input_dtypes": {"x": "int64"}, + "mod": mod, + "params": None, + "main_dtype": "float32", + } + + +def gpt2_16(): + metatable = {"relay.Constant": gpt2_consts("float16")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%x: Tensor[(1, 50, 32), int64]) -> (Tensor[(1, 50, 32, 768), float16], + Tensor[(2, 50, 12, 32, 64), float16], + Tensor[(2, 50, 12, 32, 64), float16], + Tensor[(2, 50, 12, 32, 64), float16], + Tensor[(2, 50, 12, 32, 64), float16], + Tensor[(2, 50, 12, 32, 64), float16], + Tensor[(2, 50, 12, 32, 64), float16], + Tensor[(2, 50, 12, 32, 64), float16], + Tensor[(2, 50, 12, 32, 64), float16], + Tensor[(2, 50, 12, 32, 64), float16], + Tensor[(2, 50, 12, 32, 64), float16], + Tensor[(2, 50, 12, 32, 64), float16], + Tensor[(2, 50, 12, 32, 64), float16]) { + %0 = reshape(%x, newshape=[-1, 32]); + %1 = less(%0, 0i64); + %2 = add(%0, 50257i64); + %3 = where(%1, %2, %0); + %4 = take(meta[relay.Constant][0], %3, axis=0); + %5 = add(%4, meta[relay.Constant][1]); + %6 = mean(%5, axis=[-1], keepdims=True); + %7 = subtract(%5, %6); + %8 = power(%7, 2f16); + %9 = mean(%8, axis=[-1], keepdims=True); + %10 = add(%9, 1e-05f16); + %11 = sqrt(%10); + %12 = divide(%7, %11); + %13 = multiply(%12, meta[relay.Constant][2]); + %14 = add(%13, meta[relay.Constant][3]); + %15 = reshape(%14, newshape=[-1, 768]); + %16 = nn.dense(%15, meta[relay.Constant][4], units=2304); + %17 = add(%16, meta[relay.Constant][5]); + %18 = reshape(%17, newshape=[50, 32, 2304]); + %19 = split(%18, indices_or_sections=[768, 1536], axis=2); + %20 = %19.0; + %21 = reshape(%20, newshape=[50, 32, 12, 64]); + %22 = transpose(%21, axes=[0, 2, 1, 3]); + %23 = %19.1; + %24 = reshape(%23, newshape=[50, 32, 12, 64]); + %25 = transpose(%24, axes=[0, 2, 3, 1]); + %26 = reshape(%25, newshape=[-1, 64, 32]); + %27 = reshape(%22, newshape=[-1, 32, 64]); + %28 = transpose(%26, axes=[0, 2, 1]); + %29 = nn.batch_matmul(%27, %28, out_dtype="float16", transpose_b=True); + %30 = reshape(%29, newshape=[50, 12, 32, 32]); + %31 = divide(%30, 8f16); + %32 = multiply(%31, meta[relay.Constant][6]); + %33 = subtract(%32, meta[relay.Constant][7]); + %34 = nn.softmax(%33, axis=3); + %35 = %19.2; + %36 = reshape(%35, newshape=[50, 32, 12, 64]); + %37 = transpose(%36, axes=[0, 2, 1, 3]); + %38 = reshape(%37, newshape=[-1, 32, 64]); + %39 = reshape(%34, newshape=[-1, 32, 32]); + %40 = transpose(%38, axes=[0, 2, 1]); + %41 = nn.batch_matmul(%39, %40, out_dtype="float16", transpose_b=True); + %42 = reshape(%41, newshape=[50, 12, 32, 64]); + %43 = transpose(%42, axes=[0, 2, 1, 3]); + %44 = reshape(%43, newshape=[50, 32, 768]); + %45 = reshape(%44, newshape=[-1, 768]); + %46 = nn.dense(%45, meta[relay.Constant][8], units=768); + %47 = add(%46, meta[relay.Constant][9]); + %48 = reshape(%47, newshape=[50, 32, 768]); + %49 = add(%5, %48); + %50 = mean(%49, axis=[-1], keepdims=True); + %51 = subtract(%49, %50); + %52 = power(%51, 2f16); + %53 = mean(%52, axis=[-1], keepdims=True); + %54 = add(%53, 1e-05f16); + %55 = sqrt(%54); + %56 = divide(%51, %55); + %57 = multiply(%56, meta[relay.Constant][10]); + %58 = add(%57, meta[relay.Constant][11]); + %59 = reshape(%58, newshape=[-1, 768]); + %60 = nn.dense(%59, meta[relay.Constant][12], units=3072); + %61 = add(%60, meta[relay.Constant][13]); + %62 = reshape(%61, newshape=[50, 32, 3072]); + %63 = power(%62, 3f16); + %64 = multiply(%63, 0.044715f16); + %65 = add(%62, %64); + %66 = multiply(%65, 0.797885f16); + %67 = tanh(%66); + %68 = multiply(%62, 0.5f16); + %69 = add(%67, 1f16); + %70 = multiply(%68, %69); + %71 = reshape(%70, newshape=[-1, 3072]); + %72 = nn.dense(%71, meta[relay.Constant][14], units=768); + %73 = add(%72, meta[relay.Constant][15]); + %74 = reshape(%73, newshape=[50, 32, 768]); + %75 = add(%49, %74); + %76 = mean(%75, axis=[-1], keepdims=True); + %77 = subtract(%75, %76); + %78 = power(%77, 2f16); + %79 = mean(%78, axis=[-1], keepdims=True); + %80 = add(%79, 1e-05f16); + %81 = sqrt(%80); + %82 = divide(%77, %81); + %83 = multiply(%82, meta[relay.Constant][16]); + %84 = add(%83, meta[relay.Constant][17]); + %85 = reshape(%84, newshape=[-1, 768]); + %86 = nn.dense(%85, meta[relay.Constant][18], units=2304); + %87 = add(%86, meta[relay.Constant][19]); + %88 = reshape(%87, newshape=[50, 32, 2304]); + %89 = split(%88, indices_or_sections=[768, 1536], axis=2); + %90 = %89.0; + %91 = reshape(%90, newshape=[50, 32, 12, 64]); + %92 = transpose(%91, axes=[0, 2, 1, 3]); + %93 = %89.1; + %94 = reshape(%93, newshape=[50, 32, 12, 64]); + %95 = transpose(%94, axes=[0, 2, 3, 1]); + %96 = reshape(%95, newshape=[-1, 64, 32]); + %97 = reshape(%92, newshape=[-1, 32, 64]); + %98 = transpose(%96, axes=[0, 2, 1]); + %99 = nn.batch_matmul(%97, %98, out_dtype="float16", transpose_b=True); + %100 = reshape(%99, newshape=[50, 12, 32, 32]); + %101 = divide(%100, 8f16); + %102 = multiply(%101, meta[relay.Constant][20]); + %103 = subtract(%102, meta[relay.Constant][21]); + %104 = nn.softmax(%103, axis=3); + %105 = %89.2; + %106 = reshape(%105, newshape=[50, 32, 12, 64]); + %107 = transpose(%106, axes=[0, 2, 1, 3]); + %108 = reshape(%107, newshape=[-1, 32, 64]); + %109 = reshape(%104, newshape=[-1, 32, 32]); + %110 = transpose(%108, axes=[0, 2, 1]); + %111 = nn.batch_matmul(%109, %110, out_dtype="float16", transpose_b=True); + %112 = reshape(%111, newshape=[50, 12, 32, 64]); + %113 = transpose(%112, axes=[0, 2, 1, 3]); + %114 = reshape(%113, newshape=[50, 32, 768]); + %115 = reshape(%114, newshape=[-1, 768]); + %116 = nn.dense(%115, meta[relay.Constant][22], units=768); + %117 = add(%116, meta[relay.Constant][23]); + %118 = reshape(%117, newshape=[50, 32, 768]); + %119 = add(%75, %118); + %120 = mean(%119, axis=[-1], keepdims=True); + %121 = subtract(%119, %120); + %122 = power(%121, 2f16); + %123 = mean(%122, axis=[-1], keepdims=True); + %124 = add(%123, 1e-05f16); + %125 = sqrt(%124); + %126 = divide(%121, %125); + %127 = multiply(%126, meta[relay.Constant][24]); + %128 = add(%127, meta[relay.Constant][25]); + %129 = reshape(%128, newshape=[-1, 768]); + %130 = nn.dense(%129, meta[relay.Constant][26], units=3072); + %131 = add(%130, meta[relay.Constant][27]); + %132 = reshape(%131, newshape=[50, 32, 3072]); + %133 = power(%132, 3f16); + %134 = multiply(%133, 0.044715f16); + %135 = add(%132, %134); + %136 = multiply(%135, 0.797885f16); + %137 = tanh(%136); + %138 = multiply(%132, 0.5f16); + %139 = add(%137, 1f16); + %140 = multiply(%138, %139); + %141 = reshape(%140, newshape=[-1, 3072]); + %142 = nn.dense(%141, meta[relay.Constant][28], units=768); + %143 = add(%142, meta[relay.Constant][29]); + %144 = reshape(%143, newshape=[50, 32, 768]); + %145 = add(%119, %144); + %146 = mean(%145, axis=[-1], keepdims=True); + %147 = subtract(%145, %146); + %148 = power(%147, 2f16); + %149 = mean(%148, axis=[-1], keepdims=True); + %150 = add(%149, 1e-05f16); + %151 = sqrt(%150); + %152 = divide(%147, %151); + %153 = multiply(%152, meta[relay.Constant][30]); + %154 = add(%153, meta[relay.Constant][31]); + %155 = reshape(%154, newshape=[-1, 768]); + %156 = nn.dense(%155, meta[relay.Constant][32], units=2304); + %157 = add(%156, meta[relay.Constant][33]); + %158 = reshape(%157, newshape=[50, 32, 2304]); + %159 = split(%158, indices_or_sections=[768, 1536], axis=2); + %160 = %159.0; + %161 = reshape(%160, newshape=[50, 32, 12, 64]); + %162 = transpose(%161, axes=[0, 2, 1, 3]); + %163 = %159.1; + %164 = reshape(%163, newshape=[50, 32, 12, 64]); + %165 = transpose(%164, axes=[0, 2, 3, 1]); + %166 = reshape(%165, newshape=[-1, 64, 32]); + %167 = reshape(%162, newshape=[-1, 32, 64]); + %168 = transpose(%166, axes=[0, 2, 1]); + %169 = nn.batch_matmul(%167, %168, out_dtype="float16", transpose_b=True); + %170 = reshape(%169, newshape=[50, 12, 32, 32]); + %171 = divide(%170, 8f16); + %172 = multiply(%171, meta[relay.Constant][34]); + %173 = subtract(%172, meta[relay.Constant][35]); + %174 = nn.softmax(%173, axis=3); + %175 = %159.2; + %176 = reshape(%175, newshape=[50, 32, 12, 64]); + %177 = transpose(%176, axes=[0, 2, 1, 3]); + %178 = reshape(%177, newshape=[-1, 32, 64]); + %179 = reshape(%174, newshape=[-1, 32, 32]); + %180 = transpose(%178, axes=[0, 2, 1]); + %181 = nn.batch_matmul(%179, %180, out_dtype="float16", transpose_b=True); + %182 = reshape(%181, newshape=[50, 12, 32, 64]); + %183 = transpose(%182, axes=[0, 2, 1, 3]); + %184 = reshape(%183, newshape=[50, 32, 768]); + %185 = reshape(%184, newshape=[-1, 768]); + %186 = nn.dense(%185, meta[relay.Constant][36], units=768); + %187 = add(%186, meta[relay.Constant][37]); + %188 = reshape(%187, newshape=[50, 32, 768]); + %189 = add(%145, %188); + %190 = mean(%189, axis=[-1], keepdims=True); + %191 = subtract(%189, %190); + %192 = power(%191, 2f16); + %193 = mean(%192, axis=[-1], keepdims=True); + %194 = add(%193, 1e-05f16); + %195 = sqrt(%194); + %196 = divide(%191, %195); + %197 = multiply(%196, meta[relay.Constant][38]); + %198 = add(%197, meta[relay.Constant][39]); + %199 = reshape(%198, newshape=[-1, 768]); + %200 = nn.dense(%199, meta[relay.Constant][40], units=3072); + %201 = add(%200, meta[relay.Constant][41]); + %202 = reshape(%201, newshape=[50, 32, 3072]); + %203 = power(%202, 3f16); + %204 = multiply(%203, 0.044715f16); + %205 = add(%202, %204); + %206 = multiply(%205, 0.797885f16); + %207 = tanh(%206); + %208 = multiply(%202, 0.5f16); + %209 = add(%207, 1f16); + %210 = multiply(%208, %209); + %211 = reshape(%210, newshape=[-1, 3072]); + %212 = nn.dense(%211, meta[relay.Constant][42], units=768); + %213 = add(%212, meta[relay.Constant][43]); + %214 = reshape(%213, newshape=[50, 32, 768]); + %215 = add(%189, %214); + %216 = mean(%215, axis=[-1], keepdims=True); + %217 = subtract(%215, %216); + %218 = power(%217, 2f16); + %219 = mean(%218, axis=[-1], keepdims=True); + %220 = add(%219, 1e-05f16); + %221 = sqrt(%220); + %222 = divide(%217, %221); + %223 = multiply(%222, meta[relay.Constant][44]); + %224 = add(%223, meta[relay.Constant][45]); + %225 = reshape(%224, newshape=[-1, 768]); + %226 = nn.dense(%225, meta[relay.Constant][46], units=2304); + %227 = add(%226, meta[relay.Constant][47]); + %228 = reshape(%227, newshape=[50, 32, 2304]); + %229 = split(%228, indices_or_sections=[768, 1536], axis=2); + %230 = %229.0; + %231 = reshape(%230, newshape=[50, 32, 12, 64]); + %232 = transpose(%231, axes=[0, 2, 1, 3]); + %233 = %229.1; + %234 = reshape(%233, newshape=[50, 32, 12, 64]); + %235 = transpose(%234, axes=[0, 2, 3, 1]); + %236 = reshape(%235, newshape=[-1, 64, 32]); + %237 = reshape(%232, newshape=[-1, 32, 64]); + %238 = transpose(%236, axes=[0, 2, 1]); + %239 = nn.batch_matmul(%237, %238, out_dtype="float16", transpose_b=True); + %240 = reshape(%239, newshape=[50, 12, 32, 32]); + %241 = divide(%240, 8f16); + %242 = multiply(%241, meta[relay.Constant][48]); + %243 = subtract(%242, meta[relay.Constant][49]); + %244 = nn.softmax(%243, axis=3); + %245 = %229.2; + %246 = reshape(%245, newshape=[50, 32, 12, 64]); + %247 = transpose(%246, axes=[0, 2, 1, 3]); + %248 = reshape(%247, newshape=[-1, 32, 64]); + %249 = reshape(%244, newshape=[-1, 32, 32]); + %250 = transpose(%248, axes=[0, 2, 1]); + %251 = nn.batch_matmul(%249, %250, out_dtype="float16", transpose_b=True); + %252 = reshape(%251, newshape=[50, 12, 32, 64]); + %253 = transpose(%252, axes=[0, 2, 1, 3]); + %254 = reshape(%253, newshape=[50, 32, 768]); + %255 = reshape(%254, newshape=[-1, 768]); + %256 = nn.dense(%255, meta[relay.Constant][50], units=768); + %257 = add(%256, meta[relay.Constant][51]); + %258 = reshape(%257, newshape=[50, 32, 768]); + %259 = add(%215, %258); + %260 = mean(%259, axis=[-1], keepdims=True); + %261 = subtract(%259, %260); + %262 = power(%261, 2f16); + %263 = mean(%262, axis=[-1], keepdims=True); + %264 = add(%263, 1e-05f16); + %265 = sqrt(%264); + %266 = divide(%261, %265); + %267 = multiply(%266, meta[relay.Constant][52]); + %268 = add(%267, meta[relay.Constant][53]); + %269 = reshape(%268, newshape=[-1, 768]); + %270 = nn.dense(%269, meta[relay.Constant][54], units=3072); + %271 = add(%270, meta[relay.Constant][55]); + %272 = reshape(%271, newshape=[50, 32, 3072]); + %273 = power(%272, 3f16); + %274 = multiply(%273, 0.044715f16); + %275 = add(%272, %274); + %276 = multiply(%275, 0.797885f16); + %277 = tanh(%276); + %278 = multiply(%272, 0.5f16); + %279 = add(%277, 1f16); + %280 = multiply(%278, %279); + %281 = reshape(%280, newshape=[-1, 3072]); + %282 = nn.dense(%281, meta[relay.Constant][56], units=768); + %283 = add(%282, meta[relay.Constant][57]); + %284 = reshape(%283, newshape=[50, 32, 768]); + %285 = add(%259, %284); + %286 = mean(%285, axis=[-1], keepdims=True); + %287 = subtract(%285, %286); + %288 = power(%287, 2f16); + %289 = mean(%288, axis=[-1], keepdims=True); + %290 = add(%289, 1e-05f16); + %291 = sqrt(%290); + %292 = divide(%287, %291); + %293 = multiply(%292, meta[relay.Constant][58]); + %294 = add(%293, meta[relay.Constant][59]); + %295 = reshape(%294, newshape=[-1, 768]); + %296 = nn.dense(%295, meta[relay.Constant][60], units=2304); + %297 = add(%296, meta[relay.Constant][61]); + %298 = reshape(%297, newshape=[50, 32, 2304]); + %299 = split(%298, indices_or_sections=[768, 1536], axis=2); + %300 = %299.0; + %301 = reshape(%300, newshape=[50, 32, 12, 64]); + %302 = transpose(%301, axes=[0, 2, 1, 3]); + %303 = %299.1; + %304 = reshape(%303, newshape=[50, 32, 12, 64]); + %305 = transpose(%304, axes=[0, 2, 3, 1]); + %306 = reshape(%305, newshape=[-1, 64, 32]); + %307 = reshape(%302, newshape=[-1, 32, 64]); + %308 = transpose(%306, axes=[0, 2, 1]); + %309 = nn.batch_matmul(%307, %308, out_dtype="float16", transpose_b=True); + %310 = reshape(%309, newshape=[50, 12, 32, 32]); + %311 = divide(%310, 8f16); + %312 = multiply(%311, meta[relay.Constant][62]); + %313 = subtract(%312, meta[relay.Constant][63]); + %314 = nn.softmax(%313, axis=3); + %315 = %299.2; + %316 = reshape(%315, newshape=[50, 32, 12, 64]); + %317 = transpose(%316, axes=[0, 2, 1, 3]); + %318 = reshape(%317, newshape=[-1, 32, 64]); + %319 = reshape(%314, newshape=[-1, 32, 32]); + %320 = transpose(%318, axes=[0, 2, 1]); + %321 = nn.batch_matmul(%319, %320, out_dtype="float16", transpose_b=True); + %322 = reshape(%321, newshape=[50, 12, 32, 64]); + %323 = transpose(%322, axes=[0, 2, 1, 3]); + %324 = reshape(%323, newshape=[50, 32, 768]); + %325 = reshape(%324, newshape=[-1, 768]); + %326 = nn.dense(%325, meta[relay.Constant][64], units=768); + %327 = add(%326, meta[relay.Constant][65]); + %328 = reshape(%327, newshape=[50, 32, 768]); + %329 = add(%285, %328); + %330 = mean(%329, axis=[-1], keepdims=True); + %331 = subtract(%329, %330); + %332 = power(%331, 2f16); + %333 = mean(%332, axis=[-1], keepdims=True); + %334 = add(%333, 1e-05f16); + %335 = sqrt(%334); + %336 = divide(%331, %335); + %337 = multiply(%336, meta[relay.Constant][66]); + %338 = add(%337, meta[relay.Constant][67]); + %339 = reshape(%338, newshape=[-1, 768]); + %340 = nn.dense(%339, meta[relay.Constant][68], units=3072); + %341 = add(%340, meta[relay.Constant][69]); + %342 = reshape(%341, newshape=[50, 32, 3072]); + %343 = power(%342, 3f16); + %344 = multiply(%343, 0.044715f16); + %345 = add(%342, %344); + %346 = multiply(%345, 0.797885f16); + %347 = tanh(%346); + %348 = multiply(%342, 0.5f16); + %349 = add(%347, 1f16); + %350 = multiply(%348, %349); + %351 = reshape(%350, newshape=[-1, 3072]); + %352 = nn.dense(%351, meta[relay.Constant][70], units=768); + %353 = add(%352, meta[relay.Constant][71]); + %354 = reshape(%353, newshape=[50, 32, 768]); + %355 = add(%329, %354); + %356 = mean(%355, axis=[-1], keepdims=True); + %357 = subtract(%355, %356); + %358 = power(%357, 2f16); + %359 = mean(%358, axis=[-1], keepdims=True); + %360 = add(%359, 1e-05f16); + %361 = sqrt(%360); + %362 = divide(%357, %361); + %363 = multiply(%362, meta[relay.Constant][72]); + %364 = add(%363, meta[relay.Constant][73]); + %365 = reshape(%364, newshape=[-1, 768]); + %366 = nn.dense(%365, meta[relay.Constant][74], units=2304); + %367 = add(%366, meta[relay.Constant][75]); + %368 = reshape(%367, newshape=[50, 32, 2304]); + %369 = split(%368, indices_or_sections=[768, 1536], axis=2); + %370 = %369.0; + %371 = reshape(%370, newshape=[50, 32, 12, 64]); + %372 = transpose(%371, axes=[0, 2, 1, 3]); + %373 = %369.1; + %374 = reshape(%373, newshape=[50, 32, 12, 64]); + %375 = transpose(%374, axes=[0, 2, 3, 1]); + %376 = reshape(%375, newshape=[-1, 64, 32]); + %377 = reshape(%372, newshape=[-1, 32, 64]); + %378 = transpose(%376, axes=[0, 2, 1]); + %379 = nn.batch_matmul(%377, %378, out_dtype="float16", transpose_b=True); + %380 = reshape(%379, newshape=[50, 12, 32, 32]); + %381 = divide(%380, 8f16); + %382 = multiply(%381, meta[relay.Constant][76]); + %383 = subtract(%382, meta[relay.Constant][77]); + %384 = nn.softmax(%383, axis=3); + %385 = %369.2; + %386 = reshape(%385, newshape=[50, 32, 12, 64]); + %387 = transpose(%386, axes=[0, 2, 1, 3]); + %388 = reshape(%387, newshape=[-1, 32, 64]); + %389 = reshape(%384, newshape=[-1, 32, 32]); + %390 = transpose(%388, axes=[0, 2, 1]); + %391 = nn.batch_matmul(%389, %390, out_dtype="float16", transpose_b=True); + %392 = reshape(%391, newshape=[50, 12, 32, 64]); + %393 = transpose(%392, axes=[0, 2, 1, 3]); + %394 = reshape(%393, newshape=[50, 32, 768]); + %395 = reshape(%394, newshape=[-1, 768]); + %396 = nn.dense(%395, meta[relay.Constant][78], units=768); + %397 = add(%396, meta[relay.Constant][79]); + %398 = reshape(%397, newshape=[50, 32, 768]); + %399 = add(%355, %398); + %400 = mean(%399, axis=[-1], keepdims=True); + %401 = subtract(%399, %400); + %402 = power(%401, 2f16); + %403 = mean(%402, axis=[-1], keepdims=True); + %404 = add(%403, 1e-05f16); + %405 = sqrt(%404); + %406 = divide(%401, %405); + %407 = multiply(%406, meta[relay.Constant][80]); + %408 = add(%407, meta[relay.Constant][81]); + %409 = reshape(%408, newshape=[-1, 768]); + %410 = nn.dense(%409, meta[relay.Constant][82], units=3072); + %411 = add(%410, meta[relay.Constant][83]); + %412 = reshape(%411, newshape=[50, 32, 3072]); + %413 = power(%412, 3f16); + %414 = multiply(%413, 0.044715f16); + %415 = add(%412, %414); + %416 = multiply(%415, 0.797885f16); + %417 = tanh(%416); + %418 = multiply(%412, 0.5f16); + %419 = add(%417, 1f16); + %420 = multiply(%418, %419); + %421 = reshape(%420, newshape=[-1, 3072]); + %422 = nn.dense(%421, meta[relay.Constant][84], units=768); + %423 = add(%422, meta[relay.Constant][85]); + %424 = reshape(%423, newshape=[50, 32, 768]); + %425 = add(%399, %424); + %426 = mean(%425, axis=[-1], keepdims=True); + %427 = subtract(%425, %426); + %428 = power(%427, 2f16); + %429 = mean(%428, axis=[-1], keepdims=True); + %430 = add(%429, 1e-05f16); + %431 = sqrt(%430); + %432 = divide(%427, %431); + %433 = multiply(%432, meta[relay.Constant][86]); + %434 = add(%433, meta[relay.Constant][87]); + %435 = reshape(%434, newshape=[-1, 768]); + %436 = nn.dense(%435, meta[relay.Constant][88], units=2304); + %437 = add(%436, meta[relay.Constant][89]); + %438 = reshape(%437, newshape=[50, 32, 2304]); + %439 = split(%438, indices_or_sections=[768, 1536], axis=2); + %440 = %439.0; + %441 = reshape(%440, newshape=[50, 32, 12, 64]); + %442 = transpose(%441, axes=[0, 2, 1, 3]); + %443 = %439.1; + %444 = reshape(%443, newshape=[50, 32, 12, 64]); + %445 = transpose(%444, axes=[0, 2, 3, 1]); + %446 = reshape(%445, newshape=[-1, 64, 32]); + %447 = reshape(%442, newshape=[-1, 32, 64]); + %448 = transpose(%446, axes=[0, 2, 1]); + %449 = nn.batch_matmul(%447, %448, out_dtype="float16", transpose_b=True); + %450 = reshape(%449, newshape=[50, 12, 32, 32]); + %451 = divide(%450, 8f16); + %452 = multiply(%451, meta[relay.Constant][90]); + %453 = subtract(%452, meta[relay.Constant][91]); + %454 = nn.softmax(%453, axis=3); + %455 = %439.2; + %456 = reshape(%455, newshape=[50, 32, 12, 64]); + %457 = transpose(%456, axes=[0, 2, 1, 3]); + %458 = reshape(%457, newshape=[-1, 32, 64]); + %459 = reshape(%454, newshape=[-1, 32, 32]); + %460 = transpose(%458, axes=[0, 2, 1]); + %461 = nn.batch_matmul(%459, %460, out_dtype="float16", transpose_b=True); + %462 = reshape(%461, newshape=[50, 12, 32, 64]); + %463 = transpose(%462, axes=[0, 2, 1, 3]); + %464 = reshape(%463, newshape=[50, 32, 768]); + %465 = reshape(%464, newshape=[-1, 768]); + %466 = nn.dense(%465, meta[relay.Constant][92], units=768); + %467 = add(%466, meta[relay.Constant][93]); + %468 = reshape(%467, newshape=[50, 32, 768]); + %469 = add(%425, %468); + %470 = mean(%469, axis=[-1], keepdims=True); + %471 = subtract(%469, %470); + %472 = power(%471, 2f16); + %473 = mean(%472, axis=[-1], keepdims=True); + %474 = add(%473, 1e-05f16); + %475 = sqrt(%474); + %476 = divide(%471, %475); + %477 = multiply(%476, meta[relay.Constant][94]); + %478 = add(%477, meta[relay.Constant][95]); + %479 = reshape(%478, newshape=[-1, 768]); + %480 = nn.dense(%479, meta[relay.Constant][96], units=3072); + %481 = add(%480, meta[relay.Constant][97]); + %482 = reshape(%481, newshape=[50, 32, 3072]); + %483 = power(%482, 3f16); + %484 = multiply(%483, 0.044715f16); + %485 = add(%482, %484); + %486 = multiply(%485, 0.797885f16); + %487 = tanh(%486); + %488 = multiply(%482, 0.5f16); + %489 = add(%487, 1f16); + %490 = multiply(%488, %489); + %491 = reshape(%490, newshape=[-1, 3072]); + %492 = nn.dense(%491, meta[relay.Constant][98], units=768); + %493 = add(%492, meta[relay.Constant][99]); + %494 = reshape(%493, newshape=[50, 32, 768]); + %495 = add(%469, %494); + %496 = mean(%495, axis=[-1], keepdims=True); + %497 = subtract(%495, %496); + %498 = power(%497, 2f16); + %499 = mean(%498, axis=[-1], keepdims=True); + %500 = add(%499, 1e-05f16); + %501 = sqrt(%500); + %502 = divide(%497, %501); + %503 = multiply(%502, meta[relay.Constant][100]); + %504 = add(%503, meta[relay.Constant][101]); + %505 = reshape(%504, newshape=[-1, 768]); + %506 = nn.dense(%505, meta[relay.Constant][102], units=2304); + %507 = add(%506, meta[relay.Constant][103]); + %508 = reshape(%507, newshape=[50, 32, 2304]); + %509 = split(%508, indices_or_sections=[768, 1536], axis=2); + %510 = %509.0; + %511 = reshape(%510, newshape=[50, 32, 12, 64]); + %512 = transpose(%511, axes=[0, 2, 1, 3]); + %513 = %509.1; + %514 = reshape(%513, newshape=[50, 32, 12, 64]); + %515 = transpose(%514, axes=[0, 2, 3, 1]); + %516 = reshape(%515, newshape=[-1, 64, 32]); + %517 = reshape(%512, newshape=[-1, 32, 64]); + %518 = transpose(%516, axes=[0, 2, 1]); + %519 = nn.batch_matmul(%517, %518, out_dtype="float16", transpose_b=True); + %520 = reshape(%519, newshape=[50, 12, 32, 32]); + %521 = divide(%520, 8f16); + %522 = multiply(%521, meta[relay.Constant][104]); + %523 = subtract(%522, meta[relay.Constant][105]); + %524 = nn.softmax(%523, axis=3); + %525 = %509.2; + %526 = reshape(%525, newshape=[50, 32, 12, 64]); + %527 = transpose(%526, axes=[0, 2, 1, 3]); + %528 = reshape(%527, newshape=[-1, 32, 64]); + %529 = reshape(%524, newshape=[-1, 32, 32]); + %530 = transpose(%528, axes=[0, 2, 1]); + %531 = nn.batch_matmul(%529, %530, out_dtype="float16", transpose_b=True); + %532 = reshape(%531, newshape=[50, 12, 32, 64]); + %533 = transpose(%532, axes=[0, 2, 1, 3]); + %534 = reshape(%533, newshape=[50, 32, 768]); + %535 = reshape(%534, newshape=[-1, 768]); + %536 = nn.dense(%535, meta[relay.Constant][106], units=768); + %537 = add(%536, meta[relay.Constant][107]); + %538 = reshape(%537, newshape=[50, 32, 768]); + %539 = add(%495, %538); + %540 = mean(%539, axis=[-1], keepdims=True); + %541 = subtract(%539, %540); + %542 = power(%541, 2f16); + %543 = mean(%542, axis=[-1], keepdims=True); + %544 = add(%543, 1e-05f16); + %545 = sqrt(%544); + %546 = divide(%541, %545); + %547 = multiply(%546, meta[relay.Constant][108]); + %548 = add(%547, meta[relay.Constant][109]); + %549 = reshape(%548, newshape=[-1, 768]); + %550 = nn.dense(%549, meta[relay.Constant][110], units=3072); + %551 = add(%550, meta[relay.Constant][111]); + %552 = reshape(%551, newshape=[50, 32, 3072]); + %553 = power(%552, 3f16); + %554 = multiply(%553, 0.044715f16); + %555 = add(%552, %554); + %556 = multiply(%555, 0.797885f16); + %557 = tanh(%556); + %558 = multiply(%552, 0.5f16); + %559 = add(%557, 1f16); + %560 = multiply(%558, %559); + %561 = reshape(%560, newshape=[-1, 3072]); + %562 = nn.dense(%561, meta[relay.Constant][112], units=768); + %563 = add(%562, meta[relay.Constant][113]); + %564 = reshape(%563, newshape=[50, 32, 768]); + %565 = add(%539, %564); + %566 = mean(%565, axis=[-1], keepdims=True); + %567 = subtract(%565, %566); + %568 = power(%567, 2f16); + %569 = mean(%568, axis=[-1], keepdims=True); + %570 = add(%569, 1e-05f16); + %571 = sqrt(%570); + %572 = divide(%567, %571); + %573 = multiply(%572, meta[relay.Constant][114]); + %574 = add(%573, meta[relay.Constant][115]); + %575 = reshape(%574, newshape=[-1, 768]); + %576 = nn.dense(%575, meta[relay.Constant][116], units=2304); + %577 = add(%576, meta[relay.Constant][117]); + %578 = reshape(%577, newshape=[50, 32, 2304]); + %579 = split(%578, indices_or_sections=[768, 1536], axis=2); + %580 = %579.0; + %581 = reshape(%580, newshape=[50, 32, 12, 64]); + %582 = transpose(%581, axes=[0, 2, 1, 3]); + %583 = %579.1; + %584 = reshape(%583, newshape=[50, 32, 12, 64]); + %585 = transpose(%584, axes=[0, 2, 3, 1]); + %586 = reshape(%585, newshape=[-1, 64, 32]); + %587 = reshape(%582, newshape=[-1, 32, 64]); + %588 = transpose(%586, axes=[0, 2, 1]); + %589 = nn.batch_matmul(%587, %588, out_dtype="float16", transpose_b=True); + %590 = reshape(%589, newshape=[50, 12, 32, 32]); + %591 = divide(%590, 8f16); + %592 = multiply(%591, meta[relay.Constant][118]); + %593 = subtract(%592, meta[relay.Constant][119]); + %594 = nn.softmax(%593, axis=3); + %595 = %579.2; + %596 = reshape(%595, newshape=[50, 32, 12, 64]); + %597 = transpose(%596, axes=[0, 2, 1, 3]); + %598 = reshape(%597, newshape=[-1, 32, 64]); + %599 = reshape(%594, newshape=[-1, 32, 32]); + %600 = transpose(%598, axes=[0, 2, 1]); + %601 = nn.batch_matmul(%599, %600, out_dtype="float16", transpose_b=True); + %602 = reshape(%601, newshape=[50, 12, 32, 64]); + %603 = transpose(%602, axes=[0, 2, 1, 3]); + %604 = reshape(%603, newshape=[50, 32, 768]); + %605 = reshape(%604, newshape=[-1, 768]); + %606 = nn.dense(%605, meta[relay.Constant][120], units=768); + %607 = add(%606, meta[relay.Constant][121]); + %608 = reshape(%607, newshape=[50, 32, 768]); + %609 = add(%565, %608); + %610 = mean(%609, axis=[-1], keepdims=True); + %611 = subtract(%609, %610); + %612 = power(%611, 2f16); + %613 = mean(%612, axis=[-1], keepdims=True); + %614 = add(%613, 1e-05f16); + %615 = sqrt(%614); + %616 = divide(%611, %615); + %617 = multiply(%616, meta[relay.Constant][122]); + %618 = add(%617, meta[relay.Constant][123]); + %619 = reshape(%618, newshape=[-1, 768]); + %620 = nn.dense(%619, meta[relay.Constant][124], units=3072); + %621 = add(%620, meta[relay.Constant][125]); + %622 = reshape(%621, newshape=[50, 32, 3072]); + %623 = power(%622, 3f16); + %624 = multiply(%623, 0.044715f16); + %625 = add(%622, %624); + %626 = multiply(%625, 0.797885f16); + %627 = tanh(%626); + %628 = multiply(%622, 0.5f16); + %629 = add(%627, 1f16); + %630 = multiply(%628, %629); + %631 = reshape(%630, newshape=[-1, 3072]); + %632 = nn.dense(%631, meta[relay.Constant][126], units=768); + %633 = add(%632, meta[relay.Constant][127]); + %634 = reshape(%633, newshape=[50, 32, 768]); + %635 = add(%609, %634); + %636 = mean(%635, axis=[-1], keepdims=True); + %637 = subtract(%635, %636); + %638 = power(%637, 2f16); + %639 = mean(%638, axis=[-1], keepdims=True); + %640 = add(%639, 1e-05f16); + %641 = sqrt(%640); + %642 = divide(%637, %641); + %643 = multiply(%642, meta[relay.Constant][128]); + %644 = add(%643, meta[relay.Constant][129]); + %645 = reshape(%644, newshape=[-1, 768]); + %646 = nn.dense(%645, meta[relay.Constant][130], units=2304); + %647 = add(%646, meta[relay.Constant][131]); + %648 = reshape(%647, newshape=[50, 32, 2304]); + %649 = split(%648, indices_or_sections=[768, 1536], axis=2); + %650 = %649.0; + %651 = reshape(%650, newshape=[50, 32, 12, 64]); + %652 = transpose(%651, axes=[0, 2, 1, 3]); + %653 = %649.1; + %654 = reshape(%653, newshape=[50, 32, 12, 64]); + %655 = transpose(%654, axes=[0, 2, 3, 1]); + %656 = reshape(%655, newshape=[-1, 64, 32]); + %657 = reshape(%652, newshape=[-1, 32, 64]); + %658 = transpose(%656, axes=[0, 2, 1]); + %659 = nn.batch_matmul(%657, %658, out_dtype="float16", transpose_b=True); + %660 = reshape(%659, newshape=[50, 12, 32, 32]); + %661 = divide(%660, 8f16); + %662 = multiply(%661, meta[relay.Constant][132]); + %663 = subtract(%662, meta[relay.Constant][133]); + %664 = nn.softmax(%663, axis=3); + %665 = %649.2; + %666 = reshape(%665, newshape=[50, 32, 12, 64]); + %667 = transpose(%666, axes=[0, 2, 1, 3]); + %668 = reshape(%667, newshape=[-1, 32, 64]); + %669 = reshape(%664, newshape=[-1, 32, 32]); + %670 = transpose(%668, axes=[0, 2, 1]); + %671 = nn.batch_matmul(%669, %670, out_dtype="float16", transpose_b=True); + %672 = reshape(%671, newshape=[50, 12, 32, 64]); + %673 = transpose(%672, axes=[0, 2, 1, 3]); + %674 = reshape(%673, newshape=[50, 32, 768]); + %675 = reshape(%674, newshape=[-1, 768]); + %676 = nn.dense(%675, meta[relay.Constant][134], units=768); + %677 = add(%676, meta[relay.Constant][135]); + %678 = reshape(%677, newshape=[50, 32, 768]); + %679 = add(%635, %678); + %680 = mean(%679, axis=[-1], keepdims=True); + %681 = subtract(%679, %680); + %682 = power(%681, 2f16); + %683 = mean(%682, axis=[-1], keepdims=True); + %684 = add(%683, 1e-05f16); + %685 = sqrt(%684); + %686 = divide(%681, %685); + %687 = multiply(%686, meta[relay.Constant][136]); + %688 = add(%687, meta[relay.Constant][137]); + %689 = reshape(%688, newshape=[-1, 768]); + %690 = nn.dense(%689, meta[relay.Constant][138], units=3072); + %691 = add(%690, meta[relay.Constant][139]); + %692 = reshape(%691, newshape=[50, 32, 3072]); + %693 = power(%692, 3f16); + %694 = multiply(%693, 0.044715f16); + %695 = add(%692, %694); + %696 = multiply(%695, 0.797885f16); + %697 = tanh(%696); + %698 = multiply(%692, 0.5f16); + %699 = add(%697, 1f16); + %700 = multiply(%698, %699); + %701 = reshape(%700, newshape=[-1, 3072]); + %702 = nn.dense(%701, meta[relay.Constant][140], units=768); + %703 = add(%702, meta[relay.Constant][141]); + %704 = reshape(%703, newshape=[50, 32, 768]); + %705 = add(%679, %704); + %706 = mean(%705, axis=[-1], keepdims=True); + %707 = subtract(%705, %706); + %708 = power(%707, 2f16); + %709 = mean(%708, axis=[-1], keepdims=True); + %710 = add(%709, 1e-05f16); + %711 = sqrt(%710); + %712 = divide(%707, %711); + %713 = multiply(%712, meta[relay.Constant][142]); + %714 = add(%713, meta[relay.Constant][143]); + %715 = reshape(%714, newshape=[-1, 768]); + %716 = nn.dense(%715, meta[relay.Constant][144], units=2304); + %717 = add(%716, meta[relay.Constant][145]); + %718 = reshape(%717, newshape=[50, 32, 2304]); + %719 = split(%718, indices_or_sections=[768, 1536], axis=2); + %720 = %719.0; + %721 = reshape(%720, newshape=[50, 32, 12, 64]); + %722 = transpose(%721, axes=[0, 2, 1, 3]); + %723 = %719.1; + %724 = reshape(%723, newshape=[50, 32, 12, 64]); + %725 = transpose(%724, axes=[0, 2, 3, 1]); + %726 = reshape(%725, newshape=[-1, 64, 32]); + %727 = reshape(%722, newshape=[-1, 32, 64]); + %728 = transpose(%726, axes=[0, 2, 1]); + %729 = nn.batch_matmul(%727, %728, out_dtype="float16", transpose_b=True); + %730 = reshape(%729, newshape=[50, 12, 32, 32]); + %731 = divide(%730, 8f16); + %732 = multiply(%731, meta[relay.Constant][146]); + %733 = subtract(%732, meta[relay.Constant][147]); + %734 = nn.softmax(%733, axis=3); + %735 = %719.2; + %736 = reshape(%735, newshape=[50, 32, 12, 64]); + %737 = transpose(%736, axes=[0, 2, 1, 3]); + %738 = reshape(%737, newshape=[-1, 32, 64]); + %739 = reshape(%734, newshape=[-1, 32, 32]); + %740 = transpose(%738, axes=[0, 2, 1]); + %741 = nn.batch_matmul(%739, %740, out_dtype="float16", transpose_b=True); + %742 = reshape(%741, newshape=[50, 12, 32, 64]); + %743 = transpose(%742, axes=[0, 2, 1, 3]); + %744 = reshape(%743, newshape=[50, 32, 768]); + %745 = reshape(%744, newshape=[-1, 768]); + %746 = nn.dense(%745, meta[relay.Constant][148], units=768); + %747 = add(%746, meta[relay.Constant][149]); + %748 = reshape(%747, newshape=[50, 32, 768]); + %749 = add(%705, %748); + %750 = mean(%749, axis=[-1], keepdims=True); + %751 = subtract(%749, %750); + %752 = power(%751, 2f16); + %753 = mean(%752, axis=[-1], keepdims=True); + %754 = add(%753, 1e-05f16); + %755 = sqrt(%754); + %756 = divide(%751, %755); + %757 = multiply(%756, meta[relay.Constant][150]); + %758 = add(%757, meta[relay.Constant][151]); + %759 = reshape(%758, newshape=[-1, 768]); + %760 = nn.dense(%759, meta[relay.Constant][152], units=3072); + %761 = add(%760, meta[relay.Constant][153]); + %762 = reshape(%761, newshape=[50, 32, 3072]); + %763 = power(%762, 3f16); + %764 = multiply(%763, 0.044715f16); + %765 = add(%762, %764); + %766 = multiply(%765, 0.797885f16); + %767 = tanh(%766); + %768 = multiply(%762, 0.5f16); + %769 = add(%767, 1f16); + %770 = multiply(%768, %769); + %771 = reshape(%770, newshape=[-1, 3072]); + %772 = nn.dense(%771, meta[relay.Constant][154], units=768); + %773 = add(%772, meta[relay.Constant][155]); + %774 = reshape(%773, newshape=[50, 32, 768]); + %775 = add(%749, %774); + %776 = mean(%775, axis=[-1], keepdims=True); + %777 = subtract(%775, %776); + %778 = power(%777, 2f16); + %779 = mean(%778, axis=[-1], keepdims=True); + %780 = add(%779, 1e-05f16); + %781 = sqrt(%780); + %782 = divide(%777, %781); + %783 = multiply(%782, meta[relay.Constant][156]); + %784 = add(%783, meta[relay.Constant][157]); + %785 = reshape(%784, newshape=[-1, 768]); + %786 = nn.dense(%785, meta[relay.Constant][158], units=2304); + %787 = add(%786, meta[relay.Constant][159]); + %788 = reshape(%787, newshape=[50, 32, 2304]); + %789 = split(%788, indices_or_sections=[768, 1536], axis=2); + %790 = %789.0; + %791 = reshape(%790, newshape=[50, 32, 12, 64]); + %792 = transpose(%791, axes=[0, 2, 1, 3]); + %793 = %789.1; + %794 = reshape(%793, newshape=[50, 32, 12, 64]); + %795 = transpose(%794, axes=[0, 2, 3, 1]); + %796 = reshape(%795, newshape=[-1, 64, 32]); + %797 = reshape(%792, newshape=[-1, 32, 64]); + %798 = transpose(%796, axes=[0, 2, 1]); + %799 = nn.batch_matmul(%797, %798, out_dtype="float16", transpose_b=True); + %800 = reshape(%799, newshape=[50, 12, 32, 32]); + %801 = divide(%800, 8f16); + %802 = multiply(%801, meta[relay.Constant][160]); + %803 = subtract(%802, meta[relay.Constant][161]); + %804 = nn.softmax(%803, axis=3); + %805 = %789.2; + %806 = reshape(%805, newshape=[50, 32, 12, 64]); + %807 = transpose(%806, axes=[0, 2, 1, 3]); + %808 = reshape(%807, newshape=[-1, 32, 64]); + %809 = reshape(%804, newshape=[-1, 32, 32]); + %810 = transpose(%808, axes=[0, 2, 1]); + %811 = nn.batch_matmul(%809, %810, out_dtype="float16", transpose_b=True); + %812 = reshape(%811, newshape=[50, 12, 32, 64]); + %813 = transpose(%812, axes=[0, 2, 1, 3]); + %814 = reshape(%813, newshape=[50, 32, 768]); + %815 = reshape(%814, newshape=[-1, 768]); + %816 = nn.dense(%815, meta[relay.Constant][162], units=768); + %817 = add(%816, meta[relay.Constant][163]); + %818 = reshape(%817, newshape=[50, 32, 768]); + %819 = add(%775, %818); + %820 = mean(%819, axis=[-1], keepdims=True); + %821 = subtract(%819, %820); + %822 = power(%821, 2f16); + %823 = mean(%822, axis=[-1], keepdims=True); + %824 = add(%823, 1e-05f16); + %825 = sqrt(%824); + %826 = divide(%821, %825); + %827 = multiply(%826, meta[relay.Constant][164]); + %828 = add(%827, meta[relay.Constant][165]); + %829 = reshape(%828, newshape=[-1, 768]); + %830 = nn.dense(%829, meta[relay.Constant][166], units=3072); + %831 = add(%830, meta[relay.Constant][167]); + %832 = reshape(%831, newshape=[50, 32, 3072]); + %833 = power(%832, 3f16); + %834 = multiply(%833, 0.044715f16); + %835 = add(%832, %834); + %836 = multiply(%835, 0.797885f16); + %837 = tanh(%836); + %838 = multiply(%832, 0.5f16); + %839 = add(%837, 1f16); + %840 = multiply(%838, %839); + %841 = reshape(%840, newshape=[-1, 3072]); + %842 = nn.dense(%841, meta[relay.Constant][168], units=768); + %843 = add(%842, meta[relay.Constant][169]); + %844 = reshape(%843, newshape=[50, 32, 768]); + %845 = add(%819, %844); + %846 = mean(%845, axis=[-1], keepdims=True); + %847 = subtract(%845, %846); + %848 = power(%847, 2f16); + %849 = mean(%848, axis=[-1], keepdims=True); + %850 = add(%849, 1e-05f16); + %851 = sqrt(%850); + %852 = divide(%847, %851); + %853 = multiply(%852, meta[relay.Constant][170]); + %854 = add(%853, meta[relay.Constant][171]); + %855 = transpose(%24, axes=[0, 2, 1, 3]); + %856 = expand_dims(%855, axis=0); + %857 = expand_dims(%37, axis=0); + %858 = (%856, %857); + %859 = transpose(%94, axes=[0, 2, 1, 3]); + %860 = expand_dims(%859, axis=0); + %861 = expand_dims(%107, axis=0); + %862 = (%860, %861); + %863 = transpose(%164, axes=[0, 2, 1, 3]); + %864 = expand_dims(%863, axis=0); + %865 = expand_dims(%177, axis=0); + %866 = (%864, %865); + %867 = transpose(%234, axes=[0, 2, 1, 3]); + %868 = expand_dims(%867, axis=0); + %869 = expand_dims(%247, axis=0); + %870 = (%868, %869); + %871 = transpose(%304, axes=[0, 2, 1, 3]); + %872 = expand_dims(%871, axis=0); + %873 = expand_dims(%317, axis=0); + %874 = (%872, %873); + %875 = transpose(%374, axes=[0, 2, 1, 3]); + %876 = expand_dims(%875, axis=0); + %877 = expand_dims(%387, axis=0); + %878 = (%876, %877); + %879 = transpose(%444, axes=[0, 2, 1, 3]); + %880 = expand_dims(%879, axis=0); + %881 = expand_dims(%457, axis=0); + %882 = (%880, %881); + %883 = transpose(%514, axes=[0, 2, 1, 3]); + %884 = expand_dims(%883, axis=0); + %885 = expand_dims(%527, axis=0); + %886 = (%884, %885); + %887 = transpose(%584, axes=[0, 2, 1, 3]); + %888 = expand_dims(%887, axis=0); + %889 = expand_dims(%597, axis=0); + %890 = (%888, %889); + %891 = transpose(%654, axes=[0, 2, 1, 3]); + %892 = expand_dims(%891, axis=0); + %893 = expand_dims(%667, axis=0); + %894 = (%892, %893); + %895 = transpose(%724, axes=[0, 2, 1, 3]); + %896 = expand_dims(%895, axis=0); + %897 = expand_dims(%737, axis=0); + %898 = (%896, %897); + %899 = transpose(%794, axes=[0, 2, 1, 3]); + %900 = expand_dims(%899, axis=0); + %901 = expand_dims(%807, axis=0); + %902 = (%900, %901); + %903 = reshape(%854, newshape=[1, 50, 32, 768]); + %904 = concatenate(%858); + %905 = concatenate(%862); + %906 = concatenate(%866); + %907 = concatenate(%870); + %908 = concatenate(%874); + %909 = concatenate(%878); + %910 = concatenate(%882); + %911 = concatenate(%886); + %912 = concatenate(%890); + %913 = concatenate(%894); + %914 = concatenate(%898); + %915 = concatenate(%902); + (%903, %904, %905, %906, %907, %908, %909, %910, %911, %912, %913, %914, %915) + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "gpt2_16", + "input_shapes": {"x": [1, 50, 32]}, + "input_dtypes": {"x": "int64"}, + "mod": mod, + "params": None, + "main_dtype": "float16", + } + + +def gpt2_extract_consts(dtype): + return make_consts( + dtype, + [ + (768, 768), # 0 + (768,), # 1 + (768,), # 2 + (768,), # 3 + (3072, 768), # 4 + (3072,), # 5 + (1, 32, 768), # 6 + ], + ) + + +def gpt2_extract(): + metatable = {"relay.Constant": gpt2_extract_consts("float32")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%x: Tensor[(1600, 768), float32]) -> Tensor[(50, 32, 3072), float32] { + %46 = nn.dense(%x, meta[relay.Constant][0], units=768); + %47 = add(%46, meta[relay.Constant][1]); + %48 = reshape(%47, newshape=[50, 32, 768]); + %49 = add(meta[relay.Constant][6], %48); + %50 = mean(%49, axis=[-1], keepdims=True); + %51 = subtract(%49, %50); + %52 = power(%51, 2f); + %53 = mean(%52, axis=[-1], keepdims=True); + %54 = add(%53, 1e-05f); + %55 = sqrt(%54); + %56 = divide(%51, %55); + %57 = multiply(%56, meta[relay.Constant][2]); + %58 = add(%57, meta[relay.Constant][3]); + %59 = reshape(%58, newshape=[-1, 768]); + %60 = nn.dense(%59, meta[relay.Constant][4], units=3072); + %61 = add(%60, meta[relay.Constant][5]); + %62 = reshape(%61, newshape=[50, 32, 3072]); + %63 = power(%62, 3f); + %64 = multiply(%63, 0.044715f); + %65 = add(%62, %64); + %66 = multiply(%65, 0.797885f); + %67 = tanh(%66); + %68 = multiply(%62, 0.5f); + %69 = add(%67, 1f); + %70 = multiply(%68, %69); + %70 + } + """, + "from_string", + None, + metatable, + ) + return { + "input_shapes": {"x": [1600, 768]}, + "input_dtypes": {"x": "float32"}, + "mod": mod, + "params": None, + "main_dtype": "float32", + } + + +def gpt2_extract_16(): + metatable = {"relay.Constant": gpt2_extract_consts("float16")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%x: Tensor[(1600, 768), float16]) -> Tensor[(50, 32, 3072), float16] { + %46 = nn.dense(%x, meta[relay.Constant][0], units=768); + %47 = add(%46, meta[relay.Constant][1]); + %48 = reshape(%47, newshape=[50, 32, 768]); + %49 = add(meta[relay.Constant][6], %48); + %50 = mean(%49, axis=[-1], keepdims=True); + %51 = subtract(%49, %50); + %52 = power(%51, 2f16); + %53 = mean(%52, axis=[-1], keepdims=True); + %54 = add(%53, 1e-05f16); + %55 = sqrt(%54); + %56 = divide(%51, %55); + %57 = multiply(%56, meta[relay.Constant][2]); + %58 = add(%57, meta[relay.Constant][3]); + %59 = reshape(%58, newshape=[-1, 768]); + %60 = nn.dense(%59, meta[relay.Constant][4], units=3072); + %61 = add(%60, meta[relay.Constant][5]); + %62 = reshape(%61, newshape=[50, 32, 3072]); + %63 = power(%62, 3f16); + %64 = multiply(%63, 0.044715f16); + %65 = add(%62, %64); + %66 = multiply(%65, 0.797885f16); + %67 = tanh(%66); + %68 = multiply(%62, 0.5f16); + %69 = add(%67, 1f16); + %70 = multiply(%68, %69); + %70 + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "gpt2_extract_16", + "input_shapes": {"x": [1600, 768]}, + "input_dtypes": {"x": "float16"}, + "mod": mod, + "params": None, + "main_dtype": "float16", + } + + +def gpt2_16_for_cutlass_extract_consts(dtype): + return make_consts( + "float16", + [ + (2304, 768), # 0 + (2304,), # 1 + (600, 32, 64), # 2 + (600, 32, 32), # 3 + ], + ) + + +def gpt2_16_for_cutlass_extract(): + metatable = {"relay.Constant": gpt2_16_for_cutlass_extract_consts("float16")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%x0: Tensor[(1600, 768), float16], + %x3: Tensor[(600, 32, 64), float16]) + -> (Tensor[(1600, 2304), float16], Tensor[(1200, 32, 32), float16]) { + %0 = nn.dense(%x0, meta[relay.Constant][0], units=2304); + %1 = add(%0, meta[relay.Constant][1]); + %2 = nn.batch_matmul(%x3, meta[relay.Constant][2], out_dtype="float16", transpose_b=True); + %3 = (%2, meta[relay.Constant][3]); + %4 = concatenate(%3); + (%1, %4) + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "gpt2_16_for_cutlass_extract", + "input_shapes": {"x0": (1600, 768), "x3": (600, 32, 64)}, + "input_dtypes": {"x0": "float16", "x3": "float16"}, + "mod": mod, + "params": None, + "main_dtype": "float16", + } + + +def resnet50_consts(dtype): + return make_consts( + dtype, + [ + (3,), # 0 + (3,), # 1 + (3,), # 2 + (3,), # 3 + (64, 3, 7, 7), # 4 + (64,), # 5 + (64,), # 6 + (64,), # 7 + (64,), # 8 + (64,), # 9 + (64,), # 10 + (64,), # 11 + (64,), # 12 + (64, 64, 1, 1), # 13 + (64,), # 14 + (64,), # 15 + (64,), # 16 + (64,), # 17 + (64, 64, 3, 3), # 18 + (64,), # 19 + (64,), # 20 + (64,), # 21 + (64,), # 22 + (256, 64, 1, 1), # 23 + (256, 64, 1, 1), # 24 + (256,), # 25 + (256,), # 26 + (256,), # 27 + (256,), # 28 + (64, 256, 1, 1), # 29 + (64,), # 30 + (64,), # 31 + (64,), # 32 + (64,), # 33 + (64, 64, 3, 3), # 34 + (64,), # 35 + (64,), # 36 + (64,), # 37 + (64,), # 38 + (256, 64, 1, 1), # 39 + (256,), # 40 + (256,), # 41 + (256,), # 42 + (256,), # 43 + (64, 256, 1, 1), # 44 + (64,), # 45 + (64,), # 46 + (64,), # 47 + (64,), # 48 + (64, 64, 3, 3), # 49 + (64,), # 50 + (64,), # 51 + (64,), # 52 + (64,), # 53 + (256, 64, 1, 1), # 54 + (256,), # 55 + (256,), # 56 + (256,), # 57 + (256,), # 58 + (128, 256, 1, 1), # 59 + (128,), # 60 + (128,), # 61 + (128,), # 62 + (128,), # 63 + (128, 128, 3, 3), # 64 + (128,), # 65 + (128,), # 66 + (128,), # 67 + (128,), # 68 + (512, 128, 1, 1), # 69 + (512, 256, 1, 1), # 70 + (512,), # 71 + (512,), # 72 + (512,), # 73 + (512,), # 74 + (128, 512, 1, 1), # 75 + (128,), # 76 + (128,), # 77 + (128,), # 78 + (128,), # 79 + (128, 128, 3, 3), # 80 + (128,), # 81 + (128,), # 82 + (128,), # 83 + (128,), # 84 + (512, 128, 1, 1), # 85 + (512,), # 86 + (512,), # 87 + (512,), # 88 + (512,), # 89 + (128, 512, 1, 1), # 90 + (128,), # 91 + (128,), # 92 + (128,), # 93 + (128,), # 94 + (128, 128, 3, 3), # 95 + (128,), # 96 + (128,), # 97 + (128,), # 98 + (128,), # 99 + (512, 128, 1, 1), # 100 + (512,), # 101 + (512,), # 102 + (512,), # 103 + (512,), # 104 + (128, 512, 1, 1), # 105 + (128,), # 106 + (128,), # 107 + (128,), # 108 + (128,), # 109 + (128, 128, 3, 3), # 110 + (128,), # 111 + (128,), # 112 + (128,), # 113 + (128,), # 114 + (512, 128, 1, 1), # 115 + (512,), # 116 + (512,), # 117 + (512,), # 118 + (512,), # 119 + (256, 512, 1, 1), # 120 + (256,), # 121 + (256,), # 122 + (256,), # 123 + (256,), # 124 + (256, 256, 3, 3), # 125 + (256,), # 126 + (256,), # 127 + (256,), # 128 + (256,), # 129 + (1024, 256, 1, 1), # 130 + (1024, 512, 1, 1), # 131 + (1024,), # 132 + (1024,), # 133 + (1024,), # 134 + (1024,), # 135 + (256, 1024, 1, 1), # 136 + (256,), # 137 + (256,), # 138 + (256,), # 139 + (256,), # 140 + (256, 256, 3, 3), # 141 + (256,), # 142 + (256,), # 143 + (256,), # 144 + (256,), # 145 + (1024, 256, 1, 1), # 146 + (1024,), # 147 + (1024,), # 148 + (1024,), # 149 + (1024,), # 150 + (256, 1024, 1, 1), # 151 + (256,), # 152 + (256,), # 153 + (256,), # 154 + (256,), # 155 + (256, 256, 3, 3), # 156 + (256,), # 157 + (256,), # 158 + (256,), # 159 + (256,), # 160 + (1024, 256, 1, 1), # 161 + (1024,), # 162 + (1024,), # 163 + (1024,), # 164 + (1024,), # 165 + (256, 1024, 1, 1), # 166 + (256,), # 167 + (256,), # 168 + (256,), # 169 + (256,), # 170 + (256, 256, 3, 3), # 171 + (256,), # 172 + (256,), # 173 + (256,), # 174 + (256,), # 175 + (1024, 256, 1, 1), # 176 + (1024,), # 177 + (1024,), # 178 + (1024,), # 179 + (1024,), # 180 + (256, 1024, 1, 1), # 181 + (256,), # 182 + (256,), # 183 + (256,), # 184 + (256,), # 185 + (256, 256, 3, 3), # 186 + (256,), # 187 + (256,), # 188 + (256,), # 189 + (256,), # 190 + (1024, 256, 1, 1), # 191 + (1024,), # 192 + (1024,), # 193 + (1024,), # 194 + (1024,), # 195 + (256, 1024, 1, 1), # 196 + (256,), # 197 + (256,), # 198 + (256,), # 199 + (256,), # 200 + (256, 256, 3, 3), # 201 + (256,), # 202 + (256,), # 203 + (256,), # 204 + (256,), # 205 + (1024, 256, 1, 1), # 206 + (1024,), # 207 + (1024,), # 208 + (1024,), # 209 + (1024,), # 210 + (512, 1024, 1, 1), # 211 + (512,), # 212 + (512,), # 213 + (512,), # 214 + (512,), # 215 + (512, 512, 3, 3), # 216 + (512,), # 217 + (512,), # 218 + (512,), # 219 + (512,), # 220 + (2048, 512, 1, 1), # 221 + (2048, 1024, 1, 1), # 222 + (2048,), # 223 + (2048,), # 224 + (2048,), # 225 + (2048,), # 226 + (512, 2048, 1, 1), # 227 + (512,), # 228 + (512,), # 229 + (512,), # 230 + (512,), # 231 + (512, 512, 3, 3), # 232 + (512,), # 233 + (512,), # 234 + (512,), # 235 + (512,), # 236 + (2048, 512, 1, 1), # 237 + (2048,), # 238 + (2048,), # 239 + (2048,), # 240 + (2048,), # 241 + (512, 2048, 1, 1), # 242 + (512,), # 243 + (512,), # 244 + (512,), # 245 + (512,), # 246 + (512, 512, 3, 3), # 247 + (512,), # 248 + (512,), # 249 + (512,), # 250 + (512,), # 251 + (2048, 512, 1, 1), # 252 + (2048,), # 253 + (2048,), # 254 + (2048,), # 255 + (2048,), # 256 + (1000, 2048), # 257 + (1000,), # 258 + ], + ) + + +def resnet50(): + metatable = {"relay.Constant": resnet50_consts("float32")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32] { + %0 = nn.batch_norm(%data, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]); + %1 = %0.0; + %2 = nn.conv2d(%1, meta[relay.Constant][4], strides=[2, 2], padding=[3, 3, 3, 3], channels=64, kernel_size=[7, 7]); + %3 = nn.batch_norm(%2, meta[relay.Constant][5], meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8]); + %4 = %3.0; + %5 = nn.relu(%4); + %6 = nn.max_pool2d(%5, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]); + %7 = nn.batch_norm(%6, meta[relay.Constant][9], meta[relay.Constant][10], meta[relay.Constant][11], meta[relay.Constant][12]); + %8 = %7.0; + %9 = nn.relu(%8); + %10 = nn.conv2d(%9, meta[relay.Constant][13], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %11 = nn.batch_norm(%10, meta[relay.Constant][14], meta[relay.Constant][15], meta[relay.Constant][16], meta[relay.Constant][17]); + %12 = %11.0; + %13 = nn.relu(%12); + %14 = nn.conv2d(%13, meta[relay.Constant][18], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]); + %15 = nn.batch_norm(%14, meta[relay.Constant][19], meta[relay.Constant][20], meta[relay.Constant][21], meta[relay.Constant][22]); + %16 = %15.0; + %17 = nn.relu(%16); + %18 = nn.conv2d(%17, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %19 = nn.conv2d(%9, meta[relay.Constant][24], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %20 = add(%18, %19); + %21 = nn.batch_norm(%20, meta[relay.Constant][25], meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28]); + %22 = %21.0; + %23 = nn.relu(%22); + %24 = nn.conv2d(%23, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %25 = nn.batch_norm(%24, meta[relay.Constant][30], meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33]); + %26 = %25.0; + %27 = nn.relu(%26); + %28 = nn.conv2d(%27, meta[relay.Constant][34], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]); + %29 = nn.batch_norm(%28, meta[relay.Constant][35], meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38]); + %30 = %29.0; + %31 = nn.relu(%30); + %32 = nn.conv2d(%31, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %33 = add(%32, %20); + %34 = nn.batch_norm(%33, meta[relay.Constant][40], meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43]); + %35 = %34.0; + %36 = nn.relu(%35); + %37 = nn.conv2d(%36, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %38 = nn.batch_norm(%37, meta[relay.Constant][45], meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48]); + %39 = %38.0; + %40 = nn.relu(%39); + %41 = nn.conv2d(%40, meta[relay.Constant][49], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]); + %42 = nn.batch_norm(%41, meta[relay.Constant][50], meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53]); + %43 = %42.0; + %44 = nn.relu(%43); + %45 = nn.conv2d(%44, meta[relay.Constant][54], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %46 = add(%45, %33); + %47 = nn.batch_norm(%46, meta[relay.Constant][55], meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58]); + %48 = %47.0; + %49 = nn.relu(%48); + %50 = nn.conv2d(%49, meta[relay.Constant][59], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %51 = nn.batch_norm(%50, meta[relay.Constant][60], meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63]); + %52 = %51.0; + %53 = nn.relu(%52); + %54 = nn.conv2d(%53, meta[relay.Constant][64], strides=[2, 2], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]); + %55 = nn.batch_norm(%54, meta[relay.Constant][65], meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68]); + %56 = %55.0; + %57 = nn.relu(%56); + %58 = nn.conv2d(%57, meta[relay.Constant][69], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %59 = nn.conv2d(%49, meta[relay.Constant][70], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %60 = add(%58, %59); + %61 = nn.batch_norm(%60, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]); + %62 = %61.0; + %63 = nn.relu(%62); + %64 = nn.conv2d(%63, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %65 = nn.batch_norm(%64, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]); + %66 = %65.0; + %67 = nn.relu(%66); + %68 = nn.conv2d(%67, meta[relay.Constant][80], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]); + %69 = nn.batch_norm(%68, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]); + %70 = %69.0; + %71 = nn.relu(%70); + %72 = nn.conv2d(%71, meta[relay.Constant][85], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %73 = add(%72, %60); + %74 = nn.batch_norm(%73, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]); + %75 = %74.0; + %76 = nn.relu(%75); + %77 = nn.conv2d(%76, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %78 = nn.batch_norm(%77, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]); + %79 = %78.0; + %80 = nn.relu(%79); + %81 = nn.conv2d(%80, meta[relay.Constant][95], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]); + %82 = nn.batch_norm(%81, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]); + %83 = %82.0; + %84 = nn.relu(%83); + %85 = nn.conv2d(%84, meta[relay.Constant][100], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %86 = add(%85, %73); + %87 = nn.batch_norm(%86, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]); + %88 = %87.0; + %89 = nn.relu(%88); + %90 = nn.conv2d(%89, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %91 = nn.batch_norm(%90, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]); + %92 = %91.0; + %93 = nn.relu(%92); + %94 = nn.conv2d(%93, meta[relay.Constant][110], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]); + %95 = nn.batch_norm(%94, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]); + %96 = %95.0; + %97 = nn.relu(%96); + %98 = nn.conv2d(%97, meta[relay.Constant][115], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %99 = add(%98, %86); + %100 = nn.batch_norm(%99, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]); + %101 = %100.0; + %102 = nn.relu(%101); + %103 = nn.conv2d(%102, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %104 = nn.batch_norm(%103, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]); + %105 = %104.0; + %106 = nn.relu(%105); + %107 = nn.conv2d(%106, meta[relay.Constant][125], strides=[2, 2], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %108 = nn.batch_norm(%107, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]); + %109 = %108.0; + %110 = nn.relu(%109); + %111 = nn.conv2d(%110, meta[relay.Constant][130], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %112 = nn.conv2d(%102, meta[relay.Constant][131], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %113 = add(%111, %112); + %114 = nn.batch_norm(%113, meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134], meta[relay.Constant][135]); + %115 = %114.0; + %116 = nn.relu(%115); + %117 = nn.conv2d(%116, meta[relay.Constant][136], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %118 = nn.batch_norm(%117, meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139], meta[relay.Constant][140]); + %119 = %118.0; + %120 = nn.relu(%119); + %121 = nn.conv2d(%120, meta[relay.Constant][141], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %122 = nn.batch_norm(%121, meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144], meta[relay.Constant][145]); + %123 = %122.0; + %124 = nn.relu(%123); + %125 = nn.conv2d(%124, meta[relay.Constant][146], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %126 = add(%125, %113); + %127 = nn.batch_norm(%126, meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149], meta[relay.Constant][150]); + %128 = %127.0; + %129 = nn.relu(%128); + %130 = nn.conv2d(%129, meta[relay.Constant][151], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %131 = nn.batch_norm(%130, meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154], meta[relay.Constant][155]); + %132 = %131.0; + %133 = nn.relu(%132); + %134 = nn.conv2d(%133, meta[relay.Constant][156], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %135 = nn.batch_norm(%134, meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159], meta[relay.Constant][160]); + %136 = %135.0; + %137 = nn.relu(%136); + %138 = nn.conv2d(%137, meta[relay.Constant][161], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %139 = add(%138, %126); + %140 = nn.batch_norm(%139, meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164], meta[relay.Constant][165]); + %141 = %140.0; + %142 = nn.relu(%141); + %143 = nn.conv2d(%142, meta[relay.Constant][166], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %144 = nn.batch_norm(%143, meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169], meta[relay.Constant][170]); + %145 = %144.0; + %146 = nn.relu(%145); + %147 = nn.conv2d(%146, meta[relay.Constant][171], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %148 = nn.batch_norm(%147, meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174], meta[relay.Constant][175]); + %149 = %148.0; + %150 = nn.relu(%149); + %151 = nn.conv2d(%150, meta[relay.Constant][176], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %152 = add(%151, %139); + %153 = nn.batch_norm(%152, meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179], meta[relay.Constant][180]); + %154 = %153.0; + %155 = nn.relu(%154); + %156 = nn.conv2d(%155, meta[relay.Constant][181], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %157 = nn.batch_norm(%156, meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184], meta[relay.Constant][185]); + %158 = %157.0; + %159 = nn.relu(%158); + %160 = nn.conv2d(%159, meta[relay.Constant][186], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %161 = nn.batch_norm(%160, meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189], meta[relay.Constant][190]); + %162 = %161.0; + %163 = nn.relu(%162); + %164 = nn.conv2d(%163, meta[relay.Constant][191], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %165 = add(%164, %152); + %166 = nn.batch_norm(%165, meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194], meta[relay.Constant][195]); + %167 = %166.0; + %168 = nn.relu(%167); + %169 = nn.conv2d(%168, meta[relay.Constant][196], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %170 = nn.batch_norm(%169, meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199], meta[relay.Constant][200]); + %171 = %170.0; + %172 = nn.relu(%171); + %173 = nn.conv2d(%172, meta[relay.Constant][201], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %174 = nn.batch_norm(%173, meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204], meta[relay.Constant][205]); + %175 = %174.0; + %176 = nn.relu(%175); + %177 = nn.conv2d(%176, meta[relay.Constant][206], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %178 = add(%177, %165); + %179 = nn.batch_norm(%178, meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209], meta[relay.Constant][210]); + %180 = %179.0; + %181 = nn.relu(%180); + %182 = nn.conv2d(%181, meta[relay.Constant][211], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %183 = nn.batch_norm(%182, meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214], meta[relay.Constant][215]); + %184 = %183.0; + %185 = nn.relu(%184); + %186 = nn.conv2d(%185, meta[relay.Constant][216], strides=[2, 2], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]); + %187 = nn.batch_norm(%186, meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219], meta[relay.Constant][220]); + %188 = %187.0; + %189 = nn.relu(%188); + %190 = nn.conv2d(%189, meta[relay.Constant][221], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %191 = nn.conv2d(%181, meta[relay.Constant][222], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %192 = add(%190, %191); + %193 = nn.batch_norm(%192, meta[relay.Constant][223], meta[relay.Constant][224], meta[relay.Constant][225], meta[relay.Constant][226]); + %194 = %193.0; + %195 = nn.relu(%194); + %196 = nn.conv2d(%195, meta[relay.Constant][227], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %197 = nn.batch_norm(%196, meta[relay.Constant][228], meta[relay.Constant][229], meta[relay.Constant][230], meta[relay.Constant][231]); + %198 = %197.0; + %199 = nn.relu(%198); + %200 = nn.conv2d(%199, meta[relay.Constant][232], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]); + %201 = nn.batch_norm(%200, meta[relay.Constant][233], meta[relay.Constant][234], meta[relay.Constant][235], meta[relay.Constant][236]); + %202 = %201.0; + %203 = nn.relu(%202); + %204 = nn.conv2d(%203, meta[relay.Constant][237], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %205 = add(%204, %192); + %206 = nn.batch_norm(%205, meta[relay.Constant][238], meta[relay.Constant][239], meta[relay.Constant][240], meta[relay.Constant][241]); + %207 = %206.0; + %208 = nn.relu(%207); + %209 = nn.conv2d(%208, meta[relay.Constant][242], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %210 = nn.batch_norm(%209, meta[relay.Constant][243], meta[relay.Constant][244], meta[relay.Constant][245], meta[relay.Constant][246]); + %211 = %210.0; + %212 = nn.relu(%211); + %213 = nn.conv2d(%212, meta[relay.Constant][247], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]); + %214 = nn.batch_norm(%213, meta[relay.Constant][248], meta[relay.Constant][249], meta[relay.Constant][250], meta[relay.Constant][251]); + %215 = %214.0; + %216 = nn.relu(%215); + %217 = nn.conv2d(%216, meta[relay.Constant][252], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %218 = add(%217, %205); + %219 = nn.batch_norm(%218, meta[relay.Constant][253], meta[relay.Constant][254], meta[relay.Constant][255], meta[relay.Constant][256]); + %220 = %219.0; + %221 = nn.relu(%220); + %222 = nn.global_avg_pool2d(%221); + %223 = reshape(%222, newshape=[0, -1]); + %224 = nn.dense(%223, meta[relay.Constant][257], units=1000); + add(%224, meta[relay.Constant][258]) + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "resnet50", + "input_shapes": {"data": [1, 3, 224, 224]}, + "input_dtypes": {"data": "float32"}, + "mod": mod, + "params": None, + "main_dtype": "float32", + } + + +def resnet50_16(): + metatable = {"relay.Constant": resnet50_consts("float16")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%data: Tensor[(1, 3, 224, 224), float16]) -> Tensor[(1, 1000), float16] { + %0 = nn.batch_norm(%data, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]); + %1 = %0.0; + %2 = nn.conv2d(%1, meta[relay.Constant][4], strides=[2, 2], padding=[3, 3, 3, 3], channels=64, kernel_size=[7, 7]); + %3 = nn.batch_norm(%2, meta[relay.Constant][5], meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8]); + %4 = %3.0; + %5 = nn.relu(%4); + %6 = nn.max_pool2d(%5, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]); + %7 = nn.batch_norm(%6, meta[relay.Constant][9], meta[relay.Constant][10], meta[relay.Constant][11], meta[relay.Constant][12]); + %8 = %7.0; + %9 = nn.relu(%8); + %10 = nn.conv2d(%9, meta[relay.Constant][13], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %11 = nn.batch_norm(%10, meta[relay.Constant][14], meta[relay.Constant][15], meta[relay.Constant][16], meta[relay.Constant][17]); + %12 = %11.0; + %13 = nn.relu(%12); + %14 = nn.conv2d(%13, meta[relay.Constant][18], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]); + %15 = nn.batch_norm(%14, meta[relay.Constant][19], meta[relay.Constant][20], meta[relay.Constant][21], meta[relay.Constant][22]); + %16 = %15.0; + %17 = nn.relu(%16); + %18 = nn.conv2d(%17, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %19 = nn.conv2d(%9, meta[relay.Constant][24], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %20 = add(%18, %19); + %21 = nn.batch_norm(%20, meta[relay.Constant][25], meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28]); + %22 = %21.0; + %23 = nn.relu(%22); + %24 = nn.conv2d(%23, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %25 = nn.batch_norm(%24, meta[relay.Constant][30], meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33]); + %26 = %25.0; + %27 = nn.relu(%26); + %28 = nn.conv2d(%27, meta[relay.Constant][34], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]); + %29 = nn.batch_norm(%28, meta[relay.Constant][35], meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38]); + %30 = %29.0; + %31 = nn.relu(%30); + %32 = nn.conv2d(%31, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %33 = add(%32, %20); + %34 = nn.batch_norm(%33, meta[relay.Constant][40], meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43]); + %35 = %34.0; + %36 = nn.relu(%35); + %37 = nn.conv2d(%36, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %38 = nn.batch_norm(%37, meta[relay.Constant][45], meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48]); + %39 = %38.0; + %40 = nn.relu(%39); + %41 = nn.conv2d(%40, meta[relay.Constant][49], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]); + %42 = nn.batch_norm(%41, meta[relay.Constant][50], meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53]); + %43 = %42.0; + %44 = nn.relu(%43); + %45 = nn.conv2d(%44, meta[relay.Constant][54], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %46 = add(%45, %33); + %47 = nn.batch_norm(%46, meta[relay.Constant][55], meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58]); + %48 = %47.0; + %49 = nn.relu(%48); + %50 = nn.conv2d(%49, meta[relay.Constant][59], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %51 = nn.batch_norm(%50, meta[relay.Constant][60], meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63]); + %52 = %51.0; + %53 = nn.relu(%52); + %54 = nn.conv2d(%53, meta[relay.Constant][64], strides=[2, 2], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]); + %55 = nn.batch_norm(%54, meta[relay.Constant][65], meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68]); + %56 = %55.0; + %57 = nn.relu(%56); + %58 = nn.conv2d(%57, meta[relay.Constant][69], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %59 = nn.conv2d(%49, meta[relay.Constant][70], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %60 = add(%58, %59); + %61 = nn.batch_norm(%60, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]); + %62 = %61.0; + %63 = nn.relu(%62); + %64 = nn.conv2d(%63, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %65 = nn.batch_norm(%64, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]); + %66 = %65.0; + %67 = nn.relu(%66); + %68 = nn.conv2d(%67, meta[relay.Constant][80], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]); + %69 = nn.batch_norm(%68, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]); + %70 = %69.0; + %71 = nn.relu(%70); + %72 = nn.conv2d(%71, meta[relay.Constant][85], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %73 = add(%72, %60); + %74 = nn.batch_norm(%73, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]); + %75 = %74.0; + %76 = nn.relu(%75); + %77 = nn.conv2d(%76, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %78 = nn.batch_norm(%77, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]); + %79 = %78.0; + %80 = nn.relu(%79); + %81 = nn.conv2d(%80, meta[relay.Constant][95], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]); + %82 = nn.batch_norm(%81, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]); + %83 = %82.0; + %84 = nn.relu(%83); + %85 = nn.conv2d(%84, meta[relay.Constant][100], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %86 = add(%85, %73); + %87 = nn.batch_norm(%86, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]); + %88 = %87.0; + %89 = nn.relu(%88); + %90 = nn.conv2d(%89, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %91 = nn.batch_norm(%90, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]); + %92 = %91.0; + %93 = nn.relu(%92); + %94 = nn.conv2d(%93, meta[relay.Constant][110], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]); + %95 = nn.batch_norm(%94, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]); + %96 = %95.0; + %97 = nn.relu(%96); + %98 = nn.conv2d(%97, meta[relay.Constant][115], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %99 = add(%98, %86); + %100 = nn.batch_norm(%99, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]); + %101 = %100.0; + %102 = nn.relu(%101); + %103 = nn.conv2d(%102, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %104 = nn.batch_norm(%103, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]); + %105 = %104.0; + %106 = nn.relu(%105); + %107 = nn.conv2d(%106, meta[relay.Constant][125], strides=[2, 2], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %108 = nn.batch_norm(%107, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]); + %109 = %108.0; + %110 = nn.relu(%109); + %111 = nn.conv2d(%110, meta[relay.Constant][130], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %112 = nn.conv2d(%102, meta[relay.Constant][131], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %113 = add(%111, %112); + %114 = nn.batch_norm(%113, meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134], meta[relay.Constant][135]); + %115 = %114.0; + %116 = nn.relu(%115); + %117 = nn.conv2d(%116, meta[relay.Constant][136], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %118 = nn.batch_norm(%117, meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139], meta[relay.Constant][140]); + %119 = %118.0; + %120 = nn.relu(%119); + %121 = nn.conv2d(%120, meta[relay.Constant][141], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %122 = nn.batch_norm(%121, meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144], meta[relay.Constant][145]); + %123 = %122.0; + %124 = nn.relu(%123); + %125 = nn.conv2d(%124, meta[relay.Constant][146], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %126 = add(%125, %113); + %127 = nn.batch_norm(%126, meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149], meta[relay.Constant][150]); + %128 = %127.0; + %129 = nn.relu(%128); + %130 = nn.conv2d(%129, meta[relay.Constant][151], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %131 = nn.batch_norm(%130, meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154], meta[relay.Constant][155]); + %132 = %131.0; + %133 = nn.relu(%132); + %134 = nn.conv2d(%133, meta[relay.Constant][156], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %135 = nn.batch_norm(%134, meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159], meta[relay.Constant][160]); + %136 = %135.0; + %137 = nn.relu(%136); + %138 = nn.conv2d(%137, meta[relay.Constant][161], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %139 = add(%138, %126); + %140 = nn.batch_norm(%139, meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164], meta[relay.Constant][165]); + %141 = %140.0; + %142 = nn.relu(%141); + %143 = nn.conv2d(%142, meta[relay.Constant][166], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %144 = nn.batch_norm(%143, meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169], meta[relay.Constant][170]); + %145 = %144.0; + %146 = nn.relu(%145); + %147 = nn.conv2d(%146, meta[relay.Constant][171], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %148 = nn.batch_norm(%147, meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174], meta[relay.Constant][175]); + %149 = %148.0; + %150 = nn.relu(%149); + %151 = nn.conv2d(%150, meta[relay.Constant][176], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %152 = add(%151, %139); + %153 = nn.batch_norm(%152, meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179], meta[relay.Constant][180]); + %154 = %153.0; + %155 = nn.relu(%154); + %156 = nn.conv2d(%155, meta[relay.Constant][181], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %157 = nn.batch_norm(%156, meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184], meta[relay.Constant][185]); + %158 = %157.0; + %159 = nn.relu(%158); + %160 = nn.conv2d(%159, meta[relay.Constant][186], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %161 = nn.batch_norm(%160, meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189], meta[relay.Constant][190]); + %162 = %161.0; + %163 = nn.relu(%162); + %164 = nn.conv2d(%163, meta[relay.Constant][191], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %165 = add(%164, %152); + %166 = nn.batch_norm(%165, meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194], meta[relay.Constant][195]); + %167 = %166.0; + %168 = nn.relu(%167); + %169 = nn.conv2d(%168, meta[relay.Constant][196], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %170 = nn.batch_norm(%169, meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199], meta[relay.Constant][200]); + %171 = %170.0; + %172 = nn.relu(%171); + %173 = nn.conv2d(%172, meta[relay.Constant][201], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]); + %174 = nn.batch_norm(%173, meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204], meta[relay.Constant][205]); + %175 = %174.0; + %176 = nn.relu(%175); + %177 = nn.conv2d(%176, meta[relay.Constant][206], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %178 = add(%177, %165); + %179 = nn.batch_norm(%178, meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209], meta[relay.Constant][210]); + %180 = %179.0; + %181 = nn.relu(%180); + %182 = nn.conv2d(%181, meta[relay.Constant][211], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %183 = nn.batch_norm(%182, meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214], meta[relay.Constant][215]); + %184 = %183.0; + %185 = nn.relu(%184); + %186 = nn.conv2d(%185, meta[relay.Constant][216], strides=[2, 2], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]); + %187 = nn.batch_norm(%186, meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219], meta[relay.Constant][220]); + %188 = %187.0; + %189 = nn.relu(%188); + %190 = nn.conv2d(%189, meta[relay.Constant][221], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %191 = nn.conv2d(%181, meta[relay.Constant][222], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %192 = add(%190, %191); + %193 = nn.batch_norm(%192, meta[relay.Constant][223], meta[relay.Constant][224], meta[relay.Constant][225], meta[relay.Constant][226]); + %194 = %193.0; + %195 = nn.relu(%194); + %196 = nn.conv2d(%195, meta[relay.Constant][227], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %197 = nn.batch_norm(%196, meta[relay.Constant][228], meta[relay.Constant][229], meta[relay.Constant][230], meta[relay.Constant][231]); + %198 = %197.0; + %199 = nn.relu(%198); + %200 = nn.conv2d(%199, meta[relay.Constant][232], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]); + %201 = nn.batch_norm(%200, meta[relay.Constant][233], meta[relay.Constant][234], meta[relay.Constant][235], meta[relay.Constant][236]); + %202 = %201.0; + %203 = nn.relu(%202); + %204 = nn.conv2d(%203, meta[relay.Constant][237], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %205 = add(%204, %192); + %206 = nn.batch_norm(%205, meta[relay.Constant][238], meta[relay.Constant][239], meta[relay.Constant][240], meta[relay.Constant][241]); + %207 = %206.0; + %208 = nn.relu(%207); + %209 = nn.conv2d(%208, meta[relay.Constant][242], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %210 = nn.batch_norm(%209, meta[relay.Constant][243], meta[relay.Constant][244], meta[relay.Constant][245], meta[relay.Constant][246]); + %211 = %210.0; + %212 = nn.relu(%211); + %213 = nn.conv2d(%212, meta[relay.Constant][247], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]); + %214 = nn.batch_norm(%213, meta[relay.Constant][248], meta[relay.Constant][249], meta[relay.Constant][250], meta[relay.Constant][251]); + %215 = %214.0; + %216 = nn.relu(%215); + %217 = nn.conv2d(%216, meta[relay.Constant][252], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %218 = add(%217, %205); + %219 = nn.batch_norm(%218, meta[relay.Constant][253], meta[relay.Constant][254], meta[relay.Constant][255], meta[relay.Constant][256]); + %220 = %219.0; + %221 = nn.relu(%220); + %222 = nn.global_avg_pool2d(%221); + %223 = reshape(%222, newshape=[0, -1]); + %224 = nn.dense(%223, meta[relay.Constant][257], units=1000); + add(%224, meta[relay.Constant][258]) + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "resnet50_16", + "input_shapes": {"data": [1, 3, 224, 224]}, + "input_dtypes": {"data": "float16"}, + "mod": mod, + "params": None, + "main_dtype": "float16", + } + + +def mobilenet_consts(dtype): + return make_consts( + dtype, + [ + (32, 3, 3, 3), # 0 + (32,), # 1 + (32,), # 2 + (32,), # 3 + (32,), # 4 + (32, 32, 1, 1), # 5 + (32,), # 6 + (32,), # 7 + (32,), # 8 + (32,), # 9 + (32, 1, 3, 3), # 10 + (32,), # 11 + (32,), # 12 + (32,), # 13 + (32,), # 14 + (16, 32, 1, 1), # 15 + (16,), # 16 + (16,), # 17 + (16,), # 18 + (16,), # 19 + (96, 16, 1, 1), # 20 + (96,), # 21 + (96,), # 22 + (96,), # 23 + (96,), # 24 + (96, 1, 3, 3), # 25 + (96,), # 26 + (96,), # 27 + (96,), # 28 + (96,), # 29 + (24, 96, 1, 1), # 30 + (24,), # 31 + (24,), # 32 + (24,), # 33 + (24,), # 34 + (144, 24, 1, 1), # 35 + (144,), # 36 + (144,), # 37 + (144,), # 38 + (144,), # 39 + (144, 1, 3, 3), # 40 + (144,), # 41 + (144,), # 42 + (144,), # 43 + (144,), # 44 + (24, 144, 1, 1), # 45 + (24,), # 46 + (24,), # 47 + (24,), # 48 + (24,), # 49 + (144, 24, 1, 1), # 50 + (144,), # 51 + (144,), # 52 + (144,), # 53 + (144,), # 54 + (144, 1, 3, 3), # 55 + (144,), # 56 + (144,), # 57 + (144,), # 58 + (144,), # 59 + (32, 144, 1, 1), # 60 + (32,), # 61 + (32,), # 62 + (32,), # 63 + (32,), # 64 + (192, 32, 1, 1), # 65 + (192,), # 66 + (192,), # 67 + (192,), # 68 + (192,), # 69 + (192, 1, 3, 3), # 70 + (192,), # 71 + (192,), # 72 + (192,), # 73 + (192,), # 74 + (32, 192, 1, 1), # 75 + (32,), # 76 + (32,), # 77 + (32,), # 78 + (32,), # 79 + (192, 32, 1, 1), # 80 + (192,), # 81 + (192,), # 82 + (192,), # 83 + (192,), # 84 + (192, 1, 3, 3), # 85 + (192,), # 86 + (192,), # 87 + (192,), # 88 + (192,), # 89 + (32, 192, 1, 1), # 90 + (32,), # 91 + (32,), # 92 + (32,), # 93 + (32,), # 94 + (192, 32, 1, 1), # 95 + (192,), # 96 + (192,), # 97 + (192,), # 98 + (192,), # 99 + (192, 1, 3, 3), # 100 + (192,), # 101 + (192,), # 102 + (192,), # 103 + (192,), # 104 + (64, 192, 1, 1), # 105 + (64,), # 106 + (64,), # 107 + (64,), # 108 + (64,), # 109 + (384, 64, 1, 1), # 110 + (384,), # 111 + (384,), # 112 + (384,), # 113 + (384,), # 114 + (384, 1, 3, 3), # 115 + (384,), # 116 + (384,), # 117 + (384,), # 118 + (384,), # 119 + (64, 384, 1, 1), # 120 + (64,), # 121 + (64,), # 122 + (64,), # 123 + (64,), # 124 + (384, 64, 1, 1), # 125 + (384,), # 126 + (384,), # 127 + (384,), # 128 + (384,), # 129 + (384, 1, 3, 3), # 130 + (384,), # 131 + (384,), # 132 + (384,), # 133 + (384,), # 134 + (64, 384, 1, 1), # 135 + (64,), # 136 + (64,), # 137 + (64,), # 138 + (64,), # 139 + (384, 64, 1, 1), # 140 + (384,), # 141 + (384,), # 142 + (384,), # 143 + (384,), # 144 + (384, 1, 3, 3), # 145 + (384,), # 146 + (384,), # 147 + (384,), # 148 + (384,), # 149 + (64, 384, 1, 1), # 150 + (64,), # 151 + (64,), # 152 + (64,), # 153 + (64,), # 154 + (384, 64, 1, 1), # 155 + (384,), # 156 + (384,), # 157 + (384,), # 158 + (384,), # 159 + (384, 1, 3, 3), # 160 + (384,), # 161 + (384,), # 162 + (384,), # 163 + (384,), # 164 + (96, 384, 1, 1), # 165 + (96,), # 166 + (96,), # 167 + (96,), # 168 + (96,), # 169 + (576, 96, 1, 1), # 170 + (576,), # 171 + (576,), # 172 + (576,), # 173 + (576,), # 174 + (576, 1, 3, 3), # 175 + (576,), # 176 + (576,), # 177 + (576,), # 178 + (576,), # 179 + (96, 576, 1, 1), # 180 + (96,), # 181 + (96,), # 182 + (96,), # 183 + (96,), # 184 + (576, 96, 1, 1), # 185 + (576,), # 186 + (576,), # 187 + (576,), # 188 + (576,), # 189 + (576, 1, 3, 3), # 190 + (576,), # 191 + (576,), # 192 + (576,), # 193 + (576,), # 194 + (96, 576, 1, 1), # 195 + (96,), # 196 + (96,), # 197 + (96,), # 198 + (96,), # 199 + (576, 96, 1, 1), # 200 + (576,), # 201 + (576,), # 202 + (576,), # 203 + (576,), # 204 + (576, 1, 3, 3), # 205 + (576,), # 206 + (576,), # 207 + (576,), # 208 + (576,), # 209 + (160, 576, 1, 1), # 210 + (160,), # 211 + (160,), # 212 + (160,), # 213 + (160,), # 214 + (960, 160, 1, 1), # 215 + (960,), # 216 + (960,), # 217 + (960,), # 218 + (960,), # 219 + (960, 1, 3, 3), # 220 + (960,), # 221 + (960,), # 222 + (960,), # 223 + (960,), # 224 + (160, 960, 1, 1), # 225 + (160,), # 226 + (160,), # 227 + (160,), # 228 + (160,), # 229 + (960, 160, 1, 1), # 230 + (960,), # 231 + (960,), # 232 + (960,), # 233 + (960,), # 234 + (960, 1, 3, 3), # 235 + (960,), # 236 + (960,), # 237 + (960,), # 238 + (960,), # 239 + (160, 960, 1, 1), # 240 + (160,), # 241 + (160,), # 242 + (160,), # 243 + (160,), # 244 + (960, 160, 1, 1), # 245 + (960,), # 246 + (960,), # 247 + (960,), # 248 + (960,), # 249 + (960, 1, 3, 3), # 250 + (960,), # 251 + (960,), # 252 + (960,), # 253 + (960,), # 254 + (320, 960, 1, 1), # 255 + (320,), # 256 + (320,), # 257 + (320,), # 258 + (320,), # 259 + (1280, 320, 1, 1), # 260 + (1280,), # 261 + (1280,), # 262 + (1280,), # 263 + (1280,), # 264 + (1000, 1280, 1, 1), # 265 + ], + ) + + +def mobilenet(): + metatable = {"relay.Constant": mobilenet_consts("float32")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32] { + %0 = nn.conv2d(%data, meta[relay.Constant][0], strides=[2, 2], padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3]); + %1 = nn.batch_norm(%0, meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3], meta[relay.Constant][4]); + %2 = %1.0; + %3 = nn.relu(%2); + %4 = nn.conv2d(%3, meta[relay.Constant][5], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]); + %5 = nn.batch_norm(%4, meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8], meta[relay.Constant][9]); + %6 = %5.0; + %7 = nn.relu(%6); + %8 = nn.conv2d(%7, meta[relay.Constant][10], padding=[1, 1, 1, 1], groups=32, channels=32, kernel_size=[3, 3]); + %9 = nn.batch_norm(%8, meta[relay.Constant][11], meta[relay.Constant][12], meta[relay.Constant][13], meta[relay.Constant][14]); + %10 = %9.0; + %11 = nn.relu(%10); + %12 = nn.conv2d(%11, meta[relay.Constant][15], padding=[0, 0, 0, 0], channels=16, kernel_size=[1, 1]); + %13 = nn.batch_norm(%12, meta[relay.Constant][16], meta[relay.Constant][17], meta[relay.Constant][18], meta[relay.Constant][19]); + %14 = %13.0; + %15 = nn.conv2d(%14, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]); + %16 = nn.batch_norm(%15, meta[relay.Constant][21], meta[relay.Constant][22], meta[relay.Constant][23], meta[relay.Constant][24]); + %17 = %16.0; + %18 = nn.relu(%17); + %19 = nn.conv2d(%18, meta[relay.Constant][25], strides=[2, 2], padding=[1, 1, 1, 1], groups=96, channels=96, kernel_size=[3, 3]); + %20 = nn.batch_norm(%19, meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28], meta[relay.Constant][29]); + %21 = %20.0; + %22 = nn.relu(%21); + %23 = nn.conv2d(%22, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]); + %24 = nn.batch_norm(%23, meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33], meta[relay.Constant][34]); + %25 = %24.0; + %26 = nn.conv2d(%25, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]); + %27 = nn.batch_norm(%26, meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38], meta[relay.Constant][39]); + %28 = %27.0; + %29 = nn.relu(%28); + %30 = nn.conv2d(%29, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]); + %31 = nn.batch_norm(%30, meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43], meta[relay.Constant][44]); + %32 = %31.0; + %33 = nn.relu(%32); + %34 = nn.conv2d(%33, meta[relay.Constant][45], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]); + %35 = nn.batch_norm(%34, meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48], meta[relay.Constant][49]); + %36 = %35.0; + %37 = add(%36, %25); + %38 = nn.conv2d(%37, meta[relay.Constant][50], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]); + %39 = nn.batch_norm(%38, meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53], meta[relay.Constant][54]); + %40 = %39.0; + %41 = nn.relu(%40); + %42 = nn.conv2d(%41, meta[relay.Constant][55], strides=[2, 2], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]); + %43 = nn.batch_norm(%42, meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58], meta[relay.Constant][59]); + %44 = %43.0; + %45 = nn.relu(%44); + %46 = nn.conv2d(%45, meta[relay.Constant][60], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]); + %47 = nn.batch_norm(%46, meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63], meta[relay.Constant][64]); + %48 = %47.0; + %49 = nn.conv2d(%48, meta[relay.Constant][65], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]); + %50 = nn.batch_norm(%49, meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68], meta[relay.Constant][69]); + %51 = %50.0; + %52 = nn.relu(%51); + %53 = nn.conv2d(%52, meta[relay.Constant][70], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]); + %54 = nn.batch_norm(%53, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]); + %55 = %54.0; + %56 = nn.relu(%55); + %57 = nn.conv2d(%56, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]); + %58 = nn.batch_norm(%57, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]); + %59 = %58.0; + %60 = add(%59, %48); + %61 = nn.conv2d(%60, meta[relay.Constant][80], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]); + %62 = nn.batch_norm(%61, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]); + %63 = %62.0; + %64 = nn.relu(%63); + %65 = nn.conv2d(%64, meta[relay.Constant][85], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]); + %66 = nn.batch_norm(%65, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]); + %67 = %66.0; + %68 = nn.relu(%67); + %69 = nn.conv2d(%68, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]); + %70 = nn.batch_norm(%69, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]); + %71 = %70.0; + %72 = add(%71, %60); + %73 = nn.conv2d(%72, meta[relay.Constant][95], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]); + %74 = nn.batch_norm(%73, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]); + %75 = %74.0; + %76 = nn.relu(%75); + %77 = nn.conv2d(%76, meta[relay.Constant][100], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]); + %78 = nn.batch_norm(%77, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]); + %79 = %78.0; + %80 = nn.relu(%79); + %81 = nn.conv2d(%80, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %82 = nn.batch_norm(%81, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]); + %83 = %82.0; + %84 = nn.conv2d(%83, meta[relay.Constant][110], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]); + %85 = nn.batch_norm(%84, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]); + %86 = %85.0; + %87 = nn.relu(%86); + %88 = nn.conv2d(%87, meta[relay.Constant][115], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]); + %89 = nn.batch_norm(%88, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]); + %90 = %89.0; + %91 = nn.relu(%90); + %92 = nn.conv2d(%91, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %93 = nn.batch_norm(%92, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]); + %94 = %93.0; + %95 = add(%94, %83); + %96 = nn.conv2d(%95, meta[relay.Constant][125], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]); + %97 = nn.batch_norm(%96, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]); + %98 = %97.0; + %99 = nn.relu(%98); + %100 = nn.conv2d(%99, meta[relay.Constant][130], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]); + %101 = nn.batch_norm(%100, meta[relay.Constant][131], meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134]); + %102 = %101.0; + %103 = nn.relu(%102); + %104 = nn.conv2d(%103, meta[relay.Constant][135], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %105 = nn.batch_norm(%104, meta[relay.Constant][136], meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139]); + %106 = %105.0; + %107 = add(%106, %95); + %108 = nn.conv2d(%107, meta[relay.Constant][140], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]); + %109 = nn.batch_norm(%108, meta[relay.Constant][141], meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144]); + %110 = %109.0; + %111 = nn.relu(%110); + %112 = nn.conv2d(%111, meta[relay.Constant][145], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]); + %113 = nn.batch_norm(%112, meta[relay.Constant][146], meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149]); + %114 = %113.0; + %115 = nn.relu(%114); + %116 = nn.conv2d(%115, meta[relay.Constant][150], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %117 = nn.batch_norm(%116, meta[relay.Constant][151], meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154]); + %118 = %117.0; + %119 = add(%118, %107); + %120 = nn.conv2d(%119, meta[relay.Constant][155], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]); + %121 = nn.batch_norm(%120, meta[relay.Constant][156], meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159]); + %122 = %121.0; + %123 = nn.relu(%122); + %124 = nn.conv2d(%123, meta[relay.Constant][160], strides=[2, 2], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]); + %125 = nn.batch_norm(%124, meta[relay.Constant][161], meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164]); + %126 = %125.0; + %127 = nn.relu(%126); + %128 = nn.conv2d(%127, meta[relay.Constant][165], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]); + %129 = nn.batch_norm(%128, meta[relay.Constant][166], meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169]); + %130 = %129.0; + %131 = nn.conv2d(%130, meta[relay.Constant][170], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]); + %132 = nn.batch_norm(%131, meta[relay.Constant][171], meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174]); + %133 = %132.0; + %134 = nn.relu(%133); + %135 = nn.conv2d(%134, meta[relay.Constant][175], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]); + %136 = nn.batch_norm(%135, meta[relay.Constant][176], meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179]); + %137 = %136.0; + %138 = nn.relu(%137); + %139 = nn.conv2d(%138, meta[relay.Constant][180], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]); + %140 = nn.batch_norm(%139, meta[relay.Constant][181], meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184]); + %141 = %140.0; + %142 = add(%141, %130); + %143 = nn.conv2d(%142, meta[relay.Constant][185], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]); + %144 = nn.batch_norm(%143, meta[relay.Constant][186], meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189]); + %145 = %144.0; + %146 = nn.relu(%145); + %147 = nn.conv2d(%146, meta[relay.Constant][190], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]); + %148 = nn.batch_norm(%147, meta[relay.Constant][191], meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194]); + %149 = %148.0; + %150 = nn.relu(%149); + %151 = nn.conv2d(%150, meta[relay.Constant][195], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]); + %152 = nn.batch_norm(%151, meta[relay.Constant][196], meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199]); + %153 = %152.0; + %154 = add(%153, %142); + %155 = nn.conv2d(%154, meta[relay.Constant][200], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]); + %156 = nn.batch_norm(%155, meta[relay.Constant][201], meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204]); + %157 = %156.0; + %158 = nn.relu(%157); + %159 = nn.conv2d(%158, meta[relay.Constant][205], strides=[2, 2], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]); + %160 = nn.batch_norm(%159, meta[relay.Constant][206], meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209]); + %161 = %160.0; + %162 = nn.relu(%161); + %163 = nn.conv2d(%162, meta[relay.Constant][210], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]); + %164 = nn.batch_norm(%163, meta[relay.Constant][211], meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214]); + %165 = %164.0; + %166 = nn.conv2d(%165, meta[relay.Constant][215], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]); + %167 = nn.batch_norm(%166, meta[relay.Constant][216], meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219]); + %168 = %167.0; + %169 = nn.relu(%168); + %170 = nn.conv2d(%169, meta[relay.Constant][220], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]); + %171 = nn.batch_norm(%170, meta[relay.Constant][221], meta[relay.Constant][222], meta[relay.Constant][223], meta[relay.Constant][224]); + %172 = %171.0; + %173 = nn.relu(%172); + %174 = nn.conv2d(%173, meta[relay.Constant][225], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]); + %175 = nn.batch_norm(%174, meta[relay.Constant][226], meta[relay.Constant][227], meta[relay.Constant][228], meta[relay.Constant][229]); + %176 = %175.0; + %177 = add(%176, %165); + %178 = nn.conv2d(%177, meta[relay.Constant][230], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]); + %179 = nn.batch_norm(%178, meta[relay.Constant][231], meta[relay.Constant][232], meta[relay.Constant][233], meta[relay.Constant][234]); + %180 = %179.0; + %181 = nn.relu(%180); + %182 = nn.conv2d(%181, meta[relay.Constant][235], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]); + %183 = nn.batch_norm(%182, meta[relay.Constant][236], meta[relay.Constant][237], meta[relay.Constant][238], meta[relay.Constant][239]); + %184 = %183.0; + %185 = nn.relu(%184); + %186 = nn.conv2d(%185, meta[relay.Constant][240], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]); + %187 = nn.batch_norm(%186, meta[relay.Constant][241], meta[relay.Constant][242], meta[relay.Constant][243], meta[relay.Constant][244]); + %188 = %187.0; + %189 = add(%188, %177); + %190 = nn.conv2d(%189, meta[relay.Constant][245], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]); + %191 = nn.batch_norm(%190, meta[relay.Constant][246], meta[relay.Constant][247], meta[relay.Constant][248], meta[relay.Constant][249]); + %192 = %191.0; + %193 = nn.relu(%192); + %194 = nn.conv2d(%193, meta[relay.Constant][250], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]); + %195 = nn.batch_norm(%194, meta[relay.Constant][251], meta[relay.Constant][252], meta[relay.Constant][253], meta[relay.Constant][254]); + %196 = %195.0; + %197 = nn.relu(%196); + %198 = nn.conv2d(%197, meta[relay.Constant][255], padding=[0, 0, 0, 0], channels=320, kernel_size=[1, 1]); + %199 = nn.batch_norm(%198, meta[relay.Constant][256], meta[relay.Constant][257], meta[relay.Constant][258], meta[relay.Constant][259]); + %200 = %199.0; + %201 = nn.conv2d(%200, meta[relay.Constant][260], padding=[0, 0, 0, 0], channels=1280, kernel_size=[1, 1]); + %202 = nn.batch_norm(%201, meta[relay.Constant][261], meta[relay.Constant][262], meta[relay.Constant][263], meta[relay.Constant][264]); + %203 = %202.0; + %204 = nn.relu(%203); + %205 = nn.global_avg_pool2d(%204); + %206 = nn.conv2d(%205, meta[relay.Constant][265], padding=[0, 0, 0, 0], channels=1000, kernel_size=[1, 1]); + reshape(%206, newshape=[0, -1]) + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "mobilenet", + "input_shapes": {"data": [1, 3, 224, 224]}, + "input_dtypes": {"data": "float32"}, + "mod": mod, + "params": None, + "main_dtype": "float32", + } + + +def mobilenet_16(): + metatable = {"relay.Constant": mobilenet_consts("float16")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%data: Tensor[(1, 3, 224, 224), float16]) -> Tensor[(1, 1000), float16] { + %0 = nn.conv2d(%data, meta[relay.Constant][0], strides=[2, 2], padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3]); + %1 = nn.batch_norm(%0, meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3], meta[relay.Constant][4]); + %2 = %1.0; + %3 = nn.relu(%2); + %4 = nn.conv2d(%3, meta[relay.Constant][5], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]); + %5 = nn.batch_norm(%4, meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8], meta[relay.Constant][9]); + %6 = %5.0; + %7 = nn.relu(%6); + %8 = nn.conv2d(%7, meta[relay.Constant][10], padding=[1, 1, 1, 1], groups=32, channels=32, kernel_size=[3, 3]); + %9 = nn.batch_norm(%8, meta[relay.Constant][11], meta[relay.Constant][12], meta[relay.Constant][13], meta[relay.Constant][14]); + %10 = %9.0; + %11 = nn.relu(%10); + %12 = nn.conv2d(%11, meta[relay.Constant][15], padding=[0, 0, 0, 0], channels=16, kernel_size=[1, 1]); + %13 = nn.batch_norm(%12, meta[relay.Constant][16], meta[relay.Constant][17], meta[relay.Constant][18], meta[relay.Constant][19]); + %14 = %13.0; + %15 = nn.conv2d(%14, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]); + %16 = nn.batch_norm(%15, meta[relay.Constant][21], meta[relay.Constant][22], meta[relay.Constant][23], meta[relay.Constant][24]); + %17 = %16.0; + %18 = nn.relu(%17); + %19 = nn.conv2d(%18, meta[relay.Constant][25], strides=[2, 2], padding=[1, 1, 1, 1], groups=96, channels=96, kernel_size=[3, 3]); + %20 = nn.batch_norm(%19, meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28], meta[relay.Constant][29]); + %21 = %20.0; + %22 = nn.relu(%21); + %23 = nn.conv2d(%22, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]); + %24 = nn.batch_norm(%23, meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33], meta[relay.Constant][34]); + %25 = %24.0; + %26 = nn.conv2d(%25, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]); + %27 = nn.batch_norm(%26, meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38], meta[relay.Constant][39]); + %28 = %27.0; + %29 = nn.relu(%28); + %30 = nn.conv2d(%29, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]); + %31 = nn.batch_norm(%30, meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43], meta[relay.Constant][44]); + %32 = %31.0; + %33 = nn.relu(%32); + %34 = nn.conv2d(%33, meta[relay.Constant][45], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]); + %35 = nn.batch_norm(%34, meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48], meta[relay.Constant][49]); + %36 = %35.0; + %37 = add(%36, %25); + %38 = nn.conv2d(%37, meta[relay.Constant][50], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]); + %39 = nn.batch_norm(%38, meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53], meta[relay.Constant][54]); + %40 = %39.0; + %41 = nn.relu(%40); + %42 = nn.conv2d(%41, meta[relay.Constant][55], strides=[2, 2], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]); + %43 = nn.batch_norm(%42, meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58], meta[relay.Constant][59]); + %44 = %43.0; + %45 = nn.relu(%44); + %46 = nn.conv2d(%45, meta[relay.Constant][60], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]); + %47 = nn.batch_norm(%46, meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63], meta[relay.Constant][64]); + %48 = %47.0; + %49 = nn.conv2d(%48, meta[relay.Constant][65], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]); + %50 = nn.batch_norm(%49, meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68], meta[relay.Constant][69]); + %51 = %50.0; + %52 = nn.relu(%51); + %53 = nn.conv2d(%52, meta[relay.Constant][70], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]); + %54 = nn.batch_norm(%53, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]); + %55 = %54.0; + %56 = nn.relu(%55); + %57 = nn.conv2d(%56, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]); + %58 = nn.batch_norm(%57, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]); + %59 = %58.0; + %60 = add(%59, %48); + %61 = nn.conv2d(%60, meta[relay.Constant][80], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]); + %62 = nn.batch_norm(%61, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]); + %63 = %62.0; + %64 = nn.relu(%63); + %65 = nn.conv2d(%64, meta[relay.Constant][85], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]); + %66 = nn.batch_norm(%65, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]); + %67 = %66.0; + %68 = nn.relu(%67); + %69 = nn.conv2d(%68, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]); + %70 = nn.batch_norm(%69, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]); + %71 = %70.0; + %72 = add(%71, %60); + %73 = nn.conv2d(%72, meta[relay.Constant][95], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]); + %74 = nn.batch_norm(%73, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]); + %75 = %74.0; + %76 = nn.relu(%75); + %77 = nn.conv2d(%76, meta[relay.Constant][100], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]); + %78 = nn.batch_norm(%77, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]); + %79 = %78.0; + %80 = nn.relu(%79); + %81 = nn.conv2d(%80, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %82 = nn.batch_norm(%81, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]); + %83 = %82.0; + %84 = nn.conv2d(%83, meta[relay.Constant][110], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]); + %85 = nn.batch_norm(%84, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]); + %86 = %85.0; + %87 = nn.relu(%86); + %88 = nn.conv2d(%87, meta[relay.Constant][115], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]); + %89 = nn.batch_norm(%88, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]); + %90 = %89.0; + %91 = nn.relu(%90); + %92 = nn.conv2d(%91, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %93 = nn.batch_norm(%92, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]); + %94 = %93.0; + %95 = add(%94, %83); + %96 = nn.conv2d(%95, meta[relay.Constant][125], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]); + %97 = nn.batch_norm(%96, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]); + %98 = %97.0; + %99 = nn.relu(%98); + %100 = nn.conv2d(%99, meta[relay.Constant][130], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]); + %101 = nn.batch_norm(%100, meta[relay.Constant][131], meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134]); + %102 = %101.0; + %103 = nn.relu(%102); + %104 = nn.conv2d(%103, meta[relay.Constant][135], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %105 = nn.batch_norm(%104, meta[relay.Constant][136], meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139]); + %106 = %105.0; + %107 = add(%106, %95); + %108 = nn.conv2d(%107, meta[relay.Constant][140], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]); + %109 = nn.batch_norm(%108, meta[relay.Constant][141], meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144]); + %110 = %109.0; + %111 = nn.relu(%110); + %112 = nn.conv2d(%111, meta[relay.Constant][145], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]); + %113 = nn.batch_norm(%112, meta[relay.Constant][146], meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149]); + %114 = %113.0; + %115 = nn.relu(%114); + %116 = nn.conv2d(%115, meta[relay.Constant][150], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]); + %117 = nn.batch_norm(%116, meta[relay.Constant][151], meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154]); + %118 = %117.0; + %119 = add(%118, %107); + %120 = nn.conv2d(%119, meta[relay.Constant][155], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]); + %121 = nn.batch_norm(%120, meta[relay.Constant][156], meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159]); + %122 = %121.0; + %123 = nn.relu(%122); + %124 = nn.conv2d(%123, meta[relay.Constant][160], strides=[2, 2], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]); + %125 = nn.batch_norm(%124, meta[relay.Constant][161], meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164]); + %126 = %125.0; + %127 = nn.relu(%126); + %128 = nn.conv2d(%127, meta[relay.Constant][165], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]); + %129 = nn.batch_norm(%128, meta[relay.Constant][166], meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169]); + %130 = %129.0; + %131 = nn.conv2d(%130, meta[relay.Constant][170], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]); + %132 = nn.batch_norm(%131, meta[relay.Constant][171], meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174]); + %133 = %132.0; + %134 = nn.relu(%133); + %135 = nn.conv2d(%134, meta[relay.Constant][175], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]); + %136 = nn.batch_norm(%135, meta[relay.Constant][176], meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179]); + %137 = %136.0; + %138 = nn.relu(%137); + %139 = nn.conv2d(%138, meta[relay.Constant][180], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]); + %140 = nn.batch_norm(%139, meta[relay.Constant][181], meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184]); + %141 = %140.0; + %142 = add(%141, %130); + %143 = nn.conv2d(%142, meta[relay.Constant][185], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]); + %144 = nn.batch_norm(%143, meta[relay.Constant][186], meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189]); + %145 = %144.0; + %146 = nn.relu(%145); + %147 = nn.conv2d(%146, meta[relay.Constant][190], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]); + %148 = nn.batch_norm(%147, meta[relay.Constant][191], meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194]); + %149 = %148.0; + %150 = nn.relu(%149); + %151 = nn.conv2d(%150, meta[relay.Constant][195], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]); + %152 = nn.batch_norm(%151, meta[relay.Constant][196], meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199]); + %153 = %152.0; + %154 = add(%153, %142); + %155 = nn.conv2d(%154, meta[relay.Constant][200], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]); + %156 = nn.batch_norm(%155, meta[relay.Constant][201], meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204]); + %157 = %156.0; + %158 = nn.relu(%157); + %159 = nn.conv2d(%158, meta[relay.Constant][205], strides=[2, 2], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]); + %160 = nn.batch_norm(%159, meta[relay.Constant][206], meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209]); + %161 = %160.0; + %162 = nn.relu(%161); + %163 = nn.conv2d(%162, meta[relay.Constant][210], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]); + %164 = nn.batch_norm(%163, meta[relay.Constant][211], meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214]); + %165 = %164.0; + %166 = nn.conv2d(%165, meta[relay.Constant][215], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]); + %167 = nn.batch_norm(%166, meta[relay.Constant][216], meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219]); + %168 = %167.0; + %169 = nn.relu(%168); + %170 = nn.conv2d(%169, meta[relay.Constant][220], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]); + %171 = nn.batch_norm(%170, meta[relay.Constant][221], meta[relay.Constant][222], meta[relay.Constant][223], meta[relay.Constant][224]); + %172 = %171.0; + %173 = nn.relu(%172); + %174 = nn.conv2d(%173, meta[relay.Constant][225], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]); + %175 = nn.batch_norm(%174, meta[relay.Constant][226], meta[relay.Constant][227], meta[relay.Constant][228], meta[relay.Constant][229]); + %176 = %175.0; + %177 = add(%176, %165); + %178 = nn.conv2d(%177, meta[relay.Constant][230], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]); + %179 = nn.batch_norm(%178, meta[relay.Constant][231], meta[relay.Constant][232], meta[relay.Constant][233], meta[relay.Constant][234]); + %180 = %179.0; + %181 = nn.relu(%180); + %182 = nn.conv2d(%181, meta[relay.Constant][235], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]); + %183 = nn.batch_norm(%182, meta[relay.Constant][236], meta[relay.Constant][237], meta[relay.Constant][238], meta[relay.Constant][239]); + %184 = %183.0; + %185 = nn.relu(%184); + %186 = nn.conv2d(%185, meta[relay.Constant][240], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]); + %187 = nn.batch_norm(%186, meta[relay.Constant][241], meta[relay.Constant][242], meta[relay.Constant][243], meta[relay.Constant][244]); + %188 = %187.0; + %189 = add(%188, %177); + %190 = nn.conv2d(%189, meta[relay.Constant][245], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]); + %191 = nn.batch_norm(%190, meta[relay.Constant][246], meta[relay.Constant][247], meta[relay.Constant][248], meta[relay.Constant][249]); + %192 = %191.0; + %193 = nn.relu(%192); + %194 = nn.conv2d(%193, meta[relay.Constant][250], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]); + %195 = nn.batch_norm(%194, meta[relay.Constant][251], meta[relay.Constant][252], meta[relay.Constant][253], meta[relay.Constant][254]); + %196 = %195.0; + %197 = nn.relu(%196); + %198 = nn.conv2d(%197, meta[relay.Constant][255], padding=[0, 0, 0, 0], channels=320, kernel_size=[1, 1]); + %199 = nn.batch_norm(%198, meta[relay.Constant][256], meta[relay.Constant][257], meta[relay.Constant][258], meta[relay.Constant][259]); + %200 = %199.0; + %201 = nn.conv2d(%200, meta[relay.Constant][260], padding=[0, 0, 0, 0], channels=1280, kernel_size=[1, 1]); + %202 = nn.batch_norm(%201, meta[relay.Constant][261], meta[relay.Constant][262], meta[relay.Constant][263], meta[relay.Constant][264]); + %203 = %202.0; + %204 = nn.relu(%203); + %205 = nn.global_avg_pool2d(%204); + %206 = nn.conv2d(%205, meta[relay.Constant][265], padding=[0, 0, 0, 0], channels=1000, kernel_size=[1, 1]); + reshape(%206, newshape=[0, -1]) + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "mobilenet_16", + "input_shapes": {"data": [1, 3, 224, 224]}, + "input_dtypes": {"data": "float16"}, + "mod": mod, + "params": None, + "main_dtype": "float16", + } + + +def batch_norm_extract(): + consts = make_consts( + "float32", + [ + (32,), # 0 + (32,), # 1 + (32,), # 2 + (32,), # 3 + ], + ) + metatable = {"relay.Constant": consts} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%FunctionVar_0: Tensor[(1, 32, 112, 112), float32]) -> Tensor[(1, 32, 112, 112), float32] { + %3 = nn.batch_norm(%FunctionVar_0, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]); + %3.0 + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "batch_norm_extract", + "input_shapes": {"FunctionVar_0": [1, 32, 112, 112]}, + "input_dtypes": {"FunctionVar_0": "float32"}, + "mod": mod, + "params": None, + "main_dtype": "float32", + } + + +def resnext50_32x4d_consts(dtype): + return make_consts( + dtype, + [ + (128, 64, 1, 1), # 0 + (128, 4, 3, 3), # 1 + (256, 128, 1, 1), # 2 + (256, 64, 1, 1), # 3 + (128, 256, 1, 1), # 4 + (128, 4, 3, 3), # 5 + (256, 128, 1, 1), # 6 + (128, 256, 1, 1), # 7 + (128, 4, 3, 3), # 8 + (256, 128, 1, 1), # 9 + (256, 256, 1, 1), # 10 + (256, 8, 3, 3), # 11 + (512, 256, 1, 1), # 12 + (512, 256, 1, 1), # 13 + (256, 512, 1, 1), # 14 + (256, 8, 3, 3), # 15 + (512, 256, 1, 1), # 16 + (256, 512, 1, 1), # 17 + (256, 8, 3, 3), # 18 + (512, 256, 1, 1), # 19 + (256, 512, 1, 1), # 20 + (256, 8, 3, 3), # 21 + (512, 256, 1, 1), # 22 + (512, 512, 1, 1), # 23 + (512, 16, 3, 3), # 24 + (1024, 512, 1, 1), # 25 + (1024, 512, 1, 1), # 26 + (512, 1024, 1, 1), # 27 + (512, 16, 3, 3), # 28 + (1024, 512, 1, 1), # 29 + (512, 1024, 1, 1), # 30 + (512, 16, 3, 3), # 31 + (1024, 512, 1, 1), # 32 + (512, 1024, 1, 1), # 33 + (512, 16, 3, 3), # 34 + (1024, 512, 1, 1), # 35 + (512, 1024, 1, 1), # 36 + (512, 16, 3, 3), # 37 + (1024, 512, 1, 1), # 38 + (512, 1024, 1, 1), # 39 + (512, 16, 3, 3), # 40 + (1024, 512, 1, 1), # 41 + (1024, 1024, 1, 1), # 42 + (1024, 32, 3, 3), # 43 + (2048, 1024, 1, 1), # 44 + (2048, 1024, 1, 1), # 45 + (1024, 2048, 1, 1), # 46 + (1024, 32, 3, 3), # 47 + (2048, 1024, 1, 1), # 48 + (1024, 2048, 1, 1), # 49 + (1024, 32, 3, 3), # 50 + (2048, 1024, 1, 1), # 51 + ], + ) + + +def resnext50_32x4d(): + metatable = {"relay.Constant": resnext50_32x4d_consts("float32")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%x: Tensor[(1, 64, 56, 56), float32]) { + %0 = nn.conv2d(%x, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %1 = nn.relu(%0); + %2 = nn.conv2d(%1, meta[relay.Constant][1], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]); + %3 = nn.relu(%2); + %4 = nn.conv2d(%3, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %5 = nn.conv2d(%x, meta[relay.Constant][3], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %6 = add(%4, %5); + %7 = nn.relu(%6); + %8 = nn.conv2d(%7, meta[relay.Constant][4], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %9 = nn.relu(%8); + %10 = nn.conv2d(%9, meta[relay.Constant][5], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]); + %11 = nn.relu(%10); + %12 = nn.conv2d(%11, meta[relay.Constant][6], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %13 = add(%12, %7); + %14 = nn.relu(%13); + %15 = nn.conv2d(%14, meta[relay.Constant][7], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %16 = nn.relu(%15); + %17 = nn.conv2d(%16, meta[relay.Constant][8], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]); + %18 = nn.relu(%17); + %19 = nn.conv2d(%18, meta[relay.Constant][9], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %20 = add(%19, %14); + %21 = nn.relu(%20); + %22 = nn.conv2d(%21, meta[relay.Constant][10], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %23 = nn.relu(%22); + %24 = nn.conv2d(%23, meta[relay.Constant][11], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]); + %25 = nn.relu(%24); + %26 = nn.conv2d(%25, meta[relay.Constant][12], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %27 = nn.conv2d(%21, meta[relay.Constant][13], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %28 = add(%26, %27); + %29 = nn.relu(%28); + %30 = nn.conv2d(%29, meta[relay.Constant][14], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %31 = nn.relu(%30); + %32 = nn.conv2d(%31, meta[relay.Constant][15], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]); + %33 = nn.relu(%32); + %34 = nn.conv2d(%33, meta[relay.Constant][16], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %35 = add(%34, %29); + %36 = nn.relu(%35); + %37 = nn.conv2d(%36, meta[relay.Constant][17], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %38 = nn.relu(%37); + %39 = nn.conv2d(%38, meta[relay.Constant][18], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]); + %40 = nn.relu(%39); + %41 = nn.conv2d(%40, meta[relay.Constant][19], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %42 = add(%41, %36); + %43 = nn.relu(%42); + %44 = nn.conv2d(%43, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %45 = nn.relu(%44); + %46 = nn.conv2d(%45, meta[relay.Constant][21], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]); + %47 = nn.relu(%46); + %48 = nn.conv2d(%47, meta[relay.Constant][22], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %49 = add(%48, %43); + %50 = nn.relu(%49); + %51 = nn.conv2d(%50, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %52 = nn.relu(%51); + %53 = nn.conv2d(%52, meta[relay.Constant][24], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %54 = nn.relu(%53); + %55 = nn.conv2d(%54, meta[relay.Constant][25], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %56 = nn.conv2d(%50, meta[relay.Constant][26], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %57 = add(%55, %56); + %58 = nn.relu(%57); + %59 = nn.conv2d(%58, meta[relay.Constant][27], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %60 = nn.relu(%59); + %61 = nn.conv2d(%60, meta[relay.Constant][28], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %62 = nn.relu(%61); + %63 = nn.conv2d(%62, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %64 = add(%63, %58); + %65 = nn.relu(%64); + %66 = nn.conv2d(%65, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %67 = nn.relu(%66); + %68 = nn.conv2d(%67, meta[relay.Constant][31], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %69 = nn.relu(%68); + %70 = nn.conv2d(%69, meta[relay.Constant][32], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %71 = add(%70, %65); + %72 = nn.relu(%71); + %73 = nn.conv2d(%72, meta[relay.Constant][33], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %74 = nn.relu(%73); + %75 = nn.conv2d(%74, meta[relay.Constant][34], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %76 = nn.relu(%75); + %77 = nn.conv2d(%76, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %78 = add(%77, %72); + %79 = nn.relu(%78); + %80 = nn.conv2d(%79, meta[relay.Constant][36], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %81 = nn.relu(%80); + %82 = nn.conv2d(%81, meta[relay.Constant][37], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %83 = nn.relu(%82); + %84 = nn.conv2d(%83, meta[relay.Constant][38], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %85 = add(%84, %79); + %86 = nn.relu(%85); + %87 = nn.conv2d(%86, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %88 = nn.relu(%87); + %89 = nn.conv2d(%88, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %90 = nn.relu(%89); + %91 = nn.conv2d(%90, meta[relay.Constant][41], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %92 = add(%91, %86); + %93 = nn.relu(%92); + %94 = nn.conv2d(%93, meta[relay.Constant][42], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %95 = nn.relu(%94); + %96 = nn.conv2d(%95, meta[relay.Constant][43], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]); + %97 = nn.relu(%96); + %98 = nn.conv2d(%97, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %99 = nn.conv2d(%93, meta[relay.Constant][45], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %100 = add(%98, %99); + %101 = nn.relu(%100); + %102 = nn.conv2d(%101, meta[relay.Constant][46], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %103 = nn.relu(%102); + %104 = nn.conv2d(%103, meta[relay.Constant][47], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]); + %105 = nn.relu(%104); + %106 = nn.conv2d(%105, meta[relay.Constant][48], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %107 = add(%106, %101); + %108 = nn.relu(%107); + %109 = nn.conv2d(%108, meta[relay.Constant][49], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %110 = nn.relu(%109); + %111 = nn.conv2d(%110, meta[relay.Constant][50], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]); + %112 = nn.relu(%111); + %113 = nn.conv2d(%112, meta[relay.Constant][51], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %114 = add(%113, %108); + nn.relu(%114) + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "resnext50_32x4d", + "input_shapes": {"x": [1, 64, 56, 56]}, + "input_dtypes": {"x": "float32"}, + "mod": mod, + "params": None, + "main_dtype": "float32", + } + + +def resnext50_32x4d_16(): + metatable = {"relay.Constant": resnext50_32x4d_consts("float16")} + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%x: Tensor[(1, 64, 56, 56), float16]) { + %0 = nn.conv2d(%x, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %1 = nn.relu(%0); + %2 = nn.conv2d(%1, meta[relay.Constant][1], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]); + %3 = nn.relu(%2); + %4 = nn.conv2d(%3, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %5 = nn.conv2d(%x, meta[relay.Constant][3], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %6 = add(%4, %5); + %7 = nn.relu(%6); + %8 = nn.conv2d(%7, meta[relay.Constant][4], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %9 = nn.relu(%8); + %10 = nn.conv2d(%9, meta[relay.Constant][5], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]); + %11 = nn.relu(%10); + %12 = nn.conv2d(%11, meta[relay.Constant][6], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %13 = add(%12, %7); + %14 = nn.relu(%13); + %15 = nn.conv2d(%14, meta[relay.Constant][7], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]); + %16 = nn.relu(%15); + %17 = nn.conv2d(%16, meta[relay.Constant][8], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]); + %18 = nn.relu(%17); + %19 = nn.conv2d(%18, meta[relay.Constant][9], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %20 = add(%19, %14); + %21 = nn.relu(%20); + %22 = nn.conv2d(%21, meta[relay.Constant][10], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %23 = nn.relu(%22); + %24 = nn.conv2d(%23, meta[relay.Constant][11], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]); + %25 = nn.relu(%24); + %26 = nn.conv2d(%25, meta[relay.Constant][12], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %27 = nn.conv2d(%21, meta[relay.Constant][13], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %28 = add(%26, %27); + %29 = nn.relu(%28); + %30 = nn.conv2d(%29, meta[relay.Constant][14], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %31 = nn.relu(%30); + %32 = nn.conv2d(%31, meta[relay.Constant][15], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]); + %33 = nn.relu(%32); + %34 = nn.conv2d(%33, meta[relay.Constant][16], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %35 = add(%34, %29); + %36 = nn.relu(%35); + %37 = nn.conv2d(%36, meta[relay.Constant][17], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %38 = nn.relu(%37); + %39 = nn.conv2d(%38, meta[relay.Constant][18], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]); + %40 = nn.relu(%39); + %41 = nn.conv2d(%40, meta[relay.Constant][19], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %42 = add(%41, %36); + %43 = nn.relu(%42); + %44 = nn.conv2d(%43, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %45 = nn.relu(%44); + %46 = nn.conv2d(%45, meta[relay.Constant][21], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]); + %47 = nn.relu(%46); + %48 = nn.conv2d(%47, meta[relay.Constant][22], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %49 = add(%48, %43); + %50 = nn.relu(%49); + %51 = nn.conv2d(%50, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %52 = nn.relu(%51); + %53 = nn.conv2d(%52, meta[relay.Constant][24], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %54 = nn.relu(%53); + %55 = nn.conv2d(%54, meta[relay.Constant][25], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %56 = nn.conv2d(%50, meta[relay.Constant][26], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %57 = add(%55, %56); + %58 = nn.relu(%57); + %59 = nn.conv2d(%58, meta[relay.Constant][27], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %60 = nn.relu(%59); + %61 = nn.conv2d(%60, meta[relay.Constant][28], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %62 = nn.relu(%61); + %63 = nn.conv2d(%62, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %64 = add(%63, %58); + %65 = nn.relu(%64); + %66 = nn.conv2d(%65, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %67 = nn.relu(%66); + %68 = nn.conv2d(%67, meta[relay.Constant][31], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %69 = nn.relu(%68); + %70 = nn.conv2d(%69, meta[relay.Constant][32], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %71 = add(%70, %65); + %72 = nn.relu(%71); + %73 = nn.conv2d(%72, meta[relay.Constant][33], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %74 = nn.relu(%73); + %75 = nn.conv2d(%74, meta[relay.Constant][34], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %76 = nn.relu(%75); + %77 = nn.conv2d(%76, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %78 = add(%77, %72); + %79 = nn.relu(%78); + %80 = nn.conv2d(%79, meta[relay.Constant][36], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %81 = nn.relu(%80); + %82 = nn.conv2d(%81, meta[relay.Constant][37], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %83 = nn.relu(%82); + %84 = nn.conv2d(%83, meta[relay.Constant][38], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %85 = add(%84, %79); + %86 = nn.relu(%85); + %87 = nn.conv2d(%86, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]); + %88 = nn.relu(%87); + %89 = nn.conv2d(%88, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]); + %90 = nn.relu(%89); + %91 = nn.conv2d(%90, meta[relay.Constant][41], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %92 = add(%91, %86); + %93 = nn.relu(%92); + %94 = nn.conv2d(%93, meta[relay.Constant][42], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %95 = nn.relu(%94); + %96 = nn.conv2d(%95, meta[relay.Constant][43], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]); + %97 = nn.relu(%96); + %98 = nn.conv2d(%97, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %99 = nn.conv2d(%93, meta[relay.Constant][45], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %100 = add(%98, %99); + %101 = nn.relu(%100); + %102 = nn.conv2d(%101, meta[relay.Constant][46], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %103 = nn.relu(%102); + %104 = nn.conv2d(%103, meta[relay.Constant][47], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]); + %105 = nn.relu(%104); + %106 = nn.conv2d(%105, meta[relay.Constant][48], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %107 = add(%106, %101); + %108 = nn.relu(%107); + %109 = nn.conv2d(%108, meta[relay.Constant][49], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]); + %110 = nn.relu(%109); + %111 = nn.conv2d(%110, meta[relay.Constant][50], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]); + %112 = nn.relu(%111); + %113 = nn.conv2d(%112, meta[relay.Constant][51], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]); + %114 = add(%113, %108); + nn.relu(%114) + } + """, + "from_string", + None, + metatable, + ) + return { + "name": "resnext50_32x4d_16", + "input_shapes": {"x": [1, 64, 56, 56]}, + "input_dtypes": {"x": "float16"}, + "mod": mod, + "params": None, + "main_dtype": "float16", + } + + +def describe_onnx(name, filename): + """Returns the description of the ONNX model at filename, which can be passed to from_onnx to actually load + the model. Note that ? (ie unknown) shape dimensions must be manually changed to concrete dimensions + which are consistent with the overall model.""" + onnx_model = onnx.load(MODEL_PREFIX + filename) + input_shapes = {} + input_dtypes = {} + initializer_names = [n.name for n in onnx_model.graph.initializer] + for input_info in onnx_model.graph.input: + if input_info.name not in initializer_names: + _, shape, dtype, _ = tvm.relay.frontend.onnx.get_info(input_info) + if dtype is None: + raise ValueError(f"Unknown dtype on input '{input_info.name}' is not supported.") + input_shapes.update({input_info.name: shape}) + input_dtypes.update({input_info.name: dtype}) + print( + f"{{'name': '{name}', 'filename': '{filename}', 'input_shapes': {input_shapes}, 'input_dtypes': {input_dtypes}, 'main_dtype': 'float32'}}" + ) + + +def from_onnx(model): + logging.info("-------------------- BEGIN ONNX IMPORT --------------------") + + filename = MODEL_PREFIX + model["filename"] + logging.info(f"Loading ONNX model from {filename}") + + onnx_model = onnx.load(filename) + logging.info(f"Loaded model from {filename}") + + mod, params = tvm.relay.frontend.from_onnx( + onnx_model, model["input_shapes"], freeze_params=True + ) + mod = tvm.relay.transform.InferType()(mod) + logging.info("-------------------- END ONNX IMPORT --------------------") + + logging.info(f"Imported model:\n{mod}") + logging.info(f"Params:\n{params}") + + return { + "name": model["name"], + "input_shapes": model["input_shapes"], + "input_dtypes": model["input_dtypes"], + "mod": mod, + "params": params, + "main_dtype": model["main_dtype"], + } + + +def to_onnx(model): + logging.info("-------------------- BEGIN ONNX EXPORT --------------------") + short_filename = model["name"] + ".onnx" + filename = MODEL_PREFIX + short_filename + logging.info(f"Saving ONNX model to {filename}") + + params = model["params"] + if params is None: + params = {} + tvm.contrib.target.onnx.to_onnx(model["mod"], params, model["name"], path=filename) + logging.info("-------------------- END ONNX EXPORT --------------------") + + return { + "name": model["name"], + "filename": short_filename, + "input_shapes": model["input_shapes"], + "input_dtypes": model["input_dtypes"], + "main_dtype": model["main_dtype"], + } diff --git a/tests/python/relay/collage/test_collage_partitioner.py b/tests/python/relay/collage/test_collage_partitioner.py new file mode 100644 index 0000000000000..e1217fde44192 --- /dev/null +++ b/tests/python/relay/collage/test_collage_partitioner.py @@ -0,0 +1,269 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +import logging +import tempfile +import os +import menangerie + +# The following are necessary to force global functions or pattern tables to be registered +from tvm.relay.op.contrib.cutlass import partition_for_cutlass +from tvm.contrib.cutlass import num_cutlass_partitions +from tvm.relay.op.contrib.cublas import partition_for_cublas +from tvm.relay.op.contrib.cudnn import partition_for_cudnn + +logging.basicConfig(level=logging.INFO) + +HOST = tvm.target.Target("llvm") +CUDA = tvm.target.Target("cuda", HOST) + +### +### Rename to match your hardware, eg ..._vt100... +### +TUNING_LOG = "collage_autotvm_rtx3070.tuninglog" + +### +### If true, runs final model under nvprof +### +PROFILE = True + +### +### If true, run all models +### +ALL_MODELS = False + +### +### If true, run all configurations +### +ALL_CONFIGS = False + +TVM_MAX_MAX_DEPTH = 8 +BYOC_MAX_MAX_DEPTH = 8 + +runner_template = """ +import tvm +import tvm.runtime.vm +import numpy as np +import logging + +logging.basicConfig(level=logging.INFO) + +MEASURE_NUMBER = 20 +MEASURE_REPEAT = 5 +WARMUP_MIN_REPEAT_MS = 250 + +def arg_for(shape, dtype, device): + return tvm.nd.array( + np.random.rand(*shape).astype(dtype), device=device) + +def vm_estimate_seconds(device, vm, args): + vm.benchmark(device, repeat=1, number=1, min_repeat_ms=WARMUP_MIN_REPEAT_MS, **args) + return vm.benchmark(device, repeat=MEASURE_REPEAT, number=MEASURE_NUMBER, min_repeat_ms=0, + **args) + + +def run(label, name, device, lib_path, code_path, input_shapes, input_dtypes): + logging.info(f"Loading compiled code for {name} generated by {label} from {lib_path} and {code_path}...") + loaded_lib = tvm.runtime.load_module(lib_path) + loaded_code = bytearray(open(code_path, "rb").read()) + loaded_exe = tvm.runtime.vm.Executable.load_exec(loaded_code, loaded_lib) + vm = tvm.runtime.vm.VirtualMachine(loaded_exe, device) + args = { + input_name: arg_for(input_shapes[input_name], input_dtypes[input_name], device) + for input_name in input_shapes.keys() + } + logging.info(f"Benchmarking for {name} generated by {label}...") + profile = vm_estimate_seconds(device, vm, args) + logging.info(f"Benchmarked for {name} generated by {label}: {profile}") + logging.info(f"RESULT: {label} | {name} | {profile.median * 1e3}ms") + +if __name__ == "__main__": +""" + + +def compile_and_benchmark(label, model, targets, dev, tmp_dir): + logging.info(f"Compiling {model['name']} using {label} with {targets}...") + exe = tvm.relay.vm.compile(model["mod"], target=targets, params=model["params"]) + lib_path = os.path.join(tmp_dir, "lib.so") + code_path = os.path.join(tmp_dir, "code.ro") + code, lib = exe.save() + logging.info(f"Saving VM code to {code_path}...") + with open(code_path, "wb") as fo: + fo.write(code) + logging.info(f"Exporting library to {lib_path}...") + lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc") + runner = f"{runner_template} run('{label}', '{model['name']}', tvm.device({dev.device_type}), '{lib_path}', '{code_path}', {model['input_shapes']}, {model['input_dtypes']})\n" + runner_path = os.path.join(tmp_dir, "runner.py") + logging.info(f"Saving runner to {runner_path}...") + with open(runner_path, "w") as fo: + fo.write(runner) + + logging.info(f"Invoking runner...") + if PROFILE: + profile_path = os.path.join(tmp_dir, "profile.txt") + os.system(f"nsys nvprof -o {profile_path} python3 {runner_path}") + else: + os.system(f"python3 {runner_path}") + + +def collage(model): + logging.info(f"collage | {model['name']}") + logging.info("-------------- BEGIN ORIGINAL --------------") + logging.info(model["mod"]) + logging.info("-------------- END ORIGINAL ----------------") + tvm.relay.collage.autotvm_tune_module(model["mod"], CUDA, TUNING_LOG) + with tvm.relay.collage.optional_tuning_records(TUNING_LOG): + targets = [] + targets.append(CUDA) + use_fp16 = model["main_dtype"] == "float16" + targets.append( + tvm.target.Target(f"tensorrt -use_implicit_batch=False -use_fp16={use_fp16}", HOST) + ) + tmp_dir = tempfile.mkdtemp() + targets.append(tvm.target.Target(f"cutlass -tmp_dir={tmp_dir}", HOST)) + targets.append(tvm.target.Target("cublas", HOST)) + targets.append(tvm.target.Target("cudnn", HOST)) + config = { + "relay.collage.tvm_max_max_depth": TVM_MAX_MAX_DEPTH, + "relay.collage.byoc_max_max_depth": BYOC_MAX_MAX_DEPTH, + } + logging.info(f"Using PassContext(config={config}") + ctxt = tvm.transform.PassContext(config=config) + config = tvm.target.make_compilation_config(ctxt, targets) + with ctxt: + mod = model["mod"] + mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod) + logging.info("-------------- BEGIN INDEXED --------------") + logging.info(mod) + logging.info("-------------- END INDEXED ----------------") + mod = tvm.relay.transform.CollagePartition(config)(mod) + partitioned_model = model.copy() + partitioned_model["mod"] = mod + logging.info("-------------- BEGIN PARTITIONED --------------") + logging.info(partitioned_model["mod"]) + logging.info("-------------- END PARTITIONED ----------------") + dev = tvm.device(CUDA.kind.device_type) + compile_and_benchmark("collage", partitioned_model, targets, dev, tmp_dir) + + +def just_tensorrt(model): + logging.info(f"just_tensorrt | {model['name']}") + logging.info("-------------- BEGIN ORIGINAL --------------") + logging.info(model["mod"]) + logging.info("-------------- END ORIGINAL ----------------") + tmp_dir = tempfile.mkdtemp() + tvm.relay.collage.autotvm_tune_module(model["mod"], CUDA, TUNING_LOG) + with tvm.relay.collage.optional_tuning_records(TUNING_LOG): + logging.info("Partitioning for TensorRT...") + use_fp16 = model["main_dtype"] == "float16" + trt_target = tvm.target.Target( + f"tensorrt -use_implicit_batch=False -use_fp16={use_fp16}", HOST + ) + mod = tvm.relay.op.contrib.partition_for_tensorrt( + mod=model["mod"], params=model["params"], target=trt_target + ) + partitioned_model = model.copy() + partitioned_model["mod"] = mod + logging.info("-------------- BEGIN PARTITIONED --------------") + logging.info(partitioned_model["mod"]) + logging.info("-------------- END PARTITIONED ----------------") + targets = [] + targets.append(CUDA) + targets.append(trt_target) + dev = tvm.device(CUDA.kind.device_type) + compile_and_benchmark("just_tensorrt", partitioned_model, targets, dev, tmp_dir) + + +def just_cutlass(model): + logging.info(f"just_cutlass | {model['name']}") + logging.info("-------------- BEGIN ORIGINAL --------------") + logging.info(model["mod"]) + logging.info("-------------- END ORIGINAL ----------------") + tmp_dir = tempfile.mkdtemp() + tvm.relay.collage.autotvm_tune_module(model["mod"], CUDA, TUNING_LOG) + with tvm.relay.collage.optional_tuning_records(TUNING_LOG): + with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): + logging.info("Partitioning for CUTLASS...") + mod = tvm.relay.op.contrib.partition_for_cutlass(model["mod"], model["params"]) + partitioned_model = model.copy() + partitioned_model["mod"] = mod + logging.info("-------------- BEGIN PARTITIONED --------------") + logging.info(partitioned_model["mod"]) + logging.info("-------------- END PARTITIONED ----------------") + targets = [] + targets.append(CUDA) + targets.append(tvm.target.Target(f"cutlass -tmp_dir={tmp_dir}", HOST)) + dev = tvm.device(CUDA.kind.device_type) + compile_and_benchmark("just_cutlass", partitioned_model, targets, dev, tmp_dir) + + +def just_tvm(model): + logging.info(f"just_tvm | {model['name']}") + logging.info("-------------- BEGIN ORIGINAL --------------") + logging.info(model["mod"]) + logging.info("-------------- END ORIGINAL ----------------") + tmp_dir = tempfile.mkdtemp() + tvm.relay.collage.autotvm_tune_module(model["mod"], CUDA, TUNING_LOG) + with tvm.relay.collage.optional_tuning_records(TUNING_LOG): + dev = tvm.device(CUDA.kind.device_type) + compile_and_benchmark("just_tvm", model, CUDA, dev, tmp_dir) + + +def tvm_with_libs(model): + logging.info(f"tvm_with_libs | {model['name']}") + logging.info("-------------- BEGIN ORIGINAL --------------") + logging.info(model["mod"]) + logging.info("-------------- END ORIGINAL ----------------") + tmp_dir = tempfile.mkdtemp() + cuda_target = tvm.target.Target("cuda -libs=cudnn,cublas", HOST) + tvm.relay.collage.autotvm_tune_module(model["mod"], cuda_target, TUNING_LOG) + with tvm.relay.collage.optional_tuning_records(TUNING_LOG): + dev = tvm.device(cuda_target.kind.device_type) + compile_and_benchmark("tvm_with_libs", model, cuda_target, dev, tmp_dir) + + +def test_all(): + make_models = [] + make_models.append(menangerie.resnext50_32x4d) + if ALL_MODELS: + make_models.append(menangerie.resnext50_32x4d_16) + make_models.append(menangerie.gpt2_16) + make_models.append(menangerie.gpt2) + make_models.append(menangerie.mobilenet_16) + make_models.append(menangerie.mobilenet) + make_models.append(menangerie.resnet50_16) + make_models.append(menangerie.resnet50) + run_models = [] + if ALL_CONFIGS: + run_models.append(just_tensorrt) + run_models.append(just_tvm) + run_models.append(tvm_with_libs) + run_models.append(collage) + for make_model in make_models: + model = make_model() + for run_model in run_models: + run_model(model) + + +def test_mini(): + collage(menangerie.gpt2_16_for_cutlass_extract()) + + +if __name__ == "__main__": + # test_all() + test_mini() diff --git a/tests/python/relay/collage/test_sub_graph.py b/tests/python/relay/collage/test_sub_graph.py new file mode 100644 index 0000000000000..a231bd72454fa --- /dev/null +++ b/tests/python/relay/collage/test_sub_graph.py @@ -0,0 +1,400 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +import logging +import tvm.testing + +logging.basicConfig(level=logging.INFO) + +partition_on_indexes_for_testing = tvm._ffi.get_global_func( + "relay.collage.partition_on_indexes_for_testing" +) + + +def print_with_indexes(mod): + mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod) + print(mod) + + +def process(mod, max_outputs, allow_taps, indexes, labels=None): + mod = tvm.relay.transform.InferType()(mod) + mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod) + mod = partition_on_indexes_for_testing(max_outputs, allow_taps, indexes, labels)(mod) + return mod + + +def assert_eq(in_mod, expected_mod, actual_mod): + in_mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(in_mod) + expected_mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(expected_mod) + if not tvm.ir.structural_equal(actual_mod, expected_mod, True): + # Print everything in full so we can see what's going on when things fail. + print("Input module:") + print(in_mod) + print("Expected module:") + print(expected_mod) + print("Actual module:") + print(actual_mod) + # Assert again so as to see the actual disagreeing sub-expressions. + tvm.ir.assert_structural_equal(actual_mod, expected_mod, map_free_vars=True) + + +def test_single_op(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32], + %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) { + %0 = add(%a, %b); + %1 = add(%c, %d); // node 7 + subtract(%0, %1) + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32], + %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) { + %0 = add(%a, %b); + %1 = (fn(%x, %y, Composite="a") { add(%x, %y) })(%c, %d); + subtract(%0, %1) + } + """ + ) + + assert_eq(input(), expected(), process(input(), 1, False, [7], ["a"])) + + +def test_multi_output(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32], + %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) { + %0 = add(%a, %b); // node 6 + %1 = add(%c, %d); // node 7 + subtract(%0, %1) + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32], + %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) { + %0 = (fn(%w, %x, %y, %z, Composite="a") { (add(%y, %z), add(%w, %x)) })(%c, %d, %a, %b); + %1 = %0.0; + %2 = %0.1; + subtract(%1, %2) + } + """ + ) + + # No rewrite since 2 outputs + assert_eq(input(), input(), process(input(), 1, False, [6, 7], ["a", "a"])) + # Rewrite + assert_eq(input(), expected(), process(input(), 2, False, [6, 7], ["a", "a"])) + + +def test_classic_conv2d_add_relu(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32], + %c: Tensor[(5, 2, 28, 28), float32], %d: Tensor[(5, 2, 28, 28), float32]) { + %0 = nn.conv2d(%a, %b); // node 8 + %1 = add(%0, %c); // node 9 + %2 = nn.relu(%1); // node 10 + subtract(%2, %d) + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32], + %c: Tensor[(5, 2, 28, 28), float32], %d: Tensor[(5, 2, 28, 28), float32]) { + %2 = (fn(%x, %y, %z, Composite="a") { + %0 = nn.conv2d(%x, %y); + %1 = add(%0, %z); + nn.relu(%1) + })(%a, %b, %c); + subtract(%2, %d) + } + """ + ) + + assert_eq(input(), expected(), process(input(), 1, False, [8, 9, 10], ["a", "a", "a"])) + + +def test_diamond_single_output(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5 + %1 = nn.relu(%0); // node 6 + %2 = nn.relu(%1); // node 7 + %3 = nn.leaky_relu(%0, alpha=0f); // node 9 + add(%2, %3) // node 10 + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + (fn (%x: Tensor[(5, 3, 32, 32), float32], %y: Tensor[(2, 3, 5, 5), float32], Composite="a") { + %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]); + %1 = nn.relu(%0); + %2 = nn.relu(%1); + %3 = nn.leaky_relu(%0, alpha=0f); + add(%2, %3) + })(%a, %b) + } + """ + ) + + assert_eq( + input(), expected(), process(input(), 1, False, [5, 6, 7, 9, 10], ["a", "a", "a", "a", "a"]) + ) + + +def test_diamond_multi_output(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5 + %1 = nn.relu(%0); // node 6 + %2 = nn.relu(%1); // node 7 + %3 = nn.leaky_relu(%0, alpha=0f); // node 9 + add(%2, %3) + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + %4 = (fn (%x: Tensor[(5, 3, 32, 32), float32], %y: Tensor[(2, 3, 5, 5), float32], Composite="a") { + %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]); + %1 = nn.relu(%0); + %2 = nn.relu(%1); + %3 = nn.leaky_relu(%0, alpha=0f); + (%2, %3) + })(%a, %b); + %5 = %4.0; + %6 = %4.1; + add(%5, %6) + } + """ + ) + + assert_eq(input(), expected(), process(input(), 2, False, [5, 6, 7, 9], ["a", "a", "a", "a"])) + + +def test_with_tap(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5 + %1 = nn.relu(%0); // node 6 + add(%1, %0) + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + %2 = (fn (%x, %y, Composite="a") { + %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]); + %1 = nn.relu(%0); + (%0, %1) + })(%a, %b); + %3 = %2.1; + %4 = %2.0; + add(%3, %4) + } + """ + ) + + # No rewrite since has tap + assert_eq(input(), input(), process(input(), 2, False, [5, 6], ["a", "a"])) + # Rewrite + assert_eq(input(), expected(), process(input(), 2, True, [5, 6], ["a", "a"])) + + +def test_no_cycles(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) { + %0 = add(%a, %b); // node 3 + %1 = add(%0, %b); + add(%1, %b) // node 5 + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) { + (fn(%x, %y, Composite="a") { + %0 = add(%x, %y); + %1 = add(%0, %y); + add(%1, %y) + })(%a, %b) + } + """ + ) + + # No rewrite since would create cycle + assert_eq(input(), input(), process(input(), 2, False, [3, 5], ["a", "a'"])) + # No cycle + assert_eq(input(), expected(), process(input(), 2, False, [3, 4, 5], ["a", "a", "a"])) + + +def test_labels_direct_connection(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32]) { + %0 = nn.relu(%a); // node 3 + %1 = nn.relu(%0); // node 4 + %2 = nn.relu(%1); // node 5 + %3 = nn.relu(%1); // node 6 + %4 = add(%2, %3); // node 7 + %5 = nn.relu(%4); // node 8 + %6 = nn.relu(%4); // node 9 + %7 = add(%5, %6); // node 10 + nn.relu(%7) // node 11 + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32]) { + %0 = nn.relu(%a); + %4 = (fn(%y, Composite="a") { + %1 = nn.relu(%y); + %2 = nn.relu(%1); + %3 = nn.relu(%1); + add(%2, %3) + })(%0); + %7 = (fn(%z, Composite="b") { + %5 = nn.relu(%z); + %6 = nn.relu(%z); + add(%5, %6) + })(%4); + nn.relu(%7) + } + """ + ) + + assert_eq( + input(), + expected(), + process( + input(), + 1, + False, + [3, 4, 5, 6, 7, 8, 9, 10, 11], + ["", "a", "a", "a", "a", "b", "b", "b", ""], + ), + ) + + +def test_labels_nested_tap(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32]) { + %0 = nn.relu(%a); // node 3 + %1 = nn.relu(%0); // node 4 + %2 = nn.relu(%1); // node 5 + %3 = nn.relu(%1); // node 6 + %4 = add(%2, %3); // node 7 + %5 = nn.relu(%4); // node 8 + %6 = nn.relu(%4); // node 9 + %7 = add(%5, %6); // node 10 + add(%2, %7) // node 11 + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32]) { + %0 = nn.relu(%a); + %5 = (fn(%y, Composite="a") { + %1 = nn.relu(%y); + %2 = nn.relu(%1); + %3 = nn.relu(%1); + %4 = add(%2, %3); + (%2, %4) + })(%0); + %8 = (fn(%z, Composite="b") { + %6 = nn.relu(%z); + %7 = nn.relu(%z); + add(%6, %7) + })(%5.1); + add(%5.0, %8) + } + """ + ) + + assert_eq( + input(), + expected(), + process(input(), 2, True, [4, 5, 6, 7, 8, 9, 10], ["a", "a", "a", "a", "b", "b", "b"]), + ) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/relay/test_pass_collage_partition.py b/tests/python/relay/test_pass_collage_partition.py new file mode 100644 index 0000000000000..cd0a915c5fea6 --- /dev/null +++ b/tests/python/relay/test_pass_collage_partition.py @@ -0,0 +1,567 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +from tvm import relay +import pytest +from tvm.relay.transform import CollagePartition, InferType +from tvm.target import make_compilation_config +from tvm.relay.collage import MockEstimator +from unittest.mock import patch +from tvm.relay.dataflow_pattern import is_op, wildcard + + +def cpu_pattern_table(): + def relu_pattern(): + return is_op("nn.relu")(wildcard()) + + def add_pattern(): + return is_op("add")(wildcard(), wildcard()) + + def concatenate_pattern(): + return is_op("concatenate")(wildcard()) + + return [ + ("relu", relu_pattern(), lambda x: True), + ("add", add_pattern(), lambda x: True), + ("concatenate", concatenate_pattern(), lambda x: True), + ] + + +def _mock_get_pattern_table(target): + if target == "test_external_cpu_target": + return cpu_pattern_table() + + +def run_collage(mod, targets, cost_estimator, tvm_max_depth=8, byoc_max_depth=8): + ctxt = { + "relay.collage.tvm_max_max_depth": tvm_max_depth, + "relay.collage.byoc_max_max_depth": byoc_max_depth, + } + pass_ctxt = tvm.transform.PassContext(config=ctxt) + with pass_ctxt: + config = make_compilation_config(pass_ctxt, targets) + mod = InferType()(mod) + mod = CollagePartition(config, cost_estimator)(mod) + return mod + + +@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table) +def test_partition_single_op_llvm(mock_get_pattern_table): + mod_txt = """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32]) { + nn.relu(%x) +} +""" + expected_txt = """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:11 */) -> Tensor[(10, 10), float32] { + nn.relu(%x) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ +} +""" + mod = tvm.parser.fromtext(mod_txt) + targets = [ + tvm.target.Target("llvm"), + tvm.target.Target("test_external_cpu_target"), + ] + cost_estimator = MockEstimator( + { + "llvm": 1, + "test_external_cpu_target": 2, + } + ) + mod = run_collage(mod, targets, cost_estimator) + assert mod.astext() == expected_txt + + +@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table) +def test_partition_single_op_byoc(mock_get_pattern_table): + mod_txt = """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32]) { + nn.relu(%x) +} +""" + expected_txt = """#[version = "0.0.5"] +def @collage_test_external_cpu_target_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu") -> Tensor[(10, 10), float32] { + %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */ +} + +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:11 */) -> Tensor[(10, 10), float32] { + @collage_test_external_cpu_target_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */ +} +""" + mod = tvm.parser.fromtext(mod_txt) + targets = [ + tvm.target.Target("llvm"), + tvm.target.Target("test_external_cpu_target"), + ] + cost_estimator = MockEstimator( + { + "llvm": 2, + "test_external_cpu_target": 1, + } + ) + mod = run_collage(mod, targets, cost_estimator) + assert mod.astext() == expected_txt + + +@pytest.mark.parametrize("byoc_max_depth", [1, 3]) +@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table) +def test_partition_diamond_valid_topology(mock_get_pattern_table, byoc_max_depth): + mod_txt = """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32]) { + %0 = nn.relu(%x); + %1 = abs(%0); + %2 = nn.relu(%1); + add(%1, %2) +} +""" + expected_3_txt = """#[version = "0.0.5"] +def @collage_test_external_cpu_target_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu") -> Tensor[(10, 10), float32] { + %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:4:12 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */ +} + +def @collage_test_external_cpu_target_nn_relu_add(%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu_add") -> Tensor[(10, 10), float32] { + %1 = fn (%FunctionVar_04: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_04) /* ty=Tensor[(10, 10), float32] span=from_string:6:11 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %2 = %1(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] */; + %3 = fn (%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_1: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="add") -> Tensor[(10, 10), float32] { + add(%FunctionVar_03, %FunctionVar_1) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ + } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %3(%FunctionVar_02, %2) /* ty=Tensor[(10, 10), float32] */ +} + +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] { + %4 = @collage_test_external_cpu_target_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */; + %5 = abs(%4) /* ty=Tensor[(10, 10), float32] span=from_string:6:7 */; + @collage_test_external_cpu_target_nn_relu_add(%5) /* ty=Tensor[(10, 10), float32] */ +} +""" + expected_1_txt = """#[version = "0.0.5"] +def @collage_test_external_cpu_target(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target") -> Tensor[(10, 10), float32] { + %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:6:11 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */; + %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_1: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="add") -> Tensor[(10, 10), float32] { + add(%FunctionVar_01, %FunctionVar_1) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ + } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %2(%FunctionVar_0, %1) /* ty=Tensor[(10, 10), float32] */ +} + +def @collage_test_external_cpu_target_nn_relu(%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu") -> Tensor[(10, 10), float32] { + %3 = fn (%FunctionVar_04: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_04) /* ty=Tensor[(10, 10), float32] span=from_string:4:12 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %3(%FunctionVar_03) /* ty=Tensor[(10, 10), float32] */ +} + +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] { + %4 = @collage_test_external_cpu_target_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */; + %5 = abs(%4) /* ty=Tensor[(10, 10), float32] span=from_string:6:7 */; + @collage_test_external_cpu_target(%5) /* ty=Tensor[(10, 10), float32] */ +} +""" + + mod = tvm.parser.fromtext(mod_txt) + targets = [ + tvm.target.Target("llvm"), + tvm.target.Target("test_external_cpu_target"), + ] + cost_estimator = MockEstimator( + { + "llvm": 2, + "test_external_cpu_target": 1, + } + ) + mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=1, byoc_max_depth=byoc_max_depth) + + expected_mod = tvm.parser.fromtext(expected_1_txt if byoc_max_depth == 1 else expected_3_txt) + tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True) + + +@pytest.mark.parametrize("tvm_max_depth", [1, 2, 3]) +@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table) +def test_tvm_max_depth(mock_get_pattern_table, tvm_max_depth): + mod_txt = """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32]) { + %0 = nn.relu(%x); + %1 = nn.relu(%0); + nn.relu(%1) +} +""" + expected_txts = { + 1: """#[version = "0.0.5"] +def @collage_test_external_cpu_target(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target") -> Tensor[(10, 10), float32] { + %0 = fn (%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_03) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */; + %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %3 = %2(%1) /* ty=Tensor[(10, 10), float32] */; + %4 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %4(%3) /* ty=Tensor[(10, 10), float32] */ +} + +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] { + @collage_test_external_cpu_target(%x) /* ty=Tensor[(10, 10), float32] */ +} +""", + 2: """#[version = "0.0.5"] +def @collage_test_external_cpu_target_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu") -> Tensor[(10, 10), float32] { + %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */ +} + +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] { + %1 = @collage_test_external_cpu_target_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */; + %2 = nn.relu(%1) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */; + nn.relu(%2) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ +} +""", + 3: """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] { + %0 = nn.relu(%x) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */; + %1 = nn.relu(%0) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */; + nn.relu(%1) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ +} +""", + } + + mod = tvm.parser.fromtext(mod_txt) + targets = [ + tvm.target.Target("llvm"), + tvm.target.Target("test_external_cpu_target"), + ] + cost_estimator = MockEstimator( + { + "llvm": 100, + "test_external_cpu_target": 99, + } + ) + mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=tvm_max_depth, byoc_max_depth=1) + + expected_mod = tvm.parser.fromtext(expected_txts[tvm_max_depth]) + tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True) + + +@pytest.mark.parametrize("byoc_max_depth", [1, 2, 3]) +@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table) +def test_byoc_max_depth(mock_get_pattern_table, byoc_max_depth): + mod_txt = """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32]) { + %0 = nn.relu(%x); + %1 = nn.relu(%0); + nn.relu(%1) +} +""" + expected_txts = { + 1: """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] { + %0 = nn.relu(%x) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */; + %1 = nn.relu(%0) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */; + nn.relu(%1) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ +} +""", + 2: """#[version = "0.0.5"] +def @collage_test_external_cpu_target_nn_relu_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu_nn_relu") -> Tensor[(10, 10), float32] { + %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */; + %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %2(%1) /* ty=Tensor[(10, 10), float32] */ +} + +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] { + %3 = nn.relu(%x) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */; + @collage_test_external_cpu_target_nn_relu_nn_relu(%3) /* ty=Tensor[(10, 10), float32] */ +} +""", + 3: """#[version = "0.0.5"] +def @collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu") -> Tensor[(10, 10), float32] { + %0 = fn (%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_03) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */; + %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:5:11 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %3 = %2(%1) /* ty=Tensor[(10, 10), float32] */; + %4 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %4(%3) /* ty=Tensor[(10, 10), float32] */ +} + +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] { + @collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */ +} +""", + } + + mod = tvm.parser.fromtext(mod_txt) + targets = [ + tvm.target.Target("llvm"), + tvm.target.Target("test_external_cpu_target"), + ] + cost_estimator = MockEstimator( + { + "llvm": 99, + "test_external_cpu_target": 100, + } + ) + mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=1, byoc_max_depth=byoc_max_depth) + + expected_mod = tvm.parser.fromtext(expected_txts[byoc_max_depth]) + tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True) + + +@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table) +def test_partition_output_tuple(mock_get_pattern_table): + mod_txt = """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32]) { + %0 = nn.relu(%x); + %1 = nn.relu(%0); + %2 = abs(%1); + (%0, %1, %2) +} +""" + expected_txt = """#[version = "0.0.5"] +def @collage_test_external_cpu_target(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target") -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) { + %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:6:4 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */; + %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:6:8 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %3 = %2(%1) /* ty=Tensor[(10, 10), float32] */; + (%1, %3) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */ +} + +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32], Tensor[(10, 10), float32]) { + %4 = @collage_test_external_cpu_target(%x) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */; + %5 = %4.1 /* ty=Tensor[(10, 10), float32] */; + %6 = %4.0 /* ty=Tensor[(10, 10), float32] */; + %7 = abs(%5) /* ty=Tensor[(10, 10), float32] span=from_string:6:12 */; + (%6, %5, %7) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32], Tensor[(10, 10), float32]) span=from_string:3:3 */ +} +""" + + mod = tvm.parser.fromtext(mod_txt) + targets = [ + tvm.target.Target("llvm"), + tvm.target.Target("test_external_cpu_target"), + ] + cost_estimator = MockEstimator( + { + "llvm": 2, + "test_external_cpu_target": 1, + } + ) + mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=2, byoc_max_depth=2) + + expected_mod = tvm.parser.fromtext(expected_txt) + tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True) + + +@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table) +def test_partition_intermediate_tuple(mock_get_pattern_table): + mod_txt = """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32]) { + %0 = nn.relu(%x); + %1 = nn.relu(%0); + %2 = (%0, %1); + concatenate(%2) +} +""" + expected_txt = """#[version = "0.0.5"] +def @collage_test_external_cpu_target(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target") -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) { + %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:5:9 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */; + %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_02) /* ty=Tensor[(10, 10), float32] span=from_string:5:13 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %3 = %2(%1) /* ty=Tensor[(10, 10), float32] */; + (%1, %3) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */ +} + +def @collage_test_external_cpu_target_concatenate(%FunctionVar_03: (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_concatenate") -> Tensor[(20, 10), float32] { + %4 = fn (%FunctionVar_04: (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */, Composite="concatenate") -> Tensor[(20, 10), float32] { + concatenate(%FunctionVar_04) /* ty=Tensor[(20, 10), float32] span=from_string:3:3 */ + } /* ty=fn ((Tensor[(10, 10), float32], Tensor[(10, 10), float32])) -> Tensor[(20, 10), float32] */; + %4(%FunctionVar_03) /* ty=Tensor[(20, 10), float32] */ +} + +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(20, 10), float32] { + %5 = @collage_test_external_cpu_target(%x) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */; + %6 = %5.0 /* ty=Tensor[(10, 10), float32] */; + %7 = %5.1 /* ty=Tensor[(10, 10), float32] */; + %8 = (%6, %7) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32]) span=from_string:6:15 */; + @collage_test_external_cpu_target_concatenate(%8) /* ty=Tensor[(20, 10), float32] */ +} +""" + + mod = tvm.parser.fromtext(mod_txt) + targets = [ + tvm.target.Target("llvm"), + tvm.target.Target("test_external_cpu_target"), + ] + cost_estimator = MockEstimator( + { + "llvm": 2, + "test_external_cpu_target": 1, + } + ) + mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=3, byoc_max_depth=5) + + expected_mod = tvm.parser.fromtext(expected_txt) + tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True) + + +@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table) +def test_fusion_benefit(mock_get_pattern_table): + mod_txt = """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32]) { + %0 = nn.relu(%x); + %1 = nn.relu(%0); + %2 = abs(%x); + %3 = nn.relu(%2); + %4 = add(%1, %3); + %5 = nn.relu(%4); + abs(%5) +} +""" + expected_txt = """#[version = "0.0.5"] +def @collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu_add_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_1: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu_add_nn_relu") -> Tensor[(10, 10), float32] { + %0 = fn (%FunctionVar_04: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_04) /* ty=Tensor[(10, 10), float32] span=from_string:4:16 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %1 = %0(%FunctionVar_0) /* ty=Tensor[(10, 10), float32] */; + %2 = fn (%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_03) /* ty=Tensor[(10, 10), float32] span=from_string:7:12 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %3 = fn (%FunctionVar_05: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_05) /* ty=Tensor[(10, 10), float32] span=from_string:7:16 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %4 = %2(%1) /* ty=Tensor[(10, 10), float32] */; + %5 = %3(%FunctionVar_1) /* ty=Tensor[(10, 10), float32] */; + %6 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_11: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="add") -> Tensor[(10, 10), float32] { + add(%FunctionVar_02, %FunctionVar_11) /* ty=Tensor[(10, 10), float32] span=from_string:8:16 */ + } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %7 = %6(%4, %5) /* ty=Tensor[(10, 10), float32] */; + %8 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_01) /* ty=Tensor[(10, 10), float32] span=from_string:9:7 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %8(%7) /* ty=Tensor[(10, 10), float32] */ +} + +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:5:12 */) -> Tensor[(10, 10), float32] { + %9 = abs(%x) /* ty=Tensor[(10, 10), float32] span=from_string:6:16 */; + %10 = @collage_test_external_cpu_target_nn_relu_nn_relu_nn_relu_add_nn_relu(%x, %9) /* ty=Tensor[(10, 10), float32] */; + abs(%10) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ +} +""" + mod = tvm.parser.fromtext(mod_txt) + targets = [ + tvm.target.Target("llvm"), + tvm.target.Target("test_external_cpu_target"), + ] + cost_estimator = MockEstimator( + { + "llvm": 5, + "test_external_cpu_target": 6, + } + ) + mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=1, byoc_max_depth=5) + + expected_mod = tvm.parser.fromtext(expected_txt) + tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True) + + +@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table) +def test_double_residual(mock_get_pattern_table): + mod_txt = """#[version = "0.0.5"] +def @main(%x: Tensor[(10, 10), float32]) { + %0 = nn.relu(%x); + %1 = abs(%0); + %2 = add(%0, %1); + add(%1, %2) +} +""" + expected_txt = """#[version = "0.0.5"] +def @collage_test_external_cpu_target_add_add(%FunctionVar_0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_1: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_add_add") -> Tensor[(10, 10), float32] { + %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_12: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="add") -> Tensor[(10, 10), float32] { + add(%FunctionVar_02, %FunctionVar_12) /* ty=Tensor[(10, 10), float32] span=from_string:6:11 */ + } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %1 = %0(%FunctionVar_1, %FunctionVar_0) /* ty=Tensor[(10, 10), float32] */; + %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %FunctionVar_11: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="add") -> Tensor[(10, 10), float32] { + add(%FunctionVar_01, %FunctionVar_11) /* ty=Tensor[(10, 10), float32] span=from_string:3:3 */ + } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %2(%FunctionVar_0, %1) /* ty=Tensor[(10, 10), float32] */ +} + +def @collage_test_external_cpu_target_nn_relu(%FunctionVar_03: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="test_external_cpu_target", global_symbol="collage_test_external_cpu_target_nn_relu") -> Tensor[(10, 10), float32] { + %3 = fn (%FunctionVar_04: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Composite="relu") -> Tensor[(10, 10), float32] { + nn.relu(%FunctionVar_04) /* ty=Tensor[(10, 10), float32] span=from_string:5:12 */ + } /* ty=fn (Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */; + %3(%FunctionVar_03) /* ty=Tensor[(10, 10), float32] */ +} + +def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] span=from_string:3:16 */) -> Tensor[(10, 10), float32] { + %4 = @collage_test_external_cpu_target_nn_relu(%x) /* ty=Tensor[(10, 10), float32] */; + %5 = abs(%4) /* ty=Tensor[(10, 10), float32] span=from_string:6:7 */; + @collage_test_external_cpu_target_add_add(%5, %4) /* ty=Tensor[(10, 10), float32] */ +} +""" + + mod = tvm.parser.fromtext(mod_txt) + targets = [ + tvm.target.Target("llvm"), + tvm.target.Target("test_external_cpu_target"), + ] + cost_estimator = MockEstimator( + { + "llvm": 2, + "test_external_cpu_target": 1, + } + ) + mod = run_collage(mod, targets, cost_estimator, tvm_max_depth=4, byoc_max_depth=4) + + expected_mod = tvm.parser.fromtext(expected_txt) + tvm.ir.assert_structural_equal(mod, expected_mod, map_free_vars=True) + + +if __name__ == "__main__": + pytest.main([__file__])