From 05f3aba8e466dc62ecbf66525314462d6adf98d0 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Sun, 21 Nov 2021 23:54:27 -0800 Subject: [PATCH] Squashed commit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Meta Schedule][M3c] Schedule Rules, Mutator & Postprocs (#485) [Meta Schedule][M3c] PostOrderApply (#486) Fix Post Order Apply (#490) [MetaSchedule] Relay Integration (#489) [M3c][Meta Schedule] Add Trace Correctness Test for PostOrderApply (#492) Fix replay trace. (#493) [M3c][Meta Schedule] Implement the Replay Func class. (#495) [PR] Test script for meta-schedule task extraction. Interface to load… (#494) [Meta Schedule Refactor] Get child blocks (#500) Read-at && Write-at (#497) [M3c][Meta Schedule] Measure Callbacks (#498) [Bug] Fix Infinite Loop Caused When Calling Methods Not Overrided In PyClass (#496) [MetaSchedule] Sample-Perfect-Tile (#501) [MetaSchedule] TE Workloads (#502) [TensorIR] GetProducer, GetConsumer (#506) [MetaScheduleRefactor] Annotate&Unannotate (#505) [MetaSchedule] Multi-Level-Tiling & Auto-Inline (#503) [Tests] Add unittests for auto-inline and multi-level-tiling (#508) [Meta Schedule] Minor Fixes (#507) [MetaSchedule] Rewrite Cooperative-Fetching / Unbound-Block / Reduction-Block (#509) [MetaSchedule] Rewrite Parallel-Vectorize-Unroll / Verify-GPU / Disallow-Dynamic-Loops (#499) [Meta Schedule] Add Helper Function & Minor Modification (#512) [MetaSchedule] Test for Rewrite Parallel-Vectorize-Unroll (#513) [Meta Schedule] Feature Extractor & Cost Model (#510) Blockize & Tensorize (#514) Layout Rewriting: Suggest-Index-Map (#520) [MetaSchedule] Parallel-Vectorize-Unroll & Random-Compute-Location (#516) [Meta Schedule] Per-Store-Feature (#521) Add traced schedule for blockize & tensorize (#526) [Meta Schedule] Add XGBoost Model & Random Model (#519) User-Interface: Tune-TIR (#525) User-Interface: Tune-TE (#527) [Minor] More logging on python (#528) Get CUDA tuning working (#529) [MetaSchedule] TensorRT BYOC (#518) [BugFix] LocalBuilder API (#531) [Meta Schedule] Add Cost Model Update Measure Callback (#530) [Bugfix] BuilderInput with default params (#532) [MetaSchedule] Mutator-Tile-Size, Mutate-Parallel, Mutate-Unroll (#534) [Meta Schedule] Evolutionary Search (#522) [BugFix] Remove duplicated definition of MakeMultinomialSampler (#535) [Meta Schedule] Fix some bugs (#537) Initiate Experiments for CPU Performance Alignment with Ansor (#538) [Meta Schedule] Tweak experiment scripts (#539) [Meta Schedule] Initiate experiments on CUDA (#540) [TIR][Schedule] Buffer transform (#523) Auto Tensor Core (#524) Working on Evo Search (#542) [Meta Schedule] Add Replay Tuning Interface (#543) Evolutionary Search on CPU (#544) Misc improvement over the error message (#545) [TIR][Schedule] Software pipelining (#533) [Meta Schedule Refactor] fixing unit tests (#547) [MetaSchedule] Mutator-Compute-Location (#548) Misc Improvement of Evolutionary Search (#549) Hotfix for software pipeline (#552) Misc Improvement (#550) Co-authored-by: Siyuan Feng Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Co-authored-by: Hongyi Jin <3231950289@qq.com> Co-authored-by: Ruihang Lai Co-authored-by: Junru Shao Co-authored-by: Wuwei Lin Co-authored-by: Sunghyun Park <49998730+sunggg@users.noreply.github.com> Co-authored-by: Xiyou Zhou Squashed commit [Meta Schedule][M3c] Schedule Rules, Mutator & Postprocs (#485) [Meta Schedule][M3c] PostOrderApply (#486) Fix Post Order Apply (#490) [MetaSchedule] Relay Integration (#489) [M3c][Meta Schedule] Add Trace Correctness Test for PostOrderApply (#492) Fix replay trace. (#493) [M3c][Meta Schedule] Implement the Replay Func class. (#495) [PR] Test script for meta-schedule task extraction. Interface to load… (#494) [Meta Schedule Refactor] Get child blocks (#500) Read-at && Write-at (#497) [M3c][Meta Schedule] Measure Callbacks (#498) [Bug] Fix Infinite Loop Caused When Calling Methods Not Overrided In PyClass (#496) [MetaSchedule] Sample-Perfect-Tile (#501) [MetaSchedule] TE Workloads (#502) [TensorIR] GetProducer, GetConsumer (#506) [MetaScheduleRefactor] Annotate&Unannotate (#505) [MetaSchedule] Multi-Level-Tiling & Auto-Inline (#503) [Tests] Add unittests for auto-inline and multi-level-tiling (#508) [Meta Schedule] Minor Fixes (#507) [MetaSchedule] Rewrite Cooperative-Fetching / Unbound-Block / Reduction-Block (#509) [MetaSchedule] Rewrite Parallel-Vectorize-Unroll / Verify-GPU / Disallow-Dynamic-Loops (#499) [Meta Schedule] Add Helper Function & Minor Modification (#512) [MetaSchedule] Test for Rewrite Parallel-Vectorize-Unroll (#513) [Meta Schedule] Feature Extractor & Cost Model (#510) Blockize & Tensorize (#514) Layout Rewriting: Suggest-Index-Map (#520) [MetaSchedule] Parallel-Vectorize-Unroll & Random-Compute-Location (#516) [Meta Schedule] Per-Store-Feature (#521) Add traced schedule for blockize & tensorize (#526) [Meta Schedule] Add XGBoost Model & Random Model (#519) User-Interface: Tune-TIR (#525) User-Interface: Tune-TE (#527) [Minor] More logging on python (#528) Get CUDA tuning working (#529) [MetaSchedule] TensorRT BYOC (#518) [BugFix] LocalBuilder API (#531) [Meta Schedule] Add Cost Model Update Measure Callback (#530) [Bugfix] BuilderInput with default params (#532) [MetaSchedule] Mutator-Tile-Size, Mutate-Parallel, Mutate-Unroll (#534) [Meta Schedule] Evolutionary Search (#522) [BugFix] Remove duplicated definition of MakeMultinomialSampler (#535) [Meta Schedule] Fix some bugs (#537) Co-authored-by: Siyuan Feng Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Co-authored-by: Hongyi Jin <3231950289@qq.com> Co-authored-by: Ruihang Lai Co-authored-by: Junru Shao Co-authored-by: Wuwei Lin Co-authored-by: Sunghyun Park <49998730+sunggg@users.noreply.github.com> Co-authored-by: Xiyou Zhou Initiate Experiments for CPU Performance Alignment with Ansor (#538) * ... * update * update * print * more [Meta Schedule] Tweak experiment scripts (#539) [Meta Schedule] Initiate experiments on CUDA (#540) * [Meta Schedule] Initiate experiments on CUDA * ... * fix boolean printing Auto Tensor Core (#524) Working on Evo Search (#542) Squashed commit [Meta Schedule][M3c] Schedule Rules, Mutator & Postprocs (#485) [Meta Schedule][M3c] PostOrderApply (#486) Fix Post Order Apply (#490) [MetaSchedule] Relay Integration (#489) [M3c][Meta Schedule] Add Trace Correctness Test for PostOrderApply (#492) Fix replay trace. (#493) [M3c][Meta Schedule] Implement the Replay Func class. (#495) [PR] Test script for meta-schedule task extraction. Interface to load… (#494) [Meta Schedule Refactor] Get child blocks (#500) Read-at && Write-at (#497) [M3c][Meta Schedule] Measure Callbacks (#498) [Bug] Fix Infinite Loop Caused When Calling Methods Not Overrided In PyClass (#496) [MetaSchedule] Sample-Perfect-Tile (#501) [MetaSchedule] TE Workloads (#502) [TensorIR] GetProducer, GetConsumer (#506) [MetaScheduleRefactor] Annotate&Unannotate (#505) [MetaSchedule] Multi-Level-Tiling & Auto-Inline (#503) [Tests] Add unittests for auto-inline and multi-level-tiling (#508) [Meta Schedule] Minor Fixes (#507) [MetaSchedule] Rewrite Cooperative-Fetching / Unbound-Block / Reduction-Block (#509) [MetaSchedule] Rewrite Parallel-Vectorize-Unroll / Verify-GPU / Disallow-Dynamic-Loops (#499) [Meta Schedule] Add Helper Function & Minor Modification (#512) [MetaSchedule] Test for Rewrite Parallel-Vectorize-Unroll (#513) [Meta Schedule] Feature Extractor & Cost Model (#510) Blockize & Tensorize (#514) Layout Rewriting: Suggest-Index-Map (#520) Co-authored-by: Siyuan Feng Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Co-authored-by: Hongyi Jin <3231950289@qq.com> Co-authored-by: Ruihang Lai Co-authored-by: Junru Shao Co-authored-by: Wuwei Lin Co-authored-by: Sunghyun Park <49998730+sunggg@users.noreply.github.com> [MetaSchedule] Parallel-Vectorize-Unroll & Random-Compute-Location (#516) * parallel vectorize unroll & random compute location * rebased [Meta Schedule] Per-Store-Feature (#521) [Meta Schedule] Add XGBoost Model & Random Model (#519) * Squashed commit [Meta Schedule][M3c] Schedule Rules, Mutator & Postprocs (#485) [Meta Schedule][M3c] PostOrderApply (#486) Fix Post Order Apply (#490) [MetaSchedule] Relay Integration (#489) [M3c][Meta Schedule] Add Trace Correctness Test for PostOrderApply (#492) Fix replay trace. (#493) [M3c][Meta Schedule] Implement the Replay Func class. (#495) [PR] Test script for meta-schedule task extraction. Interface to load… (#494) [Meta Schedule Refactor] Get child blocks (#500) Read-at && Write-at (#497) [M3c][Meta Schedule] Measure Callbacks (#498) [Bug] Fix Infinite Loop Caused When Calling Methods Not Overrided In PyClass (#496) [MetaSchedule] Sample-Perfect-Tile (#501) [MetaSchedule] TE Workloads (#502) Co-authored-by: Siyuan Feng Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Co-authored-by: Hongyi Jin <3231950289@qq.com> Co-authored-by: Ruihang Lai Co-authored-by: Junru Shao Co-authored-by: Wuwei Lin Co-authored-by: Sunghyun Park <49998730+sunggg@users.noreply.github.com> [TensorIR] GetProducer, GetConsumer (#506) [MetaScheduleRefactor] Annotate&Unannotate (#505) * annotate * annotate * lint * test * fix * fix * fix [MetaSchedule] Rewrite Cooperative-Fetching / Unbound-Block / Reduction-Block (#509) Fix sttr func & schedule naming. Fix schedule -> sch. Add feature extractor. Fix init. Add cost model. Remove unused include. [MetaSchedule] Rewrite Parallel-Vectorize-Unroll / Verify-GPU / Disallow-Dynamic-Loops (#499) * wip fix * revoke change to gallery * split postprocessors to separate files * rename attrs * minor * minor tweak on utils.h * refactor disallow-dynamic-loop * refactor verify_gpu_code * succesfully give up refactoring parallelize-vectorize-unroll * python structuring * unittests Co-authored-by: Junru Shao Fix issues. Fix init. Finish random model part. Finish xgb model. Minor fix. Rebase. Add init. Await refactor of callback. Update a bit on the test case. Move impos. Minor fix. More fixes. Remove unused import. Fix per store feature test. Update model save / load. * Fix model save / load with tar. * Fix issues. * Remove dup. Co-authored-by: Junru Shao User-Interface: Tune-TIR (#525) * User-Interface: Tune-TIR * fix fix fix User-Interface: Tune-TE (#527) * fix a lot of issues * Add tune-te Get CUDA tuning working (#529) [Meta Schedule] Evolutionary Search (#522) * Checkpoint. Fix cost model comment. Finish evolutionary seaarch. Remove extra code. Fix compile. Add comments. Add python part. Ad test. Update other files & comments. * Squashed commit [Meta Schedule][M3c] Schedule Rules, Mutator & Postprocs (#485) [Meta Schedule][M3c] PostOrderApply (#486) Fix Post Order Apply (#490) [MetaSchedule] Relay Integration (#489) [M3c][Meta Schedule] Add Trace Correctness Test for PostOrderApply (#492) Fix replay trace. (#493) [M3c][Meta Schedule] Implement the Replay Func class. (#495) [PR] Test script for meta-schedule task extraction. Interface to load… (#494) [Meta Schedule Refactor] Get child blocks (#500) Read-at && Write-at (#497) [M3c][Meta Schedule] Measure Callbacks (#498) [Bug] Fix Infinite Loop Caused When Calling Methods Not Overrided In PyClass (#496) [MetaSchedule] Sample-Perfect-Tile (#501) [MetaSchedule] TE Workloads (#502) Co-authored-by: Siyuan Feng Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Co-authored-by: Hongyi Jin <3231950289@qq.com> Co-authored-by: Ruihang Lai Co-authored-by: Junru Shao Co-authored-by: Wuwei Lin Co-authored-by: Sunghyun Park <49998730+sunggg@users.noreply.github.com> * [TensorIR] GetProducer, GetConsumer (#506) * [MetaScheduleRefactor] Annotate&Unannotate (#505) * annotate * annotate * lint * test * fix * fix * fix * [MetaSchedule] Rewrite Cooperative-Fetching / Unbound-Block / Reduction-Block (#509) * Blockize & Tensorize (#514) * Blockize & Tensorize * Update tensor intrin * Fix blockized & Recalculate affine flags * Cleanup utils.cc * Add test cases of blockize * Re-enable affine flag checking * Checkpoint. Fix cost model comment. Finish evolutionary seaarch. Remove extra code. Fix compile. Add comments. Add python part. Ad test. Update other files & comments. Fix random seed bug. Minor fix. Fix num-cores. Add docs. Check point. Add max_fail_cnt. Minor fix. Minor fix. Segfault. Fix pointers to trace. Test fix. Remove measure callbacks. Refactor a bit. Split function. Adjust variable name. Minor fixes. Add mutator probs to TuneContext. Add token. Fix loops. Remove include. Add has workload for database. Add check. Add concurrent bitmask. * Fix TuneContext. * Fix haash & stuff. * Modifyy shash. * Remove trace field. * Minor fix. * Fix cbmask. * Fix numbers. Co-authored-by: Junru Shao Co-authored-by: Siyuan Feng Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Co-authored-by: Hongyi Jin <3231950289@qq.com> Co-authored-by: Ruihang Lai Co-authored-by: Wuwei Lin Co-authored-by: Sunghyun Park <49998730+sunggg@users.noreply.github.com> [BugFix] Remove duplicated definition of MakeMultinomialSampler (#535) Tune relay. Further add interface. Remove unused import Fix rebase. Add task name dispatch. Add task deduplication. Rename extract_task to extract_task_from_relay Remove duplicate function def. Minor fix. --- python/tvm/meta_schedule/integration.py | 2 +- python/tvm/meta_schedule/tune.py | 30 +++++++++++++++---- src/tir/schedule/primitive.h | 9 ------ .../test_meta_schedule_integration.py | 2 +- .../test_meta_schedule_task_extraction.py | 2 +- 5 files changed, 28 insertions(+), 17 deletions(-) diff --git a/python/tvm/meta_schedule/integration.py b/python/tvm/meta_schedule/integration.py index 47003c6faa25..5cd483698a13 100644 --- a/python/tvm/meta_schedule/integration.py +++ b/python/tvm/meta_schedule/integration.py @@ -177,7 +177,7 @@ class ApplyHistoryBest(MetaScheduleContext): pass -def extract_task( +def extract_task_from_relay( mod: Union[IRModule, RelayFunc], target: Target, params: Optional[Dict[str, NDArray]] = None, diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py index 21d7a2614261..bcfa08cdfc7d 100644 --- a/python/tvm/meta_schedule/tune.py +++ b/python/tvm/meta_schedule/tune.py @@ -19,10 +19,11 @@ import logging import os.path from typing import Callable, Dict, List, Optional, Union +from tvm.ir.base import structural_equal, structural_hash from tvm.ir.module import IRModule from tvm.runtime import NDArray -from tvm.meta_schedule.integration import extract_task +from tvm.meta_schedule.integration import extract_task_from_relay from tvm.target.target import Target from tvm.te import Tensor, create_prim_func from tvm.tir import PrimFunc, Schedule @@ -650,11 +651,12 @@ def tune_relay( """ logger.info("Working directory: %s", work_dir) - extracted_tasks = extract_task(mod, target, params) + extracted_tasks = extract_task_from_relay(mod, target, params) # pylint: disable=protected-access tune_contexts = [] target = Parse._target(target) database = Parse._database(database, task_name, work_dir) + # parse the tuning contexts for task in extracted_tasks: assert len(task.dispatched) == 1, "Only size 1 dispatched task list is supported for now" mod = Parse._mod(task.dispatched[0]) @@ -664,7 +666,7 @@ def tune_relay( mod=mod, target=target, config=config, - task_name=task_name, + task_name=task.task_name, space_generator=space, sch_rules=sch_rules, postprocs=postprocs, @@ -672,9 +674,27 @@ def tune_relay( num_threads=num_threads, ) ) + # deduplication + logger.info(f"Before task deduplication: {len(tune_contexts)} tasks") + tasks: List[TuneContext] = [] + hashs: List[int] = [] + for i, task in enumerate(tune_contexts): + struct_hash: int = structural_hash(task.mod) + flag: bool = False + if struct_hash in hashs: + for other_task in tune_contexts[i + 1 :]: + if structural_equal(task.mod, other_task.mod): + flag = True + break + if not flag: + tasks.append(task) + hashs.append(struct_hash) + logger.info(f"After task deduplication: {len(tasks)} tasks") + + # parse the task scheduler task_scheduler = Parse._task_scheduler( task_scheduler, - tune_contexts, + tasks, builder=Parse._builder(builder), runner=Parse._runner(runner), database=database, @@ -684,7 +704,7 @@ def tune_relay( # pylint: enable=protected-access task_scheduler.tune() schs: List[Schedule] = [] - for task in tune_contexts: + for task in tasks: mod = task.mod workload = database.commit_workload(mod) bests: List[TuningRecord] = database.get_top_k(workload, top_k=1) diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h index 6ca748e28573..545290a3de23 100644 --- a/src/tir/schedule/primitive.h +++ b/src/tir/schedule/primitive.h @@ -28,15 +28,6 @@ namespace tvm { namespace tir { -/*! - * \brief Create a sampling function that does multinomial sampling. - * \param rand_state The random state. - * \param weights The weights for multinomial sampling. - * \return The multinomial sampling function. - */ -TVM_DLL std::function MakeMultinomialSampler( - support::LinearCongruentialEngine::TRandState* rand_state, const std::vector& weights); - /******** Schedule: Sampling ********/ /*! * \brief Sample a random integer from a given range. diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py index f508c7d252e1..0ace4d2bd02c 100644 --- a/tests/python/unittest/test_meta_schedule_integration.py +++ b/tests/python/unittest/test_meta_schedule_integration.py @@ -112,7 +112,7 @@ def test_meta_schedule_integration_extract_from_resnet(): layout="NHWC", dtype="float32", ) - extracted_tasks = ms.integration.extract_task(mod, target="llvm", params=params) + extracted_tasks = ms.integration.extract_task_from_relay(mod, target="llvm", params=params) assert len(extracted_tasks) == 30 diff --git a/tests/python/unittest/test_meta_schedule_task_extraction.py b/tests/python/unittest/test_meta_schedule_task_extraction.py index 8d1eca51432e..8523275f5186 100644 --- a/tests/python/unittest/test_meta_schedule_task_extraction.py +++ b/tests/python/unittest/test_meta_schedule_task_extraction.py @@ -91,7 +91,7 @@ def test_meta_schedule_extract_from_torch_model(model_name: str, batch_size: int dtype="float32", ) target = tvm.target.Target(target) - ms.integration.extract_task(mod, params=params, target=target) + ms.integration.extract_task_from_relay(mod, params=params, target=target) if __name__ == "__main__":