[MetaSchedule] Modify Profiler Timers (#11735)

Minor modification to scoped timers to cover 99% of all the time cost during MS tuning. Allow `ApplyHistoryBest` and `TaskExtraction` time to be counted during tune_relay.
apache · Jun 15, 2022 · 6ce41be · 6ce41be
1 parent d0cbde0
commit 6ce41be
Show file tree

Hide file tree

Showing 10 changed files with 95 additions and 79 deletions.
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
@@ -38,6 +38,7 @@
 from .measure_callback import MeasureCallback
 from .mutator import Mutator
 from .postproc import Postproc
+from .profiler import Profiler
 from .runner import Runner
 from .schedule_rule import ScheduleRule
 from .search_strategy import EvolutionarySearch, ReplayFunc, ReplayTrace
@@ -563,7 +564,8 @@ def tune_relay(
     target = default_config.target(target)
     # pylint: enable=protected-access,
     # parse the tuning contexts
-    extracted_tasks = extract_task_from_relay(mod, target, params)
+    with Profiler.timeit("TaskExtraction"):
+        extracted_tasks = extract_task_from_relay(mod, target, params)
     database = tune_extracted_tasks(
         extracted_tasks,
         config,
@@ -579,9 +581,10 @@ def tune_relay(
         mutator_probs=mutator_probs,
         num_threads=num_threads,
     )
-    with target, autotvm_silencer(), ApplyHistoryBest(database):
-        with PassContext(
-            opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
-        ):
-            return relay_build(mod, target=target, params=params)
+    with Profiler.timeit("ApplyHistoryBest"):
+        with target, autotvm_silencer(), ApplyHistoryBest(database):
+            with PassContext(
+                opt_level=3,
+                config={"relay.backend.use_meta_schedule": True},
+            ):
+                return relay_build(mod, target=target, params=params)
diff --git a/src/meta_schedule/measure_callback/add_to_database.cc b/src/meta_schedule/measure_callback/add_to_database.cc
@@ -30,7 +30,7 @@ class AddToDatabaseNode : public MeasureCallbackNode {
     if (!task_scheduler->database.defined()) {
       return;
     }
-    auto _ = Profiler::TimedScope("AddToDatabase");
+    auto _ = Profiler::TimedScope("MeasureCallback/AddToDatabase");
     TuneContext task = task_scheduler->tasks[task_id];
     Database database = task_scheduler->database.value();
     Workload workload = database->CommitWorkload(task->mod.value());

diff --git a/src/meta_schedule/measure_callback/echo_statistics.cc b/src/meta_schedule/measure_callback/echo_statistics.cc
@@ -79,10 +79,10 @@ class EchoStatisticsNode : public MeasureCallbackNode {
              const Array<MeasureCandidate>& measure_candidates,
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
+    auto _ = Profiler::TimedScope("MeasureCallback/EchoStatistics");
     if (this->task_info.empty()) {
       SetupTaskInfo(task_scheduler->tasks);
     }
-    auto _ = Profiler::TimedScope("EchoStatistics");
     ICHECK_EQ(measure_candidates.size(), builder_results.size());
     ICHECK_EQ(measure_candidates.size(), runner_results.size());
     int n = measure_candidates.size();

diff --git a/src/meta_schedule/measure_callback/measure_callback.cc b/src/meta_schedule/measure_callback/measure_callback.cc
@@ -27,7 +27,7 @@ void PyMeasureCallbackNode::Apply(const TaskScheduler& task_scheduler,
                                   const Array<BuilderResult>& builds,                 //
                                   const Array<RunnerResult>& results) {
   ICHECK(f_apply != nullptr) << "PyMeasureCallback's Apply method not implemented!";
-  auto _ = Profiler::TimedScope(this->f_as_string());
+  auto _ = Profiler::TimedScope("MeasureCallback/" + this->f_as_string());
   return f_apply(task_scheduler, task_id, measure_candidates, builds, results);
 }
 

diff --git a/src/meta_schedule/measure_callback/remove_build_artifact.cc b/src/meta_schedule/measure_callback/remove_build_artifact.cc
@@ -28,7 +28,8 @@ class RemoveBuildArtifactNode : public MeasureCallbackNode {
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
     static const PackedFunc* f_rm = runtime::Registry::Get("meta_schedule.remove_build_dir");
-    auto _ = Profiler::TimedScope("RemoveBuildArtifact");
+    ICHECK(*f_rm != nullptr) << "The `remove_build_dir` func is not in tvm registry.";
+    auto _ = Profiler::TimedScope("MeasureCallback/RemoveBuildArtifact");
     for (const BuilderResult& build_result : builder_results) {
       if (Optional<String> path = build_result->artifact_path) {
         (*f_rm)(path.value());

diff --git a/src/meta_schedule/measure_callback/update_cost_model.cc b/src/meta_schedule/measure_callback/update_cost_model.cc
@@ -27,7 +27,7 @@ class UpdateCostModelNode : public MeasureCallbackNode {
              const Array<MeasureCandidate>& measure_candidates,
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
-    auto _ = Profiler::TimedScope("UpdateCostModel");
+    auto _ = Profiler::TimedScope("MeasureCallback/UpdateCostModel");
     TuneContext task = task_scheduler->tasks[task_id];
     ICHECK(task_scheduler->cost_model.defined())
         << "Cost model must be defined for the task scheduler!";

diff --git a/src/meta_schedule/profiler.cc b/src/meta_schedule/profiler.cc
@@ -63,6 +63,7 @@ String ProfilerNode::Table() const {
       p.Row() << i << table_entry[i].name << table_entry[i].minutes << table_entry[i].percentage;
     }
   }
+  p.Separator();
   return p.AsStr();
 }
 
@@ -79,7 +80,8 @@ PackedFunc ProfilerTimedScope(String name) {
                                     tik = std::chrono::high_resolution_clock::now(),  //
                                     name = std::move(name)]() {
       auto tok = std::chrono::high_resolution_clock::now();
-      double duration = std::chrono::duration_cast<std::chrono::seconds>(tok - tik).count();
+      double duration =
+          std::chrono::duration_cast<std::chrono::nanoseconds>(tok - tik).count() / 1e9;
       profiler->stats_sec[name] += duration;
     });
   }

diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -499,41 +499,49 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
 
 std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     std::vector<Schedule> population, int num) {
-  ICHECK_GT(num, 0);
-  // The heap to record best schedule, we do not consider schedules that are already measured
-  IRModuleSet exists = this->measured_workloads_;
+  IRModuleSet exists;
+  {
+    auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc/CopyMeasuredWorkloads");
+    ICHECK_GT(num, 0);
+    // The heap to record best schedule, we do not consider schedules that are already measured
+    exists = this->measured_workloads_;
+  }
   SizedHeap heap(num);
   for (int iter = 0;; ++iter) {
     // Predict normalized score with the cost model,
     std::vector<double> scores = PredictNormalizedScore(population,                           //
                                                         GetRef<TuneContext>(self->context_),  //
                                                         this->cost_model_,                    //
                                                         this->args_info_);
-    ICHECK_EQ(scores.size(), population.size());
-    for (int i = 0, n = population.size(); i < n; ++i) {
-      Schedule sch = population.at(i);
-      IRModule mod = sch->mod();
-      size_t shash = StructuralHash()(mod);
-      double score = scores.at(i);
-      if (!exists.Has(mod, shash)) {
-        exists.Add(mod, shash);
-        heap.Push(sch, score);
+
+    {
+      auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc");
+      ICHECK_EQ(scores.size(), population.size());
+      for (int i = 0, n = population.size(); i < n; ++i) {
+        Schedule sch = population.at(i);
+        IRModule mod = sch->mod();
+        size_t shash = StructuralHash()(mod);
+        double score = scores.at(i);
+        if (!exists.Has(mod, shash)) {
+          exists.Add(mod, shash);
+          heap.Push(sch, score);
+        }
+      }
+      // Discontinue once it reaches end of search
+      if (iter == self->genetic_num_iters) {
+        break;
+      }
+      // Set threaded samplers, with probability from predicated normalized throughput
+      for (PerThreadData& data : this->per_thread_data_) {
+        data.Set(scores, self->genetic_mutate_prob, self->context_->mutator_probs);
       }
     }
-    // Discontinue once it reaches end of search
-    if (iter == self->genetic_num_iters) {
-      break;
-    }
-    // Set threaded samplers, with probability from predicated normalized throughput
-    for (PerThreadData& data : this->per_thread_data_) {
-      data.Set(scores, self->genetic_mutate_prob, self->context_->mutator_probs);
-    }
-    ThreadedTraceApply pp(self->context_->postprocs);
-    ConcurrentBitmask cbmask(self->population_size);
-    std::vector<Schedule> next_population(self->population_size, Schedule{nullptr});
-    // The worker function
     {
       auto _ = Profiler::TimedScope("EvoSearch/Evolve/Mutation");
+      ThreadedTraceApply pp(self->context_->postprocs);
+      ConcurrentBitmask cbmask(self->population_size);
+      std::vector<Schedule> next_population(self->population_size, Schedule{nullptr});
+      // The worker function
       auto f_find_candidate = [&cbmask, &population, &next_population, &pp, this](int thread_id,
                                                                                   int trace_id) {
         // Prepare samplers
@@ -571,40 +579,46 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
       };
       support::parallel_for_dynamic(0, self->population_size, self->context_->num_threads,
                                     f_find_candidate);
+
+      population.swap(next_population);
+      TVM_PY_LOG(INFO, self->context_->logging_func)
+          << "Evolve iter #" << iter << " done. Summary:\n"
+          << pp.SummarizeFailures();
     }
-    population.swap(next_population);
-    TVM_PY_LOG(INFO, self->context_->logging_func) << "Evolve iter #" << iter << " done. Summary:\n"
-                                                   << pp.SummarizeFailures();
   }
   // Return the best states from the heap, sorting from higher score to lower ones
-  std::sort(heap.heap.begin(), heap.heap.end());
-  std::vector<Schedule> results;
-  results.reserve(num);
-  for (const SizedHeap::Item& item : heap.heap) {
-    results.push_back(item.sch);
-  }
+  {
+    auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc");
+    std::sort(heap.heap.begin(), heap.heap.end());
+    std::vector<Schedule> results;
+    results.reserve(num);
+    for (const SizedHeap::Item& item : heap.heap) {
+      results.push_back(item.sch);
+    }
 
-  constexpr int kNumScoresPerLine = 16;
-  std::ostringstream os;
-  int n = heap.heap.size();
-  for (int st = 0; st < n; st += kNumScoresPerLine) {
-    os << std::endl;
-    int ed = std::min(st + kNumScoresPerLine, n);
-    os << "[" << (st + 1) << " : " << ed << "]:\t";
-    for (int i = st; i < ed; ++i) {
-      if (i != st) {
-        os << "  ";
+    constexpr int kNumScoresPerLine = 16;
+    std::ostringstream os;
+    int n = heap.heap.size();
+    for (int st = 0; st < n; st += kNumScoresPerLine) {
+      os << std::endl;
+      int ed = std::min(st + kNumScoresPerLine, n);
+      os << "[" << (st + 1) << " : " << ed << "]:\t";
+      for (int i = st; i < ed; ++i) {
+        if (i != st) {
+          os << "  ";
+        }
+        os << std::fixed << std::setprecision(4) << heap.heap.at(i).score;
       }
-      os << std::fixed << std::setprecision(4) << heap.heap.at(i).score;
     }
+    TVM_PY_LOG(INFO, self->context_->logging_func)
+        << "Scores of the best " << n << " candidates:" << os.str();
+    return results;
   }
-  TVM_PY_LOG(INFO, self->context_->logging_func)
-      << "Scores of the best " << n << " candidates:" << os.str();
-  return results;
 }
 
 std::vector<Schedule> EvolutionarySearchNode::State::PickWithEpsGreedy(
     const std::vector<Schedule>& unmeasured, const std::vector<Schedule>& bests, int num) {
+  auto _ = Profiler::TimedScope("EvoSearch/PickWithEpsGreedy");
   int num_rands = num * self->eps_greedy;
   int num_bests = num - num_rands;
   std::vector<int> rands =

diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -22,6 +22,7 @@ namespace tvm {
 namespace meta_schedule {
 
 void TaskSchedulerNode::InitializeTask(int task_id) {
+  auto _ = Profiler::TimedScope("InitializeTask");
   TuneContext task = this->tasks[task_id];
   TVM_PY_LOG(INFO, this->logging_func)
       << "Initializing Task #" << task_id << ": " << task->task_name;

diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -138,25 +138,20 @@ def test_meta_schedule_tune_relay(
     mod, params, (input_name, _, _) = get_network(name=model_name, input_shape=input_shape)
     target = Target(target)
     with tempfile.TemporaryDirectory() as work_dir:
-        rt_mod1: tvm.runtime.Module = ms.tune_relay(
-            mod=mod,
-            params=params,
-            target=target,
-            config=ms.TuneConfig(
-                strategy="evolutionary",
-                num_trials_per_iter=32,
-                max_trials_per_task=20000,
-                max_trials_global=20000,
-                search_strategy_config={
-                    "genetic_num_iters": 10,
-                },
-            ),
-            work_dir=work_dir,
-            database=ms.database.JSONDatabase(
-                osp.join(work_dir, "workload.json"),
-                osp.join(work_dir, "records.json"),
-            ),
-        )
+        with ms.Profiler() as profiler:
+            rt_mod1: tvm.runtime.Module = ms.tune_relay(
+                mod=mod,
+                params=params,
+                target=target,
+                config=ms.TuneConfig(
+                    strategy="evolutionary",
+                    num_trials_per_iter=32,
+                    max_trials_per_task=20000,
+                    max_trials_global=20000,
+                ),
+                work_dir=work_dir,
+            )
+        print(profiler.table())
         # Compile without meta-scheduler for correctness check
         with tvm.transform.PassContext(opt_level=0):
             rt_mod2 = relay.build(mod, target=target, params=params)