Skip to content

Commit

Permalink
[AutoScheduler] Accelerate feature extraction for winograd (#6981)
Browse files Browse the repository at this point in the history
* [AutoScheduler] Accelerate feature extraction for winograd

* fix an overflow in feature.cc

* address comments

* address comments

* Update include/tvm/te/schedule.h

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* Use a smaller min_repeat_ms

* Use a smaller min_repeat_ms

Co-authored-by: Cody Yu <comaniac0422@gmail.com>
  • Loading branch information
merrymercy and comaniac authored Nov 28, 2020
1 parent 093629c commit 9a9ec1a
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 12 deletions.
12 changes: 12 additions & 0 deletions include/tvm/te/schedule.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,18 @@ class Schedule : public ObjectRef {
* \return A normalized schedule, can be same as current one.
*/
Schedule normalize();

/*!
* \brief Normalize the schedule for feature extraction in auto-scheduler.
* This is similar to `Schedule::normalize`, but we do aggressive simplification
* to the TE compute with const_matrix=True for faster compilation and feature extraction.
* The resulted schedule may be wrong, but it is good enough for feature extraction
* purposes.
*
* \return A normalized schedule, can be same as current one.
*/
Schedule normalize_for_feature_extraction();

/*!
* \brief access the internal node container
* \return the pointer to the internal node container
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/topi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def select_array(i, j):
)
return now

return te.compute(matrix.shape, select_array, name=name)
return te.compute(matrix.shape, select_array, name=name, attrs={"const_matrix": True})


def get_max_power2_factor(n, max_value=None):
Expand Down
2 changes: 1 addition & 1 deletion src/auto_scheduler/compute_dag.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1235,7 +1235,7 @@ State ComputeDAG::InferBound(const State& state) const {
Array<te::Tensor> tensors;
// Replay steps to tvm::Schedule
std::tie(sch, tensors) = ApplySteps(pstate->transform_steps, &stages, &stage_to_axes);
sch = sch.normalize();
sch = sch.normalize_for_feature_extraction();
// Get bound information from TVM schedule
Map<IterVar, Range> bounds = te::InferBound(sch);

Expand Down
8 changes: 4 additions & 4 deletions src/auto_scheduler/feature.cc
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
math_op_counter(node->value);
std::vector<float> mem_bytes_list;
std::vector<float> compute_ops_list;
int cur_compute_ops;
double cur_compute_ops;

// Group 1: Computation related features
ExtractComputationFeature(node, math_op_counter);
Expand Down Expand Up @@ -768,7 +768,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {

// Extract buffer access related features (group 2)
void ExtractBufferAccessFeature(const BufferStoreNode* node, const MathOpCounter& math_op_counter,
int* cur_compute_ops, std::vector<float>* compute_ops_list,
double* cur_compute_ops, std::vector<float>* compute_ops_list,
std::vector<float>* mem_bytes_list) {
FeatureSet& fea = buffer_features[node->buffer];

Expand Down Expand Up @@ -920,7 +920,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
}

// Extract arithmetic intensity related feature (group 3)
void ExtractArithmeticIntensityFeature(const BufferStoreNode* node, int cur_compute_ops,
void ExtractArithmeticIntensityFeature(const BufferStoreNode* node, double cur_compute_ops,
const std::vector<float>& compute_ops_list,
const std::vector<float>& mem_bytes_list) {
FeatureSet& fea = buffer_features[node->buffer];
Expand Down Expand Up @@ -1267,7 +1267,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
Array<te::Tensor> tensors;

std::tie(sch, tensors) = task->compute_dag.ApplySteps(state->transform_steps);
sch = sch.normalize();
sch = sch.normalize_for_feature_extraction();
auto bounds = te::InferBound(sch);

try {
Expand Down
22 changes: 19 additions & 3 deletions src/te/schedule/schedule_dataflow_rewrite.cc
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ void RebaseNonZeroMinLoop(ScheduleNode* sch) {
}
}

void InjectInline(ScheduleNode* sch) {
void InjectInline(ScheduleNode* sch, bool feature_extraction_mode) {
sch->InvalidateCache();

std::vector<Array<PrimExpr> > new_body(sch->stages.size());
Expand All @@ -524,7 +524,15 @@ void InjectInline(ScheduleNode* sch) {
args.push_back(iv->var);
}
ICHECK_EQ(compute->body.size(), 1U) << "can only inline compute op with 1 output";
body = compute->body[0];

if (feature_extraction_mode && compute->attrs.count("const_matrix")) {
// Use constant value to replace access of const matrices.
// This produces wrong IR but is good enough for feature extraction purposes.
// This simplification can accelerate the feature extration and evolutionary search.
body = make_const(compute->output_dtype(0), 1.0f);
} else {
body = compute->body[0];
}
}
for (size_t j = i; j < sch->stages.size(); ++j) {
Stage s = sch->stages[j];
Expand Down Expand Up @@ -700,7 +708,15 @@ void LegalizeInvalidAttach(ScheduleNode* sch) {

Schedule Schedule::normalize() {
Schedule sn = copy();
InjectInline(sn.operator->());
InjectInline(sn.operator->(), false);
RebaseNonZeroMinLoop(sn.operator->());
LegalizeInvalidAttach(sn.operator->());
return sn;
}

Schedule Schedule::normalize_for_feature_extraction() {
Schedule sn = copy();
InjectInline(sn.operator->(), true);
RebaseNonZeroMinLoop(sn.operator->());
LegalizeInvalidAttach(sn.operator->());
return sn;
Expand Down
2 changes: 1 addition & 1 deletion tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
# during measurement and avoid other runtime conflicts.
# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
# This can warmup the GPU, which is necessary to get accurate measurement results.
# Typically, we recommend a value > 300 ms.
# Typically, we recommend a value >= 300 ms.
# * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
# We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
# good value for the search to converge. You can do more trials according to your time budget.
Expand Down
4 changes: 2 additions & 2 deletions tutorials/auto_scheduler/tune_network_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
# during measurement and avoid other runtime conflicts.
# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
# This can warmup the GPU, which is necessary to get accurate measurement results.
# Typically, we recommend a value > 300 ms.
# Typically, we recommend a value >= 300 ms.
# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
# You can set it to a small number (e.g., 200) for a fast demonstrative run.
# In practice, we recommend setting it around :code:`1000 * len(tasks)`,
Expand All @@ -184,7 +184,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):

def run_tuning():
print("Begin tuning...")
measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400, timeout=10)
measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)

tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
tune_option = auto_scheduler.TuningOptions(
Expand Down

0 comments on commit 9a9ec1a

Please sign in to comment.